@painitehq/structured-llm 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +145 -59
- package/dist/index.cjs +280 -74
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +19 -5
- package/dist/index.d.ts +19 -5
- package/dist/index.js +279 -75
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,75 +1,161 @@
|
|
|
1
|
-
#
|
|
1
|
+
# @painitehq/structured-llm
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Force LLM output into structured, type-safe JSON. Stop your app from crashing on malformed AI responses.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Install
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
```bash
|
|
8
|
+
npm install @painitehq/structured-llm
|
|
9
|
+
# or
|
|
10
|
+
bun add @painitehq/structured-llm
|
|
11
|
+
```
|
|
9
12
|
|
|
10
|
-
##
|
|
13
|
+
## Quick Start
|
|
11
14
|
|
|
12
|
-
|
|
15
|
+
```ts
|
|
16
|
+
import { extract, defineSchema } from "@painitehq/structured-llm";
|
|
13
17
|
|
|
14
|
-
|
|
18
|
+
const schema = defineSchema("invoice", {
|
|
19
|
+
invoiceNumber: { type: "string" },
|
|
20
|
+
totalAmount: { type: "number" },
|
|
21
|
+
vendor: { type: "string" },
|
|
22
|
+
items: { type: "array", items: { type: "string" } },
|
|
23
|
+
});
|
|
15
24
|
|
|
16
|
-
|
|
25
|
+
const result = await extract(messyText, schema, {
|
|
26
|
+
apiKey: "your-openrouter-api-key",
|
|
27
|
+
});
|
|
17
28
|
|
|
18
|
-
|
|
29
|
+
console.log(result.data);
|
|
30
|
+
// { invoiceNumber: "INV-2024-042", totalAmount: 500, vendor: "Acme Corp", items: [...] }
|
|
19
31
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
{
|
|
24
|
-
files: ['**/*.{ts,tsx}'],
|
|
25
|
-
extends: [
|
|
26
|
-
// Other configs...
|
|
32
|
+
console.log(result.confidence); // 100
|
|
33
|
+
console.log(result.attempts); // 1
|
|
34
|
+
```
|
|
27
35
|
|
|
28
|
-
|
|
29
|
-
tseslint.configs.recommendedTypeChecked,
|
|
30
|
-
// Alternatively, use this for stricter rules
|
|
31
|
-
tseslint.configs.strictTypeChecked,
|
|
32
|
-
// Optionally, add this for stylistic rules
|
|
33
|
-
tseslint.configs.stylisticTypeChecked,
|
|
36
|
+
## What It Does
|
|
34
37
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
38
|
+
LLMs return unstructured text. Sometimes it's valid JSON. Sometimes it's wrapped in markdown. Sometimes it's completely broken. This SDK:
|
|
39
|
+
|
|
40
|
+
1. **Forces** the model to output valid JSON via strict prompt engineering
|
|
41
|
+
2. **Repairs** malformed JSON (trailing commas, missing brackets, broken quotes)
|
|
42
|
+
3. **Unwraps** named wrappers like `{"invoice": {...}}` → `{...}`
|
|
43
|
+
4. **Validates** the output against your schema
|
|
44
|
+
5. **Coerces** wrong types (`"42"` → `42`, `"true"` → `true`)
|
|
45
|
+
6. **Fills** missing fields with sensible defaults
|
|
46
|
+
7. **Retries** with escalating instructions if the model fails
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
### Forced Structured Output
|
|
51
|
+
|
|
52
|
+
The SDK doesn't ask the model to "give JSON". It forces it:
|
|
53
|
+
|
|
54
|
+
```
|
|
55
|
+
CRITICAL RULES:
|
|
56
|
+
- Output ONLY valid JSON. No text before or after.
|
|
57
|
+
- No markdown. No code blocks. No explanations.
|
|
58
|
+
- Every field MUST be present with the correct type.
|
|
46
59
|
```
|
|
47
60
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
61
|
+
### Escalating Retries
|
|
62
|
+
|
|
63
|
+
If the model fails, the SDK retries with increasingly strict instructions:
|
|
64
|
+
|
|
65
|
+
- **Attempt 1**: Clean forced prompt
|
|
66
|
+
- **Attempt 2**: "Your response was invalid. Here's the error. Fix it."
|
|
67
|
+
- **Attempt 3**: "FINAL ATTEMPT. THIS IS YOUR LAST CHANCE."
|
|
68
|
+
|
|
69
|
+
### Post-Validation Repair
|
|
70
|
+
|
|
71
|
+
Even if the JSON parses, the SDK fixes type mismatches:
|
|
72
|
+
|
|
73
|
+
| Model returns | Schema expects | SDK does |
|
|
74
|
+
|---------------|----------------|----------|
|
|
75
|
+
| `"42"` | `number` | coerces to `42` |
|
|
76
|
+
| `"true"` | `boolean` | coerces to `true` |
|
|
77
|
+
| `42` | `string` | coerces to `"42"` |
|
|
78
|
+
| missing field | any type | fills with default |
|
|
79
|
+
|
|
80
|
+
### Confidence Scoring
|
|
81
|
+
|
|
82
|
+
Every extraction returns a confidence score (0-100):
|
|
83
|
+
|
|
84
|
+
```ts
|
|
85
|
+
const result = await extract(text, schema, { apiKey });
|
|
86
|
+
|
|
87
|
+
result.confidence; // 85
|
|
88
|
+
result.repairLog; // [{ type: "type_coercion", detail: "Coerced \"price\" to number" }]
|
|
89
|
+
result.attempts; // 2
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Confidence deductions:
|
|
93
|
+
- JSON not valid first try: -15
|
|
94
|
+
- Each retry: -10
|
|
95
|
+
- Type coercion per field: -5
|
|
96
|
+
- Missing field filled: -8
|
|
97
|
+
|
|
98
|
+
## Schema Definition
|
|
99
|
+
|
|
100
|
+
```ts
|
|
101
|
+
import { defineSchema } from "@painitehq/structured-llm";
|
|
102
|
+
|
|
103
|
+
const schema = defineSchema("person", {
|
|
104
|
+
name: { type: "string", description: "Full name" },
|
|
105
|
+
age: { type: "number" },
|
|
106
|
+
isStudent: { type: "boolean" },
|
|
107
|
+
hobbies: { type: "array", items: { type: "string" } },
|
|
108
|
+
address: {
|
|
109
|
+
type: "object",
|
|
110
|
+
properties: {
|
|
111
|
+
city: { type: "string" },
|
|
112
|
+
zip: { type: "string" },
|
|
72
113
|
},
|
|
73
114
|
},
|
|
74
|
-
|
|
115
|
+
});
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
Supported types: `string`, `number`, `boolean`, `array`, `object`
|
|
119
|
+
|
|
120
|
+
## API
|
|
121
|
+
|
|
122
|
+
### `extract<T>(input, schema, options)`
|
|
123
|
+
|
|
124
|
+
Returns `Promise<ExtractionResult<T>>`:
|
|
125
|
+
|
|
126
|
+
```ts
|
|
127
|
+
interface ExtractionResult<T> {
|
|
128
|
+
data: T; // typed structured data
|
|
129
|
+
raw: string; // raw LLM response
|
|
130
|
+
model: string; // model used
|
|
131
|
+
confidence: number; // 0-100 score
|
|
132
|
+
repairLog: RepairAction[]; // what was fixed
|
|
133
|
+
attempts: number; // how many tries
|
|
134
|
+
usage?: {
|
|
135
|
+
promptTokens: number;
|
|
136
|
+
completionTokens: number;
|
|
137
|
+
totalTokens: number;
|
|
138
|
+
};
|
|
139
|
+
}
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### Options
|
|
143
|
+
|
|
144
|
+
```ts
|
|
145
|
+
{
|
|
146
|
+
apiKey?: string; // OpenRouter API key (or set OPENROUTER_API_KEY env var)
|
|
147
|
+
model?: string; // model to use (default: "openrouter/free")
|
|
148
|
+
temperature?: number; // 0-1 (default: 0)
|
|
149
|
+
maxRetries?: number; // max retry attempts (default: 3)
|
|
150
|
+
timeout?: number; // request timeout in ms (default: 60000)
|
|
151
|
+
}
|
|
75
152
|
```
|
|
153
|
+
|
|
154
|
+
## Requirements
|
|
155
|
+
|
|
156
|
+
- OpenRouter API key (get one at https://openrouter.ai)
|
|
157
|
+
- Any runtime: Node.js, Bun, Deno, browsers
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT
|
package/dist/index.cjs
CHANGED
|
@@ -111,12 +111,6 @@ function repairJSON(raw) {
|
|
|
111
111
|
candidate = candidate.replace(/,\s*([}\]])/g, "$1");
|
|
112
112
|
candidate = candidate.replace(/'/g, '"');
|
|
113
113
|
candidate = candidate.replace(/(\w+)\s*:/g, '"$1":');
|
|
114
|
-
candidate = candidate.replace(/:\s*"([^"]*?)"/g, (match, content) => {
|
|
115
|
-
if (content.includes('"')) {
|
|
116
|
-
return match;
|
|
117
|
-
}
|
|
118
|
-
return match;
|
|
119
|
-
});
|
|
120
114
|
if (!candidate.startsWith("{") && !candidate.startsWith("[")) {
|
|
121
115
|
const firstBrace = candidate.indexOf("{");
|
|
122
116
|
const firstBracket = candidate.indexOf("[");
|
|
@@ -141,6 +135,128 @@ function repairJSON(raw) {
|
|
|
141
135
|
}
|
|
142
136
|
return null;
|
|
143
137
|
}
|
|
138
|
+
function coerceType(value, targetType) {
|
|
139
|
+
if (value === void 0 || value === null) {
|
|
140
|
+
return { value, coerced: false };
|
|
141
|
+
}
|
|
142
|
+
switch (targetType) {
|
|
143
|
+
case "string":
|
|
144
|
+
if (typeof value === "string") return { value, coerced: false };
|
|
145
|
+
return { value: String(value), coerced: true };
|
|
146
|
+
case "number":
|
|
147
|
+
if (typeof value === "number") return { value, coerced: false };
|
|
148
|
+
if (typeof value === "string") {
|
|
149
|
+
const num = Number(value);
|
|
150
|
+
if (!isNaN(num)) return { value: num, coerced: true };
|
|
151
|
+
}
|
|
152
|
+
return { value, coerced: false };
|
|
153
|
+
case "boolean":
|
|
154
|
+
if (typeof value === "boolean") return { value, coerced: false };
|
|
155
|
+
if (typeof value === "string") {
|
|
156
|
+
const lower = value.toLowerCase();
|
|
157
|
+
if (lower === "true" || lower === "yes" || lower === "1") return { value: true, coerced: true };
|
|
158
|
+
if (lower === "false" || lower === "no" || lower === "0") return { value: false, coerced: true };
|
|
159
|
+
}
|
|
160
|
+
if (typeof value === "number") {
|
|
161
|
+
return { value: value !== 0, coerced: true };
|
|
162
|
+
}
|
|
163
|
+
return { value, coerced: false };
|
|
164
|
+
case "array":
|
|
165
|
+
if (Array.isArray(value)) return { value, coerced: false };
|
|
166
|
+
return { value: [value], coerced: true };
|
|
167
|
+
default:
|
|
168
|
+
return { value, coerced: false };
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
function repairData(data, fields, schemaName) {
|
|
172
|
+
const actions = [];
|
|
173
|
+
if (typeof data !== "object" || data === null || Array.isArray(data)) {
|
|
174
|
+
return { data, actions };
|
|
175
|
+
}
|
|
176
|
+
let obj = data;
|
|
177
|
+
if (schemaName in obj && typeof obj[schemaName] === "object" && obj[schemaName] !== null) {
|
|
178
|
+
actions.push({ attempt: 0, type: "unwrap", detail: `Unwrapped "${schemaName}" wrapper` });
|
|
179
|
+
obj = obj[schemaName];
|
|
180
|
+
}
|
|
181
|
+
for (const [key, field] of Object.entries(fields)) {
|
|
182
|
+
if (obj[key] === void 0) {
|
|
183
|
+
if (field.default !== void 0) {
|
|
184
|
+
obj[key] = field.default;
|
|
185
|
+
actions.push({ attempt: 0, type: "default_fill", detail: `Set "${key}" to default: ${JSON.stringify(field.default)}` });
|
|
186
|
+
} else if (field.type === "string") {
|
|
187
|
+
obj[key] = "";
|
|
188
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added empty string for "${key}"` });
|
|
189
|
+
} else if (field.type === "number") {
|
|
190
|
+
obj[key] = 0;
|
|
191
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added 0 for "${key}"` });
|
|
192
|
+
} else if (field.type === "boolean") {
|
|
193
|
+
obj[key] = false;
|
|
194
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added false for "${key}"` });
|
|
195
|
+
} else if (field.type === "array") {
|
|
196
|
+
obj[key] = [];
|
|
197
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added empty array for "${key}"` });
|
|
198
|
+
} else if (field.type === "object") {
|
|
199
|
+
obj[key] = {};
|
|
200
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added empty object for "${key}"` });
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (obj[key] !== void 0) {
|
|
204
|
+
const coerced = coerceType(obj[key], field.type);
|
|
205
|
+
if (coerced.coerced) {
|
|
206
|
+
obj[key] = coerced.value;
|
|
207
|
+
actions.push({ attempt: 0, type: "type_coercion", detail: `Coerced "${key}" to ${field.type}: ${JSON.stringify(coerced.value)}` });
|
|
208
|
+
}
|
|
209
|
+
if (field.type === "array" && Array.isArray(obj[key]) && field.items) {
|
|
210
|
+
obj[key] = obj[key].map((item, i) => {
|
|
211
|
+
const itemResult = repairItem(item, field.items);
|
|
212
|
+
if (itemResult.actions.length > 0) {
|
|
213
|
+
actions.push(...itemResult.actions.map((a) => ({ ...a, detail: `[${i}] ${a.detail}` })));
|
|
214
|
+
}
|
|
215
|
+
return itemResult.data;
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
if (field.type === "object" && typeof obj[key] === "object" && obj[key] !== null && field.properties) {
|
|
219
|
+
const nested = repairData(obj[key], field.properties, "");
|
|
220
|
+
if (nested.actions.length > 0) {
|
|
221
|
+
actions.push(...nested.actions);
|
|
222
|
+
}
|
|
223
|
+
obj[key] = nested.data;
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return { data: obj, actions };
|
|
228
|
+
}
|
|
229
|
+
function repairItem(item, field) {
|
|
230
|
+
const actions = [];
|
|
231
|
+
if (field.type === "object" && typeof item === "object" && item !== null && !Array.isArray(item) && field.properties) {
|
|
232
|
+
const obj = item;
|
|
233
|
+
for (const [key, prop] of Object.entries(field.properties)) {
|
|
234
|
+
if (obj[key] === void 0) {
|
|
235
|
+
if (prop.type === "string") {
|
|
236
|
+
obj[key] = "";
|
|
237
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added empty string for "${key}"` });
|
|
238
|
+
} else if (prop.type === "number") {
|
|
239
|
+
obj[key] = 0;
|
|
240
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added 0 for "${key}"` });
|
|
241
|
+
} else if (prop.type === "boolean") {
|
|
242
|
+
obj[key] = false;
|
|
243
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added false for "${key}"` });
|
|
244
|
+
} else if (prop.type === "array") {
|
|
245
|
+
obj[key] = [];
|
|
246
|
+
actions.push({ attempt: 0, type: "missing_field", detail: `Added empty array for "${key}"` });
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
if (obj[key] !== void 0) {
|
|
250
|
+
const coerced = coerceType(obj[key], prop.type);
|
|
251
|
+
if (coerced.coerced) {
|
|
252
|
+
obj[key] = coerced.value;
|
|
253
|
+
actions.push({ attempt: 0, type: "type_coercion", detail: `Coerced "${key}" to ${prop.type}` });
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
return { data: item, actions };
|
|
259
|
+
}
|
|
144
260
|
function validateAgainstSchema(data, schema) {
|
|
145
261
|
if (typeof data !== "object" || data === null || Array.isArray(data)) {
|
|
146
262
|
return { valid: false, errors: ["Root value must be an object"] };
|
|
@@ -194,40 +310,120 @@ function validateAgainstSchema(data, schema) {
|
|
|
194
310
|
function defineSchema(name, fields) {
|
|
195
311
|
return { name, fields };
|
|
196
312
|
}
|
|
197
|
-
function
|
|
198
|
-
const
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
313
|
+
function schemaToJSONExample(fields) {
|
|
314
|
+
const result = {};
|
|
315
|
+
for (const [key, field] of Object.entries(fields)) {
|
|
316
|
+
switch (field.type) {
|
|
317
|
+
case "string":
|
|
318
|
+
result[key] = field.description || "...";
|
|
319
|
+
break;
|
|
320
|
+
case "number":
|
|
321
|
+
result[key] = 0;
|
|
322
|
+
break;
|
|
323
|
+
case "boolean":
|
|
324
|
+
result[key] = true;
|
|
325
|
+
break;
|
|
326
|
+
case "array":
|
|
327
|
+
if (field.items) {
|
|
328
|
+
result[key] = [schemaToJSONExample(field.items.properties || {})];
|
|
329
|
+
} else {
|
|
330
|
+
result[key] = [];
|
|
331
|
+
}
|
|
332
|
+
break;
|
|
333
|
+
case "object":
|
|
334
|
+
if (field.properties) {
|
|
335
|
+
result[key] = schemaToJSONExample(field.properties);
|
|
336
|
+
} else {
|
|
337
|
+
result[key] = {};
|
|
338
|
+
}
|
|
339
|
+
break;
|
|
202
340
|
}
|
|
203
|
-
return parts.join(" ");
|
|
204
|
-
}).join(",\n ");
|
|
205
|
-
return `{
|
|
206
|
-
"${schema.name}": {
|
|
207
|
-
${fieldDescriptions}
|
|
208
341
|
}
|
|
209
|
-
|
|
342
|
+
return result;
|
|
210
343
|
}
|
|
211
|
-
function
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
344
|
+
function buildForcedPrompt(input, schema) {
|
|
345
|
+
const example = schemaToJSONExample(schema.fields);
|
|
346
|
+
const exampleStr = JSON.stringify({ [schema.name]: example }, null, 2);
|
|
347
|
+
return `TASK: Extract structured data from the text below into EXACTLY this JSON format.
|
|
348
|
+
|
|
349
|
+
CRITICAL RULES:
|
|
350
|
+
- Output ONLY valid JSON. No text before or after.
|
|
351
|
+
- No markdown. No code blocks. No explanations.
|
|
352
|
+
- Every field MUST be present with the correct type.
|
|
353
|
+
- String fields: use "" if unknown.
|
|
354
|
+
- Number fields: use 0 if unknown.
|
|
355
|
+
- Boolean fields: use true or false, never null.
|
|
356
|
+
- Array fields: use [] if empty, never null.
|
|
357
|
+
- Object fields: use {} if empty, never null.
|
|
358
|
+
|
|
359
|
+
REQUIRED JSON STRUCTURE:
|
|
360
|
+
${exampleStr}
|
|
361
|
+
|
|
362
|
+
TEXT TO EXTRACT FROM:
|
|
363
|
+
"""
|
|
364
|
+
${input}
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
OUTPUT ONLY THE JSON OBJECT. NOTHING ELSE.`;
|
|
368
|
+
}
|
|
369
|
+
function buildRetryPrompt(input, schema, previousResponse, error) {
|
|
370
|
+
const example = schemaToJSONExample(schema.fields);
|
|
371
|
+
const exampleStr = JSON.stringify({ [schema.name]: example }, null, 2);
|
|
372
|
+
return `YOUR PREVIOUS RESPONSE WAS INVALID. You MUST fix it.
|
|
373
|
+
|
|
374
|
+
ERROR: ${error}
|
|
375
|
+
|
|
376
|
+
YOUR PREVIOUS RESPONSE:
|
|
377
|
+
${previousResponse.slice(0, 1e3)}
|
|
378
|
+
|
|
379
|
+
WHAT YOU MUST DO NOW:
|
|
380
|
+
1. Output ONLY valid JSON matching this EXACT structure
|
|
381
|
+
2. No text before or after the JSON
|
|
382
|
+
3. No markdown, no code blocks, no explanations
|
|
383
|
+
4. Fix the errors listed above
|
|
384
|
+
5. Every field MUST be present
|
|
385
|
+
|
|
386
|
+
REQUIRED JSON STRUCTURE:
|
|
387
|
+
${exampleStr}
|
|
388
|
+
|
|
389
|
+
TEXT TO EXTRACT FROM:
|
|
390
|
+
"""
|
|
391
|
+
${input}
|
|
392
|
+
"""
|
|
393
|
+
|
|
394
|
+
OUTPUT ONLY THE JSON OBJECT. NOTHING ELSE.`;
|
|
395
|
+
}
|
|
396
|
+
function buildFinalPrompt(input, schema, previousResponse, error) {
|
|
397
|
+
const example = schemaToJSONExample(schema.fields);
|
|
398
|
+
const exampleStr = JSON.stringify({ [schema.name]: example }, null, 2);
|
|
399
|
+
return `FINAL ATTEMPT. THIS IS YOUR LAST CHANCE.
|
|
400
|
+
|
|
401
|
+
YOUR PREVIOUS RESPONSE FAILED VALIDATION:
|
|
402
|
+
${error}
|
|
403
|
+
|
|
404
|
+
YOUR PREVIOUS RESPONSE:
|
|
405
|
+
${previousResponse.slice(0, 800)}
|
|
406
|
+
|
|
407
|
+
YOU MUST OUTPUT EXACTLY THIS STRUCTURE. NOTHING MORE. NOTHING LESS.
|
|
408
|
+
DO NOT ADD FIELDS THAT ARE NOT IN THE SCHEMA.
|
|
409
|
+
DO NOT OMIT ANY FIELDS.
|
|
410
|
+
DO NOT WRAP IN markdown OR code blocks.
|
|
411
|
+
DO NOT ADD ANY TEXT BEFORE OR AFTER THE JSON.
|
|
412
|
+
|
|
413
|
+
STRUCTURE:
|
|
414
|
+
${exampleStr}
|
|
415
|
+
|
|
416
|
+
INPUT TEXT:
|
|
417
|
+
"""
|
|
418
|
+
${input}
|
|
419
|
+
"""
|
|
420
|
+
|
|
421
|
+
OUTPUT ONLY:
|
|
422
|
+
`;
|
|
423
|
+
}
|
|
424
|
+
function schemaToPrompt(schema) {
|
|
425
|
+
const example = schemaToJSONExample(schema.fields);
|
|
426
|
+
return JSON.stringify({ [schema.name]: example }, null, 2);
|
|
231
427
|
}
|
|
232
428
|
function schemaToZodishString(schema) {
|
|
233
429
|
const lines = [];
|
|
@@ -241,11 +437,16 @@ function schemaToZodishString(schema) {
|
|
|
241
437
|
|
|
242
438
|
// src/extractor.ts
|
|
243
439
|
var MAX_REPAIR_ATTEMPTS = 3;
|
|
244
|
-
function
|
|
245
|
-
|
|
246
|
-
|
|
440
|
+
function calculateConfidence(attempts, repairActions, jsonValidFirstTry) {
|
|
441
|
+
let score = 100;
|
|
442
|
+
if (!jsonValidFirstTry) {
|
|
443
|
+
score -= 15;
|
|
247
444
|
}
|
|
248
|
-
|
|
445
|
+
score -= (attempts - 1) * 10;
|
|
446
|
+
score -= repairActions.filter((a) => a.type === "type_coercion").length * 5;
|
|
447
|
+
score -= repairActions.filter((a) => a.type === "missing_field").length * 8;
|
|
448
|
+
score -= repairActions.filter((a) => a.type === "default_fill").length * 3;
|
|
449
|
+
return Math.max(0, Math.min(100, score));
|
|
249
450
|
}
|
|
250
451
|
async function extract(input, schema, options = {}) {
|
|
251
452
|
const config = {
|
|
@@ -259,59 +460,62 @@ async function extract(input, schema, options = {}) {
|
|
|
259
460
|
"OpenRouter API key required. Pass it in options or set OPENROUTER_API_KEY environment variable."
|
|
260
461
|
);
|
|
261
462
|
}
|
|
262
|
-
const
|
|
263
|
-
const exampleJSON = schemaToPrompt(schema);
|
|
264
|
-
const userPrompt = `Extract structured data from the following text.
|
|
265
|
-
|
|
266
|
-
${schemaDescription}
|
|
267
|
-
|
|
268
|
-
Return ONLY valid JSON matching this structure:
|
|
269
|
-
${exampleJSON}
|
|
270
|
-
|
|
271
|
-
Text to extract from:
|
|
272
|
-
"""
|
|
273
|
-
${input}
|
|
274
|
-
"""`;
|
|
275
|
-
const schemaFields = {};
|
|
276
|
-
for (const [key, field] of Object.entries(schema.fields)) {
|
|
277
|
-
schemaFields[key] = { type: field.type, required: field.required };
|
|
278
|
-
}
|
|
463
|
+
const allRepairActions = [];
|
|
279
464
|
let lastRaw = "";
|
|
280
465
|
let lastError = "";
|
|
466
|
+
let jsonValidFirstTry = false;
|
|
467
|
+
let successAttempt = 0;
|
|
281
468
|
for (let attempt = 0; attempt <= MAX_REPAIR_ATTEMPTS; attempt++) {
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
Fix the JSON and return ONLY valid JSON. No explanations, no markdown, just the raw JSON object.`;
|
|
469
|
+
let prompt;
|
|
470
|
+
if (attempt === 0) {
|
|
471
|
+
prompt = buildForcedPrompt(input, schema);
|
|
472
|
+
} else if (attempt === 1) {
|
|
473
|
+
prompt = buildRetryPrompt(input, schema, lastRaw, lastError);
|
|
474
|
+
} else {
|
|
475
|
+
prompt = buildFinalPrompt(input, schema, lastRaw, lastError);
|
|
476
|
+
}
|
|
291
477
|
const response = await callLLM(prompt, config);
|
|
292
478
|
lastRaw = response.content;
|
|
293
479
|
const extracted = extractJSON(response.content);
|
|
294
480
|
if (!extracted) {
|
|
295
|
-
lastError = "Could not extract JSON from response";
|
|
481
|
+
lastError = "Could not extract JSON from response. Your response contained no valid JSON.";
|
|
296
482
|
continue;
|
|
297
483
|
}
|
|
298
484
|
const parsed = tryParseJSON(extracted);
|
|
299
485
|
if (!parsed.success) {
|
|
300
|
-
lastError = parsed.error
|
|
486
|
+
lastError = `Invalid JSON: ${parsed.error}`;
|
|
301
487
|
continue;
|
|
302
488
|
}
|
|
489
|
+
if (attempt === 0) {
|
|
490
|
+
jsonValidFirstTry = true;
|
|
491
|
+
}
|
|
303
492
|
const repaired = repairJSON(extracted);
|
|
304
|
-
|
|
305
|
-
const
|
|
306
|
-
|
|
493
|
+
let finalData = repaired ? tryParseJSON(repaired).success ? JSON.parse(repaired) : parsed.data : parsed.data;
|
|
494
|
+
const repairResult = repairData(finalData, schema.fields, schema.name);
|
|
495
|
+
finalData = repairResult.data;
|
|
496
|
+
allRepairActions.push(...repairResult.actions);
|
|
497
|
+
const schemaFields = {};
|
|
498
|
+
for (const [key, field] of Object.entries(schema.fields)) {
|
|
499
|
+
schemaFields[key] = field;
|
|
500
|
+
}
|
|
501
|
+
const validation = validateAgainstSchema(finalData, schemaFields);
|
|
307
502
|
if (!validation.valid) {
|
|
308
|
-
lastError = `Schema validation failed: ${validation.errors.join("
|
|
503
|
+
lastError = `Schema validation failed: ${validation.errors.join("; ")}`;
|
|
504
|
+
allRepairActions.push({
|
|
505
|
+
attempt,
|
|
506
|
+
type: "type_coercion",
|
|
507
|
+
detail: `Validation failed: ${validation.errors.join("; ")}`
|
|
508
|
+
});
|
|
309
509
|
continue;
|
|
310
510
|
}
|
|
511
|
+
successAttempt = attempt + 1;
|
|
311
512
|
return {
|
|
312
|
-
data:
|
|
513
|
+
data: finalData,
|
|
313
514
|
raw: response.content,
|
|
314
515
|
model: response.model,
|
|
516
|
+
confidence: calculateConfidence(successAttempt, allRepairActions, jsonValidFirstTry),
|
|
517
|
+
repairLog: allRepairActions,
|
|
518
|
+
attempts: successAttempt,
|
|
315
519
|
usage: response.usage
|
|
316
520
|
};
|
|
317
521
|
}
|
|
@@ -320,9 +524,11 @@ Fix the JSON and return ONLY valid JSON. No explanations, no markdown, just the
|
|
|
320
524
|
);
|
|
321
525
|
}
|
|
322
526
|
|
|
527
|
+
exports.coerceType = coerceType;
|
|
323
528
|
exports.defineSchema = defineSchema;
|
|
324
529
|
exports.extract = extract;
|
|
325
530
|
exports.extractJSON = extractJSON;
|
|
531
|
+
exports.repairData = repairData;
|
|
326
532
|
exports.repairJSON = repairJSON;
|
|
327
533
|
exports.schemaToPrompt = schemaToPrompt;
|
|
328
534
|
exports.schemaToZodishString = schemaToZodishString;
|