npm - @painitehq/structured-llm - Versions diffs - 0.1.0 → 0.2.0 - Mend

@painitehq/structured-llm 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -1,75 +1,161 @@
-# React + TypeScript + Vite
+# @painitehq/structured-llm
-This template provides a minimal setup to get React working in Vite with HMR and some ESLint rules.
+Force LLM output into structured, type-safe JSON. Stop your app from crashing on malformed AI responses.
-Currently, two official plugins are available:
+## Install
-- [@vitejs/plugin-react](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react) uses [Oxc](https://oxc.rs)
-- [@vitejs/plugin-react-swc](https://github.com/vitejs/vite-plugin-react/blob/main/packages/plugin-react-swc) uses [SWC](https://swc.rs/)
+```bash
+npm install @painitehq/structured-llm
+# or
+bun add @painitehq/structured-llm
+```
-## React Compiler
+## Quick Start
-The React Compiler is enabled on this template. See [this documentation](https://react.dev/learn/react-compiler) for more information.
+```ts
+import { extract, defineSchema } from "@painitehq/structured-llm";
-Note: This will impact Vite dev & build performances.
+const schema = defineSchema("invoice", {
+  invoiceNumber: { type: "string" },
+  totalAmount: { type: "number" },
+  vendor: { type: "string" },
+  items: { type: "array", items: { type: "string" } },
+});
-## Expanding the ESLint configuration
+const result = await extract(messyText, schema, {
+  apiKey: "your-openrouter-api-key",
+});
-If you are developing a production application, we recommend updating the configuration to enable type-aware lint rules:
+console.log(result.data);
+// { invoiceNumber: "INV-2024-042", totalAmount: 500, vendor: "Acme Corp", items: [...] }
-```js
-export default defineConfig([
-  globalIgnores(['dist']),
-  {
-    files: ['**/*.{ts,tsx}'],
-    extends: [
-      // Other configs...
+console.log(result.confidence); // 100
+console.log(result.attempts);   // 1
+```
-      // Remove tseslint.configs.recommended and replace with this
-      tseslint.configs.recommendedTypeChecked,
-      // Alternatively, use this for stricter rules
-      tseslint.configs.strictTypeChecked,
-      // Optionally, add this for stylistic rules
-      tseslint.configs.stylisticTypeChecked,
+## What It Does
-      // Other configs...
-    ],
-    languageOptions: {
-      parserOptions: {
-        project: ['./tsconfig.node.json', './tsconfig.app.json'],
-        tsconfigRootDir: import.meta.dirname,
-      },
-      // other options...
-    },
-  },
-])
+LLMs return unstructured text. Sometimes it's valid JSON. Sometimes it's wrapped in markdown. Sometimes it's completely broken. This SDK:
+1. **Forces** the model to output valid JSON via strict prompt engineering
+2. **Repairs** malformed JSON (trailing commas, missing brackets, broken quotes)
+3. **Unwraps** named wrappers like `{"invoice": {...}}` → `{...}`
+4. **Validates** the output against your schema
+5. **Coerces** wrong types (`"42"` → `42`, `"true"` → `true`)
+6. **Fills** missing fields with sensible defaults
+7. **Retries** with escalating instructions if the model fails
+## Features
+### Forced Structured Output
+The SDK doesn't ask the model to "give JSON". It forces it:
+```
+CRITICAL RULES:
+- Output ONLY valid JSON. No text before or after.
+- No markdown. No code blocks. No explanations.
+- Every field MUST be present with the correct type.
 ```
-You can also install [eslint-plugin-react-x](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-x) and [eslint-plugin-react-dom](https://github.com/Rel1cx/eslint-react/tree/main/packages/plugins/eslint-plugin-react-dom) for React-specific lint rules:
-```js
-// eslint.config.js
-import reactX from 'eslint-plugin-react-x'
-import reactDom from 'eslint-plugin-react-dom'
-export default defineConfig([
-  globalIgnores(['dist']),
-  {
-    files: ['**/*.{ts,tsx}'],
-    extends: [
-      // Other configs...
-      // Enable lint rules for React
-      reactX.configs['recommended-typescript'],
-      // Enable lint rules for React DOM
-      reactDom.configs.recommended,
-    ],
-    languageOptions: {
-      parserOptions: {
-        project: ['./tsconfig.node.json', './tsconfig.app.json'],
-        tsconfigRootDir: import.meta.dirname,
-      },
-      // other options...
+### Escalating Retries
+If the model fails, the SDK retries with increasingly strict instructions:
+- **Attempt 1**: Clean forced prompt
+- **Attempt 2**: "Your response was invalid. Here's the error. Fix it."
+- **Attempt 3**: "FINAL ATTEMPT. THIS IS YOUR LAST CHANCE."
+### Post-Validation Repair
+Even if the JSON parses, the SDK fixes type mismatches:
+| Model returns | Schema expects | SDK does |
+|---------------|----------------|----------|
+| `"42"` | `number` | coerces to `42` |
+| `"true"` | `boolean` | coerces to `true` |
+| `42` | `string` | coerces to `"42"` |
+| missing field | any type | fills with default |
+### Confidence Scoring
+Every extraction returns a confidence score (0-100):
+```ts
+const result = await extract(text, schema, { apiKey });
+result.confidence; // 85
+result.repairLog;  // [{ type: "type_coercion", detail: "Coerced \"price\" to number" }]
+result.attempts;   // 2
+```
+Confidence deductions:
+- JSON not valid first try: -15
+- Each retry: -10
+- Type coercion per field: -5
+- Missing field filled: -8
+## Schema Definition
+```ts
+import { defineSchema } from "@painitehq/structured-llm";
+const schema = defineSchema("person", {
+  name: { type: "string", description: "Full name" },
+  age: { type: "number" },
+  isStudent: { type: "boolean" },
+  hobbies: { type: "array", items: { type: "string" } },
+  address: {
+    type: "object",
+    properties: {
+      city: { type: "string" },
+      zip: { type: "string" },
     },
   },
-])
+});
+```
+Supported types: `string`, `number`, `boolean`, `array`, `object`
+## API
+### `extract<T>(input, schema, options)`
+Returns `Promise<ExtractionResult<T>>`:
+```ts
+interface ExtractionResult<T> {
+  data: T;                    // typed structured data
+  raw: string;                // raw LLM response
+  model: string;              // model used
+  confidence: number;         // 0-100 score
+  repairLog: RepairAction[];  // what was fixed
+  attempts: number;           // how many tries
+  usage?: {
+    promptTokens: number;
+    completionTokens: number;
+    totalTokens: number;
+  };
+}
+```
+### Options
+```ts
+{
+  apiKey?: string;      // OpenRouter API key (or set OPENROUTER_API_KEY env var)
+  model?: string;       // model to use (default: "openrouter/free")
+  temperature?: number; // 0-1 (default: 0)
+  maxRetries?: number;  // max retry attempts (default: 3)
+  timeout?: number;     // request timeout in ms (default: 60000)
+}
 ```
+## Requirements
+- OpenRouter API key (get one at https://openrouter.ai)
+- Any runtime: Node.js, Bun, Deno, browsers
+## License
+MIT

package/dist/index.cjs CHANGED Viewed

@@ -111,12 +111,6 @@ function repairJSON(raw) {
   candidate = candidate.replace(/,\s*([}\]])/g, "$1");
   candidate = candidate.replace(/'/g, '"');
   candidate = candidate.replace(/(\w+)\s*:/g, '"$1":');
-  candidate = candidate.replace(/:\s*"([^"]*?)"/g, (match, content) => {
-    if (content.includes('"')) {
-      return match;
-    }
-    return match;
-  });
   if (!candidate.startsWith("{") && !candidate.startsWith("[")) {
     const firstBrace = candidate.indexOf("{");
     const firstBracket = candidate.indexOf("[");
@@ -141,6 +135,128 @@ function repairJSON(raw) {
   }
   return null;
 }
+function coerceType(value, targetType) {
+  if (value === void 0 || value === null) {
+    return { value, coerced: false };
+  }
+  switch (targetType) {
+    case "string":
+      if (typeof value === "string") return { value, coerced: false };
+      return { value: String(value), coerced: true };
+    case "number":
+      if (typeof value === "number") return { value, coerced: false };
+      if (typeof value === "string") {
+        const num = Number(value);
+        if (!isNaN(num)) return { value: num, coerced: true };
+      }
+      return { value, coerced: false };
+    case "boolean":
+      if (typeof value === "boolean") return { value, coerced: false };
+      if (typeof value === "string") {
+        const lower = value.toLowerCase();
+        if (lower === "true" || lower === "yes" || lower === "1") return { value: true, coerced: true };
+        if (lower === "false" || lower === "no" || lower === "0") return { value: false, coerced: true };
+      }
+      if (typeof value === "number") {
+        return { value: value !== 0, coerced: true };
+      }
+      return { value, coerced: false };
+    case "array":
+      if (Array.isArray(value)) return { value, coerced: false };
+      return { value: [value], coerced: true };
+    default:
+      return { value, coerced: false };
+  }
+}
+function repairData(data, fields, schemaName) {
+  const actions = [];
+  if (typeof data !== "object" || data === null || Array.isArray(data)) {
+    return { data, actions };
+  }
+  let obj = data;
+  if (schemaName in obj && typeof obj[schemaName] === "object" && obj[schemaName] !== null) {
+    actions.push({ attempt: 0, type: "unwrap", detail: `Unwrapped "${schemaName}" wrapper` });
+    obj = obj[schemaName];
+  }
+  for (const [key, field] of Object.entries(fields)) {
+    if (obj[key] === void 0) {
+      if (field.default !== void 0) {
+        obj[key] = field.default;
+        actions.push({ attempt: 0, type: "default_fill", detail: `Set "${key}" to default: ${JSON.stringify(field.default)}` });
+      } else if (field.type === "string") {
+        obj[key] = "";
+        actions.push({ attempt: 0, type: "missing_field", detail: `Added empty string for "${key}"` });
+      } else if (field.type === "number") {
+        obj[key] = 0;
+        actions.push({ attempt: 0, type: "missing_field", detail: `Added 0 for "${key}"` });
+      } else if (field.type === "boolean") {
+        obj[key] = false;
+        actions.push({ attempt: 0, type: "missing_field", detail: `Added false for "${key}"` });
+      } else if (field.type === "array") {
+        obj[key] = [];
+        actions.push({ attempt: 0, type: "missing_field", detail: `Added empty array for "${key}"` });
+      } else if (field.type === "object") {
+        obj[key] = {};
+        actions.push({ attempt: 0, type: "missing_field", detail: `Added empty object for "${key}"` });
+      }
+    }
+    if (obj[key] !== void 0) {
+      const coerced = coerceType(obj[key], field.type);
+      if (coerced.coerced) {
+        obj[key] = coerced.value;
+        actions.push({ attempt: 0, type: "type_coercion", detail: `Coerced "${key}" to ${field.type}: ${JSON.stringify(coerced.value)}` });
+      }
+      if (field.type === "array" && Array.isArray(obj[key]) && field.items) {
+        obj[key] = obj[key].map((item, i) => {
+          const itemResult = repairItem(item, field.items);
+          if (itemResult.actions.length > 0) {
+            actions.push(...itemResult.actions.map((a) => ({ ...a, detail: `[${i}] ${a.detail}` })));
+          }
+          return itemResult.data;
+        });
+      }
+      if (field.type === "object" && typeof obj[key] === "object" && obj[key] !== null && field.properties) {
+        const nested = repairData(obj[key], field.properties, "");
+        if (nested.actions.length > 0) {
+          actions.push(...nested.actions);
+        }
+        obj[key] = nested.data;
+      }
+    }
+  }
+  return { data: obj, actions };
+}
+function repairItem(item, field) {
+  const actions = [];
+  if (field.type === "object" && typeof item === "object" && item !== null && !Array.isArray(item) && field.properties) {
+    const obj = item;
+    for (const [key, prop] of Object.entries(field.properties)) {
+      if (obj[key] === void 0) {
+        if (prop.type === "string") {
+          obj[key] = "";
+          actions.push({ attempt: 0, type: "missing_field", detail: `Added empty string for "${key}"` });
+        } else if (prop.type === "number") {
+          obj[key] = 0;
+          actions.push({ attempt: 0, type: "missing_field", detail: `Added 0 for "${key}"` });
+        } else if (prop.type === "boolean") {
+          obj[key] = false;
+          actions.push({ attempt: 0, type: "missing_field", detail: `Added false for "${key}"` });
+        } else if (prop.type === "array") {
+          obj[key] = [];
+          actions.push({ attempt: 0, type: "missing_field", detail: `Added empty array for "${key}"` });
+        }
+      }
+      if (obj[key] !== void 0) {
+        const coerced = coerceType(obj[key], prop.type);
+        if (coerced.coerced) {
+          obj[key] = coerced.value;
+          actions.push({ attempt: 0, type: "type_coercion", detail: `Coerced "${key}" to ${prop.type}` });
+        }
+      }
+    }
+  }
+  return { data: item, actions };
+}
 function validateAgainstSchema(data, schema) {
   if (typeof data !== "object" || data === null || Array.isArray(data)) {
     return { valid: false, errors: ["Root value must be an object"] };
@@ -194,40 +310,120 @@ function validateAgainstSchema(data, schema) {
 function defineSchema(name, fields) {
   return { name, fields };
 }
-function schemaToPrompt(schema) {
-  const fieldDescriptions = Object.entries(schema.fields).map(([key, field]) => {
-    const parts = [`"${key}": ${fieldTypeToExample(field)}`];
-    if (field.description) {
-      parts.push(`// ${field.description}`);
+function schemaToJSONExample(fields) {
+  const result = {};
+  for (const [key, field] of Object.entries(fields)) {
+    switch (field.type) {
+      case "string":
+        result[key] = field.description || "...";
+        break;
+      case "number":
+        result[key] = 0;
+        break;
+      case "boolean":
+        result[key] = true;
+        break;
+      case "array":
+        if (field.items) {
+          result[key] = [schemaToJSONExample(field.items.properties || {})];
+        } else {
+          result[key] = [];
+        }
+        break;
+      case "object":
+        if (field.properties) {
+          result[key] = schemaToJSONExample(field.properties);
+        } else {
+          result[key] = {};
+        }
+        break;
     }
-    return parts.join(" ");
-  }).join(",\n  ");
-  return `{
-  "${schema.name}": {
-  ${fieldDescriptions}
   }
-}`;
+  return result;
 }
-function fieldTypeToExample(field) {
-  switch (field.type) {
-    case "string":
-      return `"..."`;
-    case "number":
-      return "0";
-    case "boolean":
-      return "true";
-    case "array":
-      if (field.items) {
-        return `[${fieldTypeToExample(field.items)}]`;
-      }
-      return "[]";
-    case "object":
-      if (field.properties) {
-        const inner = Object.entries(field.properties).map(([k, v]) => `"${k}": ${fieldTypeToExample(v)}`).join(", ");
-        return `{ ${inner} }`;
-      }
-      return "{}";
-  }
+function buildForcedPrompt(input, schema) {
+  const example = schemaToJSONExample(schema.fields);
+  const exampleStr = JSON.stringify({ [schema.name]: example }, null, 2);
+  return `TASK: Extract structured data from the text below into EXACTLY this JSON format.
+CRITICAL RULES:
+- Output ONLY valid JSON. No text before or after.
+- No markdown. No code blocks. No explanations.
+- Every field MUST be present with the correct type.
+- String fields: use "" if unknown.
+- Number fields: use 0 if unknown.
+- Boolean fields: use true or false, never null.
+- Array fields: use [] if empty, never null.
+- Object fields: use {} if empty, never null.
+REQUIRED JSON STRUCTURE:
+${exampleStr}
+TEXT TO EXTRACT FROM:
+"""
+${input}
+"""
+OUTPUT ONLY THE JSON OBJECT. NOTHING ELSE.`;
+}
+function buildRetryPrompt(input, schema, previousResponse, error) {
+  const example = schemaToJSONExample(schema.fields);
+  const exampleStr = JSON.stringify({ [schema.name]: example }, null, 2);
+  return `YOUR PREVIOUS RESPONSE WAS INVALID. You MUST fix it.
+ERROR: ${error}
+YOUR PREVIOUS RESPONSE:
+${previousResponse.slice(0, 1e3)}
+WHAT YOU MUST DO NOW:
+1. Output ONLY valid JSON matching this EXACT structure
+2. No text before or after the JSON
+3. No markdown, no code blocks, no explanations
+4. Fix the errors listed above
+5. Every field MUST be present
+REQUIRED JSON STRUCTURE:
+${exampleStr}
+TEXT TO EXTRACT FROM:
+"""
+${input}
+"""
+OUTPUT ONLY THE JSON OBJECT. NOTHING ELSE.`;
+}
+function buildFinalPrompt(input, schema, previousResponse, error) {
+  const example = schemaToJSONExample(schema.fields);
+  const exampleStr = JSON.stringify({ [schema.name]: example }, null, 2);
+  return `FINAL ATTEMPT. THIS IS YOUR LAST CHANCE.
+YOUR PREVIOUS RESPONSE FAILED VALIDATION:
+${error}
+YOUR PREVIOUS RESPONSE:
+${previousResponse.slice(0, 800)}
+YOU MUST OUTPUT EXACTLY THIS STRUCTURE. NOTHING MORE. NOTHING LESS.
+DO NOT ADD FIELDS THAT ARE NOT IN THE SCHEMA.
+DO NOT OMIT ANY FIELDS.
+DO NOT WRAP IN markdown OR code blocks.
+DO NOT ADD ANY TEXT BEFORE OR AFTER THE JSON.
+STRUCTURE:
+${exampleStr}
+INPUT TEXT:
+"""
+${input}
+"""
+OUTPUT ONLY:
+`;
+}
+function schemaToPrompt(schema) {
+  const example = schemaToJSONExample(schema.fields);
+  return JSON.stringify({ [schema.name]: example }, null, 2);
 }
 function schemaToZodishString(schema) {
   const lines = [];
@@ -241,11 +437,16 @@ function schemaToZodishString(schema) {
 // src/extractor.ts
 var MAX_REPAIR_ATTEMPTS = 3;
-function unwrapNamedResponse(data, schemaName) {
-  if (typeof data === "object" && data !== null && !Array.isArray(data) && schemaName in data && typeof data[schemaName] === "object") {
-    return data[schemaName];
+function calculateConfidence(attempts, repairActions, jsonValidFirstTry) {
+  let score = 100;
+  if (!jsonValidFirstTry) {
+    score -= 15;
   }
-  return data;
+  score -= (attempts - 1) * 10;
+  score -= repairActions.filter((a) => a.type === "type_coercion").length * 5;
+  score -= repairActions.filter((a) => a.type === "missing_field").length * 8;
+  score -= repairActions.filter((a) => a.type === "default_fill").length * 3;
+  return Math.max(0, Math.min(100, score));
 }
 async function extract(input, schema, options = {}) {
   const config = {
@@ -259,59 +460,62 @@ async function extract(input, schema, options = {}) {
       "OpenRouter API key required. Pass it in options or set OPENROUTER_API_KEY environment variable."
     );
   }
-  const schemaDescription = schemaToZodishString(schema);
-  const exampleJSON = schemaToPrompt(schema);
-  const userPrompt = `Extract structured data from the following text.
-${schemaDescription}
-Return ONLY valid JSON matching this structure:
-${exampleJSON}
-Text to extract from:
-"""
-${input}
-"""`;
-  const schemaFields = {};
-  for (const [key, field] of Object.entries(schema.fields)) {
-    schemaFields[key] = { type: field.type, required: field.required };
-  }
+  const allRepairActions = [];
   let lastRaw = "";
   let lastError = "";
+  let jsonValidFirstTry = false;
+  let successAttempt = 0;
   for (let attempt = 0; attempt <= MAX_REPAIR_ATTEMPTS; attempt++) {
-    const prompt = attempt === 0 ? userPrompt : `${userPrompt}
-IMPORTANT: Your previous response was invalid JSON. Here was the error:
-${lastError}
-Previous raw response:
-${lastRaw}
-Fix the JSON and return ONLY valid JSON. No explanations, no markdown, just the raw JSON object.`;
+    let prompt;
+    if (attempt === 0) {
+      prompt = buildForcedPrompt(input, schema);
+    } else if (attempt === 1) {
+      prompt = buildRetryPrompt(input, schema, lastRaw, lastError);
+    } else {
+      prompt = buildFinalPrompt(input, schema, lastRaw, lastError);
+    }
     const response = await callLLM(prompt, config);
     lastRaw = response.content;
     const extracted = extractJSON(response.content);
     if (!extracted) {
-      lastError = "Could not extract JSON from response";
+      lastError = "Could not extract JSON from response. Your response contained no valid JSON.";
       continue;
     }
     const parsed = tryParseJSON(extracted);
     if (!parsed.success) {
-      lastError = parsed.error;
+      lastError = `Invalid JSON: ${parsed.error}`;
       continue;
     }
+    if (attempt === 0) {
+      jsonValidFirstTry = true;
+    }
     const repaired = repairJSON(extracted);
-    const finalData = repaired ? tryParseJSON(repaired).success ? JSON.parse(repaired) : parsed.data : parsed.data;
-    const unwrapped = unwrapNamedResponse(finalData, schema.name);
-    const validation = validateAgainstSchema(unwrapped, schemaFields);
+    let finalData = repaired ? tryParseJSON(repaired).success ? JSON.parse(repaired) : parsed.data : parsed.data;
+    const repairResult = repairData(finalData, schema.fields, schema.name);
+    finalData = repairResult.data;
+    allRepairActions.push(...repairResult.actions);
+    const schemaFields = {};
+    for (const [key, field] of Object.entries(schema.fields)) {
+      schemaFields[key] = field;
+    }
+    const validation = validateAgainstSchema(finalData, schemaFields);
     if (!validation.valid) {
-      lastError = `Schema validation failed: ${validation.errors.join(", ")}`;
+      lastError = `Schema validation failed: ${validation.errors.join("; ")}`;
+      allRepairActions.push({
+        attempt,
+        type: "type_coercion",
+        detail: `Validation failed: ${validation.errors.join("; ")}`
+      });
       continue;
     }
+    successAttempt = attempt + 1;
     return {
-      data: unwrapped,
+      data: finalData,
       raw: response.content,
       model: response.model,
+      confidence: calculateConfidence(successAttempt, allRepairActions, jsonValidFirstTry),
+      repairLog: allRepairActions,
+      attempts: successAttempt,
       usage: response.usage
     };
   }
@@ -320,9 +524,11 @@ Fix the JSON and return ONLY valid JSON. No explanations, no markdown, just the
   );
 }
+exports.coerceType = coerceType;
 exports.defineSchema = defineSchema;
 exports.extract = extract;
 exports.extractJSON = extractJSON;
+exports.repairData = repairData;
 exports.repairJSON = repairJSON;
 exports.schemaToPrompt = schemaToPrompt;
 exports.schemaToZodishString = schemaToZodishString;