llmbic 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +23 -0
- package/README.md +110 -29
- package/dist/extractor.d.ts +13 -7
- package/dist/extractor.d.ts.map +1 -1
- package/dist/extractor.js +37 -21
- package/dist/extractor.js.map +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/prompt.d.ts +20 -13
- package/dist/prompt.d.ts.map +1 -1
- package/dist/prompt.js +97 -39
- package/dist/prompt.js.map +1 -1
- package/dist/types/extractor.types.d.ts +29 -4
- package/dist/types/extractor.types.d.ts.map +1 -1
- package/dist/types/prompt.types.d.ts +29 -0
- package/dist/types/prompt.types.d.ts.map +1 -1
- package/package.json +3 -1
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.1.0] - 2026-04-16
|
|
9
|
+
|
|
10
|
+
Non-breaking. Unblocks hybrid workflows that rely on nested schemas, agreement/conflict detection, and extractor-level merge options.
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- `prompt.build` now supports `z.array(...)`, `z.object(...)`, `z.optional(...)` and `z.default(...)` in the response JSON Schema. Optional fields are preserved in `properties` but excluded from `required`.
|
|
15
|
+
- Cross-check mode on `prompt.build` and `ExtractorLlmConfig`: `mode: 'cross-check'` asks the LLM about every schema field, not just `partial.missing`, enabling the per-field agreement / conflict machinery in `merge.apply`. `crossCheckHints: 'bias' | 'unbiased'` (default `unbiased`) controls whether rule values are surfaced to the LLM as hints.
|
|
16
|
+
- `ExtractorConfig` now accepts `normalizers`, `validators`, `policy` and `logger` directly; previously these had to be threaded into a manual `merge.apply` call. The options are forwarded to every internal merge, so `extract`, `extractSync` and `extractor.merge` all honor them.
|
|
17
|
+
- Zod `.describe("...")` (equivalent to `.meta({ description })`) is now propagated to the generated JSON Schema at the level it was declared; providers' structured-output features consume it natively, so per-field prompt guidance no longer requires an expanded system prompt.
|
|
18
|
+
- README "Batch / async mode" section expanded with a worked OpenAI Batch API example (JSONL shape, upload / poll / download / merge), plus a full runnable script at `examples/openai-batch.ts`.
|
|
19
|
+
|
|
20
|
+
### Fixed
|
|
21
|
+
|
|
22
|
+
- Object schemas emitted by `prompt.build` now carry `additionalProperties: false`, matching the requirement of OpenAI Chat Completions Structured Outputs with `strict: true`. Other providers (Anthropic tool use, Ollama JSON Schema) ignore the extra key. Aligned with `prompt.parse` which already drops unexpected fields with a warning.
|
|
23
|
+
- `createExtractor` was not forwarding the configured `logger` to `rule.apply`, so schema-rejection warnings from the rules pass were silently dropped. The logger is now plumbed through every phase.
|
|
24
|
+
|
|
25
|
+
### Public types
|
|
26
|
+
|
|
27
|
+
- `PromptBuildMode`, `CrossCheckHints`, `PromptBuildOptions` exported from the package root.
|
|
28
|
+
- `ExtractorConfig<S>` gains optional `normalizers`, `validators`, `policy`, `logger`.
|
|
29
|
+
- `ExtractorLlmConfig` gains optional `mode`, `crossCheckHints`.
|
|
30
|
+
|
|
8
31
|
## [1.0.0] — 2026-04-15
|
|
9
32
|
|
|
10
33
|
Initial public release.
|
package/README.md
CHANGED
|
@@ -29,7 +29,7 @@ Llmbic has a single dependency: [Zod](https://zod.dev). No vendor SDK is pulled
|
|
|
29
29
|
|
|
30
30
|
```typescript
|
|
31
31
|
import { z } from 'zod';
|
|
32
|
-
import { createExtractor, rule
|
|
32
|
+
import { createExtractor, rule } from 'llmbic';
|
|
33
33
|
|
|
34
34
|
const InvoiceSchema = z.object({
|
|
35
35
|
total: z.number().nullable(),
|
|
@@ -41,14 +41,14 @@ const InvoiceSchema = z.object({
|
|
|
41
41
|
const extractor = createExtractor({
|
|
42
42
|
schema: InvoiceSchema,
|
|
43
43
|
rules: [
|
|
44
|
-
rule('total', (text) => {
|
|
44
|
+
rule.create('total', (text) => {
|
|
45
45
|
const m = text.match(/Total[:\s]*(\d[\d.,\s]+)\s*€/i);
|
|
46
46
|
if (!m) return null;
|
|
47
|
-
return confidence(parseFloat(m[1].replace(/[\s.]/g, '').replace(',', '.')), 1.0);
|
|
47
|
+
return rule.confidence(parseFloat(m[1].replace(/[\s.]/g, '').replace(',', '.')), 1.0);
|
|
48
48
|
}),
|
|
49
|
-
rule('currency', (text) => {
|
|
50
|
-
if (/€|EUR/i.test(text)) return confidence('EUR', 1.0);
|
|
51
|
-
if (/\$|USD/i.test(text)) return confidence('USD', 1.0);
|
|
49
|
+
rule.create('currency', (text) => {
|
|
50
|
+
if (/€|EUR/i.test(text)) return rule.confidence('EUR', 1.0);
|
|
51
|
+
if (/\$|USD/i.test(text)) return rule.confidence('USD', 1.0);
|
|
52
52
|
return null;
|
|
53
53
|
}),
|
|
54
54
|
],
|
|
@@ -69,7 +69,7 @@ console.log(result.missing);
|
|
|
69
69
|
### Rules + LLM
|
|
70
70
|
|
|
71
71
|
```typescript
|
|
72
|
-
import { createExtractor, rule
|
|
72
|
+
import { createExtractor, rule } from 'llmbic';
|
|
73
73
|
import type { LlmProvider } from 'llmbic';
|
|
74
74
|
import OpenAI from 'openai';
|
|
75
75
|
|
|
@@ -135,6 +135,48 @@ const llmResult = extractor.parse(rawJsonResponse);
|
|
|
135
135
|
const result = extractor.merge(partial, llmResult, markdown);
|
|
136
136
|
```
|
|
137
137
|
|
|
138
|
+
Steps 1, 2 and 4 are pure and synchronous: persist `partial` between (2) and (4); the merge re-runs the rules internally so no private state leaks across the async gap.
|
|
139
|
+
|
|
140
|
+
#### Worked example: OpenAI Batch API
|
|
141
|
+
|
|
142
|
+
The Batch API expects a JSONL file where each line is a Chat Completions request. Using `extractor.prompt(...)` as the per-document payload builder maps 1:1 onto that format:
|
|
143
|
+
|
|
144
|
+
```typescript
|
|
145
|
+
// For each document, build one JSONL line:
|
|
146
|
+
const partial = extractor.extractSync(doc.markdown);
|
|
147
|
+
const request = extractor.prompt(doc.markdown, partial);
|
|
148
|
+
|
|
149
|
+
const line = JSON.stringify({
|
|
150
|
+
custom_id: doc.id, // how you'll re-match later
|
|
151
|
+
method: 'POST',
|
|
152
|
+
url: '/v1/chat/completions',
|
|
153
|
+
body: {
|
|
154
|
+
model: 'gpt-4o-mini',
|
|
155
|
+
messages: [
|
|
156
|
+
{ role: 'system', content: request.systemPrompt },
|
|
157
|
+
{ role: 'user', content: request.userContent },
|
|
158
|
+
],
|
|
159
|
+
response_format: {
|
|
160
|
+
type: 'json_schema',
|
|
161
|
+
json_schema: { name: 'extraction', strict: true, schema: request.responseSchema },
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
});
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Upload the JSONL, create the batch, poll until `status === 'completed'`, download the output file. Each output line carries the same `custom_id` so you can map back to the `partial` you kept in memory (or in Redis, or on disk):
|
|
168
|
+
|
|
169
|
+
```typescript
|
|
170
|
+
for (const entry of prepared) {
|
|
171
|
+
const raw = responsesById.get(entry.id); // from output JSONL
|
|
172
|
+
const llmResult = extractor.parse(raw);
|
|
173
|
+
const result = extractor.merge(entry.partial, llmResult, entry.markdown);
|
|
174
|
+
// ... persist result ...
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
End-to-end runnable example (upload + poll + download + merge): [`examples/openai-batch.ts`](./examples/openai-batch.ts). At current OpenAI pricing the Batch API is ~50% cheaper than realtime Chat Completions, with a 24h completion window.
|
|
179
|
+
|
|
138
180
|
## Features
|
|
139
181
|
|
|
140
182
|
### Per-field confidence scoring
|
|
@@ -161,6 +203,35 @@ result.conflicts;
|
|
|
161
203
|
|
|
162
204
|
Three conflict strategies: `'flag'` (default — keep rule value, record conflict), `'prefer-rule'`, or `'prefer-llm'`.
|
|
163
205
|
|
|
206
|
+
In the default `'fill-gaps'` mode the LLM is only asked about fields the rules could not resolve, so conflicts are impossible. To actually trigger conflict detection, opt into cross-check (see below).
|
|
207
|
+
|
|
208
|
+
### Cross-check mode
|
|
209
|
+
|
|
210
|
+
Switch the LLM call from fill-gaps (ask only about missing fields) to cross-check (ask about every schema field, whether the rules resolved it or not):
|
|
211
|
+
|
|
212
|
+
```typescript
|
|
213
|
+
const extractor = createExtractor({
|
|
214
|
+
schema: InvoiceSchema,
|
|
215
|
+
rules: [...],
|
|
216
|
+
llm: {
|
|
217
|
+
provider,
|
|
218
|
+
mode: 'cross-check',
|
|
219
|
+
crossCheckHints: 'unbiased', // default; hides rule values from the LLM
|
|
220
|
+
},
|
|
221
|
+
});
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
The merge step now sees two candidates per field and surfaces real disagreements through `result.conflicts`. `crossCheckHints: 'bias'` re-exposes the rule values as hints to save tokens, at the cost of confirmation bias (the LLM tends to agree with what it was shown).
|
|
225
|
+
|
|
226
|
+
### Rich schemas
|
|
227
|
+
|
|
228
|
+
The JSON Schema handed to the LLM supports the Zod constructs that show up in real-world extraction targets:
|
|
229
|
+
|
|
230
|
+
- Primitives: `z.string()`, `z.number()`, `z.boolean()`, `z.enum([...])`.
|
|
231
|
+
- Composition: `z.array(...)`, `z.object({...})`, nested arbitrarily.
|
|
232
|
+
- Wrappers: `.nullable()`, `.optional()`, `.default(...)`.
|
|
233
|
+
- Descriptions: `z.string().describe("price in EUR, tax included")` propagates to the JSON Schema `description` at the declared level (array root vs items, object root vs property), and providers' structured-output features consume it natively. No need to inflate the system prompt with per-field hints.
|
|
234
|
+
|
|
164
235
|
### Normalizers
|
|
165
236
|
|
|
166
237
|
Post-merge transformations. Run in sequence, receive the merged data + original content:
|
|
@@ -184,9 +255,9 @@ const extractor = createExtractor({
|
|
|
184
255
|
Check the final output for logical consistency:
|
|
185
256
|
|
|
186
257
|
```typescript
|
|
187
|
-
import {
|
|
258
|
+
import { validator } from 'llmbic';
|
|
188
259
|
|
|
189
|
-
const { field, crossField } =
|
|
260
|
+
const { field, crossField } = validator.of<MySchemaShape>();
|
|
190
261
|
|
|
191
262
|
const extractor = createExtractor({
|
|
192
263
|
schema: MySchema,
|
|
@@ -227,7 +298,7 @@ const provider: LlmProvider = {
|
|
|
227
298
|
|
|
228
299
|
Ready-made snippets for common backends:
|
|
229
300
|
|
|
230
|
-
**OpenAI** (Chat Completions + Structured Outputs):
|
|
301
|
+
**OpenAI** (Chat Completions + Structured Outputs). The response schema llmbic emits always carries `additionalProperties: false`, so `strict: true` works out of the box:
|
|
231
302
|
|
|
232
303
|
```typescript
|
|
233
304
|
const client = new OpenAI();
|
|
@@ -307,28 +378,38 @@ Creates an extractor instance. Config:
|
|
|
307
378
|
|
|
308
379
|
| Field | Type | Required | Description |
|
|
309
380
|
|-------|------|----------|-------------|
|
|
310
|
-
| `schema` | `ZodObject` | yes | Output schema |
|
|
311
|
-
| `rules` | `ExtractionRule[]` | yes | Deterministic extraction rules |
|
|
312
|
-
| `llm` | `
|
|
313
|
-
| `normalizers` | `Normalizer[]` | no | Post-merge transformations |
|
|
314
|
-
| `validators` | `Validator[]` | no |
|
|
315
|
-
| `
|
|
316
|
-
| `logger` | `Logger` | no |
|
|
381
|
+
| `schema` | `ZodObject` | yes | Output schema (drives field enumeration and re-validation). |
|
|
382
|
+
| `rules` | `ExtractionRule[]` | yes | Deterministic extraction rules. |
|
|
383
|
+
| `llm` | `ExtractorLlmConfig` | no | LLM fallback. Omit for rules-only mode. See below. |
|
|
384
|
+
| `normalizers` | `Normalizer<T>[]` | no | Post-merge transformations, run in declared order. |
|
|
385
|
+
| `validators` | `Validator<ExtractedData<T>>[]` | no | Invariants populating `result.validation`. |
|
|
386
|
+
| `policy` | `Partial<FieldMergePolicy>` | no | Overrides the per-field merge policy (conflict strategy, confidence defaults, equality). |
|
|
387
|
+
| `logger` | `Logger` | no | Pino/Winston/console-compatible. Warnings from `rule.apply` and `merge.apply` flow through. |
|
|
317
388
|
|
|
318
|
-
|
|
389
|
+
`ExtractorLlmConfig`:
|
|
319
390
|
|
|
320
|
-
|
|
391
|
+
| Field | Type | Required | Description |
|
|
392
|
+
|-------|------|----------|-------------|
|
|
393
|
+
| `provider` | `LlmProvider` | yes | Single-method adapter the extractor calls. |
|
|
394
|
+
| `systemPrompt` | `string` | no | Overrides the built-in system prompt. |
|
|
395
|
+
| `mode` | `'fill-gaps' \| 'cross-check'` | no | `'fill-gaps'` (default) asks the LLM only about fields the rules did not resolve. `'cross-check'` asks about every schema field so `merge.apply` can surface agreements / conflicts. |
|
|
396
|
+
| `crossCheckHints` | `'bias' \| 'unbiased'` | no | In cross-check mode only. `'unbiased'` (default) hides rule values from the LLM for genuine disagreement detection; `'bias'` re-exposes them to save tokens. |
|
|
321
397
|
|
|
322
|
-
### `
|
|
398
|
+
### `rule` namespace
|
|
323
399
|
|
|
324
|
-
|
|
400
|
+
| Member | Signature | Description |
|
|
401
|
+
|---|---|---|
|
|
402
|
+
| `rule.create` | `(field, extract) => ExtractionRule` | Declare a rule. `extract(content)` returns a `RuleMatch` or `null`. |
|
|
403
|
+
| `rule.regex` | `(field, pattern, score, transform?) => ExtractionRule` | Regex-based rule. On match, capture group 1 (or the full match) is fed to `transform`. |
|
|
404
|
+
| `rule.confidence` | `(value, score) => RuleMatch` | Wrap a value and a confidence score; sugar for custom `extract` callbacks. |
|
|
405
|
+
| `rule.apply` | `(content, rules, schema, logger?) => RulesResult` | Run every rule, pick the highest-confidence match per field, type-check against the schema. |
|
|
325
406
|
|
|
326
|
-
### `
|
|
407
|
+
### `validator.of<T>()`
|
|
327
408
|
|
|
328
|
-
|
|
409
|
+
Binds a target data shape `T` and returns two validator builders:
|
|
329
410
|
|
|
330
|
-
- `field(name,
|
|
331
|
-
- `crossField(
|
|
411
|
+
- `field(name, ruleName, check, message, severity?)`: single-field validator. `check(value, data)` receives the precise type of the field (`T[name]`) as first argument.
|
|
412
|
+
- `crossField(ruleName, check, message, severity?)`: whole-object validator, produces a violation without a `field` property.
|
|
332
413
|
|
|
333
414
|
Binding `T` once lets TypeScript infer each field's type from the field name, so predicates are fully typed without manual annotations.
|
|
334
415
|
|
|
@@ -336,10 +417,10 @@ Binding `T` once lets TypeScript infer each field's type from the field name, so
|
|
|
336
417
|
|
|
337
418
|
| Method | Sync | Description |
|
|
338
419
|
|--------|------|-------------|
|
|
339
|
-
| `extract(content)` | async | Full pipeline: rules
|
|
340
|
-
| `extractSync(content)` | sync | Rules only. Returns partial result + missing fields. |
|
|
341
|
-
| `prompt(content, partial)` | sync | Builds LLM
|
|
342
|
-
| `parse(raw)` | sync | Parses raw LLM JSON response. |
|
|
420
|
+
| `extract(content)` | async | Full pipeline: rules -> LLM (if configured) -> merge -> normalize -> validate. |
|
|
421
|
+
| `extractSync(content)` | sync | Rules only. Returns the partial result + `missing` fields. |
|
|
422
|
+
| `prompt(content, partial)` | sync | Builds the LLM request. Covers `partial.missing` in fill-gaps mode, every schema field in cross-check mode. |
|
|
423
|
+
| `parse(raw)` | sync | Parses a raw LLM JSON response, validating each field individually. Never throws. |
|
|
343
424
|
| `merge(partial, llmResult, content)` | sync | Merges rules + LLM, detects conflicts, normalizes, validates. |
|
|
344
425
|
|
|
345
426
|
## License
|
package/dist/extractor.d.ts
CHANGED
|
@@ -1,18 +1,24 @@
|
|
|
1
1
|
import type { z } from 'zod';
|
|
2
2
|
import type { Extractor, ExtractorConfig } from './types/extractor.types.js';
|
|
3
3
|
/**
|
|
4
|
-
* Bind a schema, deterministic rules and
|
|
4
|
+
* Bind a schema, deterministic rules and their merge-time options into an
|
|
5
5
|
* {@link Extractor}. The returned object exposes the extraction pipeline as
|
|
6
6
|
* pre-configured methods; call sites stop having to thread `schema`,
|
|
7
|
-
* `rules` and provider wiring through
|
|
7
|
+
* `rules`, `policy`, normalizers/validators and provider wiring through
|
|
8
|
+
* every step.
|
|
8
9
|
*
|
|
9
|
-
* {@link Extractor.extract} runs {@link rule.apply}, then
|
|
10
|
-
* configured
|
|
11
|
-
*
|
|
12
|
-
*
|
|
10
|
+
* {@link Extractor.extract} runs {@link rule.apply}, then - when an LLM is
|
|
11
|
+
* configured - asks the provider either for the missing fields only
|
|
12
|
+
* (`mode: 'fill-gaps'`, default) or for every schema field
|
|
13
|
+
* (`mode: 'cross-check'`, which always triggers the LLM call so conflicts
|
|
14
|
+
* can be detected even when the rules resolved every field). The response
|
|
15
|
+
* is parsed with {@link prompt.parse} and fused through {@link merge.apply}.
|
|
13
16
|
*
|
|
14
17
|
* @typeParam S - A Zod object schema describing the target data shape.
|
|
15
|
-
* @param config - Schema, deterministic rules, and optional LLM fallback
|
|
18
|
+
* @param config - Schema, deterministic rules, and optional LLM fallback,
|
|
19
|
+
* plus `policy`, `normalizers`, `validators` and `logger` forwarded to
|
|
20
|
+
* every internal {@link merge.apply} call. The logger is also forwarded
|
|
21
|
+
* to {@link rule.apply} so schema-rejection warnings stay visible.
|
|
16
22
|
* @returns An {@link Extractor} bound to `config.schema`.
|
|
17
23
|
*/
|
|
18
24
|
export declare function createExtractor<S extends z.ZodObject<z.ZodRawShape>>(config: ExtractorConfig<S>): Extractor<z.infer<S>>;
|
package/dist/extractor.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../src/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAI7B,OAAO,KAAK,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AA8C7E
|
|
1
|
+
{"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../src/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAI7B,OAAO,KAAK,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AA8C7E;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,wBAAgB,eAAe,CAAC,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,EAClE,MAAM,EAAE,eAAe,CAAC,CAAC,CAAC,GACzB,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAqEvB"}
|
package/dist/extractor.js
CHANGED
|
@@ -36,18 +36,24 @@ function stampDuration(result, startedAt) {
|
|
|
36
36
|
};
|
|
37
37
|
}
|
|
38
38
|
/**
|
|
39
|
-
* Bind a schema, deterministic rules and
|
|
39
|
+
* Bind a schema, deterministic rules and their merge-time options into an
|
|
40
40
|
* {@link Extractor}. The returned object exposes the extraction pipeline as
|
|
41
41
|
* pre-configured methods; call sites stop having to thread `schema`,
|
|
42
|
-
* `rules` and provider wiring through
|
|
42
|
+
* `rules`, `policy`, normalizers/validators and provider wiring through
|
|
43
|
+
* every step.
|
|
43
44
|
*
|
|
44
|
-
* {@link Extractor.extract} runs {@link rule.apply}, then
|
|
45
|
-
* configured
|
|
46
|
-
*
|
|
47
|
-
*
|
|
45
|
+
* {@link Extractor.extract} runs {@link rule.apply}, then - when an LLM is
|
|
46
|
+
* configured - asks the provider either for the missing fields only
|
|
47
|
+
* (`mode: 'fill-gaps'`, default) or for every schema field
|
|
48
|
+
* (`mode: 'cross-check'`, which always triggers the LLM call so conflicts
|
|
49
|
+
* can be detected even when the rules resolved every field). The response
|
|
50
|
+
* is parsed with {@link prompt.parse} and fused through {@link merge.apply}.
|
|
48
51
|
*
|
|
49
52
|
* @typeParam S - A Zod object schema describing the target data shape.
|
|
50
|
-
* @param config - Schema, deterministic rules, and optional LLM fallback
|
|
53
|
+
* @param config - Schema, deterministic rules, and optional LLM fallback,
|
|
54
|
+
* plus `policy`, `normalizers`, `validators` and `logger` forwarded to
|
|
55
|
+
* every internal {@link merge.apply} call. The logger is also forwarded
|
|
56
|
+
* to {@link rule.apply} so schema-rejection warnings stay visible.
|
|
51
57
|
* @returns An {@link Extractor} bound to `config.schema`.
|
|
52
58
|
*/
|
|
53
59
|
export function createExtractor(config) {
|
|
@@ -55,32 +61,42 @@ export function createExtractor(config) {
|
|
|
55
61
|
if (allFields.length === 0) {
|
|
56
62
|
throw new Error('createExtractor: schema must declare at least one field');
|
|
57
63
|
}
|
|
64
|
+
const buildOptions = {
|
|
65
|
+
systemPrompt: config.llm?.systemPrompt,
|
|
66
|
+
mode: config.llm?.mode ?? 'fill-gaps',
|
|
67
|
+
crossCheckHints: config.llm?.crossCheckHints ?? 'unbiased',
|
|
68
|
+
};
|
|
69
|
+
const mergeOptions = {
|
|
70
|
+
policy: config.policy,
|
|
71
|
+
normalizers: config.normalizers,
|
|
72
|
+
validators: config.validators,
|
|
73
|
+
logger: config.logger,
|
|
74
|
+
};
|
|
58
75
|
return {
|
|
59
76
|
async extract(content) {
|
|
60
77
|
const startedAt = performance.now();
|
|
61
|
-
const rulesResult = rule.apply(content, config.rules, config.schema);
|
|
62
|
-
const partial = merge.apply(config.schema, rulesResult, null, content);
|
|
63
|
-
|
|
78
|
+
const rulesResult = rule.apply(content, config.rules, config.schema, config.logger);
|
|
79
|
+
const partial = merge.apply(config.schema, rulesResult, null, content, mergeOptions);
|
|
80
|
+
const shouldCallLlm = config.llm !== undefined &&
|
|
81
|
+
(buildOptions.mode === 'cross-check' || partial.missing.length > 0);
|
|
82
|
+
if (!shouldCallLlm) {
|
|
64
83
|
return stampDuration(partial, startedAt);
|
|
65
84
|
}
|
|
66
|
-
const request = prompt.build(config.schema, partial, content,
|
|
67
|
-
systemPrompt: config.llm.systemPrompt,
|
|
68
|
-
});
|
|
85
|
+
const request = prompt.build(config.schema, partial, content, buildOptions);
|
|
69
86
|
const completion = await config.llm.provider.complete(request);
|
|
70
|
-
const
|
|
71
|
-
const
|
|
87
|
+
const llmTargetFields = buildOptions.mode === 'cross-check' ? allFields : partial.missing;
|
|
88
|
+
const llmResult = prompt.parse(config.schema, llmTargetFields, completion.values);
|
|
89
|
+
const final = merge.apply(config.schema, rulesResult, llmResult, content, mergeOptions);
|
|
72
90
|
return stampDuration(final, startedAt);
|
|
73
91
|
},
|
|
74
92
|
extractSync(content) {
|
|
75
93
|
const startedAt = performance.now();
|
|
76
|
-
const rulesResult = rule.apply(content, config.rules, config.schema);
|
|
77
|
-
const partial = merge.apply(config.schema, rulesResult, null, content);
|
|
94
|
+
const rulesResult = rule.apply(content, config.rules, config.schema, config.logger);
|
|
95
|
+
const partial = merge.apply(config.schema, rulesResult, null, content, mergeOptions);
|
|
78
96
|
return stampDuration(partial, startedAt);
|
|
79
97
|
},
|
|
80
98
|
prompt(content, partial) {
|
|
81
|
-
return prompt.build(config.schema, partial, content,
|
|
82
|
-
systemPrompt: config.llm?.systemPrompt,
|
|
83
|
-
});
|
|
99
|
+
return prompt.build(config.schema, partial, content, buildOptions);
|
|
84
100
|
},
|
|
85
101
|
parse(raw) {
|
|
86
102
|
return prompt.parse(config.schema, allFields, raw);
|
|
@@ -88,7 +104,7 @@ export function createExtractor(config) {
|
|
|
88
104
|
merge(partial, llmResult, content) {
|
|
89
105
|
const startedAt = performance.now();
|
|
90
106
|
const rulesResult = rulesResultFromPartial(partial, allFields);
|
|
91
|
-
const result = merge.apply(config.schema, rulesResult, llmResult, content);
|
|
107
|
+
const result = merge.apply(config.schema, rulesResult, llmResult, content, mergeOptions);
|
|
92
108
|
return stampDuration(result, startedAt);
|
|
93
109
|
},
|
|
94
110
|
};
|
package/dist/extractor.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"extractor.js","sourceRoot":"","sources":["../src/extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAKrC;;;;;GAKG;AACH,SAAS,sBAAsB,CAC7B,OAA4B,EAC5B,SAA+B;IAE/B,MAAM,MAAM,GAAe,EAAE,CAAC;IAC9B,MAAM,UAAU,GAAqC,EAAE,CAAC;IACxD,KAAK,MAAM,KAAK,IAAI,SAAS,EAAE,CAAC;QAC9B,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACnB,SAAS;QACX,CAAC;QACD,MAAM,CAAC,KAAK,CAAC,GAAG,KAAmB,CAAC;QACpC,MAAM,eAAe,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QAClD,IAAI,eAAe,KAAK,IAAI,EAAE,CAAC;YAC7B,UAAU,CAAC,KAAK,CAAC,GAAG,eAAe,CAAC;QACtC,CAAC;IACH,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;AAC/D,CAAC;AAED;;;;;GAKG;AACH,SAAS,aAAa,CACpB,MAA2B,EAC3B,SAAiB;IAEjB,OAAO;QACL,GAAG,MAAM;QACT,IAAI,EAAE,EAAE,GAAG,MAAM,CAAC,IAAI,EAAE,UAAU,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,EAAE;KACpE,CAAC;AACJ,CAAC;AAED
|
|
1
|
+
{"version":3,"file":"extractor.js","sourceRoot":"","sources":["../src/extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAKrC;;;;;GAKG;AACH,SAAS,sBAAsB,CAC7B,OAA4B,EAC5B,SAA+B;IAE/B,MAAM,MAAM,GAAe,EAAE,CAAC;IAC9B,MAAM,UAAU,GAAqC,EAAE,CAAC;IACxD,KAAK,MAAM,KAAK,IAAI,SAAS,EAAE,CAAC;QAC9B,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACnB,SAAS;QACX,CAAC;QACD,MAAM,CAAC,KAAK,CAAC,GAAG,KAAmB,CAAC;QACpC,MAAM,eAAe,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QAClD,IAAI,eAAe,KAAK,IAAI,EAAE,CAAC;YAC7B,UAAU,CAAC,KAAK,CAAC,GAAG,eAAe,CAAC;QACtC,CAAC;IACH,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;AAC/D,CAAC;AAED;;;;;GAKG;AACH,SAAS,aAAa,CACpB,MAA2B,EAC3B,SAAiB;IAEjB,OAAO;QACL,GAAG,MAAM;QACT,IAAI,EAAE,EAAE,GAAG,MAAM,CAAC,IAAI,EAAE,UAAU,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,EAAE;KACpE,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;;;;;;;GAoBG;AACH,MAAM,UAAU,eAAe,CAC7B,MAA0B;IAG1B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAmB,CAAC;IAErE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;IAC7E,CAAC;IAED,MAAM,YAAY,GAAG;QACnB,YAAY,EAAE,MAAM,CAAC,GAAG,EAAE,YAAY;QACtC,IAAI,EAAE,MAAM,CAAC,GAAG,EAAE,IAAI,IAAI,WAAW;QACrC,eAAe,EAAE,MAAM,CAAC,GAAG,EAAE,eAAe,IAAI,UAAU;KAClD,CAAC;IAEX,MAAM,YAAY,GAA4B;QAC5C,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,WAAW,EAAE,MAAM,CAAC,WAAW;QAC/B,UAAU,EAAE,MAAM,CAAC,UAAU;QAC7B,MAAM,EAAE,MAAM,CAAC,MAAM;KACtB,CAAC;IAEF,OAAO;QACL,KAAK,CAAC,OAAO,CAAC,OAAO;YACnB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YACpF,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;YAErF,MAAM,aAAa,GACjB,MAAM,CAAC,GAAG,KAAK,SAAS;gBACxB,CAAC,YAAY,CAAC,IAAI,KAAK,aAAa,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACtE,IAAI,CAAC,aAAa,EAAE,CAAC;gBACnB,OAAO,aAAa,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;YAC3C,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;YAC5E,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,GAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;YAChE,MAAM,eAAe,GACnB,YAAY,CAAC,IAAI,KAAK,aAAa,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO,CAAC;YACpE,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAC5B,MAAM,CAAC,MAAM,EACb,eAAe,EACf,UAAU,CAAC,MAAM,CAClB,CAAC;YACF,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;YACxF,OAAO,aAAa,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QACzC,CAAC;QAED,WAAW,CAAC,OAAO;YACjB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YACpF,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;YACrF,OAAO,aAAa,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;QAC3C,CAAC;QAED,MAAM,CAAC,OAAO,EAAE,OAAO;YACrB,OAAO,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;QACrE,CAAC;QAED,KAAK,CAAC,GAAG;YACP,OAAO,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC;QACrD,CAAC;QAED,KAAK,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO;YAC/B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,sBAAsB,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;YAC/D,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,OAAO,EAAE,YAAY,CAAC,CAAC;YACzF,OAAO,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC1C,CAAC;KACF,CAAC;AACJ,CAAC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -16,7 +16,7 @@ export { prompt } from './prompt.js';
|
|
|
16
16
|
export { validator } from './validate.js';
|
|
17
17
|
export type { ExtractionRule, RuleMatch, RulesResult, } from './types/rule.types.js';
|
|
18
18
|
export type { Extractor, ExtractorConfig, ExtractorLlmConfig, } from './types/extractor.types.js';
|
|
19
|
-
export type { LlmRequest } from './types/prompt.types.js';
|
|
19
|
+
export type { CrossCheckHints, LlmRequest, PromptBuildMode, PromptBuildOptions, } from './types/prompt.types.js';
|
|
20
20
|
export type { LlmProvider } from './types/provider.types.js';
|
|
21
21
|
export type { Logger } from './types/logger.types.js';
|
|
22
22
|
export type { Severity, Violation, Validator, } from './types/validate.types.js';
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAE1C,YAAY,EACV,cAAc,EACd,SAAS,EACT,WAAW,GACZ,MAAM,uBAAuB,CAAC;AAE/B,YAAY,EACV,SAAS,EACT,eAAe,EACf,kBAAkB,GACnB,MAAM,4BAA4B,CAAC;AAEpC,YAAY,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAE1C,YAAY,EACV,cAAc,EACd,SAAS,EACT,WAAW,GACZ,MAAM,uBAAuB,CAAC;AAE/B,YAAY,EACV,SAAS,EACT,eAAe,EACf,kBAAkB,GACnB,MAAM,4BAA4B,CAAC;AAEpC,YAAY,EACV,eAAe,EACf,UAAU,EACV,eAAe,EACf,kBAAkB,GACnB,MAAM,yBAAyB,CAAC;AACjC,YAAY,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAC7D,YAAY,EAAE,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAEtD,YAAY,EACV,QAAQ,EACR,SAAS,EACT,SAAS,GACV,MAAM,2BAA2B,CAAC;AAEnC,YAAY,EACV,QAAQ,EACR,gBAAgB,EAChB,aAAa,EACb,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,SAAS,EACT,iBAAiB,EACjB,UAAU,EACV,gBAAgB,GACjB,MAAM,wBAAwB,CAAC"}
|
package/dist/prompt.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import type { z } from 'zod';
|
|
2
2
|
import type { ExtractionResult, LlmResult } from './types/merge.types.js';
|
|
3
|
-
import type { LlmRequest } from './types/prompt.types.js';
|
|
3
|
+
import type { LlmRequest, PromptBuildOptions } from './types/prompt.types.js';
|
|
4
4
|
/**
|
|
5
5
|
* Prompt-building primitives that turn a partial extraction result into an
|
|
6
6
|
* {@link LlmRequest} targeted at the fields the deterministic pass could not
|
|
@@ -8,27 +8,34 @@ import type { LlmRequest } from './types/prompt.types.js';
|
|
|
8
8
|
*/
|
|
9
9
|
export declare const prompt: {
|
|
10
10
|
/**
|
|
11
|
-
* Build an LLM request
|
|
12
|
-
* is a JSON Schema covering only those fields, and values already produced
|
|
13
|
-
* by the deterministic pass are surfaced both as `knownValues` and as a
|
|
14
|
-
* hint block prepended to `userContent`.
|
|
11
|
+
* Build an LLM request targeting a subset of the schema's fields.
|
|
15
12
|
*
|
|
16
|
-
*
|
|
13
|
+
* - In `'fill-gaps'` mode (default) the response schema covers only
|
|
14
|
+
* `partial.missing`, and rule values flow back to the LLM as hints both
|
|
15
|
+
* through `knownValues` and a prepended "Already extracted" block in
|
|
16
|
+
* `userContent`.
|
|
17
|
+
* - In `'cross-check'` mode the response schema covers every schema field,
|
|
18
|
+
* so {@link merge.apply} can surface agreements or disagreements with
|
|
19
|
+
* the rule pass. `crossCheckHints: 'unbiased'` (default) drops the hint
|
|
20
|
+
* block and empties `knownValues` so the LLM re-extracts from scratch;
|
|
21
|
+
* `'bias'` keeps the hints to save tokens at the cost of confirmation
|
|
22
|
+
* bias.
|
|
23
|
+
*
|
|
24
|
+
* Orchestration only: the four phases (response-schema build, known-values
|
|
17
25
|
* collection, user-content formatting, request assembly) each live in their
|
|
18
26
|
* own private helper above.
|
|
19
27
|
*
|
|
20
28
|
* @typeParam S - A Zod object schema describing the full target shape.
|
|
21
29
|
* @param schema - Zod object schema that drives the field selection.
|
|
22
30
|
* @param partial - Output of {@link merge.apply} (or any equivalent partial)
|
|
23
|
-
*
|
|
31
|
+
* `data` is always read; `missing` drives the fill-gaps schema and the
|
|
32
|
+
* hint block.
|
|
24
33
|
* @param content - Original text the request will refer to.
|
|
25
|
-
* @param options - Optional
|
|
26
|
-
* @throws When a
|
|
27
|
-
*
|
|
34
|
+
* @param options - Optional overrides: `systemPrompt`, `mode`, `crossCheckHints`.
|
|
35
|
+
* @throws When a target field uses an unsupported Zod kind; the error
|
|
36
|
+
* message names the offending field.
|
|
28
37
|
*/
|
|
29
|
-
build<S extends z.ZodObject<z.ZodRawShape>>(schema: S, partial: Pick<ExtractionResult<z.infer<S>>, "data" | "missing">, content: string, options?:
|
|
30
|
-
systemPrompt?: string;
|
|
31
|
-
}): LlmRequest;
|
|
38
|
+
build<S extends z.ZodObject<z.ZodRawShape>>(schema: S, partial: Pick<ExtractionResult<z.infer<S>>, "data" | "missing">, content: string, options?: PromptBuildOptions): LlmRequest;
|
|
32
39
|
/**
|
|
33
40
|
* Parse a raw LLM response permissively. Accepts either an already-decoded
|
|
34
41
|
* object or a JSON-encoded string. Each field listed in `missing` is
|
package/dist/prompt.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAC7B,OAAO,KAAK,EAEV,gBAAgB,EAChB,SAAS,EACV,MAAM,wBAAwB,CAAC;AAChC,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;
|
|
1
|
+
{"version":3,"file":"prompt.d.ts","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAC7B,OAAO,KAAK,EAEV,gBAAgB,EAChB,SAAS,EACV,MAAM,wBAAwB,CAAC;AAChC,OAAO,KAAK,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAuN9E;;;;GAIG;AACH,eAAO,MAAM,MAAM;IACjB;;;;;;;;;;;;;;;;;;;;;;;;;;;OA2BG;UACG,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,UAChC,CAAC,WACA,IAAI,CAAC,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,GAAG,SAAS,CAAC,WACtD,MAAM,YACL,kBAAkB,GAC3B,UAAU;IAsBb;;;;;;;;;;;;;;;OAeG;UACG,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,UAChC,CAAC,WACA,SAAS,CAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,OACjC,OAAO,GACX,SAAS;CAiBb,CAAC"}
|
package/dist/prompt.js
CHANGED
|
@@ -1,41 +1,80 @@
|
|
|
1
1
|
const DEFAULT_SYSTEM_PROMPT = 'Extract the listed fields from the content as a JSON object.';
|
|
2
2
|
/**
|
|
3
|
-
* Convert a
|
|
4
|
-
*
|
|
5
|
-
*
|
|
6
|
-
* their schema.
|
|
3
|
+
* Convert a `z.nullable(inner)` into JSON Schema by recursing into `inner`
|
|
4
|
+
* and widening its `type` to `[innerType, 'null']`. Refuses nested nullables
|
|
5
|
+
* whose inner already carries a tuple `type`.
|
|
7
6
|
*/
|
|
8
|
-
function
|
|
9
|
-
const
|
|
10
|
-
const
|
|
11
|
-
if (
|
|
12
|
-
|
|
13
|
-
}
|
|
14
|
-
if (kind === 'number') {
|
|
15
|
-
return { type: 'number' };
|
|
16
|
-
}
|
|
17
|
-
if (kind === 'boolean') {
|
|
18
|
-
return { type: 'boolean' };
|
|
19
|
-
}
|
|
20
|
-
if (kind === 'enum') {
|
|
21
|
-
const entries = def.entries;
|
|
22
|
-
return { type: 'string', enum: Object.values(entries) };
|
|
7
|
+
function nullableToJsonSchema(def, field) {
|
|
8
|
+
const inner = zodFieldToJsonSchema(def.innerType, field);
|
|
9
|
+
const innerType = inner.type;
|
|
10
|
+
if (typeof innerType !== 'string') {
|
|
11
|
+
throw new Error(`Unsupported nested nullable on field "${field}"`);
|
|
23
12
|
}
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
13
|
+
return { ...inner, type: [innerType, 'null'] };
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Convert a `z.object(shape)` into JSON Schema by recursing over every
|
|
17
|
+
* property. Children wrapped in `z.optional(...)` are kept in `properties`
|
|
18
|
+
* but excluded from the object-level `required` list.
|
|
19
|
+
*/
|
|
20
|
+
function objectToJsonSchema(def, field) {
|
|
21
|
+
const shape = def.shape;
|
|
22
|
+
const properties = {};
|
|
23
|
+
const required = [];
|
|
24
|
+
for (const [key, child] of Object.entries(shape)) {
|
|
25
|
+
properties[key] = zodFieldToJsonSchema(child, `${field}.${key}`);
|
|
26
|
+
if (child.def.type !== 'optional') {
|
|
27
|
+
required.push(key);
|
|
28
28
|
}
|
|
29
|
-
return { ...inner, type: [inner.type, 'null'] };
|
|
30
29
|
}
|
|
31
|
-
|
|
30
|
+
return { type: 'object', properties, required, additionalProperties: false };
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Dispatch the conversion by Zod kind. Primitives short-circuit, wrappers
|
|
34
|
+
* (`optional`, `default`, `nullable`, `array`, `object`) recurse, unsupported
|
|
35
|
+
* kinds throw with the offending `field` named so the caller can restructure
|
|
36
|
+
* their schema.
|
|
37
|
+
*/
|
|
38
|
+
function zodKindToJsonSchema(def, kind, field) {
|
|
39
|
+
switch (kind) {
|
|
40
|
+
case 'string':
|
|
41
|
+
case 'number':
|
|
42
|
+
case 'boolean':
|
|
43
|
+
return { type: kind };
|
|
44
|
+
case 'enum':
|
|
45
|
+
return { type: 'string', enum: Object.values(def.entries) };
|
|
46
|
+
case 'nullable':
|
|
47
|
+
return nullableToJsonSchema(def, field);
|
|
48
|
+
case 'optional':
|
|
49
|
+
case 'default':
|
|
50
|
+
return zodFieldToJsonSchema(def.innerType, field);
|
|
51
|
+
case 'array':
|
|
52
|
+
return { type: 'array', items: zodFieldToJsonSchema(def.element, field) };
|
|
53
|
+
case 'object':
|
|
54
|
+
return objectToJsonSchema(def, field);
|
|
55
|
+
default:
|
|
56
|
+
throw new Error(`Unsupported Zod type "${kind}" on field "${field}"`);
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Convert a single Zod field schema to JSON Schema. Wraps the kind-level
|
|
61
|
+
* dispatch with a `description` pass so `.describe()` / `.meta({ description })`
|
|
62
|
+
* registered at this recursion level flows through to the output; providers'
|
|
63
|
+
* structured-output features consume it natively.
|
|
64
|
+
*/
|
|
65
|
+
function zodFieldToJsonSchema(zodType, field) {
|
|
66
|
+
const schema = zodKindToJsonSchema(zodType.def, zodType.def.type, field);
|
|
67
|
+
const description = zodType.description;
|
|
68
|
+
return description ? { ...schema, description } : schema;
|
|
32
69
|
}
|
|
33
70
|
/**
|
|
34
71
|
* Build the JSON Schema handed to the LLM, restricted to the fields the
|
|
35
|
-
* deterministic pass could not produce.
|
|
72
|
+
* deterministic pass could not produce. Optional top-level fields are kept
|
|
73
|
+
* in `properties` but excluded from `required`.
|
|
36
74
|
*/
|
|
37
75
|
function buildResponseSchema(schema, missing) {
|
|
38
76
|
const properties = {};
|
|
77
|
+
const required = [];
|
|
39
78
|
const shape = schema.shape;
|
|
40
79
|
for (const field of missing) {
|
|
41
80
|
const zodField = shape[field];
|
|
@@ -43,8 +82,11 @@ function buildResponseSchema(schema, missing) {
|
|
|
43
82
|
continue;
|
|
44
83
|
}
|
|
45
84
|
properties[field] = zodFieldToJsonSchema(zodField, field);
|
|
85
|
+
if (zodField.def.type !== 'optional') {
|
|
86
|
+
required.push(field);
|
|
87
|
+
}
|
|
46
88
|
}
|
|
47
|
-
return { type: 'object', properties, required:
|
|
89
|
+
return { type: 'object', properties, required, additionalProperties: false };
|
|
48
90
|
}
|
|
49
91
|
/**
|
|
50
92
|
* Pick the non-null, non-missing entries of the partial result — the values
|
|
@@ -142,28 +184,44 @@ function collectUnexpectedKeys(object, missing) {
|
|
|
142
184
|
*/
|
|
143
185
|
export const prompt = {
|
|
144
186
|
/**
|
|
145
|
-
* Build an LLM request
|
|
146
|
-
*
|
|
147
|
-
*
|
|
148
|
-
*
|
|
187
|
+
* Build an LLM request targeting a subset of the schema's fields.
|
|
188
|
+
*
|
|
189
|
+
* - In `'fill-gaps'` mode (default) the response schema covers only
|
|
190
|
+
* `partial.missing`, and rule values flow back to the LLM as hints both
|
|
191
|
+
* through `knownValues` and a prepended "Already extracted" block in
|
|
192
|
+
* `userContent`.
|
|
193
|
+
* - In `'cross-check'` mode the response schema covers every schema field,
|
|
194
|
+
* so {@link merge.apply} can surface agreements or disagreements with
|
|
195
|
+
* the rule pass. `crossCheckHints: 'unbiased'` (default) drops the hint
|
|
196
|
+
* block and empties `knownValues` so the LLM re-extracts from scratch;
|
|
197
|
+
* `'bias'` keeps the hints to save tokens at the cost of confirmation
|
|
198
|
+
* bias.
|
|
149
199
|
*
|
|
150
|
-
* Orchestration only
|
|
200
|
+
* Orchestration only: the four phases (response-schema build, known-values
|
|
151
201
|
* collection, user-content formatting, request assembly) each live in their
|
|
152
202
|
* own private helper above.
|
|
153
203
|
*
|
|
154
204
|
* @typeParam S - A Zod object schema describing the full target shape.
|
|
155
205
|
* @param schema - Zod object schema that drives the field selection.
|
|
156
206
|
* @param partial - Output of {@link merge.apply} (or any equivalent partial)
|
|
157
|
-
*
|
|
207
|
+
* `data` is always read; `missing` drives the fill-gaps schema and the
|
|
208
|
+
* hint block.
|
|
158
209
|
* @param content - Original text the request will refer to.
|
|
159
|
-
* @param options - Optional
|
|
160
|
-
* @throws When a
|
|
161
|
-
*
|
|
210
|
+
* @param options - Optional overrides: `systemPrompt`, `mode`, `crossCheckHints`.
|
|
211
|
+
* @throws When a target field uses an unsupported Zod kind; the error
|
|
212
|
+
* message names the offending field.
|
|
162
213
|
*/
|
|
163
214
|
build(schema, partial, content, options) {
|
|
164
|
-
const
|
|
165
|
-
const
|
|
166
|
-
const
|
|
215
|
+
const mode = options?.mode ?? 'fill-gaps';
|
|
216
|
+
const crossCheckHints = options?.crossCheckHints ?? 'unbiased';
|
|
217
|
+
const targetFields = mode === 'cross-check'
|
|
218
|
+
? Object.keys(schema.shape)
|
|
219
|
+
: partial.missing;
|
|
220
|
+
const responseSchema = buildResponseSchema(schema, targetFields);
|
|
221
|
+
const exposeHints = mode === 'fill-gaps' || crossCheckHints === 'bias';
|
|
222
|
+
const knownValues = exposeHints
|
|
223
|
+
? collectKnownValues(partial.data, partial.missing)
|
|
224
|
+
: {};
|
|
167
225
|
const userContent = formatUserContent(content, knownValues);
|
|
168
226
|
return {
|
|
169
227
|
systemPrompt: options?.systemPrompt ?? DEFAULT_SYSTEM_PROMPT,
|
package/dist/prompt.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAQA,MAAM,qBAAqB,GACzB,8DAA8D,CAAC;AAKjE
|
|
1
|
+
{"version":3,"file":"prompt.js","sourceRoot":"","sources":["../src/prompt.ts"],"names":[],"mappings":"AAQA,MAAM,qBAAqB,GACzB,8DAA8D,CAAC;AAKjE;;;;GAIG;AACH,SAAS,oBAAoB,CAAC,GAAgB,EAAE,KAAa;IAC3D,MAAM,KAAK,GAAG,oBAAoB,CAAC,GAAG,CAAC,SAAoB,EAAE,KAAK,CAAC,CAAC;IACpE,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC;IAC7B,IAAI,OAAO,SAAS,KAAK,QAAQ,EAAE,CAAC;QAClC,MAAM,IAAI,KAAK,CAAC,yCAAyC,KAAK,GAAG,CAAC,CAAC;IACrE,CAAC;IACD,OAAO,EAAE,GAAG,KAAK,EAAE,IAAI,EAAE,CAAC,SAAS,EAAE,MAAM,CAAC,EAAE,CAAC;AACjD,CAAC;AAED;;;;GAIG;AACH,SAAS,kBAAkB,CAAC,GAAgB,EAAE,KAAa;IACzD,MAAM,KAAK,GAAG,GAAG,CAAC,KAAgC,CAAC;IACnD,MAAM,UAAU,GAA4B,EAAE,CAAC;IAC/C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;QACjD,UAAU,CAAC,GAAG,CAAC,GAAG,oBAAoB,CAAC,KAAK,EAAE,GAAG,KAAK,IAAI,GAAG,EAAE,CAAC,CAAC;QACjE,IAAI,KAAK,CAAC,GAAG,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YAClC,QAAQ,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,oBAAoB,EAAE,KAAK,EAAE,CAAC;AAC/E,CAAC;AAED;;;;;GAKG;AACH,SAAS,mBAAmB,CAC1B,GAAgB,EAChB,IAAY,EACZ,KAAa;IAEb,QAAQ,IAAI,EAAE,CAAC;QACb,KAAK,QAAQ,CAAC;QACd,KAAK,QAAQ,CAAC;QACd,KAAK,SAAS;YACZ,OAAO,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;QACxB,KAAK,MAAM;YACT,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,MAAM,CAAC,MAAM,CAAC,GAAG,CAAC,OAA0C,CAAC,EAAE,CAAC;QACjG,KAAK,UAAU;YACb,OAAO,oBAAoB,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QAC1C,KAAK,UAAU,CAAC;QAChB,KAAK,SAAS;YACZ,OAAO,oBAAoB,CAAC,GAAG,CAAC,SAAoB,EAAE,KAAK,CAAC,CAAC;QAC/D,KAAK,OAAO;YACV,OAAO,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,EAAE,oBAAoB,CAAC,GAAG,CAAC,OAAkB,EAAE,KAAK,CAAC,EAAE,CAAC;QACvF,KAAK,QAAQ;YACX,OAAO,kBAAkB,CAAC,GAAG,EAAE,KAAK,CAAC,CAAC;QACxC;YACE,MAAM,IAAI,KAAK,CAAC,yBAAyB,IAAI,eAAe,KAAK,GAAG,CAAC,CAAC;IAC1E,CAAC;AACH,CAAC;AAED;;;;;GAKG;AACH,SAAS,oBAAoB,CAAC,OAAgB,EAAE,KAAa;IAC3D,MAAM,MAAM,GAAG,mBAAmB,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,GAAG,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;IACzE,MAAM,WAAW,GAAG,OAAO,CAAC,WAAW,CAAC;IACxC,OAAO,WAAW,CAAC,CAAC,CAAC,EAAE,GAAG,MAAM,EAAE,WAAW,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;AAC3D,CAAC;AAED;;;;GAIG;AACH,SAAS,mBAAmB,CAC1B,MAAkC,EAClC,OAA0B;IAE1B,MAAM,UAAU,GAA4C,EAAE,CAAC;IAC/D,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,KAAK,GAAG,MAAM,CAAC,KAA2C,CAAC;IACjE,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,MAAM,QAAQ,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC;QAC9B,IAAI,QAAQ,KAAK,SAAS,EAAE,CAAC;YAC3B,SAAS;QACX,CAAC;QACD,UAAU,CAAC,KAAK,CAAC,GAAG,oBAAoB,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAC1D,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;YACrC,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACvB,CAAC;IACH,CAAC;IACD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,UAAU,EAAE,QAAQ,EAAE,oBAAoB,EAAE,KAAK,EAAE,CAAC;AAC/E,CAAC;AAED;;;GAGG;AACH,SAAS,kBAAkB,CACzB,IAAsB,EACtB,OAA6B;IAE7B,MAAM,UAAU,GAAG,IAAI,GAAG,CAAS,OAA4B,CAAC,CAAC;IACjE,MAAM,KAAK,GAA4B,EAAE,CAAC;IAC1C,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC;QAChD,IAAI,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,SAAS;QACX,CAAC;QACD,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YAC1C,SAAS;QACX,CAAC;QACD,KAAK,CAAC,GAAG,CAAC,GAAG,KAAK,CAAC;IACrB,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,SAAS,iBAAiB,CAAC,OAAe,EAAE,WAAoC;IAC9E,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IACtC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACtB,OAAO,OAAO,CAAC;IACjB,CAAC;IACD,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,KAAK,GAAG,MAAM,IAAI,CAAC,SAAS,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC;IAClF,OAAO,uBAAuB,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,OAAO,EAAE,CAAC;AACjE,CAAC;AAED;;;;GAIG;AACH,SAAS,SAAS,CAChB,GAAY;IAEZ,IAAI,SAAS,GAAY,GAAG,CAAC;IAC7B,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,IAAI,CAAC;YACH,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC;QAC9B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,OAAO,EAAE,4BAA4B,EAAE,CAAC;QACnD,CAAC;IACH,CAAC;IACD,IACE,SAAS,KAAK,IAAI;QAClB,OAAO,SAAS,KAAK,QAAQ;QAC7B,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,EACxB,CAAC;QACD,OAAO,EAAE,OAAO,EAAE,4BAA4B,EAAE,CAAC;IACnD,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,SAAoC,EAAE,CAAC;AAC1D,CAAC;AAED;;;GAGG;AACH,SAAS,qBAAqB,CAC5B,MAAkC,EAClC,OAA0B,EAC1B,MAA+B;IAE/B,MAAM,KAAK,GAAG,MAAM,CAAC,KAAkC,CAAC;IACxD,MAAM,MAAM,GAA4B,EAAE,CAAC;IAC3C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC5B,IAAI,CAAC,CAAC,KAAK,IAAI,MAAM,CAAC,EAAE,CAAC;YACvB,SAAS;QACX,CAAC;QACD,MAAM,WAAW,GAAG,KAAK,CAAC,KAAK,CAAC,CAAC;QACjC,IAAI,WAAW,KAAK,SAAS,EAAE,CAAC;YAC9B,SAAS;QACX,CAAC;QACD,MAAM,MAAM,GAAG,WAAW,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC;QACpD,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YACnB,MAAM,CAAC,KAAK,CAAC,GAAG,MAAM,CAAC,IAAI,CAAC;QAC9B,CAAC;aAAM,CAAC;YACN,MAAM,MAAM,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,OAAO,IAAI,eAAe,CAAC;YAClE,QAAQ,CAAC,IAAI,CAAC,SAAS,KAAK,KAAK,MAAM,EAAE,CAAC,CAAC;QAC7C,CAAC;IACH,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC;AAC9B,CAAC;AAED;;;;GAIG;AACH,SAAS,qBAAqB,CAC5B,MAA+B,EAC/B,OAA0B;IAE1B,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,OAAO,CAAC,CAAC;IACpC,OAAO,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,UAAU,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC;AACnE,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,MAAM,GAAG;IACpB;;;;;;;;;;;;;;;;;;;;;;;;;;;OA2BG;IACH,KAAK,CACH,MAAS,EACT,OAA+D,EAC/D,OAAe,EACf,OAA4B;QAG5B,MAAM,IAAI,GAAG,OAAO,EAAE,IAAI,IAAI,WAAW,CAAC;QAC1C,MAAM,eAAe,GAAG,OAAO,EAAE,eAAe,IAAI,UAAU,CAAC;QAC/D,MAAM,YAAY,GAChB,IAAI,KAAK,aAAa;YACpB,CAAC,CAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAuB;YAClD,CAAC,CAAE,OAAO,CAAC,OAA6B,CAAC;QAC7C,MAAM,cAAc,GAAG,mBAAmB,CAAC,MAAM,EAAE,YAAY,CAAC,CAAC;QACjE,MAAM,WAAW,GAAG,IAAI,KAAK,WAAW,IAAI,eAAe,KAAK,MAAM,CAAC;QACvE,MAAM,WAAW,GAAG,WAAW;YAC7B,CAAC,CAAC,kBAAkB,CAAO,OAAO,CAAC,IAAI,EAAE,OAAO,CAAC,OAAO,CAAC;YACzD,CAAC,CAAC,EAAE,CAAC;QACP,MAAM,WAAW,GAAG,iBAAiB,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QAC5D,OAAO;YACL,YAAY,EAAE,OAAO,EAAE,YAAY,IAAI,qBAAqB;YAC5D,WAAW;YACX,cAAc;YACd,WAAW;SACZ,CAAC;IACJ,CAAC;IAED;;;;;;;;;;;;;;;OAeG;IACH,KAAK,CACH,MAAS,EACT,OAAsC,EACtC,GAAY;QAEZ,MAAM,WAAW,GAAG,OAA4B,CAAC;QACjD,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC;QAC/B,IAAI,SAAS,IAAI,OAAO,EAAE,CAAC;YACzB,OAAO,EAAE,MAAM,EAAE,EAAE,EAAE,QAAQ,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;QACrD,CAAC;QACD,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,GAAG,qBAAqB,CAChD,MAAM,EACN,WAAW,EACX,OAAO,CAAC,MAAM,CACf,CAAC;QACF,MAAM,UAAU,GAAG,qBAAqB,CAAC,OAAO,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QACtE,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,QAAQ,CAAC,IAAI,CAAC,8BAA8B,UAAU,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACvE,CAAC;QACD,OAAO,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC;IACjE,CAAC;CACF,CAAC"}
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
import type { z } from 'zod';
|
|
2
2
|
import type { ExtractionRule } from './rule.types.js';
|
|
3
|
-
import type { ExtractionResult, LlmResult } from './merge.types.js';
|
|
3
|
+
import type { ExtractedData, ExtractionResult, FieldMergePolicy, LlmResult, Normalizer } from './merge.types.js';
|
|
4
4
|
import type { LlmProvider } from './provider.types.js';
|
|
5
|
-
import type {
|
|
5
|
+
import type { Logger } from './logger.types.js';
|
|
6
|
+
import type { Validator } from './validate.types.js';
|
|
7
|
+
import type { CrossCheckHints, LlmRequest, PromptBuildMode } from './prompt.types.js';
|
|
6
8
|
/**
|
|
7
9
|
* LLM-fallback section of {@link ExtractorConfig}. When present, the
|
|
8
10
|
* extractor hands the fields the deterministic rules could not produce to
|
|
@@ -13,6 +15,18 @@ export type ExtractorLlmConfig = {
|
|
|
13
15
|
provider: LlmProvider;
|
|
14
16
|
/** Optional override for {@link LlmRequest.systemPrompt}; defaults to the {@link prompt.build} built-in. */
|
|
15
17
|
systemPrompt?: string;
|
|
18
|
+
/**
|
|
19
|
+
* Field-selection strategy passed to {@link prompt.build}. In
|
|
20
|
+
* `'cross-check'` mode the extractor always calls the LLM (even when the
|
|
21
|
+
* rules resolved every field) so the merge step can surface agreements or
|
|
22
|
+
* conflicts. Defaults to `'fill-gaps'`.
|
|
23
|
+
*/
|
|
24
|
+
mode?: PromptBuildMode;
|
|
25
|
+
/**
|
|
26
|
+
* Hint-exposure policy for cross-check mode. Defaults to `'unbiased'`.
|
|
27
|
+
* Ignored when `mode !== 'cross-check'`.
|
|
28
|
+
*/
|
|
29
|
+
crossCheckHints?: CrossCheckHints;
|
|
16
30
|
};
|
|
17
31
|
/**
|
|
18
32
|
* Configuration accepted by {@link createExtractor}. A schema describes the
|
|
@@ -28,6 +42,14 @@ export type ExtractorConfig<S extends z.ZodObject<z.ZodRawShape>> = {
|
|
|
28
42
|
rules: ExtractionRule[];
|
|
29
43
|
/** Optional LLM fallback invoked for fields the rules could not produce. */
|
|
30
44
|
llm?: ExtractorLlmConfig;
|
|
45
|
+
/** Post-merge transformations, forwarded to every `merge.apply` call. */
|
|
46
|
+
normalizers?: Normalizer<z.infer<S>>[];
|
|
47
|
+
/** Invariants checked on the normalized data; populate `result.validation`. */
|
|
48
|
+
validators?: Validator<ExtractedData<z.infer<S>>>[];
|
|
49
|
+
/** Overrides for the per-field merge policy (conflict strategy, confidences, compare). */
|
|
50
|
+
policy?: Partial<FieldMergePolicy>;
|
|
51
|
+
/** Logger propagated through the merge pipeline for warnings and fallbacks. */
|
|
52
|
+
logger?: Logger;
|
|
31
53
|
};
|
|
32
54
|
/**
|
|
33
55
|
* Public surface returned by {@link createExtractor}. Methods are added to
|
|
@@ -50,8 +72,11 @@ export type Extractor<T> = {
|
|
|
50
72
|
*/
|
|
51
73
|
extractSync(content: string): ExtractionResult<T>;
|
|
52
74
|
/**
|
|
53
|
-
* Build the LLM request for
|
|
54
|
-
*
|
|
75
|
+
* Build the LLM request for `partial`. The target field set depends on the
|
|
76
|
+
* configured `llm.mode`: `'fill-gaps'` (default) covers only
|
|
77
|
+
* `partial.missing`; `'cross-check'` covers every schema field. Delegates
|
|
78
|
+
* to {@link prompt.build} with the bound schema and the configured
|
|
79
|
+
* `systemPrompt` / `crossCheckHints`.
|
|
55
80
|
*/
|
|
56
81
|
prompt(content: string, partial: ExtractionResult<T>): LlmRequest;
|
|
57
82
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"extractor.types.d.ts","sourceRoot":"","sources":["../../src/types/extractor.types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAC7B,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACtD,OAAO,KAAK,
|
|
1
|
+
{"version":3,"file":"extractor.types.d.ts","sourceRoot":"","sources":["../../src/types/extractor.types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAC7B,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACtD,OAAO,KAAK,EACV,aAAa,EACb,gBAAgB,EAChB,gBAAgB,EAChB,SAAS,EACT,UAAU,EACX,MAAM,kBAAkB,CAAC;AAC1B,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AACvD,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,mBAAmB,CAAC;AAChD,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,KAAK,EAAE,eAAe,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AAEtF;;;;GAIG;AACH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,+DAA+D;IAC/D,QAAQ,EAAE,WAAW,CAAC;IACtB,4GAA4G;IAC5G,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB;;;;;OAKG;IACH,IAAI,CAAC,EAAE,eAAe,CAAC;IACvB;;;OAGG;IACH,eAAe,CAAC,EAAE,eAAe,CAAC;CACnC,CAAC;AAEF;;;;;;GAMG;AACH,MAAM,MAAM,eAAe,CAAC,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,IAAI;IAClE,2FAA2F;IAC3F,MAAM,EAAE,CAAC,CAAC;IACV,qFAAqF;IACrF,KAAK,EAAE,cAAc,EAAE,CAAC;IACxB,4EAA4E;IAC5E,GAAG,CAAC,EAAE,kBAAkB,CAAC;IACzB,yEAAyE;IACzE,WAAW,CAAC,EAAE,UAAU,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvC,+EAA+E;IAC/E,UAAU,CAAC,EAAE,SAAS,CAAC,aAAa,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACpD,0FAA0F;IAC1F,MAAM,CAAC,EAAE,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACnC,+EAA+E;IAC/E,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB,CAAC;AAEF;;;;;GAKG;AACH,MAAM,MAAM,SAAS,CAAC,CAAC,IAAI;IACzB;;;;OAIG;IACH,OAAO,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAAC,CAAC,CAAC,CAAC;IACvD;;;;;OAKG;IACH,WAAW,CAAC,OAAO,EAAE,MAAM,GAAG,gBAAgB,CAAC,CAAC,CAAC,CAAC;IAClD;;;;;;OAMG;IACH,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,gBAAgB,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC;IAClE;;;;;OAKG;IACH,KAAK,CAAC,GAAG,EAAE,OAAO,GAAG,SAAS,CAAC;IAC/B;;;;;OAKG;IACH,KAAK,CACH,OAAO,EAAE,gBAAgB,CAAC,CAAC,CAAC,EAC5B,SAAS,EAAE,SAAS,EACpB,OAAO,EAAE,MAAM,GACd,gBAAgB,CAAC,CAAC,CAAC,CAAC;CACxB,CAAC"}
|
|
@@ -1,3 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Selects whether the LLM is asked only about fields the deterministic pass
|
|
3
|
+
* could not produce (`'fill-gaps'`, default) or about every schema field
|
|
4
|
+
* (`'cross-check'`, enabling agreement/conflict detection with the rules).
|
|
5
|
+
*/
|
|
6
|
+
export type PromptBuildMode = 'fill-gaps' | 'cross-check';
|
|
7
|
+
/**
|
|
8
|
+
* Whether a cross-check request exposes the rule values to the LLM as hints.
|
|
9
|
+
*
|
|
10
|
+
* - `'unbiased'` (default): no hints, the LLM re-extracts every field from
|
|
11
|
+
* scratch, enabling genuine disagreement detection.
|
|
12
|
+
* - `'bias'`: prepend the rule values as an "Already extracted" block, same
|
|
13
|
+
* shape as fill-gaps. Saves tokens when the caller trusts the rules and
|
|
14
|
+
* only wants a quick sanity check.
|
|
15
|
+
*
|
|
16
|
+
* Ignored when `mode !== 'cross-check'`.
|
|
17
|
+
*/
|
|
18
|
+
export type CrossCheckHints = 'bias' | 'unbiased';
|
|
19
|
+
/**
|
|
20
|
+
* Optional behavior overrides for `prompt.build`.
|
|
21
|
+
*/
|
|
22
|
+
export type PromptBuildOptions = {
|
|
23
|
+
/** Custom system prompt sent to the provider; falls back to a built-in. */
|
|
24
|
+
systemPrompt?: string;
|
|
25
|
+
/** Field-selection strategy. Defaults to `'fill-gaps'`. */
|
|
26
|
+
mode?: PromptBuildMode;
|
|
27
|
+
/** Hint-exposure policy in cross-check mode. Defaults to `'unbiased'`. */
|
|
28
|
+
crossCheckHints?: CrossCheckHints;
|
|
29
|
+
};
|
|
1
30
|
/**
|
|
2
31
|
* A fully-built request ready to be handed to an LLM provider. Produced by
|
|
3
32
|
* {@link prompt.build} from a schema and a partial extraction result.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"prompt.types.d.ts","sourceRoot":"","sources":["../../src/types/prompt.types.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,MAAM,MAAM,UAAU,GAAG;IACvB,0DAA0D;IAC1D,YAAY,EAAE,MAAM,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACtC,CAAC"}
|
|
1
|
+
{"version":3,"file":"prompt.types.d.ts","sourceRoot":"","sources":["../../src/types/prompt.types.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AACH,MAAM,MAAM,eAAe,GAAG,WAAW,GAAG,aAAa,CAAC;AAE1D;;;;;;;;;;GAUG;AACH,MAAM,MAAM,eAAe,GAAG,MAAM,GAAG,UAAU,CAAC;AAElD;;GAEG;AACH,MAAM,MAAM,kBAAkB,GAAG;IAC/B,2EAA2E;IAC3E,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,2DAA2D;IAC3D,IAAI,CAAC,EAAE,eAAe,CAAC;IACvB,0EAA0E;IAC1E,eAAe,CAAC,EAAE,eAAe,CAAC;CACnC,CAAC;AAEF;;;GAGG;AACH,MAAM,MAAM,UAAU,GAAG;IACvB,0DAA0D;IAC1D,YAAY,EAAE,MAAM,CAAC;IACrB,6EAA6E;IAC7E,WAAW,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB;;;;OAIG;IACH,WAAW,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;CACtC,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "llmbic",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Hybrid data extraction — deterministic rules + LLM fallback, with per-field confidence scoring.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"license": "MIT",
|
|
@@ -29,6 +29,7 @@
|
|
|
29
29
|
"test:watch": "vitest",
|
|
30
30
|
"typecheck": "tsc --noEmit",
|
|
31
31
|
"example:ollama": "tsx examples/ollama.ts",
|
|
32
|
+
"example:openai-batch": "tsx examples/openai-batch.ts",
|
|
32
33
|
"prepublishOnly": "npm run typecheck && npm test && npm run build"
|
|
33
34
|
},
|
|
34
35
|
"peerDependencies": {
|
|
@@ -37,6 +38,7 @@
|
|
|
37
38
|
"devDependencies": {
|
|
38
39
|
"@types/node": "^25.6.0",
|
|
39
40
|
"ollama": "^0.5.0",
|
|
41
|
+
"openai": "^6.34.0",
|
|
40
42
|
"tsx": "^4.19.0",
|
|
41
43
|
"typescript": "^5.7.0",
|
|
42
44
|
"vitest": "^4.0.0",
|