llmbic 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +22 -0
  2. package/LICENSE +21 -0
  3. package/README.md +351 -0
  4. package/dist/extractor.d.ts +19 -0
  5. package/dist/extractor.d.ts.map +1 -0
  6. package/dist/extractor.js +96 -0
  7. package/dist/extractor.js.map +1 -0
  8. package/dist/index.d.ts +24 -0
  9. package/dist/index.d.ts.map +1 -0
  10. package/dist/index.js +17 -0
  11. package/dist/index.js.map +1 -0
  12. package/dist/merge.d.ts +76 -0
  13. package/dist/merge.d.ts.map +1 -0
  14. package/dist/merge.js +230 -0
  15. package/dist/merge.js.map +1 -0
  16. package/dist/prompt.d.ts +50 -0
  17. package/dist/prompt.d.ts.map +1 -0
  18. package/dist/prompt.js +205 -0
  19. package/dist/prompt.js.map +1 -0
  20. package/dist/rules.d.ts +73 -0
  21. package/dist/rules.d.ts.map +1 -0
  22. package/dist/rules.js +118 -0
  23. package/dist/rules.js.map +1 -0
  24. package/dist/types/extractor.types.d.ts +72 -0
  25. package/dist/types/extractor.types.d.ts.map +1 -0
  26. package/dist/types/extractor.types.js +2 -0
  27. package/dist/types/extractor.types.js.map +1 -0
  28. package/dist/types/logger.types.d.ts +12 -0
  29. package/dist/types/logger.types.d.ts.map +1 -0
  30. package/dist/types/logger.types.js +2 -0
  31. package/dist/types/logger.types.js.map +1 -0
  32. package/dist/types/merge.types.d.ts +159 -0
  33. package/dist/types/merge.types.d.ts.map +1 -0
  34. package/dist/types/merge.types.js +2 -0
  35. package/dist/types/merge.types.js.map +1 -0
  36. package/dist/types/prompt.types.d.ts +22 -0
  37. package/dist/types/prompt.types.d.ts.map +1 -0
  38. package/dist/types/prompt.types.js +2 -0
  39. package/dist/types/prompt.types.js.map +1 -0
  40. package/dist/types/provider.types.d.ts +21 -0
  41. package/dist/types/provider.types.d.ts.map +1 -0
  42. package/dist/types/provider.types.js +2 -0
  43. package/dist/types/provider.types.js.map +1 -0
  44. package/dist/types/rule.types.d.ts +38 -0
  45. package/dist/types/rule.types.d.ts.map +1 -0
  46. package/dist/types/rule.types.js +2 -0
  47. package/dist/types/rule.types.js.map +1 -0
  48. package/dist/types/validate.types.d.ts +25 -0
  49. package/dist/types/validate.types.d.ts.map +1 -0
  50. package/dist/types/validate.types.js +2 -0
  51. package/dist/types/validate.types.js.map +1 -0
  52. package/dist/validate.d.ts +57 -0
  53. package/dist/validate.d.ts.map +1 -0
  54. package/dist/validate.js +46 -0
  55. package/dist/validate.js.map +1 -0
  56. package/package.json +59 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,22 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [1.0.0] — 2026-04-15
9
+
10
+ Initial public release.
11
+
12
+ ### Added
13
+
14
+ - `createExtractor(config)` — factory binding a Zod schema, deterministic rules and an optional LLM fallback into an extractor with `extract`, `extractSync`, `prompt`, `parse` and `merge` methods. Covers both one-shot async extraction and 4-step batch flows (extractSync → prompt → external LLM call → parse → merge).
15
+ - `rule` namespace — `rule.create(field, extractFn)`, `rule.regex(field, pattern, score, transform?)`, `rule.confidence(value, score)`, `rule.apply(content, rules, schema, logger?)`. Deterministic rules are pure synchronous functions returning typed matches with a confidence score in `[0, 1]`.
16
+ - `merge` namespace — `merge.apply(schema, rulesResult, llmResult, content, options?)` fuses rules output with LLM output, detects per-field conflicts, runs normalizers, re-validates against the Zod schema, and runs custom validators. `merge.defaultFieldPolicy` exposes the built-in fusion rules.
17
+ - `prompt` namespace — `prompt.build(schema, partial, options?)` emits an `LlmRequest` (`systemPrompt`, `userContent`, `responseSchema`, `knownValues`) restricted to fields missing from the deterministic pass. `prompt.parse(raw, missing, schema)` is a permissive parser that validates each field individually via Zod, drops invalid or unexpected keys, and never throws.
18
+ - `validator` namespace — `validator.of<T>()` returns `{ field, crossField }` factories bound to the data shape `T`, so predicates are fully typed from the field name.
19
+ - `LlmProvider` contract — single-method interface (`complete(request) → { values }`) consumers implement to wire any backend (OpenAI, Anthropic, Ollama, custom HTTP, ...). No vendor SDK is pulled into the import graph.
20
+ - Per-field confidence scoring, conflict detection (`flag` / `prefer-rule` / `prefer-llm` strategies), and extraction metadata (`durationMs`, rule/LLM field counts).
21
+ - Full TypeScript `.d.ts` output with JSDoc on every public type, method and configuration field.
22
+ - Example wiring a local Ollama runtime under `examples/ollama.ts`.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nicolas Smeets <contact@devpixel.be>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,351 @@
1
+ # Llmbic
2
+
3
+ [![npm version](https://img.shields.io/npm/v/llmbic.svg)](https://www.npmjs.com/package/llmbic)
4
+ [![CI](https://github.com/devpixel-be/llmbic/actions/workflows/ci.yml/badge.svg)](https://github.com/devpixel-be/llmbic/actions/workflows/ci.yml)
5
+ [![license](https://img.shields.io/npm/l/llmbic.svg)](./LICENSE)
6
+ [![node](https://img.shields.io/node/v/llmbic.svg)](https://nodejs.org)
7
+
8
+ Hybrid data extraction — deterministic rules + LLM fallback, with per-field confidence scoring.
9
+
10
+ The name folds **LLM** into [*lambic*](https://en.wikipedia.org/wiki/Lambic), the Belgian beer made by blending wild fermentation with a controlled process. Same idea here: LLMs are unpredictable, rules are rigid, and the mix produces something reliable.
11
+
12
+ ## Why
13
+
14
+ Extracting structured data from unstructured text is a solved problem — until you need it to be *reliable*. Rules (regex, parsers) are deterministic but brittle. LLMs understand context but hallucinate. Neither is enough alone.
15
+
16
+ Llmbic combines both: deterministic rules extract what they can with full confidence, the LLM fills in the gaps, and a merge layer detects conflicts between the two. Every field carries a confidence score. You know exactly what's trustworthy and what needs review.
17
+
18
+ ## Install
19
+
20
+ ```bash
21
+ npm install llmbic
22
+ ```
23
+
24
+ Llmbic has a single dependency: [Zod](https://zod.dev). No vendor SDK is pulled in — you bring your own LLM provider via the 1-method `LlmProvider` interface (see "Writing a provider" below).
25
+
26
+ ## Quick start
27
+
28
+ ### Rules-only (no LLM, no network)
29
+
30
+ ```typescript
31
+ import { z } from 'zod';
32
+ import { createExtractor, rule, confidence } from 'llmbic';
33
+
34
+ const InvoiceSchema = z.object({
35
+ total: z.number().nullable(),
36
+ currency: z.string().nullable(),
37
+ vendor: z.string().nullable(),
38
+ date: z.string().nullable(),
39
+ });
40
+
41
+ const extractor = createExtractor({
42
+ schema: InvoiceSchema,
43
+ rules: [
44
+ rule('total', (text) => {
45
+ const m = text.match(/Total[:\s]*(\d[\d.,\s]+)\s*€/i);
46
+ if (!m) return null;
47
+ return confidence(parseFloat(m[1].replace(/[\s.]/g, '').replace(',', '.')), 1.0);
48
+ }),
49
+ rule('currency', (text) => {
50
+ if (/€|EUR/i.test(text)) return confidence('EUR', 1.0);
51
+ if (/\$|USD/i.test(text)) return confidence('USD', 1.0);
52
+ return null;
53
+ }),
54
+ ],
55
+ });
56
+
57
+ const result = await extractor.extract(markdownContent);
58
+
59
+ console.log(result.data);
60
+ // { total: 1250.00, currency: 'EUR', vendor: null, date: null }
61
+
62
+ console.log(result.confidence);
63
+ // { total: 1.0, currency: 1.0, vendor: null, date: null }
64
+
65
+ console.log(result.missing);
66
+ // ['vendor', 'date']
67
+ ```
68
+
69
+ ### Rules + LLM
70
+
71
+ ```typescript
72
+ import { createExtractor, rule, confidence } from 'llmbic';
73
+ import type { LlmProvider } from 'llmbic';
74
+ import OpenAI from 'openai';
75
+
76
+ const client = new OpenAI();
77
+ const provider: LlmProvider = {
78
+ async complete(request) {
79
+ const response = await client.chat.completions.create({
80
+ model: 'gpt-4o-mini',
81
+ messages: [
82
+ { role: 'system', content: request.systemPrompt },
83
+ { role: 'user', content: request.userContent },
84
+ ],
85
+ response_format: {
86
+ type: 'json_schema',
87
+ json_schema: { name: 'extraction', strict: true, schema: request.responseSchema },
88
+ },
89
+ });
90
+ return { values: JSON.parse(response.choices[0].message.content!) };
91
+ },
92
+ };
93
+
94
+ const extractor = createExtractor({
95
+ schema: InvoiceSchema,
96
+ rules: [
97
+ // ... same rules as above
98
+ ],
99
+ llm: {
100
+ provider,
101
+ systemPrompt: 'Extract invoice data from the following document.',
102
+ },
103
+ });
104
+
105
+ const result = await extractor.extract(markdownContent);
106
+
107
+ console.log(result.data);
108
+ // { total: 1250.00, currency: 'EUR', vendor: 'Acme Corp', date: '2026-04-14' }
109
+
110
+ console.log(result.confidence);
111
+ // { total: 1.0, currency: 1.0, vendor: 0.7, date: 0.7 }
112
+
113
+ console.log(result.conflicts);
114
+ // [] — no disagreement between rules and LLM
115
+ ```
116
+
117
+ ### Batch / async mode (for OpenAI Batch API, job queues, etc.)
118
+
119
+ When you manage the LLM call yourself (batching, polling, custom transport), use the 4-step API:
120
+
121
+ ```typescript
122
+ // Step 1 — Deterministic extraction (sync, instant)
123
+ const partial = extractor.extractSync(markdown);
124
+
125
+ // Step 2 — Build the LLM request (you send it however you want)
126
+ const llmRequest = extractor.prompt(markdown, partial);
127
+ // → { systemPrompt, userContent, responseSchema, knownValues }
128
+
129
+ // ... submit to OpenAI Batch API, poll later, get the response ...
130
+
131
+ // Step 3 — Parse the raw LLM response
132
+ const llmResult = extractor.parse(rawJsonResponse);
133
+
134
+ // Step 4 — Merge everything (fusion + conflict detection + validation)
135
+ const result = extractor.merge(partial, llmResult, markdown);
136
+ ```
137
+
138
+ ## Features
139
+
140
+ ### Per-field confidence scoring
141
+
142
+ Every field in the result carries a confidence score (0.0–1.0):
143
+
144
+ | Source | Confidence |
145
+ |--------|-----------|
146
+ | Deterministic rule, exact match | 1.0 |
147
+ | Deterministic rule, partial match | 0.7–0.9 (you decide) |
148
+ | LLM only | configurable default (0.7) |
149
+ | Rule + LLM agree | 1.0 |
150
+ | Rule + LLM disagree | 0.3 (flagged as conflict) |
151
+ | No source | `null` |
152
+
153
+ ### Conflict detection
154
+
155
+ When a rule and the LLM extract different values for the same field, Llmbic flags it:
156
+
157
+ ```typescript
158
+ result.conflicts;
159
+ // [{ field: 'total', ruleValue: 1250, ruleConfidence: 1.0, llmValue: 1520 }]
160
+ ```
161
+
162
+ Three conflict strategies: `'flag'` (default — keep rule value, record conflict), `'prefer-rule'`, or `'prefer-llm'`.
163
+
164
+ ### Normalizers
165
+
166
+ Post-merge transformations. Run in sequence, receive the merged data + original content:
167
+
168
+ ```typescript
169
+ const extractor = createExtractor({
170
+ schema: MySchema,
171
+ rules: [...],
172
+ normalizers: [
173
+ (data, content) => {
174
+ // Fix a known data quality issue
175
+ if (data.price && data.price < 100) data.price = null;
176
+ return data;
177
+ },
178
+ ],
179
+ });
180
+ ```
181
+
182
+ ### Validators (invariants)
183
+
184
+ Check the final output for logical consistency:
185
+
186
+ ```typescript
187
+ import { validators } from 'llmbic';
188
+
189
+ const { field, crossField } = validators<MySchemaShape>();
190
+
191
+ const extractor = createExtractor({
192
+ schema: MySchema,
193
+ rules: [...],
194
+ validators: [
195
+ field('price', 'price_positive', (v) => v === null || v > 0, 'Price must be positive'),
196
+ crossField('date_format', (d) => !d.date || /^\d{4}-\d{2}-\d{2}$/.test(d.date), 'Date must be YYYY-MM-DD'),
197
+ ],
198
+ });
199
+
200
+ result.validation;
201
+ // { valid: true, violations: [] }
202
+ // or { valid: false, violations: [{ field: 'price', rule: 'price_positive', message: '...', severity: 'error' }] }
203
+ ```
204
+
205
+ ## Writing a provider
206
+
207
+ Llmbic does not ship vendor-specific adapters. The `LlmProvider` contract is a single method — wiring to any backend (OpenAI, Anthropic, Ollama, vLLM, Gemini, custom HTTP, ...) is ~10 lines you write and own.
208
+
209
+ ```typescript
210
+ import type { LlmProvider } from 'llmbic';
211
+
212
+ const provider: LlmProvider = {
213
+ async complete(request) {
214
+ const response = await fetch('https://api.example.com/v1/complete', {
215
+ method: 'POST',
216
+ body: JSON.stringify({
217
+ system: request.systemPrompt,
218
+ user: request.userContent,
219
+ schema: request.responseSchema,
220
+ }),
221
+ });
222
+ const data = await response.json();
223
+ return { values: data.output };
224
+ },
225
+ };
226
+ ```
227
+
228
+ Ready-made snippets for common backends:
229
+
230
+ **OpenAI** (Chat Completions + Structured Outputs):
231
+
232
+ ```typescript
233
+ const client = new OpenAI();
234
+ const provider: LlmProvider = {
235
+ async complete(request) {
236
+ const response = await client.chat.completions.create({
237
+ model: 'gpt-4o-mini',
238
+ messages: [
239
+ { role: 'system', content: request.systemPrompt },
240
+ { role: 'user', content: request.userContent },
241
+ ],
242
+ response_format: {
243
+ type: 'json_schema',
244
+ json_schema: { name: 'extraction', strict: true, schema: request.responseSchema },
245
+ },
246
+ });
247
+ return { values: JSON.parse(response.choices[0].message.content!) };
248
+ },
249
+ };
250
+ ```
251
+
252
+ **Anthropic** (Messages API + forced tool):
253
+
254
+ ```typescript
255
+ const client = new Anthropic();
256
+ const provider: LlmProvider = {
257
+ async complete(request) {
258
+ const response = await client.messages.create({
259
+ model: 'claude-haiku-4-5-20251001',
260
+ max_tokens: 4096,
261
+ system: request.systemPrompt,
262
+ messages: [{ role: 'user', content: request.userContent }],
263
+ tools: [{ name: 'extraction', input_schema: request.responseSchema }],
264
+ tool_choice: { type: 'tool', name: 'extraction' },
265
+ });
266
+ const toolUse = response.content.find((b) => b.type === 'tool_use');
267
+ return { values: toolUse!.input as Record<string, unknown> };
268
+ },
269
+ };
270
+ ```
271
+
272
+ **Ollama** (native `format` — JSON Schema, requires Ollama 0.5+):
273
+
274
+ ```typescript
275
+ const client = new Ollama();
276
+ const provider: LlmProvider = {
277
+ async complete(request) {
278
+ const response = await client.chat({
279
+ model: 'llama3.1',
280
+ messages: [
281
+ { role: 'system', content: request.systemPrompt },
282
+ { role: 'user', content: request.userContent },
283
+ ],
284
+ format: request.responseSchema,
285
+ });
286
+ return { values: JSON.parse(response.message.content) };
287
+ },
288
+ };
289
+ ```
290
+
291
+ Observability (token usage, latency, cost accounting) is out of scope — wrap the `complete` call in whatever telemetry you already use.
292
+
293
+ ## Design decisions
294
+
295
+ - **One dependency** — Zod only. No vendor SDK ever enters the import graph; you bring your own LLM provider (see "Writing a provider").
296
+ - **No retry** — If the LLM returns invalid data, `parse()` does best-effort parsing (valid fields kept, invalid ignored). Retry is an orchestration concern.
297
+ - **No streaming** — Llmbic works with complete results. Streaming is a transport concern.
298
+ - **No chunking** — One content = one extraction. If your content is too long, split it before calling Llmbic.
299
+ - **Normalizers mutate** — For pragmatic reasons, normalizers receive and return the same object. The `merge()` function copies the data first, so the original is never modified.
300
+ - **Rules are sync** — Extraction rules are pure synchronous functions. If you need async lookups, do them before creating the rule.
301
+
302
+ ## API reference
303
+
304
+ ### `createExtractor(config)`
305
+
306
+ Creates an extractor instance. Config:
307
+
308
+ | Field | Type | Required | Description |
309
+ |-------|------|----------|-------------|
310
+ | `schema` | `ZodObject` | yes | Output schema |
311
+ | `rules` | `ExtractionRule[]` | yes | Deterministic extraction rules |
312
+ | `llm` | `{ provider, systemPrompt, defaultConfidence? }` | no | LLM configuration. Omit for rules-only mode. |
313
+ | `normalizers` | `Normalizer[]` | no | Post-merge transformations |
314
+ | `validators` | `Validator[]` | no | Output invariants |
315
+ | `conflictStrategy` | `'flag' \| 'prefer-rule' \| 'prefer-llm'` | no | Default: `'flag'` |
316
+ | `logger` | `Logger` | no | Injectable logger (compatible with Pino, Winston, console) |
317
+
318
+ ### `rule(field, extractFn)`
319
+
320
+ Factory to create an `ExtractionRule`.
321
+
322
+ ### `confidence(value, score)`
323
+
324
+ Factory to create a `RuleMatch` with a confidence score.
325
+
326
+ ### `validators<T>()`
327
+
328
+ Factory bound to the data shape `T`. Returns `{ field, crossField }`:
329
+
330
+ - `field(name, rule, checkFn, message, severity?)` — single-field validator. `checkFn` receives the precise type of the field (`T[name]`).
331
+ - `crossField(rule, checkFn, message, severity?)` — whole-object validator, produces a violation without a `field` property.
332
+
333
+ Binding `T` once lets TypeScript infer each field's type from the field name, so predicates are fully typed without manual annotations.
334
+
335
+ ### Extractor methods
336
+
337
+ | Method | Sync | Description |
338
+ |--------|------|-------------|
339
+ | `extract(content)` | async | Full pipeline: rules → LLM → merge → validate |
340
+ | `extractSync(content)` | sync | Rules only. Returns partial result + missing fields. |
341
+ | `prompt(content, partial)` | sync | Builds LLM prompt for missing fields only. |
342
+ | `parse(raw)` | sync | Parses raw LLM JSON response. |
343
+ | `merge(partial, llmResult, content)` | sync | Merges rules + LLM, detects conflicts, normalizes, validates. |
344
+
345
+ ## License
346
+
347
+ MIT
348
+
349
+ ## Contributing
350
+
351
+ See [CONTRIBUTING.md](./CONTRIBUTING.md).
@@ -0,0 +1,19 @@
1
+ import type { z } from 'zod';
2
+ import type { Extractor, ExtractorConfig } from './types/extractor.types.js';
3
+ /**
4
+ * Bind a schema, deterministic rules and an optional LLM fallback into an
5
+ * {@link Extractor}. The returned object exposes the extraction pipeline as
6
+ * pre-configured methods; call sites stop having to thread `schema`,
7
+ * `rules` and provider wiring through every step.
8
+ *
9
+ * {@link Extractor.extract} runs {@link rule.apply}, then — when an LLM is
10
+ * configured and some fields are still missing — asks the provider for those
11
+ * fields only, parses the response with {@link prompt.parse} and fuses
12
+ * everything through {@link merge.apply}.
13
+ *
14
+ * @typeParam S - A Zod object schema describing the target data shape.
15
+ * @param config - Schema, deterministic rules, and optional LLM fallback.
16
+ * @returns An {@link Extractor} bound to `config.schema`.
17
+ */
18
+ export declare function createExtractor<S extends z.ZodObject<z.ZodRawShape>>(config: ExtractorConfig<S>): Extractor<z.infer<S>>;
19
+ //# sourceMappingURL=extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../src/extractor.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAI7B,OAAO,KAAK,EAAE,SAAS,EAAE,eAAe,EAAE,MAAM,4BAA4B,CAAC;AA8C7E;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,eAAe,CAAC,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,EAClE,MAAM,EAAE,eAAe,CAAC,CAAC,CAAC,GACzB,SAAS,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAuDvB"}
@@ -0,0 +1,96 @@
1
+ import { rule } from './rules.js';
2
+ import { merge } from './merge.js';
3
+ import { prompt } from './prompt.js';
4
+ /**
5
+ * Reconstruct a {@link RulesResult} from a `partial` that was previously
6
+ * produced by `extractSync` (i.e. a rules-only merge). Avoids re-running the
7
+ * rules during a deferred {@link Extractor.merge} call, since the partial
8
+ * already carries every rule value and its confidence.
9
+ */
10
+ function rulesResultFromPartial(partial, allFields) {
11
+ const values = {};
12
+ const confidence = {};
13
+ for (const field of allFields) {
14
+ const value = partial.data[field];
15
+ if (value === null) {
16
+ continue;
17
+ }
18
+ values[field] = value;
19
+ const fieldConfidence = partial.confidence[field];
20
+ if (fieldConfidence !== null) {
21
+ confidence[field] = fieldConfidence;
22
+ }
23
+ }
24
+ return { values, confidence, missing: [...partial.missing] };
25
+ }
26
+ /**
27
+ * Stamp `result.meta.durationMs` with the wall-clock elapsed since `startedAt`.
28
+ * Used by every {@link Extractor} method that returns an {@link ExtractionResult},
29
+ * so consumers see real timings instead of the placeholder `0` left by
30
+ * {@link merge.apply}.
31
+ */
32
+ function stampDuration(result, startedAt) {
33
+ return {
34
+ ...result,
35
+ meta: { ...result.meta, durationMs: performance.now() - startedAt },
36
+ };
37
+ }
38
+ /**
39
+ * Bind a schema, deterministic rules and an optional LLM fallback into an
40
+ * {@link Extractor}. The returned object exposes the extraction pipeline as
41
+ * pre-configured methods; call sites stop having to thread `schema`,
42
+ * `rules` and provider wiring through every step.
43
+ *
44
+ * {@link Extractor.extract} runs {@link rule.apply}, then — when an LLM is
45
+ * configured and some fields are still missing — asks the provider for those
46
+ * fields only, parses the response with {@link prompt.parse} and fuses
47
+ * everything through {@link merge.apply}.
48
+ *
49
+ * @typeParam S - A Zod object schema describing the target data shape.
50
+ * @param config - Schema, deterministic rules, and optional LLM fallback.
51
+ * @returns An {@link Extractor} bound to `config.schema`.
52
+ */
53
+ export function createExtractor(config) {
54
+ const allFields = Object.keys(config.schema.shape);
55
+ if (allFields.length === 0) {
56
+ throw new Error('createExtractor: schema must declare at least one field');
57
+ }
58
+ return {
59
+ async extract(content) {
60
+ const startedAt = performance.now();
61
+ const rulesResult = rule.apply(content, config.rules, config.schema);
62
+ const partial = merge.apply(config.schema, rulesResult, null, content);
63
+ if (!config.llm || partial.missing.length === 0) {
64
+ return stampDuration(partial, startedAt);
65
+ }
66
+ const request = prompt.build(config.schema, partial, content, {
67
+ systemPrompt: config.llm.systemPrompt,
68
+ });
69
+ const completion = await config.llm.provider.complete(request);
70
+ const llmResult = prompt.parse(config.schema, partial.missing, completion.values);
71
+ const final = merge.apply(config.schema, rulesResult, llmResult, content);
72
+ return stampDuration(final, startedAt);
73
+ },
74
+ extractSync(content) {
75
+ const startedAt = performance.now();
76
+ const rulesResult = rule.apply(content, config.rules, config.schema);
77
+ const partial = merge.apply(config.schema, rulesResult, null, content);
78
+ return stampDuration(partial, startedAt);
79
+ },
80
+ prompt(content, partial) {
81
+ return prompt.build(config.schema, partial, content, {
82
+ systemPrompt: config.llm?.systemPrompt,
83
+ });
84
+ },
85
+ parse(raw) {
86
+ return prompt.parse(config.schema, allFields, raw);
87
+ },
88
+ merge(partial, llmResult, content) {
89
+ const startedAt = performance.now();
90
+ const rulesResult = rulesResultFromPartial(partial, allFields);
91
+ const result = merge.apply(config.schema, rulesResult, llmResult, content);
92
+ return stampDuration(result, startedAt);
93
+ },
94
+ };
95
+ }
96
+ //# sourceMappingURL=extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.js","sourceRoot":"","sources":["../src/extractor.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AAKrC;;;;;GAKG;AACH,SAAS,sBAAsB,CAC7B,OAA4B,EAC5B,SAA+B;IAE/B,MAAM,MAAM,GAAe,EAAE,CAAC;IAC9B,MAAM,UAAU,GAAqC,EAAE,CAAC;IACxD,KAAK,MAAM,KAAK,IAAI,SAAS,EAAE,CAAC;QAC9B,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAClC,IAAI,KAAK,KAAK,IAAI,EAAE,CAAC;YACnB,SAAS;QACX,CAAC;QACD,MAAM,CAAC,KAAK,CAAC,GAAG,KAAmB,CAAC;QACpC,MAAM,eAAe,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC;QAClD,IAAI,eAAe,KAAK,IAAI,EAAE,CAAC;YAC7B,UAAU,CAAC,KAAK,CAAC,GAAG,eAAe,CAAC;QACtC,CAAC;IACH,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC,GAAG,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC;AAC/D,CAAC;AAED;;;;;GAKG;AACH,SAAS,aAAa,CACpB,MAA2B,EAC3B,SAAiB;IAEjB,OAAO;QACL,GAAG,MAAM;QACT,IAAI,EAAE,EAAE,GAAG,MAAM,CAAC,IAAI,EAAE,UAAU,EAAE,WAAW,CAAC,GAAG,EAAE,GAAG,SAAS,EAAE;KACpE,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,eAAe,CAC7B,MAA0B;IAG1B,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,CAAmB,CAAC;IAErE,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,yDAAyD,CAAC,CAAC;IAC7E,CAAC;IAED,OAAO;QACL,KAAK,CAAC,OAAO,CAAC,OAAO;YACnB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YACrE,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;YAEvE,IAAI,CAAC,MAAM,CAAC,GAAG,IAAI,OAAO,CAAC,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAChD,OAAO,aAAa,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;YAC3C,CAAC;YAED,MAAM,OAAO,GAAG,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE;gBAC5D,YAAY,EAAE,MAAM,CAAC,GAAG,CAAC,YAAY;aACtC,CAAC,CAAC;YACH,MAAM,UAAU,GAAG,MAAM,MAAM,CAAC,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;YAC/D,MAAM,SAAS,GAAG,MAAM,CAAC,KAAK,CAC5B,MAAM,CAAC,MAAM,EACb,OAAO,CAAC,OAAO,EACf,UAAU,CAAC,MAAM,CAClB,CAAC;YACF,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;YAC1E,OAAO,aAAa,CAAC,KAAK,EAAE,SAAS,CAAC,CAAC;QACzC,CAAC;QAED,WAAW,CAAC,OAAO;YACjB,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,MAAM,CAAC,CAAC;YACrE,MAAM,OAAO,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;YACvE,OAAO,aAAa,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;QAC3C,CAAC;QAED,MAAM,CAAC,OAAO,EAAE,OAAO;YACrB,OAAO,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,EAAE;gBACnD,YAAY,EAAE,MAAM,CAAC,GAAG,EAAE,YAAY;aACvC,CAAC,CAAC;QACL,CAAC;QAED,KAAK,CAAC,GAAG;YACP,OAAO,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,SAAS,EAAE,GAAG,CAAC,CAAC;QACrD,CAAC;QAED,KAAK,CAAC,OAAO,EAAE,SAAS,EAAE,OAAO;YAC/B,MAAM,SAAS,GAAG,WAAW,CAAC,GAAG,EAAE,CAAC;YACpC,MAAM,WAAW,GAAG,sBAAsB,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;YAC/D,MAAM,MAAM,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,CAAC,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,OAAO,CAAC,CAAC;YAC3E,OAAO,aAAa,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;QAC1C,CAAC;KACF,CAAC;AACJ,CAAC"}
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Llmbic — public entry point.
3
+ *
4
+ * Re-exports the five namespaces that make up the library (`createExtractor`,
5
+ * `rule`, `merge`, `prompt`, `validator`) and every public type consumers need
6
+ * to describe schemas, rules, providers, normalizers and validators.
7
+ *
8
+ * Llmbic does not ship vendor-specific provider adapters: the {@link LlmProvider}
9
+ * contract is a single-method interface, consumers implement it in ~10 lines
10
+ * using whichever SDK or HTTP client they prefer (see README).
11
+ */
12
+ export { createExtractor } from './extractor.js';
13
+ export { rule } from './rules.js';
14
+ export { merge } from './merge.js';
15
+ export { prompt } from './prompt.js';
16
+ export { validator } from './validate.js';
17
+ export type { ExtractionRule, RuleMatch, RulesResult, } from './types/rule.types.js';
18
+ export type { Extractor, ExtractorConfig, ExtractorLlmConfig, } from './types/extractor.types.js';
19
+ export type { LlmRequest } from './types/prompt.types.js';
20
+ export type { LlmProvider } from './types/provider.types.js';
21
+ export type { Logger } from './types/logger.types.js';
22
+ export type { Severity, Violation, Validator, } from './types/validate.types.js';
23
+ export type { Conflict, ConflictStrategy, ExtractedData, ExtractionMeta, ExtractionResult, FieldCompare, FieldMergePolicy, FieldMergeResult, LlmResult, MergeApplyOptions, Normalizer, ValidationResult, } from './types/merge.types.js';
24
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC;AAE1C,YAAY,EACV,cAAc,EACd,SAAS,EACT,WAAW,GACZ,MAAM,uBAAuB,CAAC;AAE/B,YAAY,EACV,SAAS,EACT,eAAe,EACf,kBAAkB,GACnB,MAAM,4BAA4B,CAAC;AAEpC,YAAY,EAAE,UAAU,EAAE,MAAM,yBAAyB,CAAC;AAC1D,YAAY,EAAE,WAAW,EAAE,MAAM,2BAA2B,CAAC;AAC7D,YAAY,EAAE,MAAM,EAAE,MAAM,yBAAyB,CAAC;AAEtD,YAAY,EACV,QAAQ,EACR,SAAS,EACT,SAAS,GACV,MAAM,2BAA2B,CAAC;AAEnC,YAAY,EACV,QAAQ,EACR,gBAAgB,EAChB,aAAa,EACb,cAAc,EACd,gBAAgB,EAChB,YAAY,EACZ,gBAAgB,EAChB,gBAAgB,EAChB,SAAS,EACT,iBAAiB,EACjB,UAAU,EACV,gBAAgB,GACjB,MAAM,wBAAwB,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,17 @@
1
+ /**
2
+ * Llmbic — public entry point.
3
+ *
4
+ * Re-exports the five namespaces that make up the library (`createExtractor`,
5
+ * `rule`, `merge`, `prompt`, `validator`) and every public type consumers need
6
+ * to describe schemas, rules, providers, normalizers and validators.
7
+ *
8
+ * Llmbic does not ship vendor-specific provider adapters: the {@link LlmProvider}
9
+ * contract is a single-method interface, consumers implement it in ~10 lines
10
+ * using whichever SDK or HTTP client they prefer (see README).
11
+ */
12
+ export { createExtractor } from './extractor.js';
13
+ export { rule } from './rules.js';
14
+ export { merge } from './merge.js';
15
+ export { prompt } from './prompt.js';
16
+ export { validator } from './validate.js';
17
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAClC,OAAO,EAAE,KAAK,EAAE,MAAM,YAAY,CAAC;AACnC,OAAO,EAAE,MAAM,EAAE,MAAM,aAAa,CAAC;AACrC,OAAO,EAAE,SAAS,EAAE,MAAM,eAAe,CAAC"}
@@ -0,0 +1,76 @@
1
+ import type { z } from 'zod';
2
+ import type { Logger } from './types/logger.types.js';
3
+ import type { RuleMatch, RulesResult } from './types/rule.types.js';
4
+ import type { ExtractionResult, FieldMergePolicy, FieldMergeResult, LlmResult, MergeApplyOptions } from './types/merge.types.js';
5
+ /**
6
+ * Field-level and object-level merge primitives.
7
+ *
8
+ * For now, only {@link merge.field} is exposed; the top-level object merge
9
+ * will be added in a later slice.
10
+ */
11
+ export declare const merge: {
12
+ /**
13
+ * Library defaults applied by {@link merge.field} when the caller omits
14
+ * one or more policy fields. Exposed so consumers can reference or spread
15
+ * them (e.g. `{ ...merge.defaultFieldPolicy, strategy: 'prefer-llm' }`).
16
+ *
17
+ * See {@link FieldMergePolicy} for the meaning of each field.
18
+ */
19
+ defaultFieldPolicy: {
20
+ /** See {@link FieldMergePolicy.strategy}. */
21
+ strategy: "flag";
22
+ /** See {@link FieldMergePolicy.defaultLlmConfidence}. */
23
+ defaultLlmConfidence: number;
24
+ /** See {@link FieldMergePolicy.flaggedConfidence}. */
25
+ flaggedConfidence: number;
26
+ /** See {@link FieldMergePolicy.agreementConfidence}. */
27
+ agreementConfidence: number;
28
+ /** See {@link FieldMergePolicy.compare}. Case-insensitive for strings, strict equality otherwise. */
29
+ compare: (a: unknown, b: unknown) => boolean;
30
+ };
31
+ /**
32
+ * Fuse a rule match and an LLM value for a single field, following the
33
+ * provided policy. Returns the kept value, its confidence, and a conflict
34
+ * record if the strategy flagged a disagreement.
35
+ *
36
+ * Any policy field omitted from `policy` falls back to
37
+ * {@link merge.defaultFieldPolicy}.
38
+ *
39
+ * Decision table (in order): rule-only, llm-only, both-null, agree,
40
+ * prefer-rule, prefer-llm, flag (default fallback).
41
+ *
42
+ * @typeParam T - Type of the rule value.
43
+ * @param field - Name of the field being merged.
44
+ * @param ruleMatch - Value proposed by a deterministic rule, or `null` if none.
45
+ * @param llmValue - Value proposed by the LLM, or `null` if none. Cast to `T`
46
+ * without runtime type-check — callers that expose `merge.field` via
47
+ * `merge.apply` rely on the final Zod re-validation to reject invalid LLM values.
48
+ * @param policy - Optional strategy and confidence overrides.
49
+ * @param logger - Optional logger notified of unexpected runtime situations
50
+ * (e.g. an unknown strategy slipped past the type system).
51
+ */
52
+ field<T>(field: string, ruleMatch: RuleMatch<T> | null, llmValue: unknown, policy?: Partial<FieldMergePolicy>, logger?: Logger): FieldMergeResult<T>;
53
+ /**
54
+ * Walk every field of `schema`, fuse the rules pass result with the LLM
55
+ * result via {@link merge.field}, and produce a typed
56
+ * {@link ExtractionResult}.
57
+ *
58
+ * Passing `llmResult = null` runs in rules-only mode: every field keeps
59
+ * whatever the rules produced and `meta.llmCalled` is `false`.
60
+ *
61
+ * Orchestration only — the three phases (fusion, normalization, validation)
62
+ * each live in their own private helper above.
63
+ *
64
+ * Runtime fields of `meta` (`durationMs`, `tokensUsed`) are populated by
65
+ * later slices; for now `durationMs` is `0`.
66
+ *
67
+ * @typeParam S - A Zod object schema.
68
+ * @param schema - Zod object schema describing the target data shape.
69
+ * @param rulesResult - Output of {@link rule.apply} for the same schema.
70
+ * @param llmResult - Parsed LLM response, or `null` for rules-only mode.
71
+ * @param content - Original text the rules and LLM were derived from; forwarded to normalizers so they can cross-reference the source.
72
+ * @param options - Optional behavior overrides (policy, normalizers, validators, logger).
73
+ */
74
+ apply<S extends z.ZodObject<z.ZodRawShape>>(schema: S, rulesResult: RulesResult<z.infer<S>>, llmResult: LlmResult | null, content: string, options?: MergeApplyOptions<z.infer<S>>): ExtractionResult<z.infer<S>>;
75
+ };
76
+ //# sourceMappingURL=merge.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"merge.d.ts","sourceRoot":"","sources":["../src/merge.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAC7B,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,yBAAyB,CAAC;AACtD,OAAO,KAAK,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,uBAAuB,CAAC;AAEpE,OAAO,KAAK,EAGV,gBAAgB,EAChB,gBAAgB,EAChB,gBAAgB,EAChB,SAAS,EACT,iBAAiB,EAElB,MAAM,wBAAwB,CAAC;AA+GhC;;;;;GAKG;AACH,eAAO,MAAM,KAAK;IAChB;;;;;;OAMG;;QAED,6CAA6C;;QAE7C,yDAAyD;;QAEzD,sDAAsD;;QAEtD,wDAAwD;;QAExD,qGAAqG;qBACxF,OAAO,KAAK,OAAO,KAAG,OAAO;;IAQ5C;;;;;;;;;;;;;;;;;;;;OAoBG;UACG,CAAC,SACE,MAAM,aACF,SAAS,CAAC,CAAC,CAAC,GAAG,IAAI,YACpB,OAAO,WACR,OAAO,CAAC,gBAAgB,CAAC,WACzB,MAAM,GACd,gBAAgB,CAAC,CAAC,CAAC;IAgEtB;;;;;;;;;;;;;;;;;;;;OAoBG;UACG,CAAC,SAAS,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,WAAW,CAAC,UAChC,CAAC,eACI,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,aACzB,SAAS,GAAG,IAAI,WAClB,MAAM,YACL,iBAAiB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,GACtC,gBAAgB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;CAmChC,CAAC"}