llm-extractor 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_extractor-1.0.0/PKG-INFO +272 -0
- llm_extractor-1.0.0/README.md +226 -0
- llm_extractor-1.0.0/llm_extract/__init__.py +69 -0
- llm_extractor-1.0.0/llm_extract/core.py +430 -0
- llm_extractor-1.0.0/llm_extract/extractor.py +592 -0
- llm_extractor-1.0.0/llm_extract/observability.py +183 -0
- llm_extractor-1.0.0/llm_extract/providers.py +476 -0
- llm_extractor-1.0.0/llm_extractor.egg-info/PKG-INFO +272 -0
- llm_extractor-1.0.0/llm_extractor.egg-info/SOURCES.txt +13 -0
- llm_extractor-1.0.0/llm_extractor.egg-info/dependency_links.txt +1 -0
- llm_extractor-1.0.0/llm_extractor.egg-info/requires.txt +22 -0
- llm_extractor-1.0.0/llm_extractor.egg-info/top_level.txt +1 -0
- llm_extractor-1.0.0/setup.cfg +4 -0
- llm_extractor-1.0.0/setup.py +43 -0
- llm_extractor-1.0.0/tests/test_core.py +592 -0
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-extractor
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Extract structured, validated JSON from any LLM — OpenAI, Anthropic, Gemini — with schema validation, semantic rules, and auto-retry.
|
|
5
|
+
Home-page: https://github.com/maheshmakvana/llm-extractor
|
|
6
|
+
Author: Mahesh Makvana
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Intended Audience :: Developers
|
|
18
|
+
Requires-Python: >=3.8
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: pydantic>=2.0
|
|
21
|
+
Requires-Dist: jsonschema>=4.0
|
|
22
|
+
Provides-Extra: openai
|
|
23
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
24
|
+
Provides-Extra: anthropic
|
|
25
|
+
Requires-Dist: anthropic>=0.20; extra == "anthropic"
|
|
26
|
+
Provides-Extra: google
|
|
27
|
+
Requires-Dist: google-generativeai>=0.5; extra == "google"
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
30
|
+
Requires-Dist: anthropic>=0.20; extra == "all"
|
|
31
|
+
Requires-Dist: google-generativeai>=0.5; extra == "all"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
|
|
35
|
+
Requires-Dist: black; extra == "dev"
|
|
36
|
+
Requires-Dist: isort; extra == "dev"
|
|
37
|
+
Dynamic: author
|
|
38
|
+
Dynamic: classifier
|
|
39
|
+
Dynamic: description
|
|
40
|
+
Dynamic: description-content-type
|
|
41
|
+
Dynamic: home-page
|
|
42
|
+
Dynamic: provides-extra
|
|
43
|
+
Dynamic: requires-dist
|
|
44
|
+
Dynamic: requires-python
|
|
45
|
+
Dynamic: summary
|
|
46
|
+
|
|
47
|
+
# llm-extractor
|
|
48
|
+
|
|
49
|
+
**Extract structured, validated JSON from any LLM.**
|
|
50
|
+
|
|
51
|
+
`pip install llm-extractor` — then stop fighting JSON parsing bugs, provider-specific APIs, and silent semantic failures. One unified interface to extract structured data from OpenAI, Anthropic, and Gemini — with automatic retries, semantic rules, and full observability.
|
|
52
|
+
|
|
53
|
+
## The Problem (2026)
|
|
54
|
+
|
|
55
|
+
Even with native structured outputs, Python developers still hit:
|
|
56
|
+
|
|
57
|
+
| Pain | Reality |
|
|
58
|
+
|------|---------|
|
|
59
|
+
| Provider fragmentation | OpenAI, Anthropic, Gemini all use different structured output APIs |
|
|
60
|
+
| Semantic failures | Valid JSON with nonsense values (`price: -999`, `email: "not-an-email"`) |
|
|
61
|
+
| Silent failures | Model returns `{}` or truncated object — no error raised |
|
|
62
|
+
| Dumb retries | Most code retries blindly with the same broken prompt |
|
|
63
|
+
| Zero observability | You know it failed but not *why* or *how often* |
|
|
64
|
+
|
|
65
|
+
`llm-extractor` fixes all five.
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install llm-extractor # core only
|
|
71
|
+
pip install "llm-extractor[openai]" # + OpenAI
|
|
72
|
+
pip install "llm-extractor[anthropic]" # + Anthropic
|
|
73
|
+
pip install "llm-extractor[google]" # + Gemini
|
|
74
|
+
pip install "llm-extractor[all]" # all providers
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Quick Start
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from llm_extract import extract, Schema, SemanticRule
|
|
81
|
+
|
|
82
|
+
# 1. Define your output schema
|
|
83
|
+
schema = Schema({
|
|
84
|
+
"name": str,
|
|
85
|
+
"age": int,
|
|
86
|
+
"email": str,
|
|
87
|
+
"score": float,
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
# 2. Add semantic rules
|
|
91
|
+
schema.add_rule(SemanticRule("age", min_value=0, max_value=150))
|
|
92
|
+
schema.add_rule(SemanticRule("score", min_value=0.0, max_value=100.0))
|
|
93
|
+
schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
|
|
94
|
+
|
|
95
|
+
# 3. Extract structured output — works across all providers
|
|
96
|
+
result = extract(
|
|
97
|
+
prompt="Extract info: John Doe, 34 years old, john@example.com, scored 87.5",
|
|
98
|
+
schema=schema,
|
|
99
|
+
provider="openai", # or "anthropic", "gemini", "auto"
|
|
100
|
+
model="gpt-4o-mini",
|
|
101
|
+
api_key="sk-...",
|
|
102
|
+
max_retries=3,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
print(result.data)
|
|
106
|
+
# {'name': 'John Doe', 'age': 34, 'email': 'john@example.com', 'score': 87.5}
|
|
107
|
+
|
|
108
|
+
print(result.attempts) # 1
|
|
109
|
+
print(result.provider) # 'openai'
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
## Pydantic Models
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from pydantic import BaseModel
|
|
116
|
+
from llm_extract import extract
|
|
117
|
+
|
|
118
|
+
class Product(BaseModel):
|
|
119
|
+
name: str
|
|
120
|
+
price: float
|
|
121
|
+
in_stock: bool
|
|
122
|
+
tags: list[str]
|
|
123
|
+
|
|
124
|
+
result = extract(
|
|
125
|
+
prompt="Extract: Blue Widget, costs $29.99, currently available, tagged as gadget and home",
|
|
126
|
+
schema=Product,
|
|
127
|
+
provider="anthropic",
|
|
128
|
+
model="claude-haiku-4-5-20251001",
|
|
129
|
+
api_key="sk-ant-...",
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
product: Product = result.typed_data(Product)
|
|
133
|
+
print(product.price) # 29.99
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Semantic Rules
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from llm_extract import SemanticRule, Schema
|
|
140
|
+
|
|
141
|
+
schema = Schema({"status": str, "count": int, "ratio": float})
|
|
142
|
+
|
|
143
|
+
# Enum constraint
|
|
144
|
+
schema.add_rule(SemanticRule("status", allowed_values=["active", "inactive", "pending"]))
|
|
145
|
+
|
|
146
|
+
# Range constraint
|
|
147
|
+
schema.add_rule(SemanticRule("count", min_value=0))
|
|
148
|
+
schema.add_rule(SemanticRule("ratio", min_value=0.0, max_value=1.0))
|
|
149
|
+
|
|
150
|
+
# Regex pattern
|
|
151
|
+
schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
|
|
152
|
+
|
|
153
|
+
# Custom validator function
|
|
154
|
+
schema.add_rule(SemanticRule("count", validator=lambda v: v % 2 == 0, message="count must be even"))
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Observability
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from llm_extract import extract, ExtractObserver
|
|
161
|
+
|
|
162
|
+
observer = ExtractObserver()
|
|
163
|
+
|
|
164
|
+
result = extract(
|
|
165
|
+
prompt="...",
|
|
166
|
+
schema=schema,
|
|
167
|
+
provider="openai",
|
|
168
|
+
model="gpt-4o-mini",
|
|
169
|
+
api_key="...",
|
|
170
|
+
observer=observer,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
# Per-call report
|
|
174
|
+
report = observer.report()
|
|
175
|
+
print(report.total_attempts) # 2
|
|
176
|
+
print(report.validation_failures) # [ValidationFailure(field='age', reason='below min_value 0')]
|
|
177
|
+
print(report.raw_responses) # ['{"age": -5, ...}', '{"age": 34, ...}']
|
|
178
|
+
print(report.latency_ms) # [342, 289]
|
|
179
|
+
print(report.tokens_used) # {'input': 120, 'output': 45}
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Multi-Provider Fallback
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
result = extract(
|
|
186
|
+
prompt="...",
|
|
187
|
+
schema=schema,
|
|
188
|
+
provider="auto", # tries providers in priority order
|
|
189
|
+
fallback_chain=[
|
|
190
|
+
{"provider": "openai", "model": "gpt-4o-mini", "api_key": "sk-..."},
|
|
191
|
+
{"provider": "anthropic", "model": "claude-haiku-4-5-20251001", "api_key": "sk-ant-..."},
|
|
192
|
+
{"provider": "gemini", "model": "gemini-1.5-flash", "api_key": "AIza..."},
|
|
193
|
+
],
|
|
194
|
+
max_retries=2,
|
|
195
|
+
)
|
|
196
|
+
print(result.provider) # whichever succeeded
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
## Async Support
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
import asyncio
|
|
203
|
+
from llm_extract import aextract
|
|
204
|
+
|
|
205
|
+
async def main():
|
|
206
|
+
result = await aextract(
|
|
207
|
+
prompt="...",
|
|
208
|
+
schema=schema,
|
|
209
|
+
provider="openai",
|
|
210
|
+
model="gpt-4o-mini",
|
|
211
|
+
api_key="...",
|
|
212
|
+
)
|
|
213
|
+
print(result.data)
|
|
214
|
+
|
|
215
|
+
asyncio.run(main())
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
## Raise on Failure
|
|
219
|
+
|
|
220
|
+
```python
|
|
221
|
+
from llm_extract import extract, ExtractValidationError
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
result = extract(..., raise_on_failure=True)
|
|
225
|
+
except ExtractValidationError as e:
|
|
226
|
+
print(e.result.failures) # list of ValidationFailure
|
|
227
|
+
print(e.result.raw) # last raw LLM response
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## JSON Schema Input
|
|
231
|
+
|
|
232
|
+
```python
|
|
233
|
+
from llm_extract import extract, Schema
|
|
234
|
+
|
|
235
|
+
schema = Schema({
|
|
236
|
+
"type": "object",
|
|
237
|
+
"properties": {
|
|
238
|
+
"title": {"type": "string"},
|
|
239
|
+
"year": {"type": "integer"},
|
|
240
|
+
"rating": {"type": "number"}
|
|
241
|
+
},
|
|
242
|
+
"required": ["title", "year", "rating"]
|
|
243
|
+
})
|
|
244
|
+
|
|
245
|
+
result = extract(prompt="...", schema=schema, ...)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
## OpenAI-Compatible Endpoints
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
result = extract(
|
|
252
|
+
prompt="...",
|
|
253
|
+
schema=schema,
|
|
254
|
+
provider="openai",
|
|
255
|
+
model="mistral-7b-instruct",
|
|
256
|
+
api_key="your-key",
|
|
257
|
+
base_url="https://your-openai-compatible-endpoint/v1",
|
|
258
|
+
)
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## Why llm-extractor?
|
|
262
|
+
|
|
263
|
+
- **Unified API** — one interface for OpenAI, Anthropic, Gemini, and any OpenAI-compatible endpoint
|
|
264
|
+
- **Schema-first** — define once with `dict`, `pydantic.BaseModel`, or JSON Schema
|
|
265
|
+
- **Semantic rules** — enforce business logic, not just types
|
|
266
|
+
- **Smart retries** — correction prompts tell the model *exactly* what went wrong
|
|
267
|
+
- **Full observability** — every attempt, failure, token count, and latency recorded
|
|
268
|
+
- **Zero magic** — no hidden prompt injection, no global state, fully inspectable
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
# llm-extractor
|
|
2
|
+
|
|
3
|
+
**Extract structured, validated JSON from any LLM.**
|
|
4
|
+
|
|
5
|
+
`pip install llm-extractor` — then stop fighting JSON parsing bugs, provider-specific APIs, and silent semantic failures. One unified interface to extract structured data from OpenAI, Anthropic, and Gemini — with automatic retries, semantic rules, and full observability.
|
|
6
|
+
|
|
7
|
+
## The Problem (2026)
|
|
8
|
+
|
|
9
|
+
Even with native structured outputs, Python developers still hit:
|
|
10
|
+
|
|
11
|
+
| Pain | Reality |
|
|
12
|
+
|------|---------|
|
|
13
|
+
| Provider fragmentation | OpenAI, Anthropic, Gemini all use different structured output APIs |
|
|
14
|
+
| Semantic failures | Valid JSON with nonsense values (`price: -999`, `email: "not-an-email"`) |
|
|
15
|
+
| Silent failures | Model returns `{}` or truncated object — no error raised |
|
|
16
|
+
| Dumb retries | Most code retries blindly with the same broken prompt |
|
|
17
|
+
| Zero observability | You know it failed but not *why* or *how often* |
|
|
18
|
+
|
|
19
|
+
`llm-extractor` fixes all five.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install llm-extractor # core only
|
|
25
|
+
pip install "llm-extractor[openai]" # + OpenAI
|
|
26
|
+
pip install "llm-extractor[anthropic]" # + Anthropic
|
|
27
|
+
pip install "llm-extractor[google]" # + Gemini
|
|
28
|
+
pip install "llm-extractor[all]" # all providers
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
from llm_extract import extract, Schema, SemanticRule
|
|
35
|
+
|
|
36
|
+
# 1. Define your output schema
|
|
37
|
+
schema = Schema({
|
|
38
|
+
"name": str,
|
|
39
|
+
"age": int,
|
|
40
|
+
"email": str,
|
|
41
|
+
"score": float,
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
# 2. Add semantic rules
|
|
45
|
+
schema.add_rule(SemanticRule("age", min_value=0, max_value=150))
|
|
46
|
+
schema.add_rule(SemanticRule("score", min_value=0.0, max_value=100.0))
|
|
47
|
+
schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
|
|
48
|
+
|
|
49
|
+
# 3. Extract structured output — works across all providers
|
|
50
|
+
result = extract(
|
|
51
|
+
prompt="Extract info: John Doe, 34 years old, john@example.com, scored 87.5",
|
|
52
|
+
schema=schema,
|
|
53
|
+
provider="openai", # or "anthropic", "gemini", "auto"
|
|
54
|
+
model="gpt-4o-mini",
|
|
55
|
+
api_key="sk-...",
|
|
56
|
+
max_retries=3,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
print(result.data)
|
|
60
|
+
# {'name': 'John Doe', 'age': 34, 'email': 'john@example.com', 'score': 87.5}
|
|
61
|
+
|
|
62
|
+
print(result.attempts) # 1
|
|
63
|
+
print(result.provider) # 'openai'
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Pydantic Models
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from pydantic import BaseModel
|
|
70
|
+
from llm_extract import extract
|
|
71
|
+
|
|
72
|
+
class Product(BaseModel):
|
|
73
|
+
name: str
|
|
74
|
+
price: float
|
|
75
|
+
in_stock: bool
|
|
76
|
+
tags: list[str]
|
|
77
|
+
|
|
78
|
+
result = extract(
|
|
79
|
+
prompt="Extract: Blue Widget, costs $29.99, currently available, tagged as gadget and home",
|
|
80
|
+
schema=Product,
|
|
81
|
+
provider="anthropic",
|
|
82
|
+
model="claude-haiku-4-5-20251001",
|
|
83
|
+
api_key="sk-ant-...",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
product: Product = result.typed_data(Product)
|
|
87
|
+
print(product.price) # 29.99
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Semantic Rules
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from llm_extract import SemanticRule, Schema
|
|
94
|
+
|
|
95
|
+
schema = Schema({"status": str, "count": int, "ratio": float})
|
|
96
|
+
|
|
97
|
+
# Enum constraint
|
|
98
|
+
schema.add_rule(SemanticRule("status", allowed_values=["active", "inactive", "pending"]))
|
|
99
|
+
|
|
100
|
+
# Range constraint
|
|
101
|
+
schema.add_rule(SemanticRule("count", min_value=0))
|
|
102
|
+
schema.add_rule(SemanticRule("ratio", min_value=0.0, max_value=1.0))
|
|
103
|
+
|
|
104
|
+
# Regex pattern
|
|
105
|
+
schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
|
|
106
|
+
|
|
107
|
+
# Custom validator function
|
|
108
|
+
schema.add_rule(SemanticRule("count", validator=lambda v: v % 2 == 0, message="count must be even"))
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Observability
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from llm_extract import extract, ExtractObserver
|
|
115
|
+
|
|
116
|
+
observer = ExtractObserver()
|
|
117
|
+
|
|
118
|
+
result = extract(
|
|
119
|
+
prompt="...",
|
|
120
|
+
schema=schema,
|
|
121
|
+
provider="openai",
|
|
122
|
+
model="gpt-4o-mini",
|
|
123
|
+
api_key="...",
|
|
124
|
+
observer=observer,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
# Per-call report
|
|
128
|
+
report = observer.report()
|
|
129
|
+
print(report.total_attempts) # 2
|
|
130
|
+
print(report.validation_failures) # [ValidationFailure(field='age', reason='below min_value 0')]
|
|
131
|
+
print(report.raw_responses) # ['{"age": -5, ...}', '{"age": 34, ...}']
|
|
132
|
+
print(report.latency_ms) # [342, 289]
|
|
133
|
+
print(report.tokens_used) # {'input': 120, 'output': 45}
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## Multi-Provider Fallback
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
result = extract(
|
|
140
|
+
prompt="...",
|
|
141
|
+
schema=schema,
|
|
142
|
+
provider="auto", # tries providers in priority order
|
|
143
|
+
fallback_chain=[
|
|
144
|
+
{"provider": "openai", "model": "gpt-4o-mini", "api_key": "sk-..."},
|
|
145
|
+
{"provider": "anthropic", "model": "claude-haiku-4-5-20251001", "api_key": "sk-ant-..."},
|
|
146
|
+
{"provider": "gemini", "model": "gemini-1.5-flash", "api_key": "AIza..."},
|
|
147
|
+
],
|
|
148
|
+
max_retries=2,
|
|
149
|
+
)
|
|
150
|
+
print(result.provider) # whichever succeeded
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Async Support
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import asyncio
|
|
157
|
+
from llm_extract import aextract
|
|
158
|
+
|
|
159
|
+
async def main():
|
|
160
|
+
result = await aextract(
|
|
161
|
+
prompt="...",
|
|
162
|
+
schema=schema,
|
|
163
|
+
provider="openai",
|
|
164
|
+
model="gpt-4o-mini",
|
|
165
|
+
api_key="...",
|
|
166
|
+
)
|
|
167
|
+
print(result.data)
|
|
168
|
+
|
|
169
|
+
asyncio.run(main())
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Raise on Failure
|
|
173
|
+
|
|
174
|
+
```python
|
|
175
|
+
from llm_extract import extract, ExtractValidationError
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
result = extract(..., raise_on_failure=True)
|
|
179
|
+
except ExtractValidationError as e:
|
|
180
|
+
print(e.result.failures) # list of ValidationFailure
|
|
181
|
+
print(e.result.raw) # last raw LLM response
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
## JSON Schema Input
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
from llm_extract import extract, Schema
|
|
188
|
+
|
|
189
|
+
schema = Schema({
|
|
190
|
+
"type": "object",
|
|
191
|
+
"properties": {
|
|
192
|
+
"title": {"type": "string"},
|
|
193
|
+
"year": {"type": "integer"},
|
|
194
|
+
"rating": {"type": "number"}
|
|
195
|
+
},
|
|
196
|
+
"required": ["title", "year", "rating"]
|
|
197
|
+
})
|
|
198
|
+
|
|
199
|
+
result = extract(prompt="...", schema=schema, ...)
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
## OpenAI-Compatible Endpoints
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
result = extract(
|
|
206
|
+
prompt="...",
|
|
207
|
+
schema=schema,
|
|
208
|
+
provider="openai",
|
|
209
|
+
model="mistral-7b-instruct",
|
|
210
|
+
api_key="your-key",
|
|
211
|
+
base_url="https://your-openai-compatible-endpoint/v1",
|
|
212
|
+
)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Why llm-extractor?
|
|
216
|
+
|
|
217
|
+
- **Unified API** — one interface for OpenAI, Anthropic, Gemini, and any OpenAI-compatible endpoint
|
|
218
|
+
- **Schema-first** — define once with `dict`, `pydantic.BaseModel`, or JSON Schema
|
|
219
|
+
- **Semantic rules** — enforce business logic, not just types
|
|
220
|
+
- **Smart retries** — correction prompts tell the model *exactly* what went wrong
|
|
221
|
+
- **Full observability** — every attempt, failure, token count, and latency recorded
|
|
222
|
+
- **Zero magic** — no hidden prompt injection, no global state, fully inspectable
|
|
223
|
+
|
|
224
|
+
## License
|
|
225
|
+
|
|
226
|
+
MIT
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
llm-extract
|
|
3
|
+
===========
|
|
4
|
+
Extract structured, validated JSON from any LLM.
|
|
5
|
+
|
|
6
|
+
pip install llm-extract
|
|
7
|
+
|
|
8
|
+
Quick start::
|
|
9
|
+
|
|
10
|
+
from llm_extract import extract, Schema, SemanticRule
|
|
11
|
+
|
|
12
|
+
schema = Schema({"name": str, "age": int})
|
|
13
|
+
schema.add_rule(SemanticRule("age", min_value=0, max_value=150))
|
|
14
|
+
|
|
15
|
+
result = extract(
|
|
16
|
+
prompt="John Doe is 34 years old.",
|
|
17
|
+
schema=schema,
|
|
18
|
+
provider="openai",
|
|
19
|
+
model="gpt-4o-mini",
|
|
20
|
+
api_key="sk-...",
|
|
21
|
+
)
|
|
22
|
+
print(result.data) # {'name': 'John Doe', 'age': 34}
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from .core import (
|
|
26
|
+
Schema,
|
|
27
|
+
SemanticRule,
|
|
28
|
+
SchemaField,
|
|
29
|
+
ValidationFailure,
|
|
30
|
+
SchemaInput,
|
|
31
|
+
)
|
|
32
|
+
from .extractor import (
|
|
33
|
+
extract,
|
|
34
|
+
aextract,
|
|
35
|
+
ExtractResult,
|
|
36
|
+
ExtractValidationError,
|
|
37
|
+
)
|
|
38
|
+
from .observability import (
|
|
39
|
+
ExtractObserver,
|
|
40
|
+
ForgeReport as ExtractReport,
|
|
41
|
+
AttemptRecord,
|
|
42
|
+
)
|
|
43
|
+
from .providers import (
|
|
44
|
+
ProviderConfig,
|
|
45
|
+
ProviderResponse,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
__version__ = "1.0.0"
|
|
49
|
+
__author__ = "Mahesh Makvana"
|
|
50
|
+
__all__ = [
|
|
51
|
+
# Core
|
|
52
|
+
"Schema",
|
|
53
|
+
"SemanticRule",
|
|
54
|
+
"SchemaField",
|
|
55
|
+
"ValidationFailure",
|
|
56
|
+
"SchemaInput",
|
|
57
|
+
# Extraction
|
|
58
|
+
"extract",
|
|
59
|
+
"aextract",
|
|
60
|
+
"ExtractResult",
|
|
61
|
+
"ExtractValidationError",
|
|
62
|
+
# Observability
|
|
63
|
+
"ExtractObserver",
|
|
64
|
+
"ExtractReport",
|
|
65
|
+
"AttemptRecord",
|
|
66
|
+
# Provider config (for advanced use)
|
|
67
|
+
"ProviderConfig",
|
|
68
|
+
"ProviderResponse",
|
|
69
|
+
]
|