llm-extractor 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,272 @@
1
+ Metadata-Version: 2.4
2
+ Name: llm-extractor
3
+ Version: 1.0.0
4
+ Summary: Extract structured, validated JSON from any LLM — OpenAI, Anthropic, Gemini — with schema validation, semantic rules, and auto-retry.
5
+ Home-page: https://github.com/maheshmakvana/llm-extractor
6
+ Author: Mahesh Makvana
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3.8
9
+ Classifier: Programming Language :: Python :: 3.9
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Classifier: Intended Audience :: Developers
18
+ Requires-Python: >=3.8
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: pydantic>=2.0
21
+ Requires-Dist: jsonschema>=4.0
22
+ Provides-Extra: openai
23
+ Requires-Dist: openai>=1.0; extra == "openai"
24
+ Provides-Extra: anthropic
25
+ Requires-Dist: anthropic>=0.20; extra == "anthropic"
26
+ Provides-Extra: google
27
+ Requires-Dist: google-generativeai>=0.5; extra == "google"
28
+ Provides-Extra: all
29
+ Requires-Dist: openai>=1.0; extra == "all"
30
+ Requires-Dist: anthropic>=0.20; extra == "all"
31
+ Requires-Dist: google-generativeai>=0.5; extra == "all"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=7.0; extra == "dev"
34
+ Requires-Dist: pytest-asyncio>=0.21; extra == "dev"
35
+ Requires-Dist: black; extra == "dev"
36
+ Requires-Dist: isort; extra == "dev"
37
+ Dynamic: author
38
+ Dynamic: classifier
39
+ Dynamic: description
40
+ Dynamic: description-content-type
41
+ Dynamic: home-page
42
+ Dynamic: provides-extra
43
+ Dynamic: requires-dist
44
+ Dynamic: requires-python
45
+ Dynamic: summary
46
+
47
+ # llm-extractor
48
+
49
+ **Extract structured, validated JSON from any LLM.**
50
+
51
+ `pip install llm-extractor` — then stop fighting JSON parsing bugs, provider-specific APIs, and silent semantic failures. One unified interface to extract structured data from OpenAI, Anthropic, and Gemini — with automatic retries, semantic rules, and full observability.
52
+
53
+ ## The Problem (2026)
54
+
55
+ Even with native structured outputs, Python developers still hit:
56
+
57
+ | Pain | Reality |
58
+ |------|---------|
59
+ | Provider fragmentation | OpenAI, Anthropic, Gemini all use different structured output APIs |
60
+ | Semantic failures | Valid JSON with nonsense values (`price: -999`, `email: "not-an-email"`) |
61
+ | Silent failures | Model returns `{}` or truncated object — no error raised |
62
+ | Dumb retries | Most code retries blindly with the same broken prompt |
63
+ | Zero observability | You know it failed but not *why* or *how often* |
64
+
65
+ `llm-extractor` fixes all five.
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ pip install llm-extractor # core only
71
+ pip install "llm-extractor[openai]" # + OpenAI
72
+ pip install "llm-extractor[anthropic]" # + Anthropic
73
+ pip install "llm-extractor[google]" # + Gemini
74
+ pip install "llm-extractor[all]" # all providers
75
+ ```
76
+
77
+ ## Quick Start
78
+
79
+ ```python
80
+ from llm_extract import extract, Schema, SemanticRule
81
+
82
+ # 1. Define your output schema
83
+ schema = Schema({
84
+ "name": str,
85
+ "age": int,
86
+ "email": str,
87
+ "score": float,
88
+ })
89
+
90
+ # 2. Add semantic rules
91
+ schema.add_rule(SemanticRule("age", min_value=0, max_value=150))
92
+ schema.add_rule(SemanticRule("score", min_value=0.0, max_value=100.0))
93
+ schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
94
+
95
+ # 3. Extract structured output — works across all providers
96
+ result = extract(
97
+ prompt="Extract info: John Doe, 34 years old, john@example.com, scored 87.5",
98
+ schema=schema,
99
+ provider="openai", # or "anthropic", "gemini", "auto"
100
+ model="gpt-4o-mini",
101
+ api_key="sk-...",
102
+ max_retries=3,
103
+ )
104
+
105
+ print(result.data)
106
+ # {'name': 'John Doe', 'age': 34, 'email': 'john@example.com', 'score': 87.5}
107
+
108
+ print(result.attempts) # 1
109
+ print(result.provider) # 'openai'
110
+ ```
111
+
112
+ ## Pydantic Models
113
+
114
+ ```python
115
+ from pydantic import BaseModel
116
+ from llm_extract import extract
117
+
118
+ class Product(BaseModel):
119
+ name: str
120
+ price: float
121
+ in_stock: bool
122
+ tags: list[str]
123
+
124
+ result = extract(
125
+ prompt="Extract: Blue Widget, costs $29.99, currently available, tagged as gadget and home",
126
+ schema=Product,
127
+ provider="anthropic",
128
+ model="claude-haiku-4-5-20251001",
129
+ api_key="sk-ant-...",
130
+ )
131
+
132
+ product: Product = result.typed_data(Product)
133
+ print(product.price) # 29.99
134
+ ```
135
+
136
+ ## Semantic Rules
137
+
138
+ ```python
139
+ from llm_extract import SemanticRule, Schema
140
+
141
+ schema = Schema({"status": str, "count": int, "ratio": float})
142
+
143
+ # Enum constraint
144
+ schema.add_rule(SemanticRule("status", allowed_values=["active", "inactive", "pending"]))
145
+
146
+ # Range constraint
147
+ schema.add_rule(SemanticRule("count", min_value=0))
148
+ schema.add_rule(SemanticRule("ratio", min_value=0.0, max_value=1.0))
149
+
150
+ # Regex pattern
151
+ schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
152
+
153
+ # Custom validator function
154
+ schema.add_rule(SemanticRule("count", validator=lambda v: v % 2 == 0, message="count must be even"))
155
+ ```
156
+
157
+ ## Observability
158
+
159
+ ```python
160
+ from llm_extract import extract, ExtractObserver
161
+
162
+ observer = ExtractObserver()
163
+
164
+ result = extract(
165
+ prompt="...",
166
+ schema=schema,
167
+ provider="openai",
168
+ model="gpt-4o-mini",
169
+ api_key="...",
170
+ observer=observer,
171
+ )
172
+
173
+ # Per-call report
174
+ report = observer.report()
175
+ print(report.total_attempts) # 2
176
+ print(report.validation_failures) # [ValidationFailure(field='age', reason='below min_value 0')]
177
+ print(report.raw_responses) # ['{"age": -5, ...}', '{"age": 34, ...}']
178
+ print(report.latency_ms) # [342, 289]
179
+ print(report.tokens_used) # {'input': 120, 'output': 45}
180
+ ```
181
+
182
+ ## Multi-Provider Fallback
183
+
184
+ ```python
185
+ result = extract(
186
+ prompt="...",
187
+ schema=schema,
188
+ provider="auto", # tries providers in priority order
189
+ fallback_chain=[
190
+ {"provider": "openai", "model": "gpt-4o-mini", "api_key": "sk-..."},
191
+ {"provider": "anthropic", "model": "claude-haiku-4-5-20251001", "api_key": "sk-ant-..."},
192
+ {"provider": "gemini", "model": "gemini-1.5-flash", "api_key": "AIza..."},
193
+ ],
194
+ max_retries=2,
195
+ )
196
+ print(result.provider) # whichever succeeded
197
+ ```
198
+
199
+ ## Async Support
200
+
201
+ ```python
202
+ import asyncio
203
+ from llm_extract import aextract
204
+
205
+ async def main():
206
+ result = await aextract(
207
+ prompt="...",
208
+ schema=schema,
209
+ provider="openai",
210
+ model="gpt-4o-mini",
211
+ api_key="...",
212
+ )
213
+ print(result.data)
214
+
215
+ asyncio.run(main())
216
+ ```
217
+
218
+ ## Raise on Failure
219
+
220
+ ```python
221
+ from llm_extract import extract, ExtractValidationError
222
+
223
+ try:
224
+ result = extract(..., raise_on_failure=True)
225
+ except ExtractValidationError as e:
226
+ print(e.result.failures) # list of ValidationFailure
227
+ print(e.result.raw) # last raw LLM response
228
+ ```
229
+
230
+ ## JSON Schema Input
231
+
232
+ ```python
233
+ from llm_extract import extract, Schema
234
+
235
+ schema = Schema({
236
+ "type": "object",
237
+ "properties": {
238
+ "title": {"type": "string"},
239
+ "year": {"type": "integer"},
240
+ "rating": {"type": "number"}
241
+ },
242
+ "required": ["title", "year", "rating"]
243
+ })
244
+
245
+ result = extract(prompt="...", schema=schema, ...)
246
+ ```
247
+
248
+ ## OpenAI-Compatible Endpoints
249
+
250
+ ```python
251
+ result = extract(
252
+ prompt="...",
253
+ schema=schema,
254
+ provider="openai",
255
+ model="mistral-7b-instruct",
256
+ api_key="your-key",
257
+ base_url="https://your-openai-compatible-endpoint/v1",
258
+ )
259
+ ```
260
+
261
+ ## Why llm-extractor?
262
+
263
+ - **Unified API** — one interface for OpenAI, Anthropic, Gemini, and any OpenAI-compatible endpoint
264
+ - **Schema-first** — define once with `dict`, `pydantic.BaseModel`, or JSON Schema
265
+ - **Semantic rules** — enforce business logic, not just types
266
+ - **Smart retries** — correction prompts tell the model *exactly* what went wrong
267
+ - **Full observability** — every attempt, failure, token count, and latency recorded
268
+ - **Zero magic** — no hidden prompt injection, no global state, fully inspectable
269
+
270
+ ## License
271
+
272
+ MIT
@@ -0,0 +1,226 @@
1
+ # llm-extractor
2
+
3
+ **Extract structured, validated JSON from any LLM.**
4
+
5
+ `pip install llm-extractor` — then stop fighting JSON parsing bugs, provider-specific APIs, and silent semantic failures. One unified interface to extract structured data from OpenAI, Anthropic, and Gemini — with automatic retries, semantic rules, and full observability.
6
+
7
+ ## The Problem (2026)
8
+
9
+ Even with native structured outputs, Python developers still hit:
10
+
11
+ | Pain | Reality |
12
+ |------|---------|
13
+ | Provider fragmentation | OpenAI, Anthropic, Gemini all use different structured output APIs |
14
+ | Semantic failures | Valid JSON with nonsense values (`price: -999`, `email: "not-an-email"`) |
15
+ | Silent failures | Model returns `{}` or truncated object — no error raised |
16
+ | Dumb retries | Most code retries blindly with the same broken prompt |
17
+ | Zero observability | You know it failed but not *why* or *how often* |
18
+
19
+ `llm-extractor` fixes all five.
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install llm-extractor # core only
25
+ pip install "llm-extractor[openai]" # + OpenAI
26
+ pip install "llm-extractor[anthropic]" # + Anthropic
27
+ pip install "llm-extractor[google]" # + Gemini
28
+ pip install "llm-extractor[all]" # all providers
29
+ ```
30
+
31
+ ## Quick Start
32
+
33
+ ```python
34
+ from llm_extract import extract, Schema, SemanticRule
35
+
36
+ # 1. Define your output schema
37
+ schema = Schema({
38
+ "name": str,
39
+ "age": int,
40
+ "email": str,
41
+ "score": float,
42
+ })
43
+
44
+ # 2. Add semantic rules
45
+ schema.add_rule(SemanticRule("age", min_value=0, max_value=150))
46
+ schema.add_rule(SemanticRule("score", min_value=0.0, max_value=100.0))
47
+ schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
48
+
49
+ # 3. Extract structured output — works across all providers
50
+ result = extract(
51
+ prompt="Extract info: John Doe, 34 years old, john@example.com, scored 87.5",
52
+ schema=schema,
53
+ provider="openai", # or "anthropic", "gemini", "auto"
54
+ model="gpt-4o-mini",
55
+ api_key="sk-...",
56
+ max_retries=3,
57
+ )
58
+
59
+ print(result.data)
60
+ # {'name': 'John Doe', 'age': 34, 'email': 'john@example.com', 'score': 87.5}
61
+
62
+ print(result.attempts) # 1
63
+ print(result.provider) # 'openai'
64
+ ```
65
+
66
+ ## Pydantic Models
67
+
68
+ ```python
69
+ from pydantic import BaseModel
70
+ from llm_extract import extract
71
+
72
+ class Product(BaseModel):
73
+ name: str
74
+ price: float
75
+ in_stock: bool
76
+ tags: list[str]
77
+
78
+ result = extract(
79
+ prompt="Extract: Blue Widget, costs $29.99, currently available, tagged as gadget and home",
80
+ schema=Product,
81
+ provider="anthropic",
82
+ model="claude-haiku-4-5-20251001",
83
+ api_key="sk-ant-...",
84
+ )
85
+
86
+ product: Product = result.typed_data(Product)
87
+ print(product.price) # 29.99
88
+ ```
89
+
90
+ ## Semantic Rules
91
+
92
+ ```python
93
+ from llm_extract import SemanticRule, Schema
94
+
95
+ schema = Schema({"status": str, "count": int, "ratio": float})
96
+
97
+ # Enum constraint
98
+ schema.add_rule(SemanticRule("status", allowed_values=["active", "inactive", "pending"]))
99
+
100
+ # Range constraint
101
+ schema.add_rule(SemanticRule("count", min_value=0))
102
+ schema.add_rule(SemanticRule("ratio", min_value=0.0, max_value=1.0))
103
+
104
+ # Regex pattern
105
+ schema.add_rule(SemanticRule("email", pattern=r"^[^@]+@[^@]+\.[^@]+$"))
106
+
107
+ # Custom validator function
108
+ schema.add_rule(SemanticRule("count", validator=lambda v: v % 2 == 0, message="count must be even"))
109
+ ```
110
+
111
+ ## Observability
112
+
113
+ ```python
114
+ from llm_extract import extract, ExtractObserver
115
+
116
+ observer = ExtractObserver()
117
+
118
+ result = extract(
119
+ prompt="...",
120
+ schema=schema,
121
+ provider="openai",
122
+ model="gpt-4o-mini",
123
+ api_key="...",
124
+ observer=observer,
125
+ )
126
+
127
+ # Per-call report
128
+ report = observer.report()
129
+ print(report.total_attempts) # 2
130
+ print(report.validation_failures) # [ValidationFailure(field='age', reason='below min_value 0')]
131
+ print(report.raw_responses) # ['{"age": -5, ...}', '{"age": 34, ...}']
132
+ print(report.latency_ms) # [342, 289]
133
+ print(report.tokens_used) # {'input': 120, 'output': 45}
134
+ ```
135
+
136
+ ## Multi-Provider Fallback
137
+
138
+ ```python
139
+ result = extract(
140
+ prompt="...",
141
+ schema=schema,
142
+ provider="auto", # tries providers in priority order
143
+ fallback_chain=[
144
+ {"provider": "openai", "model": "gpt-4o-mini", "api_key": "sk-..."},
145
+ {"provider": "anthropic", "model": "claude-haiku-4-5-20251001", "api_key": "sk-ant-..."},
146
+ {"provider": "gemini", "model": "gemini-1.5-flash", "api_key": "AIza..."},
147
+ ],
148
+ max_retries=2,
149
+ )
150
+ print(result.provider) # whichever succeeded
151
+ ```
152
+
153
+ ## Async Support
154
+
155
+ ```python
156
+ import asyncio
157
+ from llm_extract import aextract
158
+
159
+ async def main():
160
+ result = await aextract(
161
+ prompt="...",
162
+ schema=schema,
163
+ provider="openai",
164
+ model="gpt-4o-mini",
165
+ api_key="...",
166
+ )
167
+ print(result.data)
168
+
169
+ asyncio.run(main())
170
+ ```
171
+
172
+ ## Raise on Failure
173
+
174
+ ```python
175
+ from llm_extract import extract, ExtractValidationError
176
+
177
+ try:
178
+ result = extract(..., raise_on_failure=True)
179
+ except ExtractValidationError as e:
180
+ print(e.result.failures) # list of ValidationFailure
181
+ print(e.result.raw) # last raw LLM response
182
+ ```
183
+
184
+ ## JSON Schema Input
185
+
186
+ ```python
187
+ from llm_extract import extract, Schema
188
+
189
+ schema = Schema({
190
+ "type": "object",
191
+ "properties": {
192
+ "title": {"type": "string"},
193
+ "year": {"type": "integer"},
194
+ "rating": {"type": "number"}
195
+ },
196
+ "required": ["title", "year", "rating"]
197
+ })
198
+
199
+ result = extract(prompt="...", schema=schema, ...)
200
+ ```
201
+
202
+ ## OpenAI-Compatible Endpoints
203
+
204
+ ```python
205
+ result = extract(
206
+ prompt="...",
207
+ schema=schema,
208
+ provider="openai",
209
+ model="mistral-7b-instruct",
210
+ api_key="your-key",
211
+ base_url="https://your-openai-compatible-endpoint/v1",
212
+ )
213
+ ```
214
+
215
+ ## Why llm-extractor?
216
+
217
+ - **Unified API** — one interface for OpenAI, Anthropic, Gemini, and any OpenAI-compatible endpoint
218
+ - **Schema-first** — define once with `dict`, `pydantic.BaseModel`, or JSON Schema
219
+ - **Semantic rules** — enforce business logic, not just types
220
+ - **Smart retries** — correction prompts tell the model *exactly* what went wrong
221
+ - **Full observability** — every attempt, failure, token count, and latency recorded
222
+ - **Zero magic** — no hidden prompt injection, no global state, fully inspectable
223
+
224
+ ## License
225
+
226
+ MIT
@@ -0,0 +1,69 @@
1
+ """
2
+ llm-extract
3
+ ===========
4
+ Extract structured, validated JSON from any LLM.
5
+
6
+ pip install llm-extract
7
+
8
+ Quick start::
9
+
10
+ from llm_extract import extract, Schema, SemanticRule
11
+
12
+ schema = Schema({"name": str, "age": int})
13
+ schema.add_rule(SemanticRule("age", min_value=0, max_value=150))
14
+
15
+ result = extract(
16
+ prompt="John Doe is 34 years old.",
17
+ schema=schema,
18
+ provider="openai",
19
+ model="gpt-4o-mini",
20
+ api_key="sk-...",
21
+ )
22
+ print(result.data) # {'name': 'John Doe', 'age': 34}
23
+ """
24
+
25
+ from .core import (
26
+ Schema,
27
+ SemanticRule,
28
+ SchemaField,
29
+ ValidationFailure,
30
+ SchemaInput,
31
+ )
32
+ from .extractor import (
33
+ extract,
34
+ aextract,
35
+ ExtractResult,
36
+ ExtractValidationError,
37
+ )
38
+ from .observability import (
39
+ ExtractObserver,
40
+ ForgeReport as ExtractReport,
41
+ AttemptRecord,
42
+ )
43
+ from .providers import (
44
+ ProviderConfig,
45
+ ProviderResponse,
46
+ )
47
+
48
+ __version__ = "1.0.0"
49
+ __author__ = "Mahesh Makvana"
50
+ __all__ = [
51
+ # Core
52
+ "Schema",
53
+ "SemanticRule",
54
+ "SchemaField",
55
+ "ValidationFailure",
56
+ "SchemaInput",
57
+ # Extraction
58
+ "extract",
59
+ "aextract",
60
+ "ExtractResult",
61
+ "ExtractValidationError",
62
+ # Observability
63
+ "ExtractObserver",
64
+ "ExtractReport",
65
+ "AttemptRecord",
66
+ # Provider config (for advanced use)
67
+ "ProviderConfig",
68
+ "ProviderResponse",
69
+ ]