prompture 0.0.29.dev8__py3-none-any.whl → 0.0.38.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prompture/__init__.py +264 -23
- prompture/_version.py +34 -0
- prompture/agent.py +924 -0
- prompture/agent_types.py +156 -0
- prompture/aio/__init__.py +74 -0
- prompture/async_agent.py +880 -0
- prompture/async_conversation.py +789 -0
- prompture/async_core.py +803 -0
- prompture/async_driver.py +193 -0
- prompture/async_groups.py +551 -0
- prompture/cache.py +469 -0
- prompture/callbacks.py +55 -0
- prompture/cli.py +63 -4
- prompture/conversation.py +826 -0
- prompture/core.py +894 -263
- prompture/cost_mixin.py +51 -0
- prompture/discovery.py +187 -0
- prompture/driver.py +206 -5
- prompture/drivers/__init__.py +175 -67
- prompture/drivers/airllm_driver.py +109 -0
- prompture/drivers/async_airllm_driver.py +26 -0
- prompture/drivers/async_azure_driver.py +123 -0
- prompture/drivers/async_claude_driver.py +113 -0
- prompture/drivers/async_google_driver.py +316 -0
- prompture/drivers/async_grok_driver.py +97 -0
- prompture/drivers/async_groq_driver.py +90 -0
- prompture/drivers/async_hugging_driver.py +61 -0
- prompture/drivers/async_lmstudio_driver.py +148 -0
- prompture/drivers/async_local_http_driver.py +44 -0
- prompture/drivers/async_ollama_driver.py +135 -0
- prompture/drivers/async_openai_driver.py +102 -0
- prompture/drivers/async_openrouter_driver.py +102 -0
- prompture/drivers/async_registry.py +133 -0
- prompture/drivers/azure_driver.py +42 -9
- prompture/drivers/claude_driver.py +257 -34
- prompture/drivers/google_driver.py +295 -42
- prompture/drivers/grok_driver.py +35 -32
- prompture/drivers/groq_driver.py +33 -26
- prompture/drivers/hugging_driver.py +6 -6
- prompture/drivers/lmstudio_driver.py +97 -19
- prompture/drivers/local_http_driver.py +6 -6
- prompture/drivers/ollama_driver.py +168 -23
- prompture/drivers/openai_driver.py +184 -9
- prompture/drivers/openrouter_driver.py +37 -25
- prompture/drivers/registry.py +306 -0
- prompture/drivers/vision_helpers.py +153 -0
- prompture/field_definitions.py +106 -96
- prompture/group_types.py +147 -0
- prompture/groups.py +530 -0
- prompture/image.py +180 -0
- prompture/logging.py +80 -0
- prompture/model_rates.py +217 -0
- prompture/persistence.py +254 -0
- prompture/persona.py +482 -0
- prompture/runner.py +49 -47
- prompture/scaffold/__init__.py +1 -0
- prompture/scaffold/generator.py +84 -0
- prompture/scaffold/templates/Dockerfile.j2 +12 -0
- prompture/scaffold/templates/README.md.j2 +41 -0
- prompture/scaffold/templates/config.py.j2 +21 -0
- prompture/scaffold/templates/env.example.j2 +8 -0
- prompture/scaffold/templates/main.py.j2 +86 -0
- prompture/scaffold/templates/models.py.j2 +40 -0
- prompture/scaffold/templates/requirements.txt.j2 +5 -0
- prompture/serialization.py +218 -0
- prompture/server.py +183 -0
- prompture/session.py +117 -0
- prompture/settings.py +19 -1
- prompture/tools.py +219 -267
- prompture/tools_schema.py +254 -0
- prompture/validator.py +3 -3
- prompture-0.0.38.dev2.dist-info/METADATA +369 -0
- prompture-0.0.38.dev2.dist-info/RECORD +77 -0
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.38.dev2.dist-info}/WHEEL +1 -1
- prompture-0.0.29.dev8.dist-info/METADATA +0 -368
- prompture-0.0.29.dev8.dist-info/RECORD +0 -27
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.38.dev2.dist-info}/entry_points.txt +0 -0
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.38.dev2.dist-info}/licenses/LICENSE +0 -0
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.38.dev2.dist-info}/top_level.txt +0 -0
prompture/core.py
CHANGED
|
@@ -1,46 +1,70 @@
|
|
|
1
|
-
"""Core utilities: Helpers for requesting JSON from LLM.
|
|
2
|
-
|
|
1
|
+
"""Core utilities: Helpers for requesting JSON from LLM."""
|
|
2
|
+
|
|
3
3
|
from __future__ import annotations
|
|
4
|
+
|
|
4
5
|
import json
|
|
5
|
-
import
|
|
6
|
-
import requests
|
|
6
|
+
import logging
|
|
7
7
|
import sys
|
|
8
|
-
import
|
|
9
|
-
from datetime import datetime, date
|
|
8
|
+
from datetime import date, datetime
|
|
10
9
|
from decimal import Decimal
|
|
11
|
-
from typing import Any,
|
|
10
|
+
from typing import Any, Literal, Union
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import toon
|
|
16
|
+
except ImportError:
|
|
17
|
+
toon = None
|
|
12
18
|
|
|
13
|
-
from pydantic import BaseModel
|
|
19
|
+
from pydantic import BaseModel
|
|
14
20
|
|
|
15
|
-
from .drivers import get_driver, get_driver_for_model
|
|
16
21
|
from .driver import Driver
|
|
17
|
-
from .
|
|
22
|
+
from .drivers import get_driver_for_model
|
|
18
23
|
from .field_definitions import get_registry_snapshot
|
|
24
|
+
from .image import ImageInput, make_image
|
|
25
|
+
from .tools import (
|
|
26
|
+
clean_json_text,
|
|
27
|
+
convert_value,
|
|
28
|
+
get_field_default,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger("prompture.core")
|
|
32
|
+
|
|
19
33
|
|
|
34
|
+
def _build_content_with_images(text: str, images: list[ImageInput] | None = None) -> str | list[dict[str, Any]]:
|
|
35
|
+
"""Return plain string when no images, or a list of content blocks."""
|
|
36
|
+
if not images:
|
|
37
|
+
return text
|
|
38
|
+
blocks: list[dict[str, Any]] = [{"type": "text", "text": text}]
|
|
39
|
+
for img in images:
|
|
40
|
+
ic = make_image(img)
|
|
41
|
+
blocks.append({"type": "image", "source": ic})
|
|
42
|
+
return blocks
|
|
20
43
|
|
|
21
|
-
|
|
44
|
+
|
|
45
|
+
def normalize_field_value(value: Any, field_type: type, field_def: dict[str, Any]) -> Any:
|
|
22
46
|
"""Normalize invalid values for fields based on their type and nullable status.
|
|
23
|
-
|
|
47
|
+
|
|
24
48
|
This function handles post-processing of extracted values BEFORE Pydantic validation,
|
|
25
49
|
converting invalid values (like empty strings for booleans) to proper defaults.
|
|
26
|
-
|
|
50
|
+
|
|
27
51
|
Args:
|
|
28
52
|
value: The extracted value from the LLM
|
|
29
53
|
field_type: The expected Python type for this field
|
|
30
54
|
field_def: The field definition dict containing nullable, default, etc.
|
|
31
|
-
|
|
55
|
+
|
|
32
56
|
Returns:
|
|
33
57
|
A normalized value suitable for the field type
|
|
34
58
|
"""
|
|
35
59
|
nullable = field_def.get("nullable", True)
|
|
36
60
|
default_value = field_def.get("default")
|
|
37
|
-
|
|
61
|
+
|
|
38
62
|
# Special handling for boolean fields
|
|
39
|
-
if field_type is bool or (hasattr(field_type,
|
|
63
|
+
if field_type is bool or (hasattr(field_type, "__origin__") and field_type.__origin__ is bool):
|
|
40
64
|
# If value is already a boolean, return it as-is
|
|
41
65
|
if isinstance(value, bool):
|
|
42
66
|
return value
|
|
43
|
-
|
|
67
|
+
|
|
44
68
|
# For non-nullable booleans
|
|
45
69
|
if not nullable:
|
|
46
70
|
# Any non-empty string should be True, empty/None should be default
|
|
@@ -57,37 +81,39 @@ def normalize_field_value(value: Any, field_type: Type, field_def: Dict[str, Any
|
|
|
57
81
|
if isinstance(value, str):
|
|
58
82
|
return bool(value.strip()) if value.strip() else None
|
|
59
83
|
return bool(value) if value else None
|
|
60
|
-
|
|
84
|
+
|
|
61
85
|
# If the field is nullable and value is None, that's acceptable
|
|
62
86
|
if nullable and value is None:
|
|
63
87
|
return value
|
|
64
|
-
|
|
88
|
+
|
|
65
89
|
# For non-nullable fields with invalid values, use the default
|
|
66
90
|
if not nullable:
|
|
67
91
|
# Check for invalid values that should be replaced
|
|
68
92
|
invalid_values = (None, "", [], {})
|
|
69
|
-
|
|
93
|
+
|
|
70
94
|
if value in invalid_values or (isinstance(value, str) and not value.strip()):
|
|
71
95
|
# Use the default value if provided, otherwise use type-appropriate default
|
|
72
96
|
if default_value is not None:
|
|
73
97
|
return default_value
|
|
74
|
-
|
|
98
|
+
|
|
75
99
|
# Type-specific defaults for non-nullable fields
|
|
76
|
-
if field_type is int or (hasattr(field_type,
|
|
100
|
+
if field_type is int or (hasattr(field_type, "__origin__") and field_type.__origin__ is int):
|
|
77
101
|
return 0
|
|
78
|
-
elif field_type is float or (hasattr(field_type,
|
|
102
|
+
elif field_type is float or (hasattr(field_type, "__origin__") and field_type.__origin__ is float):
|
|
79
103
|
return 0.0
|
|
80
|
-
elif field_type is str or (hasattr(field_type,
|
|
104
|
+
elif field_type is str or (hasattr(field_type, "__origin__") and field_type.__origin__ is str):
|
|
81
105
|
return ""
|
|
82
|
-
elif field_type is list or (hasattr(field_type,
|
|
106
|
+
elif field_type is list or (hasattr(field_type, "__origin__") and field_type.__origin__ is list):
|
|
83
107
|
return []
|
|
84
|
-
elif field_type is dict or (hasattr(field_type,
|
|
108
|
+
elif field_type is dict or (hasattr(field_type, "__origin__") and field_type.__origin__ is dict):
|
|
85
109
|
return {}
|
|
86
|
-
|
|
110
|
+
|
|
87
111
|
return value
|
|
88
112
|
|
|
89
113
|
|
|
90
|
-
def clean_json_text_with_ai(
|
|
114
|
+
def clean_json_text_with_ai(
|
|
115
|
+
driver: Driver, text: str, model_name: str = "", options: dict[str, Any] | None = None
|
|
116
|
+
) -> str:
|
|
91
117
|
"""Use LLM to fix malformed JSON strings.
|
|
92
118
|
|
|
93
119
|
Generates a specialized prompt instructing the LLM to correct the
|
|
@@ -102,12 +128,14 @@ def clean_json_text_with_ai(driver: Driver, text: str, model_name: str = "", opt
|
|
|
102
128
|
A cleaned string that should contain valid JSON.
|
|
103
129
|
"""
|
|
104
130
|
# Check if JSON is already valid - if so, return unchanged
|
|
131
|
+
if options is None:
|
|
132
|
+
options = {}
|
|
105
133
|
try:
|
|
106
134
|
json.loads(text)
|
|
107
135
|
return text # Already valid, no need for LLM correction
|
|
108
136
|
except json.JSONDecodeError:
|
|
109
137
|
pass # Invalid, proceed with LLM correction
|
|
110
|
-
|
|
138
|
+
|
|
111
139
|
prompt = (
|
|
112
140
|
"The following text is supposed to be a single JSON object, but it is malformed. "
|
|
113
141
|
"Please correct it and return only the valid JSON object. Do not add any explanations or markdown. "
|
|
@@ -118,26 +146,122 @@ def clean_json_text_with_ai(driver: Driver, text: str, model_name: str = "", opt
|
|
|
118
146
|
cleaned = clean_json_text(raw)
|
|
119
147
|
return cleaned
|
|
120
148
|
|
|
149
|
+
|
|
150
|
+
def render_output(
|
|
151
|
+
driver: Driver,
|
|
152
|
+
content_prompt: str,
|
|
153
|
+
output_format: Literal["text", "html", "markdown"] = "text",
|
|
154
|
+
model_name: str = "",
|
|
155
|
+
options: dict[str, Any] | None = None,
|
|
156
|
+
system_prompt: str | None = None,
|
|
157
|
+
images: list[ImageInput] | None = None,
|
|
158
|
+
) -> dict[str, Any]:
|
|
159
|
+
"""Sends a prompt to the driver and returns the raw output in the requested format.
|
|
160
|
+
|
|
161
|
+
This function is designed for "no fluff" output, instructing the LLM to return
|
|
162
|
+
only the requested content without conversational filler or markdown fences
|
|
163
|
+
(unless markdown is requested).
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
driver: Adapter that implements generate(prompt, options).
|
|
167
|
+
content_prompt: Main prompt content.
|
|
168
|
+
output_format: Desired format ("text", "html", "markdown").
|
|
169
|
+
model_name: Optional model identifier used in usage metadata.
|
|
170
|
+
options: Additional options to pass to the driver.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
A dictionary containing:
|
|
174
|
+
- text: the raw text output.
|
|
175
|
+
- usage: token usage and cost information from the driver's meta object.
|
|
176
|
+
- output_format: the format of the output.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
ValueError: If an unsupported output format is provided.
|
|
180
|
+
"""
|
|
181
|
+
if options is None:
|
|
182
|
+
options = {}
|
|
183
|
+
if output_format not in ("text", "html", "markdown"):
|
|
184
|
+
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'text', 'html', or 'markdown'.")
|
|
185
|
+
|
|
186
|
+
instruct = ""
|
|
187
|
+
if output_format == "text":
|
|
188
|
+
instruct = (
|
|
189
|
+
"Return ONLY the raw text content. Do not use markdown formatting, "
|
|
190
|
+
"code fences, or conversational filler. Just the text."
|
|
191
|
+
)
|
|
192
|
+
elif output_format == "html":
|
|
193
|
+
instruct = (
|
|
194
|
+
"Return ONLY valid HTML code. Do not wrap it in markdown code fences "
|
|
195
|
+
"(like ```html ... ```). Do not include conversational filler."
|
|
196
|
+
)
|
|
197
|
+
elif output_format == "markdown":
|
|
198
|
+
instruct = "Return valid markdown content. You may use standard markdown formatting."
|
|
199
|
+
|
|
200
|
+
full_prompt = f"{content_prompt}\n\nSYSTEM INSTRUCTION: {instruct}"
|
|
201
|
+
|
|
202
|
+
# Use generate_messages when system_prompt or images are provided
|
|
203
|
+
user_content = _build_content_with_images(full_prompt, images)
|
|
204
|
+
if system_prompt is not None or images:
|
|
205
|
+
messages = [{"role": "user", "content": user_content}]
|
|
206
|
+
if system_prompt is not None:
|
|
207
|
+
messages.insert(0, {"role": "system", "content": system_prompt})
|
|
208
|
+
resp = driver.generate_messages(messages, options)
|
|
209
|
+
else:
|
|
210
|
+
resp = driver.generate(full_prompt, options)
|
|
211
|
+
raw = resp.get("text", "")
|
|
212
|
+
|
|
213
|
+
# Clean up potential markdown fences if the model disobeyed for text/html
|
|
214
|
+
if output_format in ("text", "html"):
|
|
215
|
+
# Simple cleanup for common fences if they appear despite instructions
|
|
216
|
+
cleaned = raw.strip()
|
|
217
|
+
if cleaned.startswith("```") and cleaned.endswith("```"):
|
|
218
|
+
# Remove first line (fence + optional language) and last line (fence)
|
|
219
|
+
lines = cleaned.splitlines()
|
|
220
|
+
if len(lines) >= 2:
|
|
221
|
+
cleaned = "\n".join(lines[1:-1])
|
|
222
|
+
raw = cleaned
|
|
223
|
+
|
|
224
|
+
usage = {
|
|
225
|
+
**resp.get("meta", {}),
|
|
226
|
+
"raw_response": resp,
|
|
227
|
+
"total_tokens": resp.get("meta", {}).get("total_tokens", 0),
|
|
228
|
+
"prompt_tokens": resp.get("meta", {}).get("prompt_tokens", 0),
|
|
229
|
+
"completion_tokens": resp.get("meta", {}).get("completion_tokens", 0),
|
|
230
|
+
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
231
|
+
"model_name": model_name or getattr(driver, "model", ""),
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return {"text": raw, "usage": usage, "output_format": output_format}
|
|
235
|
+
|
|
236
|
+
|
|
121
237
|
def ask_for_json(
|
|
122
238
|
driver: Driver,
|
|
123
239
|
content_prompt: str,
|
|
124
|
-
json_schema:
|
|
240
|
+
json_schema: dict[str, Any],
|
|
125
241
|
ai_cleanup: bool = True,
|
|
126
242
|
model_name: str = "",
|
|
127
|
-
options:
|
|
128
|
-
|
|
129
|
-
|
|
243
|
+
options: dict[str, Any] | None = None,
|
|
244
|
+
output_format: Literal["json", "toon"] = "json",
|
|
245
|
+
cache: bool | None = None,
|
|
246
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
247
|
+
system_prompt: str | None = None,
|
|
248
|
+
images: list[ImageInput] | None = None,
|
|
249
|
+
) -> dict[str, Any]:
|
|
250
|
+
"""Sends a prompt to the driver and returns structured output plus usage metadata.
|
|
130
251
|
|
|
131
252
|
This function enforces a schema-first approach by requiring a json_schema parameter
|
|
132
|
-
and automatically generating instructions for the LLM to return
|
|
253
|
+
and automatically generating instructions for the LLM to return data that matches it.
|
|
133
254
|
|
|
134
255
|
Args:
|
|
135
256
|
driver: Adapter that implements generate(prompt, options).
|
|
136
257
|
content_prompt: Main prompt content (may include examples).
|
|
137
258
|
json_schema: Required JSON schema dictionary defining the expected structure.
|
|
138
259
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
260
|
+
model_name: Optional model identifier used in usage metadata.
|
|
139
261
|
options: Additional options to pass to the driver.
|
|
140
|
-
|
|
262
|
+
output_format: Response serialization format ("json" or "toon").
|
|
263
|
+
cache: Override for response caching. ``True`` forces caching on,
|
|
264
|
+
``False`` forces it off, ``None`` defers to the global setting.
|
|
141
265
|
|
|
142
266
|
Returns:
|
|
143
267
|
A dictionary containing:
|
|
@@ -146,22 +270,96 @@ def ask_for_json(
|
|
|
146
270
|
- usage: token usage and cost information from the driver's meta object.
|
|
147
271
|
|
|
148
272
|
Raises:
|
|
149
|
-
|
|
273
|
+
ValueError: If an unsupported output format is provided.
|
|
274
|
+
RuntimeError: When TOON is requested but the dependency is missing.
|
|
275
|
+
json.JSONDecodeError: If JSON parsing fails and ai_cleanup is False.
|
|
276
|
+
ValueError: If TOON parsing fails.
|
|
150
277
|
"""
|
|
278
|
+
if options is None:
|
|
279
|
+
options = {}
|
|
280
|
+
if output_format not in ("json", "toon"):
|
|
281
|
+
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'json' or 'toon'.")
|
|
282
|
+
|
|
283
|
+
# --- cache lookup ---
|
|
284
|
+
from .cache import get_cache, make_cache_key
|
|
285
|
+
|
|
286
|
+
_cache = get_cache()
|
|
287
|
+
use_cache = cache if cache is not None else _cache.enabled
|
|
288
|
+
_force = cache is True # explicit per-call override
|
|
289
|
+
cache_key: str | None = None
|
|
290
|
+
if use_cache:
|
|
291
|
+
cache_key = make_cache_key(
|
|
292
|
+
prompt=content_prompt,
|
|
293
|
+
model_name=model_name,
|
|
294
|
+
schema=json_schema,
|
|
295
|
+
options=options,
|
|
296
|
+
output_format=output_format,
|
|
297
|
+
)
|
|
298
|
+
cached = _cache.get(cache_key, force=_force)
|
|
299
|
+
if cached is not None:
|
|
300
|
+
cached["usage"]["cache_hit"] = True
|
|
301
|
+
return cached
|
|
151
302
|
|
|
152
303
|
schema_string = json.dumps(json_schema, indent=2)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
304
|
+
if output_format == "toon" and toon is None:
|
|
305
|
+
raise RuntimeError(
|
|
306
|
+
"TOON requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
# Determine whether to use native JSON mode
|
|
310
|
+
use_json_mode = False
|
|
311
|
+
if json_mode == "on":
|
|
312
|
+
use_json_mode = True
|
|
313
|
+
elif json_mode == "auto":
|
|
314
|
+
use_json_mode = getattr(driver, "supports_json_mode", False)
|
|
315
|
+
|
|
316
|
+
if use_json_mode:
|
|
317
|
+
options = {**options, "json_mode": True}
|
|
318
|
+
if getattr(driver, "supports_json_schema", False):
|
|
319
|
+
options["json_schema"] = json_schema
|
|
320
|
+
|
|
321
|
+
# Adjust instruction prompt based on JSON mode capabilities
|
|
322
|
+
if use_json_mode and getattr(driver, "supports_json_schema", False):
|
|
323
|
+
# Schema enforced by API — minimal instruction
|
|
324
|
+
instruct = "Extract data matching the requested schema.\nIf a value is unknown use null."
|
|
325
|
+
elif use_json_mode:
|
|
326
|
+
# JSON guaranteed but schema not enforced by API
|
|
327
|
+
instruct = (
|
|
328
|
+
"Return a JSON object that validates against this schema:\n"
|
|
329
|
+
f"{schema_string}\n\n"
|
|
330
|
+
"If a value is unknown use null."
|
|
331
|
+
)
|
|
332
|
+
else:
|
|
333
|
+
# Existing prompt-based enforcement
|
|
334
|
+
instruct = (
|
|
335
|
+
"Return only a single JSON object (no markdown, no extra text) that validates against this JSON schema:\n"
|
|
336
|
+
f"{schema_string}\n\n"
|
|
337
|
+
"If a value is unknown use null. Use double quotes for keys and strings."
|
|
338
|
+
)
|
|
339
|
+
if output_format == "toon":
|
|
340
|
+
instruct += "\n\n(Respond with JSON only; Prompture will convert to TOON.)"
|
|
341
|
+
|
|
158
342
|
full_prompt = f"{content_prompt}\n\n{instruct}"
|
|
159
|
-
|
|
343
|
+
|
|
344
|
+
# Use generate_messages when system_prompt or images are provided
|
|
345
|
+
user_content = _build_content_with_images(full_prompt, images)
|
|
346
|
+
if system_prompt is not None or images:
|
|
347
|
+
messages = [{"role": "user", "content": user_content}]
|
|
348
|
+
if system_prompt is not None:
|
|
349
|
+
messages.insert(0, {"role": "system", "content": system_prompt})
|
|
350
|
+
resp = driver.generate_messages(messages, options)
|
|
351
|
+
else:
|
|
352
|
+
resp = driver.generate(full_prompt, options)
|
|
160
353
|
raw = resp.get("text", "")
|
|
161
354
|
cleaned = clean_json_text(raw)
|
|
162
355
|
|
|
163
356
|
try:
|
|
164
357
|
json_obj = json.loads(cleaned)
|
|
358
|
+
json_string = cleaned
|
|
359
|
+
toon_string = None
|
|
360
|
+
if output_format == "toon":
|
|
361
|
+
toon_string = toon.encode(json_obj)
|
|
362
|
+
|
|
165
363
|
usage = {
|
|
166
364
|
**resp.get("meta", {}),
|
|
167
365
|
"raw_response": resp,
|
|
@@ -169,19 +367,27 @@ def ask_for_json(
|
|
|
169
367
|
"prompt_tokens": resp.get("meta", {}).get("prompt_tokens", 0),
|
|
170
368
|
"completion_tokens": resp.get("meta", {}).get("completion_tokens", 0),
|
|
171
369
|
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
172
|
-
"model_name": model_name or getattr(driver, "model", "")
|
|
173
|
-
}
|
|
174
|
-
return {
|
|
175
|
-
"json_string": cleaned,
|
|
176
|
-
"json_object": json_obj,
|
|
177
|
-
"usage": usage
|
|
370
|
+
"model_name": model_name or getattr(driver, "model", ""),
|
|
178
371
|
}
|
|
372
|
+
result = {"json_string": json_string, "json_object": json_obj, "usage": usage}
|
|
373
|
+
if toon_string is not None:
|
|
374
|
+
result["toon_string"] = toon_string
|
|
375
|
+
result["output_format"] = "toon"
|
|
376
|
+
else:
|
|
377
|
+
result["output_format"] = "json"
|
|
378
|
+
|
|
379
|
+
# --- cache store ---
|
|
380
|
+
if use_cache and cache_key is not None:
|
|
381
|
+
cached_copy = {**result, "usage": {**result["usage"], "raw_response": {}}}
|
|
382
|
+
_cache.set(cache_key, cached_copy, force=_force)
|
|
383
|
+
|
|
384
|
+
return result
|
|
179
385
|
except json.JSONDecodeError as e:
|
|
180
386
|
if ai_cleanup:
|
|
181
387
|
cleaned_fixed = clean_json_text_with_ai(driver, cleaned, model_name, options)
|
|
182
388
|
try:
|
|
183
389
|
json_obj = json.loads(cleaned_fixed)
|
|
184
|
-
|
|
390
|
+
result = {
|
|
185
391
|
"json_string": cleaned_fixed,
|
|
186
392
|
"json_object": json_obj,
|
|
187
393
|
"usage": {
|
|
@@ -190,46 +396,63 @@ def ask_for_json(
|
|
|
190
396
|
"total_tokens": 0,
|
|
191
397
|
"cost": 0.0,
|
|
192
398
|
"model_name": options.get("model", getattr(driver, "model", "")),
|
|
193
|
-
"raw_response": {}
|
|
399
|
+
"raw_response": {},
|
|
194
400
|
},
|
|
401
|
+
"output_format": "json" if output_format != "toon" else "toon",
|
|
195
402
|
}
|
|
403
|
+
if output_format == "toon":
|
|
404
|
+
result["toon_string"] = toon.encode(json_obj)
|
|
405
|
+
|
|
406
|
+
# --- cache store (ai cleanup path) ---
|
|
407
|
+
if use_cache and cache_key is not None:
|
|
408
|
+
_cache.set(cache_key, result, force=_force)
|
|
409
|
+
|
|
410
|
+
return result
|
|
196
411
|
except json.JSONDecodeError:
|
|
197
|
-
|
|
198
|
-
raise e
|
|
412
|
+
raise e from None
|
|
199
413
|
else:
|
|
200
|
-
# Explicitly re-raise the original JSONDecodeError
|
|
201
414
|
raise e
|
|
202
415
|
|
|
416
|
+
|
|
203
417
|
def extract_and_jsonify(
|
|
204
418
|
text: Union[str, Driver], # Can be either text or driver for backward compatibility
|
|
205
|
-
json_schema:
|
|
419
|
+
json_schema: dict[str, Any],
|
|
206
420
|
*, # Force keyword arguments for remaining params
|
|
207
|
-
model_name: Union[str,
|
|
421
|
+
model_name: Union[str, dict[str, Any]] = "", # Can be schema (old) or model name (new)
|
|
208
422
|
instruction_template: str = "Extract information from the following text:",
|
|
209
423
|
ai_cleanup: bool = True,
|
|
210
|
-
|
|
211
|
-
|
|
424
|
+
output_format: Literal["json", "toon"] = "json",
|
|
425
|
+
options: dict[str, Any] | None = None,
|
|
426
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
427
|
+
system_prompt: str | None = None,
|
|
428
|
+
images: list[ImageInput] | None = None,
|
|
429
|
+
) -> dict[str, Any]:
|
|
212
430
|
"""Extracts structured information using automatic driver selection based on model name.
|
|
213
|
-
|
|
431
|
+
|
|
214
432
|
Args:
|
|
215
433
|
text: The raw text to extract information from.
|
|
216
434
|
json_schema: JSON schema dictionary defining the expected structure.
|
|
217
435
|
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4-turbo-preview").
|
|
218
436
|
instruction_template: Instructional text to prepend to the content.
|
|
219
437
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
438
|
+
output_format: Response serialization format ("json" or "toon").
|
|
220
439
|
options: Additional options to pass to the driver.
|
|
221
|
-
|
|
440
|
+
|
|
222
441
|
Returns:
|
|
223
442
|
A dictionary containing:
|
|
224
443
|
- json_string: the JSON string output.
|
|
225
444
|
- json_object: the parsed JSON object.
|
|
226
445
|
- usage: token usage and cost information from the driver's meta object.
|
|
227
|
-
|
|
446
|
+
|
|
228
447
|
Raises:
|
|
229
448
|
ValueError: If text is empty or None, or if model_name format is invalid.
|
|
230
449
|
json.JSONDecodeError: If the response cannot be parsed as JSON and ai_cleanup is False.
|
|
231
450
|
pytest.skip: If a ConnectionError occurs during testing (when pytest is running).
|
|
232
451
|
"""
|
|
452
|
+
if options is None:
|
|
453
|
+
options = {}
|
|
454
|
+
actual_template = instruction_template
|
|
455
|
+
actual_output_format = output_format
|
|
233
456
|
# Handle legacy format where first argument is driver
|
|
234
457
|
# Validate text input first
|
|
235
458
|
if isinstance(text, Driver):
|
|
@@ -246,7 +469,6 @@ def extract_and_jsonify(
|
|
|
246
469
|
raise ValueError("Text input cannot be empty")
|
|
247
470
|
actual_text = text
|
|
248
471
|
actual_schema = json_schema
|
|
249
|
-
actual_template = instruction_template
|
|
250
472
|
actual_model = model_name or options.get("model", "")
|
|
251
473
|
driver = options.pop("driver", None)
|
|
252
474
|
|
|
@@ -254,18 +476,18 @@ def extract_and_jsonify(
|
|
|
254
476
|
if driver is None:
|
|
255
477
|
if not actual_model:
|
|
256
478
|
raise ValueError("Model name cannot be empty")
|
|
257
|
-
|
|
479
|
+
|
|
258
480
|
# First validate model format
|
|
259
481
|
if "/" not in actual_model:
|
|
260
482
|
raise ValueError("Invalid model string format. Expected format: 'provider/model'")
|
|
261
|
-
|
|
483
|
+
|
|
262
484
|
try:
|
|
263
485
|
driver = get_driver_for_model(actual_model)
|
|
264
486
|
except ValueError as e:
|
|
265
487
|
if "Unsupported provider" in str(e):
|
|
266
|
-
raise ValueError(f"Unsupported provider in model name: {actual_model}")
|
|
488
|
+
raise ValueError(f"Unsupported provider in model name: {actual_model}") from e
|
|
267
489
|
raise # Re-raise any other ValueError
|
|
268
|
-
|
|
490
|
+
|
|
269
491
|
# Extract model parts for other validation
|
|
270
492
|
try:
|
|
271
493
|
provider, model_id = actual_model.split("/", 1)
|
|
@@ -274,29 +496,44 @@ def extract_and_jsonify(
|
|
|
274
496
|
except ValueError:
|
|
275
497
|
# If no "/" in model string, use entire string as both provider and model_id
|
|
276
498
|
provider = model_id = actual_model
|
|
277
|
-
|
|
499
|
+
|
|
278
500
|
opts = {**options, "model": model_id}
|
|
279
|
-
|
|
501
|
+
|
|
280
502
|
content_prompt = f"{actual_template} {actual_text}"
|
|
281
|
-
|
|
503
|
+
|
|
282
504
|
try:
|
|
283
|
-
return ask_for_json(
|
|
505
|
+
return ask_for_json(
|
|
506
|
+
driver,
|
|
507
|
+
content_prompt,
|
|
508
|
+
actual_schema,
|
|
509
|
+
ai_cleanup,
|
|
510
|
+
model_id,
|
|
511
|
+
opts,
|
|
512
|
+
output_format=actual_output_format,
|
|
513
|
+
json_mode=json_mode,
|
|
514
|
+
system_prompt=system_prompt,
|
|
515
|
+
images=images,
|
|
516
|
+
)
|
|
284
517
|
except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
|
|
285
518
|
if "pytest" in sys.modules:
|
|
286
519
|
import pytest
|
|
520
|
+
|
|
287
521
|
pytest.skip(f"Connection error occurred: {e}")
|
|
288
|
-
raise ConnectionError(f"Connection error occurred: {e}")
|
|
522
|
+
raise ConnectionError(f"Connection error occurred: {e}") from e
|
|
523
|
+
|
|
289
524
|
|
|
290
525
|
def manual_extract_and_jsonify(
|
|
291
526
|
driver: Driver,
|
|
292
527
|
text: str,
|
|
293
|
-
json_schema:
|
|
528
|
+
json_schema: dict[str, Any],
|
|
294
529
|
model_name: str = "",
|
|
295
530
|
instruction_template: str = "Extract information from the following text:",
|
|
296
531
|
ai_cleanup: bool = True,
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
532
|
+
output_format: Literal["json", "toon"] = "json",
|
|
533
|
+
options: dict[str, Any] | None = None,
|
|
534
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
535
|
+
system_prompt: str | None = None,
|
|
536
|
+
) -> dict[str, Any]:
|
|
300
537
|
"""Extracts structured information using an explicitly provided driver.
|
|
301
538
|
|
|
302
539
|
This variant is useful when you want to directly control which driver
|
|
@@ -310,8 +547,8 @@ def manual_extract_and_jsonify(
|
|
|
310
547
|
model_name: Optional override of the model name.
|
|
311
548
|
instruction_template: Instructional text to prepend to the content.
|
|
312
549
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
550
|
+
output_format: Response serialization format ("json" or "toon").
|
|
313
551
|
options: Additional options to pass to the driver.
|
|
314
|
-
verbose_level: Logging level for debug output (LogLevel.OFF by default).
|
|
315
552
|
|
|
316
553
|
Returns:
|
|
317
554
|
A dictionary containing:
|
|
@@ -323,46 +560,59 @@ def manual_extract_and_jsonify(
|
|
|
323
560
|
ValueError: If text is empty or None.
|
|
324
561
|
json.JSONDecodeError: If the response cannot be parsed as JSON and ai_cleanup is False.
|
|
325
562
|
"""
|
|
563
|
+
if options is None:
|
|
564
|
+
options = {}
|
|
326
565
|
if not isinstance(text, str):
|
|
327
566
|
raise ValueError("Text input must be a string")
|
|
328
|
-
|
|
567
|
+
|
|
329
568
|
if not text or not text.strip():
|
|
330
569
|
raise ValueError("Text input cannot be empty")
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
"text_length"
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
570
|
+
|
|
571
|
+
logger.info("[manual] Starting manual extraction")
|
|
572
|
+
logger.debug(
|
|
573
|
+
"[manual] text_length=%d model_name=%s schema_keys=%s",
|
|
574
|
+
len(text),
|
|
575
|
+
model_name,
|
|
576
|
+
list(json_schema.keys()) if json_schema else [],
|
|
577
|
+
)
|
|
338
578
|
|
|
339
579
|
opts = dict(options)
|
|
340
580
|
if model_name:
|
|
341
581
|
opts["model"] = model_name
|
|
342
582
|
|
|
343
|
-
# Generate the content prompt
|
|
344
583
|
content_prompt = f"{instruction_template} {text}"
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
584
|
+
|
|
585
|
+
logger.debug("[manual] Generated prompt for extraction")
|
|
586
|
+
|
|
587
|
+
result = ask_for_json(
|
|
588
|
+
driver,
|
|
589
|
+
content_prompt,
|
|
590
|
+
json_schema,
|
|
591
|
+
ai_cleanup,
|
|
592
|
+
model_name,
|
|
593
|
+
opts,
|
|
594
|
+
output_format=output_format,
|
|
595
|
+
json_mode=json_mode,
|
|
596
|
+
system_prompt=system_prompt,
|
|
597
|
+
)
|
|
598
|
+
logger.debug("[manual] Manual extraction completed successfully")
|
|
599
|
+
|
|
355
600
|
return result
|
|
356
601
|
|
|
602
|
+
|
|
357
603
|
def extract_with_model(
|
|
358
|
-
model_cls: Union[
|
|
359
|
-
text: Union[str,
|
|
360
|
-
model_name: Union[str,
|
|
604
|
+
model_cls: Union[type[BaseModel], str], # Can be model class or model name string for legacy support
|
|
605
|
+
text: Union[str, dict[str, Any]], # Can be text or schema for legacy support
|
|
606
|
+
model_name: Union[str, dict[str, Any]], # Can be model name or text for legacy support
|
|
361
607
|
instruction_template: str = "Extract information from the following text:",
|
|
362
608
|
ai_cleanup: bool = True,
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
609
|
+
output_format: Literal["json", "toon"] = "json",
|
|
610
|
+
options: dict[str, Any] | None = None,
|
|
611
|
+
cache: bool | None = None,
|
|
612
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
613
|
+
system_prompt: str | None = None,
|
|
614
|
+
images: list[ImageInput] | None = None,
|
|
615
|
+
) -> dict[str, Any]:
|
|
366
616
|
"""Extracts structured information into a Pydantic model instance.
|
|
367
617
|
|
|
368
618
|
Converts the Pydantic model to its JSON schema and uses auto-resolved driver based on model_name
|
|
@@ -374,8 +624,10 @@ def extract_with_model(
|
|
|
374
624
|
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4-turbo-preview").
|
|
375
625
|
instruction_template: Instructional text to prepend to the content.
|
|
376
626
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
627
|
+
output_format: Response serialization format ("json" or "toon").
|
|
377
628
|
options: Additional options to pass to the driver.
|
|
378
|
-
|
|
629
|
+
cache: Override for response caching. ``True`` forces caching on,
|
|
630
|
+
``False`` forces it off, ``None`` defers to the global setting.
|
|
379
631
|
|
|
380
632
|
Returns:
|
|
381
633
|
A validated instance of the Pydantic model.
|
|
@@ -385,6 +637,8 @@ def extract_with_model(
|
|
|
385
637
|
ValidationError: If the extracted data doesn't match the model schema.
|
|
386
638
|
"""
|
|
387
639
|
# Handle legacy format where first arg is model class
|
|
640
|
+
if options is None:
|
|
641
|
+
options = {}
|
|
388
642
|
if isinstance(model_cls, type) and issubclass(model_cls, BaseModel):
|
|
389
643
|
actual_cls = model_cls
|
|
390
644
|
actual_text = text
|
|
@@ -397,81 +651,123 @@ def extract_with_model(
|
|
|
397
651
|
|
|
398
652
|
if not isinstance(actual_text, str) or not actual_text.strip():
|
|
399
653
|
raise ValueError("Text input cannot be empty")
|
|
400
|
-
|
|
401
|
-
#
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
654
|
+
|
|
655
|
+
# --- cache lookup ---
|
|
656
|
+
from .cache import get_cache, make_cache_key
|
|
657
|
+
|
|
658
|
+
_cache = get_cache()
|
|
659
|
+
use_cache = cache if cache is not None else _cache.enabled
|
|
660
|
+
_force = cache is True
|
|
661
|
+
cache_key: str | None = None
|
|
662
|
+
if use_cache:
|
|
663
|
+
schema_for_key = actual_cls.model_json_schema()
|
|
664
|
+
cache_key = make_cache_key(
|
|
665
|
+
prompt=f"{instruction_template} {actual_text}",
|
|
666
|
+
model_name=actual_model if isinstance(actual_model, str) else "",
|
|
667
|
+
schema=schema_for_key,
|
|
668
|
+
options=options,
|
|
669
|
+
output_format=output_format,
|
|
670
|
+
pydantic_qualname=actual_cls.__qualname__,
|
|
671
|
+
)
|
|
672
|
+
cached = _cache.get(cache_key, force=_force)
|
|
673
|
+
if cached is not None:
|
|
674
|
+
cached["usage"]["cache_hit"] = True
|
|
675
|
+
# Reconstruct Pydantic model instance from cached JSON
|
|
676
|
+
cached["model"] = actual_cls(**cached["json_object"])
|
|
677
|
+
return type(
|
|
678
|
+
"ExtractResult",
|
|
679
|
+
(dict,),
|
|
680
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
681
|
+
)(cached)
|
|
682
|
+
|
|
683
|
+
logger.info("[extract] Starting extract_with_model")
|
|
684
|
+
logger.debug(
|
|
685
|
+
"[extract] model_cls=%s text_length=%d model_name=%s",
|
|
686
|
+
actual_cls.__name__,
|
|
687
|
+
len(actual_text),
|
|
688
|
+
actual_model,
|
|
689
|
+
)
|
|
408
690
|
|
|
409
691
|
schema = actual_cls.model_json_schema()
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
692
|
+
logger.debug("[extract] Generated JSON schema")
|
|
693
|
+
|
|
413
694
|
result = extract_and_jsonify(
|
|
414
695
|
text=actual_text,
|
|
415
696
|
json_schema=schema,
|
|
416
697
|
model_name=actual_model,
|
|
417
698
|
instruction_template=instruction_template,
|
|
418
699
|
ai_cleanup=ai_cleanup,
|
|
419
|
-
|
|
700
|
+
output_format=output_format,
|
|
701
|
+
options=options,
|
|
702
|
+
json_mode=json_mode,
|
|
703
|
+
system_prompt=system_prompt,
|
|
704
|
+
images=images,
|
|
420
705
|
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
706
|
+
logger.debug("[extract] Extraction completed successfully")
|
|
707
|
+
|
|
424
708
|
# Post-process the extracted JSON object to normalize invalid values
|
|
425
709
|
json_object = result["json_object"]
|
|
426
710
|
schema_properties = schema.get("properties", {})
|
|
427
|
-
|
|
711
|
+
|
|
428
712
|
for field_name, field_info in actual_cls.model_fields.items():
|
|
429
713
|
if field_name in json_object and field_name in schema_properties:
|
|
430
|
-
|
|
714
|
+
schema_properties[field_name]
|
|
431
715
|
field_def = {
|
|
432
|
-
"nullable": not schema_properties[field_name].get("type")
|
|
433
|
-
|
|
434
|
-
|
|
716
|
+
"nullable": not schema_properties[field_name].get("type")
|
|
717
|
+
or "null"
|
|
718
|
+
in (
|
|
719
|
+
schema_properties[field_name].get("anyOf", [])
|
|
720
|
+
if isinstance(schema_properties[field_name].get("anyOf"), list)
|
|
721
|
+
else []
|
|
722
|
+
),
|
|
723
|
+
"default": field_info.default
|
|
724
|
+
if hasattr(field_info, "default") and field_info.default is not ...
|
|
725
|
+
else None,
|
|
435
726
|
}
|
|
436
|
-
|
|
727
|
+
|
|
437
728
|
# Normalize the value
|
|
438
|
-
json_object[field_name] = normalize_field_value(
|
|
439
|
-
|
|
440
|
-
field_info.annotation,
|
|
441
|
-
field_def
|
|
442
|
-
)
|
|
443
|
-
|
|
729
|
+
json_object[field_name] = normalize_field_value(json_object[field_name], field_info.annotation, field_def)
|
|
730
|
+
|
|
444
731
|
# Create model instance for validation
|
|
445
732
|
model_instance = actual_cls(**json_object)
|
|
446
|
-
|
|
733
|
+
|
|
447
734
|
# Return dictionary with all required fields and backwards compatibility
|
|
448
|
-
result_dict = {
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
735
|
+
result_dict = {"json_string": result["json_string"], "json_object": result["json_object"], "usage": result["usage"]}
|
|
736
|
+
|
|
737
|
+
# --- cache store ---
|
|
738
|
+
if use_cache and cache_key is not None:
|
|
739
|
+
cached_copy = {
|
|
740
|
+
"json_string": result_dict["json_string"],
|
|
741
|
+
"json_object": result_dict["json_object"],
|
|
742
|
+
"usage": {**result_dict["usage"], "raw_response": {}},
|
|
743
|
+
}
|
|
744
|
+
_cache.set(cache_key, cached_copy, force=_force)
|
|
745
|
+
|
|
454
746
|
# Add backwards compatibility property
|
|
455
747
|
result_dict["model"] = model_instance
|
|
456
|
-
|
|
748
|
+
|
|
457
749
|
# Return value can be used both as a dict and accessed as model directly
|
|
458
|
-
return type(
|
|
459
|
-
"
|
|
460
|
-
|
|
461
|
-
|
|
750
|
+
return type(
|
|
751
|
+
"ExtractResult",
|
|
752
|
+
(dict,),
|
|
753
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
754
|
+
)(result_dict)
|
|
755
|
+
|
|
462
756
|
|
|
463
757
|
def stepwise_extract_with_model(
|
|
464
|
-
model_cls:
|
|
758
|
+
model_cls: type[BaseModel],
|
|
465
759
|
text: str,
|
|
466
760
|
*, # Force keyword arguments for remaining params
|
|
467
761
|
model_name: str,
|
|
468
762
|
instruction_template: str = "Extract the {field_name} from the following text:",
|
|
469
763
|
ai_cleanup: bool = True,
|
|
470
|
-
fields:
|
|
471
|
-
field_definitions:
|
|
472
|
-
options:
|
|
473
|
-
|
|
474
|
-
|
|
764
|
+
fields: list[str] | None = None,
|
|
765
|
+
field_definitions: dict[str, Any] | None = None,
|
|
766
|
+
options: dict[str, Any] | None = None,
|
|
767
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
768
|
+
system_prompt: str | None = None,
|
|
769
|
+
share_context: bool = False,
|
|
770
|
+
) -> dict[str, Union[str, dict[str, Any]]]:
|
|
475
771
|
"""Extracts structured information into a Pydantic model by processing each field individually.
|
|
476
772
|
|
|
477
773
|
For each field in the model, makes a separate LLM call to extract that specific field,
|
|
@@ -489,7 +785,6 @@ def stepwise_extract_with_model(
|
|
|
489
785
|
field_definitions: Optional field definitions dict for enhanced default handling.
|
|
490
786
|
If None, automatically uses the global field registry.
|
|
491
787
|
options: Additional options to pass to the driver.
|
|
492
|
-
verbose_level: Logging level for debug output (LogLevel.OFF by default).
|
|
493
788
|
|
|
494
789
|
Returns:
|
|
495
790
|
A dictionary containing:
|
|
@@ -500,7 +795,7 @@ def stepwise_extract_with_model(
|
|
|
500
795
|
Raises:
|
|
501
796
|
ValueError: If text is empty or None, or if model_name format is invalid.
|
|
502
797
|
KeyError: If a requested field doesn't exist in the model.
|
|
503
|
-
|
|
798
|
+
|
|
504
799
|
Note:
|
|
505
800
|
This function now gracefully handles extraction failures by falling back to default
|
|
506
801
|
values rather than failing completely. Individual field errors are logged and
|
|
@@ -508,25 +803,40 @@ def stepwise_extract_with_model(
|
|
|
508
803
|
"""
|
|
509
804
|
if not text or not text.strip():
|
|
510
805
|
raise ValueError("Text input cannot be empty")
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
806
|
+
|
|
807
|
+
# When share_context=True, delegate to Conversation-based extraction
|
|
808
|
+
if share_context:
|
|
809
|
+
from .conversation import Conversation
|
|
810
|
+
|
|
811
|
+
conv = Conversation(model_name=model_name, system_prompt=system_prompt, options=options)
|
|
812
|
+
return conv._stepwise_extract(
|
|
813
|
+
model_cls=model_cls,
|
|
814
|
+
text=text,
|
|
815
|
+
instruction_template=instruction_template,
|
|
816
|
+
ai_cleanup=ai_cleanup,
|
|
817
|
+
fields=fields,
|
|
818
|
+
field_definitions=field_definitions,
|
|
819
|
+
json_mode=json_mode,
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
logger.info("[stepwise] Starting stepwise extraction")
|
|
823
|
+
logger.debug(
|
|
824
|
+
"[stepwise] model_cls=%s text_length=%d fields=%s",
|
|
825
|
+
model_cls.__name__,
|
|
826
|
+
len(text),
|
|
827
|
+
fields,
|
|
828
|
+
)
|
|
518
829
|
|
|
519
830
|
# Auto-use global field registry if no field_definitions provided
|
|
520
831
|
if field_definitions is None:
|
|
521
832
|
field_definitions = get_registry_snapshot()
|
|
522
|
-
|
|
523
|
-
log_debug(LogLevel.TRACE, verbose_level, {"registry_fields": list(field_definitions.keys())}, prefix="[stepwise]")
|
|
833
|
+
logger.debug("[stepwise] Using global field registry")
|
|
524
834
|
|
|
525
835
|
data = {}
|
|
526
836
|
validation_errors = []
|
|
527
837
|
field_results = {} # Track success/failure per field
|
|
528
838
|
options = options or {}
|
|
529
|
-
|
|
839
|
+
|
|
530
840
|
# Initialize usage accumulator
|
|
531
841
|
accumulated_usage = {
|
|
532
842
|
"prompt_tokens": 0,
|
|
@@ -534,7 +844,7 @@ def stepwise_extract_with_model(
|
|
|
534
844
|
"total_tokens": 0,
|
|
535
845
|
"cost": 0.0,
|
|
536
846
|
"model_name": model_name, # Use provided model_name directly
|
|
537
|
-
"field_usages": {}
|
|
847
|
+
"field_usages": {},
|
|
538
848
|
}
|
|
539
849
|
|
|
540
850
|
# Get valid field names from the model
|
|
@@ -550,28 +860,16 @@ def stepwise_extract_with_model(
|
|
|
550
860
|
field_items = model_cls.model_fields.items()
|
|
551
861
|
|
|
552
862
|
for field_name, field_info in field_items:
|
|
553
|
-
|
|
554
|
-
log_debug(LogLevel.DEBUG, verbose_level, f"Extracting field: {field_name}", prefix="[stepwise]")
|
|
555
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
556
|
-
"field_name": field_name,
|
|
557
|
-
"field_info": str(field_info),
|
|
558
|
-
"field_type": str(field_info.annotation)
|
|
559
|
-
}, prefix="[stepwise]")
|
|
863
|
+
logger.debug("[stepwise] Extracting field: %s", field_name)
|
|
560
864
|
|
|
561
865
|
# Create field schema that expects a direct value rather than a dict
|
|
562
866
|
field_schema = {
|
|
563
867
|
"value": {
|
|
564
|
-
"type": "integer" if field_info.annotation
|
|
565
|
-
"description": field_info.description or f"Value for {field_name}"
|
|
868
|
+
"type": "integer" if field_info.annotation is int else "string",
|
|
869
|
+
"description": field_info.description or f"Value for {field_name}",
|
|
566
870
|
}
|
|
567
871
|
}
|
|
568
872
|
|
|
569
|
-
# Add structured logging for field schema and prompt
|
|
570
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
571
|
-
"field_schema": field_schema,
|
|
572
|
-
"prompt_template": instruction_template.format(field_name=field_name)
|
|
573
|
-
}, prefix="[stepwise]")
|
|
574
|
-
|
|
575
873
|
try:
|
|
576
874
|
result = extract_and_jsonify(
|
|
577
875
|
text=text,
|
|
@@ -579,12 +877,12 @@ def stepwise_extract_with_model(
|
|
|
579
877
|
model_name=model_name,
|
|
580
878
|
instruction_template=instruction_template.format(field_name=field_name),
|
|
581
879
|
ai_cleanup=ai_cleanup,
|
|
582
|
-
options=options
|
|
880
|
+
options=options,
|
|
881
|
+
json_mode=json_mode,
|
|
882
|
+
system_prompt=system_prompt,
|
|
583
883
|
)
|
|
584
884
|
|
|
585
|
-
|
|
586
|
-
log_debug(LogLevel.DEBUG, verbose_level, f"Raw extraction result for {field_name}", prefix="[stepwise]")
|
|
587
|
-
log_debug(LogLevel.TRACE, verbose_level, {"result": result}, prefix="[stepwise]")
|
|
885
|
+
logger.debug("[stepwise] Raw extraction result for %s", field_name)
|
|
588
886
|
|
|
589
887
|
# Accumulate usage data from this field extraction
|
|
590
888
|
field_usage = result.get("usage", {})
|
|
@@ -596,139 +894,125 @@ def stepwise_extract_with_model(
|
|
|
596
894
|
|
|
597
895
|
# Extract the raw value from the response - handle both dict and direct value formats
|
|
598
896
|
extracted_value = result["json_object"]["value"]
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
897
|
+
logger.debug("[stepwise] Raw extracted value for %s: %s", field_name, extracted_value)
|
|
898
|
+
|
|
602
899
|
if isinstance(extracted_value, dict) and "value" in extracted_value:
|
|
603
900
|
raw_value = extracted_value["value"]
|
|
604
|
-
|
|
901
|
+
logger.debug("[stepwise] Extracted inner value from dict for %s", field_name)
|
|
605
902
|
else:
|
|
606
903
|
raw_value = extracted_value
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
log_debug(LogLevel.DEBUG, verbose_level, {"field_name": field_name, "raw_value": raw_value}, prefix="[stepwise]")
|
|
904
|
+
logger.debug("[stepwise] Using direct value for %s", field_name)
|
|
610
905
|
|
|
611
906
|
# Post-process the raw value to normalize invalid values for non-nullable fields
|
|
612
907
|
field_def = {}
|
|
613
908
|
if field_definitions and field_name in field_definitions:
|
|
614
909
|
field_def = field_definitions[field_name] if isinstance(field_definitions[field_name], dict) else {}
|
|
615
|
-
|
|
910
|
+
|
|
616
911
|
# Determine nullable status and default value
|
|
617
912
|
nullable = field_def.get("nullable", True)
|
|
618
913
|
default_value = field_def.get("default")
|
|
619
|
-
if
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
914
|
+
if (
|
|
915
|
+
default_value is None
|
|
916
|
+
and hasattr(field_info, "default")
|
|
917
|
+
and field_info.default is not ...
|
|
918
|
+
and str(field_info.default) != "PydanticUndefined"
|
|
919
|
+
):
|
|
920
|
+
default_value = field_info.default
|
|
921
|
+
|
|
623
922
|
# Create field_def for normalize_field_value
|
|
624
|
-
normalize_def = {
|
|
625
|
-
|
|
626
|
-
"default": default_value
|
|
627
|
-
}
|
|
628
|
-
|
|
923
|
+
normalize_def = {"nullable": nullable, "default": default_value}
|
|
924
|
+
|
|
629
925
|
# Normalize the raw value before conversion
|
|
630
926
|
raw_value = normalize_field_value(raw_value, field_info.annotation, normalize_def)
|
|
631
|
-
|
|
927
|
+
logger.debug("[stepwise] Normalized value for %s: %s", field_name, raw_value)
|
|
632
928
|
|
|
633
929
|
# Convert value using tools.convert_value with logging
|
|
634
930
|
try:
|
|
635
|
-
converted_value = convert_value(
|
|
636
|
-
raw_value,
|
|
637
|
-
field_info.annotation,
|
|
638
|
-
allow_shorthand=True
|
|
639
|
-
)
|
|
931
|
+
converted_value = convert_value(raw_value, field_info.annotation, allow_shorthand=True)
|
|
640
932
|
data[field_name] = converted_value
|
|
641
933
|
field_results[field_name] = {"status": "success", "used_default": False}
|
|
642
934
|
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
646
|
-
"field_name": field_name,
|
|
647
|
-
"converted_value": converted_value
|
|
648
|
-
}, prefix="[stepwise]")
|
|
649
|
-
|
|
935
|
+
logger.debug("[stepwise] Successfully converted %s", field_name)
|
|
936
|
+
|
|
650
937
|
except ValueError as e:
|
|
651
|
-
error_msg = f"Type conversion failed for {field_name}: {
|
|
652
|
-
|
|
938
|
+
error_msg = f"Type conversion failed for {field_name}: {e!s}"
|
|
939
|
+
|
|
653
940
|
# Check if field has a default value (either explicit or from field_definitions)
|
|
654
941
|
has_default = False
|
|
655
942
|
if field_definitions and field_name in field_definitions:
|
|
656
943
|
field_def = field_definitions[field_name]
|
|
657
|
-
if isinstance(field_def, dict) and
|
|
944
|
+
if isinstance(field_def, dict) and "default" in field_def:
|
|
658
945
|
has_default = True
|
|
659
|
-
|
|
660
|
-
if not has_default and hasattr(field_info,
|
|
946
|
+
|
|
947
|
+
if not has_default and hasattr(field_info, "default"):
|
|
661
948
|
default_val = field_info.default
|
|
662
949
|
# Field has default if it's not PydanticUndefined or Ellipsis
|
|
663
|
-
if default_val is not ... and str(default_val) !=
|
|
950
|
+
if default_val is not ... and str(default_val) != "PydanticUndefined":
|
|
664
951
|
has_default = True
|
|
665
|
-
|
|
952
|
+
|
|
666
953
|
# Only add to validation_errors if field is required (no default)
|
|
667
954
|
if not has_default:
|
|
668
955
|
validation_errors.append(error_msg)
|
|
669
|
-
|
|
956
|
+
|
|
670
957
|
# Use default value (type-appropriate if no explicit default)
|
|
671
958
|
default_value = get_field_default(field_name, field_info, field_definitions)
|
|
672
959
|
data[field_name] = default_value
|
|
673
960
|
field_results[field_name] = {"status": "conversion_failed", "error": error_msg, "used_default": True}
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
961
|
+
|
|
962
|
+
logger.error("[stepwise] %s", error_msg)
|
|
963
|
+
logger.info("[stepwise] Using default value for %s: %s", field_name, default_value)
|
|
964
|
+
|
|
679
965
|
except Exception as e:
|
|
680
|
-
error_msg = f"Extraction failed for {field_name}: {
|
|
681
|
-
|
|
966
|
+
error_msg = f"Extraction failed for {field_name}: {e!s}"
|
|
967
|
+
|
|
682
968
|
# Check if field has a default value (either explicit or from field_definitions)
|
|
683
969
|
has_default = False
|
|
684
970
|
if field_definitions and field_name in field_definitions:
|
|
685
971
|
field_def = field_definitions[field_name]
|
|
686
|
-
if isinstance(field_def, dict) and
|
|
972
|
+
if isinstance(field_def, dict) and "default" in field_def:
|
|
687
973
|
has_default = True
|
|
688
|
-
|
|
689
|
-
if not has_default and hasattr(field_info,
|
|
974
|
+
|
|
975
|
+
if not has_default and hasattr(field_info, "default"):
|
|
690
976
|
default_val = field_info.default
|
|
691
977
|
# Field has default if it's not PydanticUndefined or Ellipsis
|
|
692
|
-
if default_val is not ... and str(default_val) !=
|
|
978
|
+
if default_val is not ... and str(default_val) != "PydanticUndefined":
|
|
693
979
|
has_default = True
|
|
694
|
-
|
|
980
|
+
|
|
695
981
|
# Only add to validation_errors if field is required (no default)
|
|
696
982
|
if not has_default:
|
|
697
983
|
validation_errors.append(error_msg)
|
|
698
|
-
|
|
984
|
+
|
|
699
985
|
# Use default value (type-appropriate if no explicit default)
|
|
700
986
|
default_value = get_field_default(field_name, field_info, field_definitions)
|
|
701
987
|
data[field_name] = default_value
|
|
702
988
|
field_results[field_name] = {"status": "extraction_failed", "error": error_msg, "used_default": True}
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
989
|
+
|
|
990
|
+
logger.error("[stepwise] %s", error_msg)
|
|
991
|
+
logger.info("[stepwise] Using default value for %s: %s", field_name, default_value)
|
|
992
|
+
|
|
708
993
|
# Store error details in field_usages
|
|
709
994
|
accumulated_usage["field_usages"][field_name] = {
|
|
710
995
|
"error": str(e),
|
|
711
996
|
"status": "failed",
|
|
712
997
|
"used_default": True,
|
|
713
|
-
"default_value": default_value
|
|
998
|
+
"default_value": default_value,
|
|
714
999
|
}
|
|
715
|
-
|
|
716
|
-
# Add structured logging for validation errors
|
|
1000
|
+
|
|
717
1001
|
if validation_errors:
|
|
718
|
-
|
|
1002
|
+
logger.warning("[stepwise] Found %d validation errors", len(validation_errors))
|
|
719
1003
|
for error in validation_errors:
|
|
720
|
-
|
|
721
|
-
|
|
1004
|
+
logger.error("[stepwise] %s", error)
|
|
1005
|
+
|
|
722
1006
|
# If there are validation errors, include them in the result
|
|
723
1007
|
if validation_errors:
|
|
724
1008
|
accumulated_usage["validation_errors"] = validation_errors
|
|
725
|
-
|
|
1009
|
+
|
|
726
1010
|
try:
|
|
727
1011
|
# Create model instance with collected data
|
|
728
1012
|
# Create model instance with collected data
|
|
729
1013
|
model_instance = model_cls(**data)
|
|
730
1014
|
model_dict = model_instance.model_dump()
|
|
731
|
-
|
|
1015
|
+
|
|
732
1016
|
# Enhanced DateTimeEncoder to handle both datetime and date objects
|
|
733
1017
|
class ExtendedJSONEncoder(json.JSONEncoder):
|
|
734
1018
|
def default(self, obj):
|
|
@@ -737,14 +1021,14 @@ def stepwise_extract_with_model(
|
|
|
737
1021
|
if isinstance(obj, Decimal):
|
|
738
1022
|
return str(obj)
|
|
739
1023
|
return super().default(obj)
|
|
740
|
-
|
|
1024
|
+
|
|
741
1025
|
# Use enhanced encoder for JSON serialization
|
|
742
1026
|
json_string = json.dumps(model_dict, cls=ExtendedJSONEncoder)
|
|
743
1027
|
|
|
744
1028
|
# Also modify return value to use ExtendedJSONEncoder
|
|
745
|
-
if
|
|
746
|
-
result[
|
|
747
|
-
|
|
1029
|
+
if "json_string" in result:
|
|
1030
|
+
result["json_string"] = json.dumps(result["json_object"], cls=ExtendedJSONEncoder)
|
|
1031
|
+
|
|
748
1032
|
# Define ExtendedJSONEncoder for handling special types
|
|
749
1033
|
class ExtendedJSONEncoder(json.JSONEncoder):
|
|
750
1034
|
def default(self, obj):
|
|
@@ -753,10 +1037,10 @@ def stepwise_extract_with_model(
|
|
|
753
1037
|
if isinstance(obj, Decimal):
|
|
754
1038
|
return str(obj)
|
|
755
1039
|
return super().default(obj)
|
|
756
|
-
|
|
1040
|
+
|
|
757
1041
|
# Create json string with custom encoder
|
|
758
1042
|
json_string = json.dumps(model_dict, cls=ExtendedJSONEncoder)
|
|
759
|
-
|
|
1043
|
+
|
|
760
1044
|
# Create result matching extract_with_model format
|
|
761
1045
|
result = {
|
|
762
1046
|
"json_string": json_string,
|
|
@@ -764,32 +1048,379 @@ def stepwise_extract_with_model(
|
|
|
764
1048
|
"usage": accumulated_usage,
|
|
765
1049
|
"field_results": field_results,
|
|
766
1050
|
}
|
|
767
|
-
|
|
1051
|
+
|
|
768
1052
|
# Add model instance as property and make callable
|
|
769
1053
|
result["model"] = model_instance
|
|
770
|
-
return type(
|
|
771
|
-
"
|
|
772
|
-
|
|
773
|
-
|
|
1054
|
+
return type(
|
|
1055
|
+
"ExtractResult",
|
|
1056
|
+
(dict,),
|
|
1057
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
1058
|
+
)(result)
|
|
774
1059
|
except Exception as e:
|
|
775
|
-
error_msg = f"Model validation error: {
|
|
1060
|
+
error_msg = f"Model validation error: {e!s}"
|
|
776
1061
|
# Add validation error to accumulated usage
|
|
777
1062
|
if "validation_errors" not in accumulated_usage:
|
|
778
1063
|
accumulated_usage["validation_errors"] = []
|
|
779
1064
|
accumulated_usage["validation_errors"].append(error_msg)
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
1065
|
+
|
|
1066
|
+
logger.error("[stepwise] %s", error_msg)
|
|
1067
|
+
|
|
784
1068
|
# Create error result with partial data
|
|
785
1069
|
error_result = {
|
|
786
1070
|
"json_string": "{}",
|
|
787
1071
|
"json_object": {},
|
|
788
1072
|
"usage": accumulated_usage,
|
|
789
1073
|
"field_results": field_results,
|
|
790
|
-
"error": error_msg
|
|
1074
|
+
"error": error_msg,
|
|
791
1075
|
}
|
|
792
|
-
return type(
|
|
793
|
-
"
|
|
794
|
-
|
|
795
|
-
|
|
1076
|
+
return type(
|
|
1077
|
+
"ExtractResult",
|
|
1078
|
+
(dict,),
|
|
1079
|
+
{
|
|
1080
|
+
"__getattr__": lambda self, key: self.get(key),
|
|
1081
|
+
"__call__": lambda self: None, # Return None when called if validation failed
|
|
1082
|
+
},
|
|
1083
|
+
)(error_result)
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _json_to_toon(data: Union[list[dict[str, Any]], dict[str, Any]], data_key: str | None = None) -> str:
|
|
1087
|
+
"""Convert JSON array or dict containing array to TOON format.
|
|
1088
|
+
|
|
1089
|
+
Args:
|
|
1090
|
+
data: List of dicts (uniform array) or dict containing array under a key
|
|
1091
|
+
data_key: If data is a dict, the key containing the array
|
|
1092
|
+
|
|
1093
|
+
Returns:
|
|
1094
|
+
TOON formatted string
|
|
1095
|
+
|
|
1096
|
+
Raises:
|
|
1097
|
+
ValueError: If TOON conversion fails or data format is invalid
|
|
1098
|
+
RuntimeError: If python-toon is not installed
|
|
1099
|
+
"""
|
|
1100
|
+
if toon is None:
|
|
1101
|
+
raise RuntimeError(
|
|
1102
|
+
"TOON conversion requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
1103
|
+
)
|
|
1104
|
+
|
|
1105
|
+
# Handle different data formats
|
|
1106
|
+
if isinstance(data, list):
|
|
1107
|
+
array_data = data
|
|
1108
|
+
elif isinstance(data, dict):
|
|
1109
|
+
if data_key:
|
|
1110
|
+
if data_key not in data:
|
|
1111
|
+
raise ValueError(f"Key '{data_key}' not found in data")
|
|
1112
|
+
array_data = data[data_key]
|
|
1113
|
+
else:
|
|
1114
|
+
# Try to find the first array value in the dict
|
|
1115
|
+
array_data = None
|
|
1116
|
+
for _key, value in data.items():
|
|
1117
|
+
if isinstance(value, list) and value:
|
|
1118
|
+
array_data = value
|
|
1119
|
+
break
|
|
1120
|
+
if array_data is None:
|
|
1121
|
+
raise ValueError("No array found in data. Specify data_key or provide a list directly.")
|
|
1122
|
+
else:
|
|
1123
|
+
raise ValueError("Data must be a list of dicts or a dict containing an array")
|
|
1124
|
+
|
|
1125
|
+
if not isinstance(array_data, list):
|
|
1126
|
+
raise ValueError("Array data must be a list")
|
|
1127
|
+
|
|
1128
|
+
if not array_data:
|
|
1129
|
+
raise ValueError("Array data cannot be empty")
|
|
1130
|
+
|
|
1131
|
+
# Validate that all items in array are dicts (uniform structure)
|
|
1132
|
+
if not all(isinstance(item, dict) for item in array_data):
|
|
1133
|
+
raise ValueError("All items in array must be dictionaries for TOON conversion")
|
|
1134
|
+
|
|
1135
|
+
try:
|
|
1136
|
+
return toon.encode(array_data)
|
|
1137
|
+
except Exception as e:
|
|
1138
|
+
raise ValueError(f"Failed to convert data to TOON format: {e}") from e
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
def _dataframe_to_toon(df) -> str:
|
|
1142
|
+
"""Convert Pandas DataFrame to TOON format.
|
|
1143
|
+
|
|
1144
|
+
Args:
|
|
1145
|
+
df: Pandas DataFrame to convert
|
|
1146
|
+
|
|
1147
|
+
Returns:
|
|
1148
|
+
TOON formatted string
|
|
1149
|
+
|
|
1150
|
+
Raises:
|
|
1151
|
+
ValueError: If DataFrame conversion fails
|
|
1152
|
+
RuntimeError: If pandas or python-toon is not installed
|
|
1153
|
+
"""
|
|
1154
|
+
try:
|
|
1155
|
+
import pandas as pd
|
|
1156
|
+
except ImportError:
|
|
1157
|
+
raise RuntimeError(
|
|
1158
|
+
"Pandas DataFrame conversion requested but 'pandas' is not installed. "
|
|
1159
|
+
"Install it with 'pip install pandas' or 'pip install prompture[pandas]'."
|
|
1160
|
+
) from None
|
|
1161
|
+
|
|
1162
|
+
if toon is None:
|
|
1163
|
+
raise RuntimeError(
|
|
1164
|
+
"TOON conversion requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
1165
|
+
)
|
|
1166
|
+
|
|
1167
|
+
dataframe_type = getattr(pd, "DataFrame", None)
|
|
1168
|
+
if isinstance(dataframe_type, type):
|
|
1169
|
+
if not isinstance(df, dataframe_type):
|
|
1170
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
1171
|
+
else:
|
|
1172
|
+
# Duck-type fallback for tests that provide a lightweight mock
|
|
1173
|
+
if not hasattr(df, "to_dict") or not hasattr(df, "empty"):
|
|
1174
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
1175
|
+
|
|
1176
|
+
if df.empty:
|
|
1177
|
+
raise ValueError("DataFrame cannot be empty")
|
|
1178
|
+
|
|
1179
|
+
try:
|
|
1180
|
+
# Convert DataFrame to list of dicts
|
|
1181
|
+
data = df.to_dict("records")
|
|
1182
|
+
return toon.encode(data)
|
|
1183
|
+
except Exception as e:
|
|
1184
|
+
raise ValueError(f"Failed to convert DataFrame to TOON format: {e}") from e
|
|
1185
|
+
|
|
1186
|
+
|
|
1187
|
+
def _calculate_token_savings(json_text: str, toon_text: str) -> dict[str, Any]:
|
|
1188
|
+
"""Calculate estimated token savings between JSON and TOON formats.
|
|
1189
|
+
|
|
1190
|
+
This is a rough estimation based on character count ratios.
|
|
1191
|
+
Actual token counts may vary by model and tokenizer.
|
|
1192
|
+
|
|
1193
|
+
Args:
|
|
1194
|
+
json_text: JSON formatted text
|
|
1195
|
+
toon_text: TOON formatted text
|
|
1196
|
+
|
|
1197
|
+
Returns:
|
|
1198
|
+
Dict containing savings statistics
|
|
1199
|
+
"""
|
|
1200
|
+
json_chars = len(json_text)
|
|
1201
|
+
toon_chars = len(toon_text)
|
|
1202
|
+
|
|
1203
|
+
# Rough estimation: 4 characters ≈ 1 token (varies by model)
|
|
1204
|
+
json_tokens_est = json_chars // 4
|
|
1205
|
+
toon_tokens_est = toon_chars // 4
|
|
1206
|
+
|
|
1207
|
+
savings_chars = json_chars - toon_chars
|
|
1208
|
+
savings_tokens_est = json_tokens_est - toon_tokens_est
|
|
1209
|
+
|
|
1210
|
+
percentage_saved = (savings_chars / json_chars * 100) if json_chars > 0 else 0
|
|
1211
|
+
|
|
1212
|
+
return {
|
|
1213
|
+
"json_characters": json_chars,
|
|
1214
|
+
"toon_characters": toon_chars,
|
|
1215
|
+
"saved_characters": savings_chars,
|
|
1216
|
+
"estimated_json_tokens": json_tokens_est,
|
|
1217
|
+
"estimated_toon_tokens": toon_tokens_est,
|
|
1218
|
+
"estimated_saved_tokens": savings_tokens_est,
|
|
1219
|
+
"percentage_saved": round(percentage_saved, 1),
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
|
|
1223
|
+
def extract_from_data(
|
|
1224
|
+
data: Union[list[dict[str, Any]], dict[str, Any]],
|
|
1225
|
+
question: str,
|
|
1226
|
+
json_schema: dict[str, Any],
|
|
1227
|
+
*,
|
|
1228
|
+
model_name: str,
|
|
1229
|
+
data_key: str | None = None,
|
|
1230
|
+
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1231
|
+
ai_cleanup: bool = True,
|
|
1232
|
+
options: dict[str, Any] | None = None,
|
|
1233
|
+
system_prompt: str | None = None,
|
|
1234
|
+
) -> dict[str, Any]:
|
|
1235
|
+
"""Extract information from structured data by converting to TOON format for token efficiency.
|
|
1236
|
+
|
|
1237
|
+
This function takes JSON array data, converts it to TOON format to reduce tokens,
|
|
1238
|
+
sends it to the LLM with a question, and returns the JSON response.
|
|
1239
|
+
|
|
1240
|
+
Args:
|
|
1241
|
+
data: List of dicts (uniform array) or dict containing array under a key
|
|
1242
|
+
question: The question to ask about the data
|
|
1243
|
+
json_schema: Expected JSON schema for the response
|
|
1244
|
+
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4")
|
|
1245
|
+
data_key: If data is a dict, the key containing the array (e.g., "products")
|
|
1246
|
+
instruction_template: Template with {question} placeholder
|
|
1247
|
+
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1248
|
+
options: Additional options to pass to the driver
|
|
1249
|
+
|
|
1250
|
+
Returns:
|
|
1251
|
+
Dict containing:
|
|
1252
|
+
- json_object: The parsed JSON response
|
|
1253
|
+
- json_string: The JSON string response
|
|
1254
|
+
- usage: Token usage and cost information (includes token_savings)
|
|
1255
|
+
- toon_data: The TOON formatted input data
|
|
1256
|
+
- token_savings: Statistics about token savings vs JSON input
|
|
1257
|
+
|
|
1258
|
+
Raises:
|
|
1259
|
+
ValueError: If data format is invalid or conversion fails
|
|
1260
|
+
RuntimeError: If required dependencies are missing
|
|
1261
|
+
|
|
1262
|
+
Example:
|
|
1263
|
+
>>> products = [
|
|
1264
|
+
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1265
|
+
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1266
|
+
... ]
|
|
1267
|
+
>>> schema = {
|
|
1268
|
+
... "type": "object",
|
|
1269
|
+
... "properties": {
|
|
1270
|
+
... "average_price": {"type": "number"},
|
|
1271
|
+
... "total_items": {"type": "integer"}
|
|
1272
|
+
... }
|
|
1273
|
+
... }
|
|
1274
|
+
>>> result = extract_from_data(
|
|
1275
|
+
... data=products,
|
|
1276
|
+
... question="What is the average price and total number of items?",
|
|
1277
|
+
... json_schema=schema,
|
|
1278
|
+
... model_name="openai/gpt-4"
|
|
1279
|
+
... )
|
|
1280
|
+
>>> print(result["json_object"])
|
|
1281
|
+
{'average_price': 509.99, 'total_items': 2}
|
|
1282
|
+
"""
|
|
1283
|
+
if not question or not question.strip():
|
|
1284
|
+
raise ValueError("Question cannot be empty")
|
|
1285
|
+
|
|
1286
|
+
if not json_schema:
|
|
1287
|
+
raise ValueError("JSON schema cannot be empty")
|
|
1288
|
+
|
|
1289
|
+
if options is None:
|
|
1290
|
+
options = {}
|
|
1291
|
+
|
|
1292
|
+
# Convert data to TOON format
|
|
1293
|
+
toon_data = _json_to_toon(data, data_key)
|
|
1294
|
+
|
|
1295
|
+
# Calculate token savings (for comparison with JSON)
|
|
1296
|
+
json_data = json.dumps(data if isinstance(data, list) else data.get(data_key, data), indent=2)
|
|
1297
|
+
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1298
|
+
|
|
1299
|
+
# Build the prompt with TOON data
|
|
1300
|
+
content_prompt = instruction_template.format(question=question)
|
|
1301
|
+
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1302
|
+
|
|
1303
|
+
# Call the LLM
|
|
1304
|
+
result = ask_for_json(
|
|
1305
|
+
driver=get_driver_for_model(model_name),
|
|
1306
|
+
content_prompt=full_prompt,
|
|
1307
|
+
json_schema=json_schema,
|
|
1308
|
+
ai_cleanup=ai_cleanup,
|
|
1309
|
+
model_name=model_name.split("/")[-1] if "/" in model_name else model_name,
|
|
1310
|
+
options=options,
|
|
1311
|
+
output_format="json", # Always return JSON, not TOON
|
|
1312
|
+
system_prompt=system_prompt,
|
|
1313
|
+
)
|
|
1314
|
+
|
|
1315
|
+
# Add our additional data to the result
|
|
1316
|
+
result["toon_data"] = toon_data
|
|
1317
|
+
result["token_savings"] = token_savings
|
|
1318
|
+
|
|
1319
|
+
return result
|
|
1320
|
+
|
|
1321
|
+
|
|
1322
|
+
def extract_from_pandas(
|
|
1323
|
+
df, # pandas.DataFrame - optional import
|
|
1324
|
+
question: str,
|
|
1325
|
+
json_schema: dict[str, Any],
|
|
1326
|
+
*,
|
|
1327
|
+
model_name: str,
|
|
1328
|
+
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1329
|
+
ai_cleanup: bool = True,
|
|
1330
|
+
options: dict[str, Any] | None = None,
|
|
1331
|
+
system_prompt: str | None = None,
|
|
1332
|
+
) -> dict[str, Any]:
|
|
1333
|
+
"""Extract information from Pandas DataFrame by converting to TOON format for token efficiency.
|
|
1334
|
+
|
|
1335
|
+
This function takes a Pandas DataFrame, converts it to TOON format to reduce tokens,
|
|
1336
|
+
sends it to the LLM with a question, and returns the JSON response.
|
|
1337
|
+
|
|
1338
|
+
Args:
|
|
1339
|
+
df: Pandas DataFrame to analyze
|
|
1340
|
+
question: The question to ask about the data
|
|
1341
|
+
json_schema: Expected JSON schema for the response
|
|
1342
|
+
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4")
|
|
1343
|
+
instruction_template: Template with {question} placeholder
|
|
1344
|
+
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1345
|
+
options: Additional options to pass to the driver
|
|
1346
|
+
|
|
1347
|
+
Returns:
|
|
1348
|
+
Dict containing:
|
|
1349
|
+
- json_object: The parsed JSON response
|
|
1350
|
+
- json_string: The JSON string response
|
|
1351
|
+
- usage: Token usage and cost information (includes token_savings)
|
|
1352
|
+
- toon_data: The TOON formatted input data
|
|
1353
|
+
- token_savings: Statistics about token savings vs JSON input
|
|
1354
|
+
- dataframe_info: Basic info about the original DataFrame
|
|
1355
|
+
|
|
1356
|
+
Raises:
|
|
1357
|
+
ValueError: If DataFrame is invalid or conversion fails
|
|
1358
|
+
RuntimeError: If required dependencies are missing
|
|
1359
|
+
|
|
1360
|
+
Example:
|
|
1361
|
+
>>> import pandas as pd
|
|
1362
|
+
>>> df = pd.DataFrame([
|
|
1363
|
+
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1364
|
+
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1365
|
+
... ])
|
|
1366
|
+
>>> schema = {
|
|
1367
|
+
... "type": "object",
|
|
1368
|
+
... "properties": {
|
|
1369
|
+
... "highest_priced_item": {"type": "string"},
|
|
1370
|
+
... "price_range": {"type": "number"}
|
|
1371
|
+
... }
|
|
1372
|
+
... }
|
|
1373
|
+
>>> result = extract_from_pandas(
|
|
1374
|
+
... df=df,
|
|
1375
|
+
... question="What is the highest priced item and price range?",
|
|
1376
|
+
... json_schema=schema,
|
|
1377
|
+
... model_name="openai/gpt-4"
|
|
1378
|
+
... )
|
|
1379
|
+
>>> print(result["json_object"])
|
|
1380
|
+
{'highest_priced_item': 'Laptop', 'price_range': 980.0}
|
|
1381
|
+
"""
|
|
1382
|
+
if not question or not question.strip():
|
|
1383
|
+
raise ValueError("Question cannot be empty")
|
|
1384
|
+
|
|
1385
|
+
if not json_schema:
|
|
1386
|
+
raise ValueError("JSON schema cannot be empty")
|
|
1387
|
+
|
|
1388
|
+
if options is None:
|
|
1389
|
+
options = {}
|
|
1390
|
+
|
|
1391
|
+
# Convert DataFrame to TOON format
|
|
1392
|
+
toon_data = _dataframe_to_toon(df)
|
|
1393
|
+
|
|
1394
|
+
# Calculate token savings (for comparison with JSON)
|
|
1395
|
+
json_data = df.to_json(indent=2, orient="records")
|
|
1396
|
+
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1397
|
+
|
|
1398
|
+
# Get basic DataFrame info
|
|
1399
|
+
dataframe_info = {
|
|
1400
|
+
"shape": df.shape,
|
|
1401
|
+
"columns": list(df.columns),
|
|
1402
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
1403
|
+
}
|
|
1404
|
+
|
|
1405
|
+
# Build the prompt with TOON data
|
|
1406
|
+
content_prompt = instruction_template.format(question=question)
|
|
1407
|
+
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1408
|
+
|
|
1409
|
+
# Call the LLM
|
|
1410
|
+
result = ask_for_json(
|
|
1411
|
+
driver=get_driver_for_model(model_name),
|
|
1412
|
+
content_prompt=full_prompt,
|
|
1413
|
+
json_schema=json_schema,
|
|
1414
|
+
ai_cleanup=ai_cleanup,
|
|
1415
|
+
model_name=model_name.split("/")[-1] if "/" in model_name else model_name,
|
|
1416
|
+
options=options,
|
|
1417
|
+
output_format="json", # Always return JSON, not TOON
|
|
1418
|
+
system_prompt=system_prompt,
|
|
1419
|
+
)
|
|
1420
|
+
|
|
1421
|
+
# Add our additional data to the result
|
|
1422
|
+
result["toon_data"] = toon_data
|
|
1423
|
+
result["token_savings"] = token_savings
|
|
1424
|
+
result["dataframe_info"] = dataframe_info
|
|
1425
|
+
|
|
1426
|
+
return result
|