prompture 0.0.29.dev8__py3-none-any.whl → 0.0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prompture/__init__.py +146 -23
- prompture/_version.py +34 -0
- prompture/aio/__init__.py +74 -0
- prompture/async_conversation.py +607 -0
- prompture/async_core.py +803 -0
- prompture/async_driver.py +169 -0
- prompture/cache.py +469 -0
- prompture/callbacks.py +55 -0
- prompture/cli.py +63 -4
- prompture/conversation.py +631 -0
- prompture/core.py +876 -263
- prompture/cost_mixin.py +51 -0
- prompture/discovery.py +164 -0
- prompture/driver.py +168 -5
- prompture/drivers/__init__.py +173 -69
- prompture/drivers/airllm_driver.py +109 -0
- prompture/drivers/async_airllm_driver.py +26 -0
- prompture/drivers/async_azure_driver.py +117 -0
- prompture/drivers/async_claude_driver.py +107 -0
- prompture/drivers/async_google_driver.py +132 -0
- prompture/drivers/async_grok_driver.py +91 -0
- prompture/drivers/async_groq_driver.py +84 -0
- prompture/drivers/async_hugging_driver.py +61 -0
- prompture/drivers/async_lmstudio_driver.py +79 -0
- prompture/drivers/async_local_http_driver.py +44 -0
- prompture/drivers/async_ollama_driver.py +125 -0
- prompture/drivers/async_openai_driver.py +96 -0
- prompture/drivers/async_openrouter_driver.py +96 -0
- prompture/drivers/async_registry.py +129 -0
- prompture/drivers/azure_driver.py +36 -9
- prompture/drivers/claude_driver.py +251 -34
- prompture/drivers/google_driver.py +107 -38
- prompture/drivers/grok_driver.py +29 -32
- prompture/drivers/groq_driver.py +27 -26
- prompture/drivers/hugging_driver.py +6 -6
- prompture/drivers/lmstudio_driver.py +26 -13
- prompture/drivers/local_http_driver.py +6 -6
- prompture/drivers/ollama_driver.py +157 -23
- prompture/drivers/openai_driver.py +178 -9
- prompture/drivers/openrouter_driver.py +31 -25
- prompture/drivers/registry.py +306 -0
- prompture/field_definitions.py +106 -96
- prompture/logging.py +80 -0
- prompture/model_rates.py +217 -0
- prompture/runner.py +49 -47
- prompture/scaffold/__init__.py +1 -0
- prompture/scaffold/generator.py +84 -0
- prompture/scaffold/templates/Dockerfile.j2 +12 -0
- prompture/scaffold/templates/README.md.j2 +41 -0
- prompture/scaffold/templates/config.py.j2 +21 -0
- prompture/scaffold/templates/env.example.j2 +8 -0
- prompture/scaffold/templates/main.py.j2 +86 -0
- prompture/scaffold/templates/models.py.j2 +40 -0
- prompture/scaffold/templates/requirements.txt.j2 +5 -0
- prompture/server.py +183 -0
- prompture/session.py +117 -0
- prompture/settings.py +18 -1
- prompture/tools.py +219 -267
- prompture/tools_schema.py +254 -0
- prompture/validator.py +3 -3
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.35.dist-info}/METADATA +117 -21
- prompture-0.0.35.dist-info/RECORD +66 -0
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.35.dist-info}/WHEEL +1 -1
- prompture-0.0.29.dev8.dist-info/RECORD +0 -27
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.35.dist-info}/entry_points.txt +0 -0
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.35.dist-info}/licenses/LICENSE +0 -0
- {prompture-0.0.29.dev8.dist-info → prompture-0.0.35.dist-info}/top_level.txt +0 -0
prompture/core.py
CHANGED
|
@@ -1,46 +1,58 @@
|
|
|
1
|
-
"""Core utilities: Helpers for requesting JSON from LLM.
|
|
2
|
-
|
|
1
|
+
"""Core utilities: Helpers for requesting JSON from LLM."""
|
|
2
|
+
|
|
3
3
|
from __future__ import annotations
|
|
4
|
+
|
|
4
5
|
import json
|
|
5
|
-
import
|
|
6
|
-
import requests
|
|
6
|
+
import logging
|
|
7
7
|
import sys
|
|
8
|
-
import
|
|
9
|
-
from datetime import datetime, date
|
|
8
|
+
from datetime import date, datetime
|
|
10
9
|
from decimal import Decimal
|
|
11
|
-
from typing import Any,
|
|
10
|
+
from typing import Any, Literal, Union
|
|
11
|
+
|
|
12
|
+
import requests
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import toon
|
|
16
|
+
except ImportError:
|
|
17
|
+
toon = None
|
|
12
18
|
|
|
13
|
-
from pydantic import BaseModel
|
|
19
|
+
from pydantic import BaseModel
|
|
14
20
|
|
|
15
|
-
from .drivers import get_driver, get_driver_for_model
|
|
16
21
|
from .driver import Driver
|
|
17
|
-
from .
|
|
22
|
+
from .drivers import get_driver_for_model
|
|
18
23
|
from .field_definitions import get_registry_snapshot
|
|
24
|
+
from .tools import (
|
|
25
|
+
clean_json_text,
|
|
26
|
+
convert_value,
|
|
27
|
+
get_field_default,
|
|
28
|
+
)
|
|
19
29
|
|
|
30
|
+
logger = logging.getLogger("prompture.core")
|
|
20
31
|
|
|
21
|
-
|
|
32
|
+
|
|
33
|
+
def normalize_field_value(value: Any, field_type: type, field_def: dict[str, Any]) -> Any:
|
|
22
34
|
"""Normalize invalid values for fields based on their type and nullable status.
|
|
23
|
-
|
|
35
|
+
|
|
24
36
|
This function handles post-processing of extracted values BEFORE Pydantic validation,
|
|
25
37
|
converting invalid values (like empty strings for booleans) to proper defaults.
|
|
26
|
-
|
|
38
|
+
|
|
27
39
|
Args:
|
|
28
40
|
value: The extracted value from the LLM
|
|
29
41
|
field_type: The expected Python type for this field
|
|
30
42
|
field_def: The field definition dict containing nullable, default, etc.
|
|
31
|
-
|
|
43
|
+
|
|
32
44
|
Returns:
|
|
33
45
|
A normalized value suitable for the field type
|
|
34
46
|
"""
|
|
35
47
|
nullable = field_def.get("nullable", True)
|
|
36
48
|
default_value = field_def.get("default")
|
|
37
|
-
|
|
49
|
+
|
|
38
50
|
# Special handling for boolean fields
|
|
39
|
-
if field_type is bool or (hasattr(field_type,
|
|
51
|
+
if field_type is bool or (hasattr(field_type, "__origin__") and field_type.__origin__ is bool):
|
|
40
52
|
# If value is already a boolean, return it as-is
|
|
41
53
|
if isinstance(value, bool):
|
|
42
54
|
return value
|
|
43
|
-
|
|
55
|
+
|
|
44
56
|
# For non-nullable booleans
|
|
45
57
|
if not nullable:
|
|
46
58
|
# Any non-empty string should be True, empty/None should be default
|
|
@@ -57,37 +69,39 @@ def normalize_field_value(value: Any, field_type: Type, field_def: Dict[str, Any
|
|
|
57
69
|
if isinstance(value, str):
|
|
58
70
|
return bool(value.strip()) if value.strip() else None
|
|
59
71
|
return bool(value) if value else None
|
|
60
|
-
|
|
72
|
+
|
|
61
73
|
# If the field is nullable and value is None, that's acceptable
|
|
62
74
|
if nullable and value is None:
|
|
63
75
|
return value
|
|
64
|
-
|
|
76
|
+
|
|
65
77
|
# For non-nullable fields with invalid values, use the default
|
|
66
78
|
if not nullable:
|
|
67
79
|
# Check for invalid values that should be replaced
|
|
68
80
|
invalid_values = (None, "", [], {})
|
|
69
|
-
|
|
81
|
+
|
|
70
82
|
if value in invalid_values or (isinstance(value, str) and not value.strip()):
|
|
71
83
|
# Use the default value if provided, otherwise use type-appropriate default
|
|
72
84
|
if default_value is not None:
|
|
73
85
|
return default_value
|
|
74
|
-
|
|
86
|
+
|
|
75
87
|
# Type-specific defaults for non-nullable fields
|
|
76
|
-
if field_type is int or (hasattr(field_type,
|
|
88
|
+
if field_type is int or (hasattr(field_type, "__origin__") and field_type.__origin__ is int):
|
|
77
89
|
return 0
|
|
78
|
-
elif field_type is float or (hasattr(field_type,
|
|
90
|
+
elif field_type is float or (hasattr(field_type, "__origin__") and field_type.__origin__ is float):
|
|
79
91
|
return 0.0
|
|
80
|
-
elif field_type is str or (hasattr(field_type,
|
|
92
|
+
elif field_type is str or (hasattr(field_type, "__origin__") and field_type.__origin__ is str):
|
|
81
93
|
return ""
|
|
82
|
-
elif field_type is list or (hasattr(field_type,
|
|
94
|
+
elif field_type is list or (hasattr(field_type, "__origin__") and field_type.__origin__ is list):
|
|
83
95
|
return []
|
|
84
|
-
elif field_type is dict or (hasattr(field_type,
|
|
96
|
+
elif field_type is dict or (hasattr(field_type, "__origin__") and field_type.__origin__ is dict):
|
|
85
97
|
return {}
|
|
86
|
-
|
|
98
|
+
|
|
87
99
|
return value
|
|
88
100
|
|
|
89
101
|
|
|
90
|
-
def clean_json_text_with_ai(
|
|
102
|
+
def clean_json_text_with_ai(
|
|
103
|
+
driver: Driver, text: str, model_name: str = "", options: dict[str, Any] | None = None
|
|
104
|
+
) -> str:
|
|
91
105
|
"""Use LLM to fix malformed JSON strings.
|
|
92
106
|
|
|
93
107
|
Generates a specialized prompt instructing the LLM to correct the
|
|
@@ -102,12 +116,14 @@ def clean_json_text_with_ai(driver: Driver, text: str, model_name: str = "", opt
|
|
|
102
116
|
A cleaned string that should contain valid JSON.
|
|
103
117
|
"""
|
|
104
118
|
# Check if JSON is already valid - if so, return unchanged
|
|
119
|
+
if options is None:
|
|
120
|
+
options = {}
|
|
105
121
|
try:
|
|
106
122
|
json.loads(text)
|
|
107
123
|
return text # Already valid, no need for LLM correction
|
|
108
124
|
except json.JSONDecodeError:
|
|
109
125
|
pass # Invalid, proceed with LLM correction
|
|
110
|
-
|
|
126
|
+
|
|
111
127
|
prompt = (
|
|
112
128
|
"The following text is supposed to be a single JSON object, but it is malformed. "
|
|
113
129
|
"Please correct it and return only the valid JSON object. Do not add any explanations or markdown. "
|
|
@@ -118,26 +134,120 @@ def clean_json_text_with_ai(driver: Driver, text: str, model_name: str = "", opt
|
|
|
118
134
|
cleaned = clean_json_text(raw)
|
|
119
135
|
return cleaned
|
|
120
136
|
|
|
137
|
+
|
|
138
|
+
def render_output(
|
|
139
|
+
driver: Driver,
|
|
140
|
+
content_prompt: str,
|
|
141
|
+
output_format: Literal["text", "html", "markdown"] = "text",
|
|
142
|
+
model_name: str = "",
|
|
143
|
+
options: dict[str, Any] | None = None,
|
|
144
|
+
system_prompt: str | None = None,
|
|
145
|
+
) -> dict[str, Any]:
|
|
146
|
+
"""Sends a prompt to the driver and returns the raw output in the requested format.
|
|
147
|
+
|
|
148
|
+
This function is designed for "no fluff" output, instructing the LLM to return
|
|
149
|
+
only the requested content without conversational filler or markdown fences
|
|
150
|
+
(unless markdown is requested).
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
driver: Adapter that implements generate(prompt, options).
|
|
154
|
+
content_prompt: Main prompt content.
|
|
155
|
+
output_format: Desired format ("text", "html", "markdown").
|
|
156
|
+
model_name: Optional model identifier used in usage metadata.
|
|
157
|
+
options: Additional options to pass to the driver.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
A dictionary containing:
|
|
161
|
+
- text: the raw text output.
|
|
162
|
+
- usage: token usage and cost information from the driver's meta object.
|
|
163
|
+
- output_format: the format of the output.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
ValueError: If an unsupported output format is provided.
|
|
167
|
+
"""
|
|
168
|
+
if options is None:
|
|
169
|
+
options = {}
|
|
170
|
+
if output_format not in ("text", "html", "markdown"):
|
|
171
|
+
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'text', 'html', or 'markdown'.")
|
|
172
|
+
|
|
173
|
+
instruct = ""
|
|
174
|
+
if output_format == "text":
|
|
175
|
+
instruct = (
|
|
176
|
+
"Return ONLY the raw text content. Do not use markdown formatting, "
|
|
177
|
+
"code fences, or conversational filler. Just the text."
|
|
178
|
+
)
|
|
179
|
+
elif output_format == "html":
|
|
180
|
+
instruct = (
|
|
181
|
+
"Return ONLY valid HTML code. Do not wrap it in markdown code fences "
|
|
182
|
+
"(like ```html ... ```). Do not include conversational filler."
|
|
183
|
+
)
|
|
184
|
+
elif output_format == "markdown":
|
|
185
|
+
instruct = "Return valid markdown content. You may use standard markdown formatting."
|
|
186
|
+
|
|
187
|
+
full_prompt = f"{content_prompt}\n\nSYSTEM INSTRUCTION: {instruct}"
|
|
188
|
+
|
|
189
|
+
# Use generate_messages when system_prompt is provided
|
|
190
|
+
if system_prompt is not None:
|
|
191
|
+
messages = [
|
|
192
|
+
{"role": "system", "content": system_prompt},
|
|
193
|
+
{"role": "user", "content": full_prompt},
|
|
194
|
+
]
|
|
195
|
+
resp = driver.generate_messages(messages, options)
|
|
196
|
+
else:
|
|
197
|
+
resp = driver.generate(full_prompt, options)
|
|
198
|
+
raw = resp.get("text", "")
|
|
199
|
+
|
|
200
|
+
# Clean up potential markdown fences if the model disobeyed for text/html
|
|
201
|
+
if output_format in ("text", "html"):
|
|
202
|
+
# Simple cleanup for common fences if they appear despite instructions
|
|
203
|
+
cleaned = raw.strip()
|
|
204
|
+
if cleaned.startswith("```") and cleaned.endswith("```"):
|
|
205
|
+
# Remove first line (fence + optional language) and last line (fence)
|
|
206
|
+
lines = cleaned.splitlines()
|
|
207
|
+
if len(lines) >= 2:
|
|
208
|
+
cleaned = "\n".join(lines[1:-1])
|
|
209
|
+
raw = cleaned
|
|
210
|
+
|
|
211
|
+
usage = {
|
|
212
|
+
**resp.get("meta", {}),
|
|
213
|
+
"raw_response": resp,
|
|
214
|
+
"total_tokens": resp.get("meta", {}).get("total_tokens", 0),
|
|
215
|
+
"prompt_tokens": resp.get("meta", {}).get("prompt_tokens", 0),
|
|
216
|
+
"completion_tokens": resp.get("meta", {}).get("completion_tokens", 0),
|
|
217
|
+
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
218
|
+
"model_name": model_name or getattr(driver, "model", ""),
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
return {"text": raw, "usage": usage, "output_format": output_format}
|
|
222
|
+
|
|
223
|
+
|
|
121
224
|
def ask_for_json(
|
|
122
225
|
driver: Driver,
|
|
123
226
|
content_prompt: str,
|
|
124
|
-
json_schema:
|
|
227
|
+
json_schema: dict[str, Any],
|
|
125
228
|
ai_cleanup: bool = True,
|
|
126
229
|
model_name: str = "",
|
|
127
|
-
options:
|
|
128
|
-
|
|
129
|
-
|
|
230
|
+
options: dict[str, Any] | None = None,
|
|
231
|
+
output_format: Literal["json", "toon"] = "json",
|
|
232
|
+
cache: bool | None = None,
|
|
233
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
234
|
+
system_prompt: str | None = None,
|
|
235
|
+
) -> dict[str, Any]:
|
|
236
|
+
"""Sends a prompt to the driver and returns structured output plus usage metadata.
|
|
130
237
|
|
|
131
238
|
This function enforces a schema-first approach by requiring a json_schema parameter
|
|
132
|
-
and automatically generating instructions for the LLM to return
|
|
239
|
+
and automatically generating instructions for the LLM to return data that matches it.
|
|
133
240
|
|
|
134
241
|
Args:
|
|
135
242
|
driver: Adapter that implements generate(prompt, options).
|
|
136
243
|
content_prompt: Main prompt content (may include examples).
|
|
137
244
|
json_schema: Required JSON schema dictionary defining the expected structure.
|
|
138
245
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
246
|
+
model_name: Optional model identifier used in usage metadata.
|
|
139
247
|
options: Additional options to pass to the driver.
|
|
140
|
-
|
|
248
|
+
output_format: Response serialization format ("json" or "toon").
|
|
249
|
+
cache: Override for response caching. ``True`` forces caching on,
|
|
250
|
+
``False`` forces it off, ``None`` defers to the global setting.
|
|
141
251
|
|
|
142
252
|
Returns:
|
|
143
253
|
A dictionary containing:
|
|
@@ -146,22 +256,96 @@ def ask_for_json(
|
|
|
146
256
|
- usage: token usage and cost information from the driver's meta object.
|
|
147
257
|
|
|
148
258
|
Raises:
|
|
149
|
-
|
|
259
|
+
ValueError: If an unsupported output format is provided.
|
|
260
|
+
RuntimeError: When TOON is requested but the dependency is missing.
|
|
261
|
+
json.JSONDecodeError: If JSON parsing fails and ai_cleanup is False.
|
|
262
|
+
ValueError: If TOON parsing fails.
|
|
150
263
|
"""
|
|
264
|
+
if options is None:
|
|
265
|
+
options = {}
|
|
266
|
+
if output_format not in ("json", "toon"):
|
|
267
|
+
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'json' or 'toon'.")
|
|
268
|
+
|
|
269
|
+
# --- cache lookup ---
|
|
270
|
+
from .cache import get_cache, make_cache_key
|
|
271
|
+
|
|
272
|
+
_cache = get_cache()
|
|
273
|
+
use_cache = cache if cache is not None else _cache.enabled
|
|
274
|
+
_force = cache is True # explicit per-call override
|
|
275
|
+
cache_key: str | None = None
|
|
276
|
+
if use_cache:
|
|
277
|
+
cache_key = make_cache_key(
|
|
278
|
+
prompt=content_prompt,
|
|
279
|
+
model_name=model_name,
|
|
280
|
+
schema=json_schema,
|
|
281
|
+
options=options,
|
|
282
|
+
output_format=output_format,
|
|
283
|
+
)
|
|
284
|
+
cached = _cache.get(cache_key, force=_force)
|
|
285
|
+
if cached is not None:
|
|
286
|
+
cached["usage"]["cache_hit"] = True
|
|
287
|
+
return cached
|
|
151
288
|
|
|
152
289
|
schema_string = json.dumps(json_schema, indent=2)
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
290
|
+
if output_format == "toon" and toon is None:
|
|
291
|
+
raise RuntimeError(
|
|
292
|
+
"TOON requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Determine whether to use native JSON mode
|
|
296
|
+
use_json_mode = False
|
|
297
|
+
if json_mode == "on":
|
|
298
|
+
use_json_mode = True
|
|
299
|
+
elif json_mode == "auto":
|
|
300
|
+
use_json_mode = getattr(driver, "supports_json_mode", False)
|
|
301
|
+
|
|
302
|
+
if use_json_mode:
|
|
303
|
+
options = {**options, "json_mode": True}
|
|
304
|
+
if getattr(driver, "supports_json_schema", False):
|
|
305
|
+
options["json_schema"] = json_schema
|
|
306
|
+
|
|
307
|
+
# Adjust instruction prompt based on JSON mode capabilities
|
|
308
|
+
if use_json_mode and getattr(driver, "supports_json_schema", False):
|
|
309
|
+
# Schema enforced by API — minimal instruction
|
|
310
|
+
instruct = "Extract data matching the requested schema.\nIf a value is unknown use null."
|
|
311
|
+
elif use_json_mode:
|
|
312
|
+
# JSON guaranteed but schema not enforced by API
|
|
313
|
+
instruct = (
|
|
314
|
+
"Return a JSON object that validates against this schema:\n"
|
|
315
|
+
f"{schema_string}\n\n"
|
|
316
|
+
"If a value is unknown use null."
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
# Existing prompt-based enforcement
|
|
320
|
+
instruct = (
|
|
321
|
+
"Return only a single JSON object (no markdown, no extra text) that validates against this JSON schema:\n"
|
|
322
|
+
f"{schema_string}\n\n"
|
|
323
|
+
"If a value is unknown use null. Use double quotes for keys and strings."
|
|
324
|
+
)
|
|
325
|
+
if output_format == "toon":
|
|
326
|
+
instruct += "\n\n(Respond with JSON only; Prompture will convert to TOON.)"
|
|
327
|
+
|
|
158
328
|
full_prompt = f"{content_prompt}\n\n{instruct}"
|
|
159
|
-
|
|
329
|
+
|
|
330
|
+
# Use generate_messages when system_prompt is provided
|
|
331
|
+
if system_prompt is not None:
|
|
332
|
+
messages = [
|
|
333
|
+
{"role": "system", "content": system_prompt},
|
|
334
|
+
{"role": "user", "content": full_prompt},
|
|
335
|
+
]
|
|
336
|
+
resp = driver.generate_messages(messages, options)
|
|
337
|
+
else:
|
|
338
|
+
resp = driver.generate(full_prompt, options)
|
|
160
339
|
raw = resp.get("text", "")
|
|
161
340
|
cleaned = clean_json_text(raw)
|
|
162
341
|
|
|
163
342
|
try:
|
|
164
343
|
json_obj = json.loads(cleaned)
|
|
344
|
+
json_string = cleaned
|
|
345
|
+
toon_string = None
|
|
346
|
+
if output_format == "toon":
|
|
347
|
+
toon_string = toon.encode(json_obj)
|
|
348
|
+
|
|
165
349
|
usage = {
|
|
166
350
|
**resp.get("meta", {}),
|
|
167
351
|
"raw_response": resp,
|
|
@@ -169,19 +353,27 @@ def ask_for_json(
|
|
|
169
353
|
"prompt_tokens": resp.get("meta", {}).get("prompt_tokens", 0),
|
|
170
354
|
"completion_tokens": resp.get("meta", {}).get("completion_tokens", 0),
|
|
171
355
|
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
172
|
-
"model_name": model_name or getattr(driver, "model", "")
|
|
173
|
-
}
|
|
174
|
-
return {
|
|
175
|
-
"json_string": cleaned,
|
|
176
|
-
"json_object": json_obj,
|
|
177
|
-
"usage": usage
|
|
356
|
+
"model_name": model_name or getattr(driver, "model", ""),
|
|
178
357
|
}
|
|
358
|
+
result = {"json_string": json_string, "json_object": json_obj, "usage": usage}
|
|
359
|
+
if toon_string is not None:
|
|
360
|
+
result["toon_string"] = toon_string
|
|
361
|
+
result["output_format"] = "toon"
|
|
362
|
+
else:
|
|
363
|
+
result["output_format"] = "json"
|
|
364
|
+
|
|
365
|
+
# --- cache store ---
|
|
366
|
+
if use_cache and cache_key is not None:
|
|
367
|
+
cached_copy = {**result, "usage": {**result["usage"], "raw_response": {}}}
|
|
368
|
+
_cache.set(cache_key, cached_copy, force=_force)
|
|
369
|
+
|
|
370
|
+
return result
|
|
179
371
|
except json.JSONDecodeError as e:
|
|
180
372
|
if ai_cleanup:
|
|
181
373
|
cleaned_fixed = clean_json_text_with_ai(driver, cleaned, model_name, options)
|
|
182
374
|
try:
|
|
183
375
|
json_obj = json.loads(cleaned_fixed)
|
|
184
|
-
|
|
376
|
+
result = {
|
|
185
377
|
"json_string": cleaned_fixed,
|
|
186
378
|
"json_object": json_obj,
|
|
187
379
|
"usage": {
|
|
@@ -190,46 +382,62 @@ def ask_for_json(
|
|
|
190
382
|
"total_tokens": 0,
|
|
191
383
|
"cost": 0.0,
|
|
192
384
|
"model_name": options.get("model", getattr(driver, "model", "")),
|
|
193
|
-
"raw_response": {}
|
|
385
|
+
"raw_response": {},
|
|
194
386
|
},
|
|
387
|
+
"output_format": "json" if output_format != "toon" else "toon",
|
|
195
388
|
}
|
|
389
|
+
if output_format == "toon":
|
|
390
|
+
result["toon_string"] = toon.encode(json_obj)
|
|
391
|
+
|
|
392
|
+
# --- cache store (ai cleanup path) ---
|
|
393
|
+
if use_cache and cache_key is not None:
|
|
394
|
+
_cache.set(cache_key, result, force=_force)
|
|
395
|
+
|
|
396
|
+
return result
|
|
196
397
|
except json.JSONDecodeError:
|
|
197
|
-
|
|
198
|
-
raise e
|
|
398
|
+
raise e from None
|
|
199
399
|
else:
|
|
200
|
-
# Explicitly re-raise the original JSONDecodeError
|
|
201
400
|
raise e
|
|
202
401
|
|
|
402
|
+
|
|
203
403
|
def extract_and_jsonify(
|
|
204
404
|
text: Union[str, Driver], # Can be either text or driver for backward compatibility
|
|
205
|
-
json_schema:
|
|
405
|
+
json_schema: dict[str, Any],
|
|
206
406
|
*, # Force keyword arguments for remaining params
|
|
207
|
-
model_name: Union[str,
|
|
407
|
+
model_name: Union[str, dict[str, Any]] = "", # Can be schema (old) or model name (new)
|
|
208
408
|
instruction_template: str = "Extract information from the following text:",
|
|
209
409
|
ai_cleanup: bool = True,
|
|
210
|
-
|
|
211
|
-
|
|
410
|
+
output_format: Literal["json", "toon"] = "json",
|
|
411
|
+
options: dict[str, Any] | None = None,
|
|
412
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
413
|
+
system_prompt: str | None = None,
|
|
414
|
+
) -> dict[str, Any]:
|
|
212
415
|
"""Extracts structured information using automatic driver selection based on model name.
|
|
213
|
-
|
|
416
|
+
|
|
214
417
|
Args:
|
|
215
418
|
text: The raw text to extract information from.
|
|
216
419
|
json_schema: JSON schema dictionary defining the expected structure.
|
|
217
420
|
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4-turbo-preview").
|
|
218
421
|
instruction_template: Instructional text to prepend to the content.
|
|
219
422
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
423
|
+
output_format: Response serialization format ("json" or "toon").
|
|
220
424
|
options: Additional options to pass to the driver.
|
|
221
|
-
|
|
425
|
+
|
|
222
426
|
Returns:
|
|
223
427
|
A dictionary containing:
|
|
224
428
|
- json_string: the JSON string output.
|
|
225
429
|
- json_object: the parsed JSON object.
|
|
226
430
|
- usage: token usage and cost information from the driver's meta object.
|
|
227
|
-
|
|
431
|
+
|
|
228
432
|
Raises:
|
|
229
433
|
ValueError: If text is empty or None, or if model_name format is invalid.
|
|
230
434
|
json.JSONDecodeError: If the response cannot be parsed as JSON and ai_cleanup is False.
|
|
231
435
|
pytest.skip: If a ConnectionError occurs during testing (when pytest is running).
|
|
232
436
|
"""
|
|
437
|
+
if options is None:
|
|
438
|
+
options = {}
|
|
439
|
+
actual_template = instruction_template
|
|
440
|
+
actual_output_format = output_format
|
|
233
441
|
# Handle legacy format where first argument is driver
|
|
234
442
|
# Validate text input first
|
|
235
443
|
if isinstance(text, Driver):
|
|
@@ -246,7 +454,6 @@ def extract_and_jsonify(
|
|
|
246
454
|
raise ValueError("Text input cannot be empty")
|
|
247
455
|
actual_text = text
|
|
248
456
|
actual_schema = json_schema
|
|
249
|
-
actual_template = instruction_template
|
|
250
457
|
actual_model = model_name or options.get("model", "")
|
|
251
458
|
driver = options.pop("driver", None)
|
|
252
459
|
|
|
@@ -254,18 +461,18 @@ def extract_and_jsonify(
|
|
|
254
461
|
if driver is None:
|
|
255
462
|
if not actual_model:
|
|
256
463
|
raise ValueError("Model name cannot be empty")
|
|
257
|
-
|
|
464
|
+
|
|
258
465
|
# First validate model format
|
|
259
466
|
if "/" not in actual_model:
|
|
260
467
|
raise ValueError("Invalid model string format. Expected format: 'provider/model'")
|
|
261
|
-
|
|
468
|
+
|
|
262
469
|
try:
|
|
263
470
|
driver = get_driver_for_model(actual_model)
|
|
264
471
|
except ValueError as e:
|
|
265
472
|
if "Unsupported provider" in str(e):
|
|
266
|
-
raise ValueError(f"Unsupported provider in model name: {actual_model}")
|
|
473
|
+
raise ValueError(f"Unsupported provider in model name: {actual_model}") from e
|
|
267
474
|
raise # Re-raise any other ValueError
|
|
268
|
-
|
|
475
|
+
|
|
269
476
|
# Extract model parts for other validation
|
|
270
477
|
try:
|
|
271
478
|
provider, model_id = actual_model.split("/", 1)
|
|
@@ -274,29 +481,43 @@ def extract_and_jsonify(
|
|
|
274
481
|
except ValueError:
|
|
275
482
|
# If no "/" in model string, use entire string as both provider and model_id
|
|
276
483
|
provider = model_id = actual_model
|
|
277
|
-
|
|
484
|
+
|
|
278
485
|
opts = {**options, "model": model_id}
|
|
279
|
-
|
|
486
|
+
|
|
280
487
|
content_prompt = f"{actual_template} {actual_text}"
|
|
281
|
-
|
|
488
|
+
|
|
282
489
|
try:
|
|
283
|
-
return ask_for_json(
|
|
490
|
+
return ask_for_json(
|
|
491
|
+
driver,
|
|
492
|
+
content_prompt,
|
|
493
|
+
actual_schema,
|
|
494
|
+
ai_cleanup,
|
|
495
|
+
model_id,
|
|
496
|
+
opts,
|
|
497
|
+
output_format=actual_output_format,
|
|
498
|
+
json_mode=json_mode,
|
|
499
|
+
system_prompt=system_prompt,
|
|
500
|
+
)
|
|
284
501
|
except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
|
|
285
502
|
if "pytest" in sys.modules:
|
|
286
503
|
import pytest
|
|
504
|
+
|
|
287
505
|
pytest.skip(f"Connection error occurred: {e}")
|
|
288
|
-
raise ConnectionError(f"Connection error occurred: {e}")
|
|
506
|
+
raise ConnectionError(f"Connection error occurred: {e}") from e
|
|
507
|
+
|
|
289
508
|
|
|
290
509
|
def manual_extract_and_jsonify(
|
|
291
510
|
driver: Driver,
|
|
292
511
|
text: str,
|
|
293
|
-
json_schema:
|
|
512
|
+
json_schema: dict[str, Any],
|
|
294
513
|
model_name: str = "",
|
|
295
514
|
instruction_template: str = "Extract information from the following text:",
|
|
296
515
|
ai_cleanup: bool = True,
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
516
|
+
output_format: Literal["json", "toon"] = "json",
|
|
517
|
+
options: dict[str, Any] | None = None,
|
|
518
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
519
|
+
system_prompt: str | None = None,
|
|
520
|
+
) -> dict[str, Any]:
|
|
300
521
|
"""Extracts structured information using an explicitly provided driver.
|
|
301
522
|
|
|
302
523
|
This variant is useful when you want to directly control which driver
|
|
@@ -310,8 +531,8 @@ def manual_extract_and_jsonify(
|
|
|
310
531
|
model_name: Optional override of the model name.
|
|
311
532
|
instruction_template: Instructional text to prepend to the content.
|
|
312
533
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
534
|
+
output_format: Response serialization format ("json" or "toon").
|
|
313
535
|
options: Additional options to pass to the driver.
|
|
314
|
-
verbose_level: Logging level for debug output (LogLevel.OFF by default).
|
|
315
536
|
|
|
316
537
|
Returns:
|
|
317
538
|
A dictionary containing:
|
|
@@ -323,46 +544,58 @@ def manual_extract_and_jsonify(
|
|
|
323
544
|
ValueError: If text is empty or None.
|
|
324
545
|
json.JSONDecodeError: If the response cannot be parsed as JSON and ai_cleanup is False.
|
|
325
546
|
"""
|
|
547
|
+
if options is None:
|
|
548
|
+
options = {}
|
|
326
549
|
if not isinstance(text, str):
|
|
327
550
|
raise ValueError("Text input must be a string")
|
|
328
|
-
|
|
551
|
+
|
|
329
552
|
if not text or not text.strip():
|
|
330
553
|
raise ValueError("Text input cannot be empty")
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
"text_length"
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
554
|
+
|
|
555
|
+
logger.info("[manual] Starting manual extraction")
|
|
556
|
+
logger.debug(
|
|
557
|
+
"[manual] text_length=%d model_name=%s schema_keys=%s",
|
|
558
|
+
len(text),
|
|
559
|
+
model_name,
|
|
560
|
+
list(json_schema.keys()) if json_schema else [],
|
|
561
|
+
)
|
|
338
562
|
|
|
339
563
|
opts = dict(options)
|
|
340
564
|
if model_name:
|
|
341
565
|
opts["model"] = model_name
|
|
342
566
|
|
|
343
|
-
# Generate the content prompt
|
|
344
567
|
content_prompt = f"{instruction_template} {text}"
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
568
|
+
|
|
569
|
+
logger.debug("[manual] Generated prompt for extraction")
|
|
570
|
+
|
|
571
|
+
result = ask_for_json(
|
|
572
|
+
driver,
|
|
573
|
+
content_prompt,
|
|
574
|
+
json_schema,
|
|
575
|
+
ai_cleanup,
|
|
576
|
+
model_name,
|
|
577
|
+
opts,
|
|
578
|
+
output_format=output_format,
|
|
579
|
+
json_mode=json_mode,
|
|
580
|
+
system_prompt=system_prompt,
|
|
581
|
+
)
|
|
582
|
+
logger.debug("[manual] Manual extraction completed successfully")
|
|
583
|
+
|
|
355
584
|
return result
|
|
356
585
|
|
|
586
|
+
|
|
357
587
|
def extract_with_model(
|
|
358
|
-
model_cls: Union[
|
|
359
|
-
text: Union[str,
|
|
360
|
-
model_name: Union[str,
|
|
588
|
+
model_cls: Union[type[BaseModel], str], # Can be model class or model name string for legacy support
|
|
589
|
+
text: Union[str, dict[str, Any]], # Can be text or schema for legacy support
|
|
590
|
+
model_name: Union[str, dict[str, Any]], # Can be model name or text for legacy support
|
|
361
591
|
instruction_template: str = "Extract information from the following text:",
|
|
362
592
|
ai_cleanup: bool = True,
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
593
|
+
output_format: Literal["json", "toon"] = "json",
|
|
594
|
+
options: dict[str, Any] | None = None,
|
|
595
|
+
cache: bool | None = None,
|
|
596
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
597
|
+
system_prompt: str | None = None,
|
|
598
|
+
) -> dict[str, Any]:
|
|
366
599
|
"""Extracts structured information into a Pydantic model instance.
|
|
367
600
|
|
|
368
601
|
Converts the Pydantic model to its JSON schema and uses auto-resolved driver based on model_name
|
|
@@ -374,8 +607,10 @@ def extract_with_model(
|
|
|
374
607
|
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4-turbo-preview").
|
|
375
608
|
instruction_template: Instructional text to prepend to the content.
|
|
376
609
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
610
|
+
output_format: Response serialization format ("json" or "toon").
|
|
377
611
|
options: Additional options to pass to the driver.
|
|
378
|
-
|
|
612
|
+
cache: Override for response caching. ``True`` forces caching on,
|
|
613
|
+
``False`` forces it off, ``None`` defers to the global setting.
|
|
379
614
|
|
|
380
615
|
Returns:
|
|
381
616
|
A validated instance of the Pydantic model.
|
|
@@ -385,6 +620,8 @@ def extract_with_model(
|
|
|
385
620
|
ValidationError: If the extracted data doesn't match the model schema.
|
|
386
621
|
"""
|
|
387
622
|
# Handle legacy format where first arg is model class
|
|
623
|
+
if options is None:
|
|
624
|
+
options = {}
|
|
388
625
|
if isinstance(model_cls, type) and issubclass(model_cls, BaseModel):
|
|
389
626
|
actual_cls = model_cls
|
|
390
627
|
actual_text = text
|
|
@@ -397,81 +634,122 @@ def extract_with_model(
|
|
|
397
634
|
|
|
398
635
|
if not isinstance(actual_text, str) or not actual_text.strip():
|
|
399
636
|
raise ValueError("Text input cannot be empty")
|
|
400
|
-
|
|
401
|
-
#
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
637
|
+
|
|
638
|
+
# --- cache lookup ---
|
|
639
|
+
from .cache import get_cache, make_cache_key
|
|
640
|
+
|
|
641
|
+
_cache = get_cache()
|
|
642
|
+
use_cache = cache if cache is not None else _cache.enabled
|
|
643
|
+
_force = cache is True
|
|
644
|
+
cache_key: str | None = None
|
|
645
|
+
if use_cache:
|
|
646
|
+
schema_for_key = actual_cls.model_json_schema()
|
|
647
|
+
cache_key = make_cache_key(
|
|
648
|
+
prompt=f"{instruction_template} {actual_text}",
|
|
649
|
+
model_name=actual_model if isinstance(actual_model, str) else "",
|
|
650
|
+
schema=schema_for_key,
|
|
651
|
+
options=options,
|
|
652
|
+
output_format=output_format,
|
|
653
|
+
pydantic_qualname=actual_cls.__qualname__,
|
|
654
|
+
)
|
|
655
|
+
cached = _cache.get(cache_key, force=_force)
|
|
656
|
+
if cached is not None:
|
|
657
|
+
cached["usage"]["cache_hit"] = True
|
|
658
|
+
# Reconstruct Pydantic model instance from cached JSON
|
|
659
|
+
cached["model"] = actual_cls(**cached["json_object"])
|
|
660
|
+
return type(
|
|
661
|
+
"ExtractResult",
|
|
662
|
+
(dict,),
|
|
663
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
664
|
+
)(cached)
|
|
665
|
+
|
|
666
|
+
logger.info("[extract] Starting extract_with_model")
|
|
667
|
+
logger.debug(
|
|
668
|
+
"[extract] model_cls=%s text_length=%d model_name=%s",
|
|
669
|
+
actual_cls.__name__,
|
|
670
|
+
len(actual_text),
|
|
671
|
+
actual_model,
|
|
672
|
+
)
|
|
408
673
|
|
|
409
674
|
schema = actual_cls.model_json_schema()
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
675
|
+
logger.debug("[extract] Generated JSON schema")
|
|
676
|
+
|
|
413
677
|
result = extract_and_jsonify(
|
|
414
678
|
text=actual_text,
|
|
415
679
|
json_schema=schema,
|
|
416
680
|
model_name=actual_model,
|
|
417
681
|
instruction_template=instruction_template,
|
|
418
682
|
ai_cleanup=ai_cleanup,
|
|
419
|
-
|
|
683
|
+
output_format=output_format,
|
|
684
|
+
options=options,
|
|
685
|
+
json_mode=json_mode,
|
|
686
|
+
system_prompt=system_prompt,
|
|
420
687
|
)
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
688
|
+
logger.debug("[extract] Extraction completed successfully")
|
|
689
|
+
|
|
424
690
|
# Post-process the extracted JSON object to normalize invalid values
|
|
425
691
|
json_object = result["json_object"]
|
|
426
692
|
schema_properties = schema.get("properties", {})
|
|
427
|
-
|
|
693
|
+
|
|
428
694
|
for field_name, field_info in actual_cls.model_fields.items():
|
|
429
695
|
if field_name in json_object and field_name in schema_properties:
|
|
430
|
-
|
|
696
|
+
schema_properties[field_name]
|
|
431
697
|
field_def = {
|
|
432
|
-
"nullable": not schema_properties[field_name].get("type")
|
|
433
|
-
|
|
434
|
-
|
|
698
|
+
"nullable": not schema_properties[field_name].get("type")
|
|
699
|
+
or "null"
|
|
700
|
+
in (
|
|
701
|
+
schema_properties[field_name].get("anyOf", [])
|
|
702
|
+
if isinstance(schema_properties[field_name].get("anyOf"), list)
|
|
703
|
+
else []
|
|
704
|
+
),
|
|
705
|
+
"default": field_info.default
|
|
706
|
+
if hasattr(field_info, "default") and field_info.default is not ...
|
|
707
|
+
else None,
|
|
435
708
|
}
|
|
436
|
-
|
|
709
|
+
|
|
437
710
|
# Normalize the value
|
|
438
|
-
json_object[field_name] = normalize_field_value(
|
|
439
|
-
|
|
440
|
-
field_info.annotation,
|
|
441
|
-
field_def
|
|
442
|
-
)
|
|
443
|
-
|
|
711
|
+
json_object[field_name] = normalize_field_value(json_object[field_name], field_info.annotation, field_def)
|
|
712
|
+
|
|
444
713
|
# Create model instance for validation
|
|
445
714
|
model_instance = actual_cls(**json_object)
|
|
446
|
-
|
|
715
|
+
|
|
447
716
|
# Return dictionary with all required fields and backwards compatibility
|
|
448
|
-
result_dict = {
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
717
|
+
result_dict = {"json_string": result["json_string"], "json_object": result["json_object"], "usage": result["usage"]}
|
|
718
|
+
|
|
719
|
+
# --- cache store ---
|
|
720
|
+
if use_cache and cache_key is not None:
|
|
721
|
+
cached_copy = {
|
|
722
|
+
"json_string": result_dict["json_string"],
|
|
723
|
+
"json_object": result_dict["json_object"],
|
|
724
|
+
"usage": {**result_dict["usage"], "raw_response": {}},
|
|
725
|
+
}
|
|
726
|
+
_cache.set(cache_key, cached_copy, force=_force)
|
|
727
|
+
|
|
454
728
|
# Add backwards compatibility property
|
|
455
729
|
result_dict["model"] = model_instance
|
|
456
|
-
|
|
730
|
+
|
|
457
731
|
# Return value can be used both as a dict and accessed as model directly
|
|
458
|
-
return type(
|
|
459
|
-
"
|
|
460
|
-
|
|
461
|
-
|
|
732
|
+
return type(
|
|
733
|
+
"ExtractResult",
|
|
734
|
+
(dict,),
|
|
735
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
736
|
+
)(result_dict)
|
|
737
|
+
|
|
462
738
|
|
|
463
739
|
def stepwise_extract_with_model(
|
|
464
|
-
model_cls:
|
|
740
|
+
model_cls: type[BaseModel],
|
|
465
741
|
text: str,
|
|
466
742
|
*, # Force keyword arguments for remaining params
|
|
467
743
|
model_name: str,
|
|
468
744
|
instruction_template: str = "Extract the {field_name} from the following text:",
|
|
469
745
|
ai_cleanup: bool = True,
|
|
470
|
-
fields:
|
|
471
|
-
field_definitions:
|
|
472
|
-
options:
|
|
473
|
-
|
|
474
|
-
|
|
746
|
+
fields: list[str] | None = None,
|
|
747
|
+
field_definitions: dict[str, Any] | None = None,
|
|
748
|
+
options: dict[str, Any] | None = None,
|
|
749
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
750
|
+
system_prompt: str | None = None,
|
|
751
|
+
share_context: bool = False,
|
|
752
|
+
) -> dict[str, Union[str, dict[str, Any]]]:
|
|
475
753
|
"""Extracts structured information into a Pydantic model by processing each field individually.
|
|
476
754
|
|
|
477
755
|
For each field in the model, makes a separate LLM call to extract that specific field,
|
|
@@ -489,7 +767,6 @@ def stepwise_extract_with_model(
|
|
|
489
767
|
field_definitions: Optional field definitions dict for enhanced default handling.
|
|
490
768
|
If None, automatically uses the global field registry.
|
|
491
769
|
options: Additional options to pass to the driver.
|
|
492
|
-
verbose_level: Logging level for debug output (LogLevel.OFF by default).
|
|
493
770
|
|
|
494
771
|
Returns:
|
|
495
772
|
A dictionary containing:
|
|
@@ -500,7 +777,7 @@ def stepwise_extract_with_model(
|
|
|
500
777
|
Raises:
|
|
501
778
|
ValueError: If text is empty or None, or if model_name format is invalid.
|
|
502
779
|
KeyError: If a requested field doesn't exist in the model.
|
|
503
|
-
|
|
780
|
+
|
|
504
781
|
Note:
|
|
505
782
|
This function now gracefully handles extraction failures by falling back to default
|
|
506
783
|
values rather than failing completely. Individual field errors are logged and
|
|
@@ -508,25 +785,40 @@ def stepwise_extract_with_model(
|
|
|
508
785
|
"""
|
|
509
786
|
if not text or not text.strip():
|
|
510
787
|
raise ValueError("Text input cannot be empty")
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
788
|
+
|
|
789
|
+
# When share_context=True, delegate to Conversation-based extraction
|
|
790
|
+
if share_context:
|
|
791
|
+
from .conversation import Conversation
|
|
792
|
+
|
|
793
|
+
conv = Conversation(model_name=model_name, system_prompt=system_prompt, options=options)
|
|
794
|
+
return conv._stepwise_extract(
|
|
795
|
+
model_cls=model_cls,
|
|
796
|
+
text=text,
|
|
797
|
+
instruction_template=instruction_template,
|
|
798
|
+
ai_cleanup=ai_cleanup,
|
|
799
|
+
fields=fields,
|
|
800
|
+
field_definitions=field_definitions,
|
|
801
|
+
json_mode=json_mode,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
logger.info("[stepwise] Starting stepwise extraction")
|
|
805
|
+
logger.debug(
|
|
806
|
+
"[stepwise] model_cls=%s text_length=%d fields=%s",
|
|
807
|
+
model_cls.__name__,
|
|
808
|
+
len(text),
|
|
809
|
+
fields,
|
|
810
|
+
)
|
|
518
811
|
|
|
519
812
|
# Auto-use global field registry if no field_definitions provided
|
|
520
813
|
if field_definitions is None:
|
|
521
814
|
field_definitions = get_registry_snapshot()
|
|
522
|
-
|
|
523
|
-
log_debug(LogLevel.TRACE, verbose_level, {"registry_fields": list(field_definitions.keys())}, prefix="[stepwise]")
|
|
815
|
+
logger.debug("[stepwise] Using global field registry")
|
|
524
816
|
|
|
525
817
|
data = {}
|
|
526
818
|
validation_errors = []
|
|
527
819
|
field_results = {} # Track success/failure per field
|
|
528
820
|
options = options or {}
|
|
529
|
-
|
|
821
|
+
|
|
530
822
|
# Initialize usage accumulator
|
|
531
823
|
accumulated_usage = {
|
|
532
824
|
"prompt_tokens": 0,
|
|
@@ -534,7 +826,7 @@ def stepwise_extract_with_model(
|
|
|
534
826
|
"total_tokens": 0,
|
|
535
827
|
"cost": 0.0,
|
|
536
828
|
"model_name": model_name, # Use provided model_name directly
|
|
537
|
-
"field_usages": {}
|
|
829
|
+
"field_usages": {},
|
|
538
830
|
}
|
|
539
831
|
|
|
540
832
|
# Get valid field names from the model
|
|
@@ -550,28 +842,16 @@ def stepwise_extract_with_model(
|
|
|
550
842
|
field_items = model_cls.model_fields.items()
|
|
551
843
|
|
|
552
844
|
for field_name, field_info in field_items:
|
|
553
|
-
|
|
554
|
-
log_debug(LogLevel.DEBUG, verbose_level, f"Extracting field: {field_name}", prefix="[stepwise]")
|
|
555
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
556
|
-
"field_name": field_name,
|
|
557
|
-
"field_info": str(field_info),
|
|
558
|
-
"field_type": str(field_info.annotation)
|
|
559
|
-
}, prefix="[stepwise]")
|
|
845
|
+
logger.debug("[stepwise] Extracting field: %s", field_name)
|
|
560
846
|
|
|
561
847
|
# Create field schema that expects a direct value rather than a dict
|
|
562
848
|
field_schema = {
|
|
563
849
|
"value": {
|
|
564
|
-
"type": "integer" if field_info.annotation
|
|
565
|
-
"description": field_info.description or f"Value for {field_name}"
|
|
850
|
+
"type": "integer" if field_info.annotation is int else "string",
|
|
851
|
+
"description": field_info.description or f"Value for {field_name}",
|
|
566
852
|
}
|
|
567
853
|
}
|
|
568
854
|
|
|
569
|
-
# Add structured logging for field schema and prompt
|
|
570
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
571
|
-
"field_schema": field_schema,
|
|
572
|
-
"prompt_template": instruction_template.format(field_name=field_name)
|
|
573
|
-
}, prefix="[stepwise]")
|
|
574
|
-
|
|
575
855
|
try:
|
|
576
856
|
result = extract_and_jsonify(
|
|
577
857
|
text=text,
|
|
@@ -579,12 +859,12 @@ def stepwise_extract_with_model(
|
|
|
579
859
|
model_name=model_name,
|
|
580
860
|
instruction_template=instruction_template.format(field_name=field_name),
|
|
581
861
|
ai_cleanup=ai_cleanup,
|
|
582
|
-
options=options
|
|
862
|
+
options=options,
|
|
863
|
+
json_mode=json_mode,
|
|
864
|
+
system_prompt=system_prompt,
|
|
583
865
|
)
|
|
584
866
|
|
|
585
|
-
|
|
586
|
-
log_debug(LogLevel.DEBUG, verbose_level, f"Raw extraction result for {field_name}", prefix="[stepwise]")
|
|
587
|
-
log_debug(LogLevel.TRACE, verbose_level, {"result": result}, prefix="[stepwise]")
|
|
867
|
+
logger.debug("[stepwise] Raw extraction result for %s", field_name)
|
|
588
868
|
|
|
589
869
|
# Accumulate usage data from this field extraction
|
|
590
870
|
field_usage = result.get("usage", {})
|
|
@@ -596,139 +876,125 @@ def stepwise_extract_with_model(
|
|
|
596
876
|
|
|
597
877
|
# Extract the raw value from the response - handle both dict and direct value formats
|
|
598
878
|
extracted_value = result["json_object"]["value"]
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
879
|
+
logger.debug("[stepwise] Raw extracted value for %s: %s", field_name, extracted_value)
|
|
880
|
+
|
|
602
881
|
if isinstance(extracted_value, dict) and "value" in extracted_value:
|
|
603
882
|
raw_value = extracted_value["value"]
|
|
604
|
-
|
|
883
|
+
logger.debug("[stepwise] Extracted inner value from dict for %s", field_name)
|
|
605
884
|
else:
|
|
606
885
|
raw_value = extracted_value
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
log_debug(LogLevel.DEBUG, verbose_level, {"field_name": field_name, "raw_value": raw_value}, prefix="[stepwise]")
|
|
886
|
+
logger.debug("[stepwise] Using direct value for %s", field_name)
|
|
610
887
|
|
|
611
888
|
# Post-process the raw value to normalize invalid values for non-nullable fields
|
|
612
889
|
field_def = {}
|
|
613
890
|
if field_definitions and field_name in field_definitions:
|
|
614
891
|
field_def = field_definitions[field_name] if isinstance(field_definitions[field_name], dict) else {}
|
|
615
|
-
|
|
892
|
+
|
|
616
893
|
# Determine nullable status and default value
|
|
617
894
|
nullable = field_def.get("nullable", True)
|
|
618
895
|
default_value = field_def.get("default")
|
|
619
|
-
if
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
896
|
+
if (
|
|
897
|
+
default_value is None
|
|
898
|
+
and hasattr(field_info, "default")
|
|
899
|
+
and field_info.default is not ...
|
|
900
|
+
and str(field_info.default) != "PydanticUndefined"
|
|
901
|
+
):
|
|
902
|
+
default_value = field_info.default
|
|
903
|
+
|
|
623
904
|
# Create field_def for normalize_field_value
|
|
624
|
-
normalize_def = {
|
|
625
|
-
|
|
626
|
-
"default": default_value
|
|
627
|
-
}
|
|
628
|
-
|
|
905
|
+
normalize_def = {"nullable": nullable, "default": default_value}
|
|
906
|
+
|
|
629
907
|
# Normalize the raw value before conversion
|
|
630
908
|
raw_value = normalize_field_value(raw_value, field_info.annotation, normalize_def)
|
|
631
|
-
|
|
909
|
+
logger.debug("[stepwise] Normalized value for %s: %s", field_name, raw_value)
|
|
632
910
|
|
|
633
911
|
# Convert value using tools.convert_value with logging
|
|
634
912
|
try:
|
|
635
|
-
converted_value = convert_value(
|
|
636
|
-
raw_value,
|
|
637
|
-
field_info.annotation,
|
|
638
|
-
allow_shorthand=True
|
|
639
|
-
)
|
|
913
|
+
converted_value = convert_value(raw_value, field_info.annotation, allow_shorthand=True)
|
|
640
914
|
data[field_name] = converted_value
|
|
641
915
|
field_results[field_name] = {"status": "success", "used_default": False}
|
|
642
916
|
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
646
|
-
"field_name": field_name,
|
|
647
|
-
"converted_value": converted_value
|
|
648
|
-
}, prefix="[stepwise]")
|
|
649
|
-
|
|
917
|
+
logger.debug("[stepwise] Successfully converted %s", field_name)
|
|
918
|
+
|
|
650
919
|
except ValueError as e:
|
|
651
|
-
error_msg = f"Type conversion failed for {field_name}: {
|
|
652
|
-
|
|
920
|
+
error_msg = f"Type conversion failed for {field_name}: {e!s}"
|
|
921
|
+
|
|
653
922
|
# Check if field has a default value (either explicit or from field_definitions)
|
|
654
923
|
has_default = False
|
|
655
924
|
if field_definitions and field_name in field_definitions:
|
|
656
925
|
field_def = field_definitions[field_name]
|
|
657
|
-
if isinstance(field_def, dict) and
|
|
926
|
+
if isinstance(field_def, dict) and "default" in field_def:
|
|
658
927
|
has_default = True
|
|
659
|
-
|
|
660
|
-
if not has_default and hasattr(field_info,
|
|
928
|
+
|
|
929
|
+
if not has_default and hasattr(field_info, "default"):
|
|
661
930
|
default_val = field_info.default
|
|
662
931
|
# Field has default if it's not PydanticUndefined or Ellipsis
|
|
663
|
-
if default_val is not ... and str(default_val) !=
|
|
932
|
+
if default_val is not ... and str(default_val) != "PydanticUndefined":
|
|
664
933
|
has_default = True
|
|
665
|
-
|
|
934
|
+
|
|
666
935
|
# Only add to validation_errors if field is required (no default)
|
|
667
936
|
if not has_default:
|
|
668
937
|
validation_errors.append(error_msg)
|
|
669
|
-
|
|
938
|
+
|
|
670
939
|
# Use default value (type-appropriate if no explicit default)
|
|
671
940
|
default_value = get_field_default(field_name, field_info, field_definitions)
|
|
672
941
|
data[field_name] = default_value
|
|
673
942
|
field_results[field_name] = {"status": "conversion_failed", "error": error_msg, "used_default": True}
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
943
|
+
|
|
944
|
+
logger.error("[stepwise] %s", error_msg)
|
|
945
|
+
logger.info("[stepwise] Using default value for %s: %s", field_name, default_value)
|
|
946
|
+
|
|
679
947
|
except Exception as e:
|
|
680
|
-
error_msg = f"Extraction failed for {field_name}: {
|
|
681
|
-
|
|
948
|
+
error_msg = f"Extraction failed for {field_name}: {e!s}"
|
|
949
|
+
|
|
682
950
|
# Check if field has a default value (either explicit or from field_definitions)
|
|
683
951
|
has_default = False
|
|
684
952
|
if field_definitions and field_name in field_definitions:
|
|
685
953
|
field_def = field_definitions[field_name]
|
|
686
|
-
if isinstance(field_def, dict) and
|
|
954
|
+
if isinstance(field_def, dict) and "default" in field_def:
|
|
687
955
|
has_default = True
|
|
688
|
-
|
|
689
|
-
if not has_default and hasattr(field_info,
|
|
956
|
+
|
|
957
|
+
if not has_default and hasattr(field_info, "default"):
|
|
690
958
|
default_val = field_info.default
|
|
691
959
|
# Field has default if it's not PydanticUndefined or Ellipsis
|
|
692
|
-
if default_val is not ... and str(default_val) !=
|
|
960
|
+
if default_val is not ... and str(default_val) != "PydanticUndefined":
|
|
693
961
|
has_default = True
|
|
694
|
-
|
|
962
|
+
|
|
695
963
|
# Only add to validation_errors if field is required (no default)
|
|
696
964
|
if not has_default:
|
|
697
965
|
validation_errors.append(error_msg)
|
|
698
|
-
|
|
966
|
+
|
|
699
967
|
# Use default value (type-appropriate if no explicit default)
|
|
700
968
|
default_value = get_field_default(field_name, field_info, field_definitions)
|
|
701
969
|
data[field_name] = default_value
|
|
702
970
|
field_results[field_name] = {"status": "extraction_failed", "error": error_msg, "used_default": True}
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
971
|
+
|
|
972
|
+
logger.error("[stepwise] %s", error_msg)
|
|
973
|
+
logger.info("[stepwise] Using default value for %s: %s", field_name, default_value)
|
|
974
|
+
|
|
708
975
|
# Store error details in field_usages
|
|
709
976
|
accumulated_usage["field_usages"][field_name] = {
|
|
710
977
|
"error": str(e),
|
|
711
978
|
"status": "failed",
|
|
712
979
|
"used_default": True,
|
|
713
|
-
"default_value": default_value
|
|
980
|
+
"default_value": default_value,
|
|
714
981
|
}
|
|
715
|
-
|
|
716
|
-
# Add structured logging for validation errors
|
|
982
|
+
|
|
717
983
|
if validation_errors:
|
|
718
|
-
|
|
984
|
+
logger.warning("[stepwise] Found %d validation errors", len(validation_errors))
|
|
719
985
|
for error in validation_errors:
|
|
720
|
-
|
|
721
|
-
|
|
986
|
+
logger.error("[stepwise] %s", error)
|
|
987
|
+
|
|
722
988
|
# If there are validation errors, include them in the result
|
|
723
989
|
if validation_errors:
|
|
724
990
|
accumulated_usage["validation_errors"] = validation_errors
|
|
725
|
-
|
|
991
|
+
|
|
726
992
|
try:
|
|
727
993
|
# Create model instance with collected data
|
|
728
994
|
# Create model instance with collected data
|
|
729
995
|
model_instance = model_cls(**data)
|
|
730
996
|
model_dict = model_instance.model_dump()
|
|
731
|
-
|
|
997
|
+
|
|
732
998
|
# Enhanced DateTimeEncoder to handle both datetime and date objects
|
|
733
999
|
class ExtendedJSONEncoder(json.JSONEncoder):
|
|
734
1000
|
def default(self, obj):
|
|
@@ -737,14 +1003,14 @@ def stepwise_extract_with_model(
|
|
|
737
1003
|
if isinstance(obj, Decimal):
|
|
738
1004
|
return str(obj)
|
|
739
1005
|
return super().default(obj)
|
|
740
|
-
|
|
1006
|
+
|
|
741
1007
|
# Use enhanced encoder for JSON serialization
|
|
742
1008
|
json_string = json.dumps(model_dict, cls=ExtendedJSONEncoder)
|
|
743
1009
|
|
|
744
1010
|
# Also modify return value to use ExtendedJSONEncoder
|
|
745
|
-
if
|
|
746
|
-
result[
|
|
747
|
-
|
|
1011
|
+
if "json_string" in result:
|
|
1012
|
+
result["json_string"] = json.dumps(result["json_object"], cls=ExtendedJSONEncoder)
|
|
1013
|
+
|
|
748
1014
|
# Define ExtendedJSONEncoder for handling special types
|
|
749
1015
|
class ExtendedJSONEncoder(json.JSONEncoder):
|
|
750
1016
|
def default(self, obj):
|
|
@@ -753,10 +1019,10 @@ def stepwise_extract_with_model(
|
|
|
753
1019
|
if isinstance(obj, Decimal):
|
|
754
1020
|
return str(obj)
|
|
755
1021
|
return super().default(obj)
|
|
756
|
-
|
|
1022
|
+
|
|
757
1023
|
# Create json string with custom encoder
|
|
758
1024
|
json_string = json.dumps(model_dict, cls=ExtendedJSONEncoder)
|
|
759
|
-
|
|
1025
|
+
|
|
760
1026
|
# Create result matching extract_with_model format
|
|
761
1027
|
result = {
|
|
762
1028
|
"json_string": json_string,
|
|
@@ -764,32 +1030,379 @@ def stepwise_extract_with_model(
|
|
|
764
1030
|
"usage": accumulated_usage,
|
|
765
1031
|
"field_results": field_results,
|
|
766
1032
|
}
|
|
767
|
-
|
|
1033
|
+
|
|
768
1034
|
# Add model instance as property and make callable
|
|
769
1035
|
result["model"] = model_instance
|
|
770
|
-
return type(
|
|
771
|
-
"
|
|
772
|
-
|
|
773
|
-
|
|
1036
|
+
return type(
|
|
1037
|
+
"ExtractResult",
|
|
1038
|
+
(dict,),
|
|
1039
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
1040
|
+
)(result)
|
|
774
1041
|
except Exception as e:
|
|
775
|
-
error_msg = f"Model validation error: {
|
|
1042
|
+
error_msg = f"Model validation error: {e!s}"
|
|
776
1043
|
# Add validation error to accumulated usage
|
|
777
1044
|
if "validation_errors" not in accumulated_usage:
|
|
778
1045
|
accumulated_usage["validation_errors"] = []
|
|
779
1046
|
accumulated_usage["validation_errors"].append(error_msg)
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
1047
|
+
|
|
1048
|
+
logger.error("[stepwise] %s", error_msg)
|
|
1049
|
+
|
|
784
1050
|
# Create error result with partial data
|
|
785
1051
|
error_result = {
|
|
786
1052
|
"json_string": "{}",
|
|
787
1053
|
"json_object": {},
|
|
788
1054
|
"usage": accumulated_usage,
|
|
789
1055
|
"field_results": field_results,
|
|
790
|
-
"error": error_msg
|
|
1056
|
+
"error": error_msg,
|
|
791
1057
|
}
|
|
792
|
-
return type(
|
|
793
|
-
"
|
|
794
|
-
|
|
795
|
-
|
|
1058
|
+
return type(
|
|
1059
|
+
"ExtractResult",
|
|
1060
|
+
(dict,),
|
|
1061
|
+
{
|
|
1062
|
+
"__getattr__": lambda self, key: self.get(key),
|
|
1063
|
+
"__call__": lambda self: None, # Return None when called if validation failed
|
|
1064
|
+
},
|
|
1065
|
+
)(error_result)
|
|
1066
|
+
|
|
1067
|
+
|
|
1068
|
+
def _json_to_toon(data: Union[list[dict[str, Any]], dict[str, Any]], data_key: str | None = None) -> str:
|
|
1069
|
+
"""Convert JSON array or dict containing array to TOON format.
|
|
1070
|
+
|
|
1071
|
+
Args:
|
|
1072
|
+
data: List of dicts (uniform array) or dict containing array under a key
|
|
1073
|
+
data_key: If data is a dict, the key containing the array
|
|
1074
|
+
|
|
1075
|
+
Returns:
|
|
1076
|
+
TOON formatted string
|
|
1077
|
+
|
|
1078
|
+
Raises:
|
|
1079
|
+
ValueError: If TOON conversion fails or data format is invalid
|
|
1080
|
+
RuntimeError: If python-toon is not installed
|
|
1081
|
+
"""
|
|
1082
|
+
if toon is None:
|
|
1083
|
+
raise RuntimeError(
|
|
1084
|
+
"TOON conversion requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
1085
|
+
)
|
|
1086
|
+
|
|
1087
|
+
# Handle different data formats
|
|
1088
|
+
if isinstance(data, list):
|
|
1089
|
+
array_data = data
|
|
1090
|
+
elif isinstance(data, dict):
|
|
1091
|
+
if data_key:
|
|
1092
|
+
if data_key not in data:
|
|
1093
|
+
raise ValueError(f"Key '{data_key}' not found in data")
|
|
1094
|
+
array_data = data[data_key]
|
|
1095
|
+
else:
|
|
1096
|
+
# Try to find the first array value in the dict
|
|
1097
|
+
array_data = None
|
|
1098
|
+
for _key, value in data.items():
|
|
1099
|
+
if isinstance(value, list) and value:
|
|
1100
|
+
array_data = value
|
|
1101
|
+
break
|
|
1102
|
+
if array_data is None:
|
|
1103
|
+
raise ValueError("No array found in data. Specify data_key or provide a list directly.")
|
|
1104
|
+
else:
|
|
1105
|
+
raise ValueError("Data must be a list of dicts or a dict containing an array")
|
|
1106
|
+
|
|
1107
|
+
if not isinstance(array_data, list):
|
|
1108
|
+
raise ValueError("Array data must be a list")
|
|
1109
|
+
|
|
1110
|
+
if not array_data:
|
|
1111
|
+
raise ValueError("Array data cannot be empty")
|
|
1112
|
+
|
|
1113
|
+
# Validate that all items in array are dicts (uniform structure)
|
|
1114
|
+
if not all(isinstance(item, dict) for item in array_data):
|
|
1115
|
+
raise ValueError("All items in array must be dictionaries for TOON conversion")
|
|
1116
|
+
|
|
1117
|
+
try:
|
|
1118
|
+
return toon.encode(array_data)
|
|
1119
|
+
except Exception as e:
|
|
1120
|
+
raise ValueError(f"Failed to convert data to TOON format: {e}") from e
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
def _dataframe_to_toon(df) -> str:
|
|
1124
|
+
"""Convert Pandas DataFrame to TOON format.
|
|
1125
|
+
|
|
1126
|
+
Args:
|
|
1127
|
+
df: Pandas DataFrame to convert
|
|
1128
|
+
|
|
1129
|
+
Returns:
|
|
1130
|
+
TOON formatted string
|
|
1131
|
+
|
|
1132
|
+
Raises:
|
|
1133
|
+
ValueError: If DataFrame conversion fails
|
|
1134
|
+
RuntimeError: If pandas or python-toon is not installed
|
|
1135
|
+
"""
|
|
1136
|
+
try:
|
|
1137
|
+
import pandas as pd
|
|
1138
|
+
except ImportError:
|
|
1139
|
+
raise RuntimeError(
|
|
1140
|
+
"Pandas DataFrame conversion requested but 'pandas' is not installed. "
|
|
1141
|
+
"Install it with 'pip install pandas' or 'pip install prompture[pandas]'."
|
|
1142
|
+
) from None
|
|
1143
|
+
|
|
1144
|
+
if toon is None:
|
|
1145
|
+
raise RuntimeError(
|
|
1146
|
+
"TOON conversion requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
dataframe_type = getattr(pd, "DataFrame", None)
|
|
1150
|
+
if isinstance(dataframe_type, type):
|
|
1151
|
+
if not isinstance(df, dataframe_type):
|
|
1152
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
1153
|
+
else:
|
|
1154
|
+
# Duck-type fallback for tests that provide a lightweight mock
|
|
1155
|
+
if not hasattr(df, "to_dict") or not hasattr(df, "empty"):
|
|
1156
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
1157
|
+
|
|
1158
|
+
if df.empty:
|
|
1159
|
+
raise ValueError("DataFrame cannot be empty")
|
|
1160
|
+
|
|
1161
|
+
try:
|
|
1162
|
+
# Convert DataFrame to list of dicts
|
|
1163
|
+
data = df.to_dict("records")
|
|
1164
|
+
return toon.encode(data)
|
|
1165
|
+
except Exception as e:
|
|
1166
|
+
raise ValueError(f"Failed to convert DataFrame to TOON format: {e}") from e
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
def _calculate_token_savings(json_text: str, toon_text: str) -> dict[str, Any]:
|
|
1170
|
+
"""Calculate estimated token savings between JSON and TOON formats.
|
|
1171
|
+
|
|
1172
|
+
This is a rough estimation based on character count ratios.
|
|
1173
|
+
Actual token counts may vary by model and tokenizer.
|
|
1174
|
+
|
|
1175
|
+
Args:
|
|
1176
|
+
json_text: JSON formatted text
|
|
1177
|
+
toon_text: TOON formatted text
|
|
1178
|
+
|
|
1179
|
+
Returns:
|
|
1180
|
+
Dict containing savings statistics
|
|
1181
|
+
"""
|
|
1182
|
+
json_chars = len(json_text)
|
|
1183
|
+
toon_chars = len(toon_text)
|
|
1184
|
+
|
|
1185
|
+
# Rough estimation: 4 characters ≈ 1 token (varies by model)
|
|
1186
|
+
json_tokens_est = json_chars // 4
|
|
1187
|
+
toon_tokens_est = toon_chars // 4
|
|
1188
|
+
|
|
1189
|
+
savings_chars = json_chars - toon_chars
|
|
1190
|
+
savings_tokens_est = json_tokens_est - toon_tokens_est
|
|
1191
|
+
|
|
1192
|
+
percentage_saved = (savings_chars / json_chars * 100) if json_chars > 0 else 0
|
|
1193
|
+
|
|
1194
|
+
return {
|
|
1195
|
+
"json_characters": json_chars,
|
|
1196
|
+
"toon_characters": toon_chars,
|
|
1197
|
+
"saved_characters": savings_chars,
|
|
1198
|
+
"estimated_json_tokens": json_tokens_est,
|
|
1199
|
+
"estimated_toon_tokens": toon_tokens_est,
|
|
1200
|
+
"estimated_saved_tokens": savings_tokens_est,
|
|
1201
|
+
"percentage_saved": round(percentage_saved, 1),
|
|
1202
|
+
}
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
def extract_from_data(
|
|
1206
|
+
data: Union[list[dict[str, Any]], dict[str, Any]],
|
|
1207
|
+
question: str,
|
|
1208
|
+
json_schema: dict[str, Any],
|
|
1209
|
+
*,
|
|
1210
|
+
model_name: str,
|
|
1211
|
+
data_key: str | None = None,
|
|
1212
|
+
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1213
|
+
ai_cleanup: bool = True,
|
|
1214
|
+
options: dict[str, Any] | None = None,
|
|
1215
|
+
system_prompt: str | None = None,
|
|
1216
|
+
) -> dict[str, Any]:
|
|
1217
|
+
"""Extract information from structured data by converting to TOON format for token efficiency.
|
|
1218
|
+
|
|
1219
|
+
This function takes JSON array data, converts it to TOON format to reduce tokens,
|
|
1220
|
+
sends it to the LLM with a question, and returns the JSON response.
|
|
1221
|
+
|
|
1222
|
+
Args:
|
|
1223
|
+
data: List of dicts (uniform array) or dict containing array under a key
|
|
1224
|
+
question: The question to ask about the data
|
|
1225
|
+
json_schema: Expected JSON schema for the response
|
|
1226
|
+
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4")
|
|
1227
|
+
data_key: If data is a dict, the key containing the array (e.g., "products")
|
|
1228
|
+
instruction_template: Template with {question} placeholder
|
|
1229
|
+
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1230
|
+
options: Additional options to pass to the driver
|
|
1231
|
+
|
|
1232
|
+
Returns:
|
|
1233
|
+
Dict containing:
|
|
1234
|
+
- json_object: The parsed JSON response
|
|
1235
|
+
- json_string: The JSON string response
|
|
1236
|
+
- usage: Token usage and cost information (includes token_savings)
|
|
1237
|
+
- toon_data: The TOON formatted input data
|
|
1238
|
+
- token_savings: Statistics about token savings vs JSON input
|
|
1239
|
+
|
|
1240
|
+
Raises:
|
|
1241
|
+
ValueError: If data format is invalid or conversion fails
|
|
1242
|
+
RuntimeError: If required dependencies are missing
|
|
1243
|
+
|
|
1244
|
+
Example:
|
|
1245
|
+
>>> products = [
|
|
1246
|
+
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1247
|
+
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1248
|
+
... ]
|
|
1249
|
+
>>> schema = {
|
|
1250
|
+
... "type": "object",
|
|
1251
|
+
... "properties": {
|
|
1252
|
+
... "average_price": {"type": "number"},
|
|
1253
|
+
... "total_items": {"type": "integer"}
|
|
1254
|
+
... }
|
|
1255
|
+
... }
|
|
1256
|
+
>>> result = extract_from_data(
|
|
1257
|
+
... data=products,
|
|
1258
|
+
... question="What is the average price and total number of items?",
|
|
1259
|
+
... json_schema=schema,
|
|
1260
|
+
... model_name="openai/gpt-4"
|
|
1261
|
+
... )
|
|
1262
|
+
>>> print(result["json_object"])
|
|
1263
|
+
{'average_price': 509.99, 'total_items': 2}
|
|
1264
|
+
"""
|
|
1265
|
+
if not question or not question.strip():
|
|
1266
|
+
raise ValueError("Question cannot be empty")
|
|
1267
|
+
|
|
1268
|
+
if not json_schema:
|
|
1269
|
+
raise ValueError("JSON schema cannot be empty")
|
|
1270
|
+
|
|
1271
|
+
if options is None:
|
|
1272
|
+
options = {}
|
|
1273
|
+
|
|
1274
|
+
# Convert data to TOON format
|
|
1275
|
+
toon_data = _json_to_toon(data, data_key)
|
|
1276
|
+
|
|
1277
|
+
# Calculate token savings (for comparison with JSON)
|
|
1278
|
+
json_data = json.dumps(data if isinstance(data, list) else data.get(data_key, data), indent=2)
|
|
1279
|
+
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1280
|
+
|
|
1281
|
+
# Build the prompt with TOON data
|
|
1282
|
+
content_prompt = instruction_template.format(question=question)
|
|
1283
|
+
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1284
|
+
|
|
1285
|
+
# Call the LLM
|
|
1286
|
+
result = ask_for_json(
|
|
1287
|
+
driver=get_driver_for_model(model_name),
|
|
1288
|
+
content_prompt=full_prompt,
|
|
1289
|
+
json_schema=json_schema,
|
|
1290
|
+
ai_cleanup=ai_cleanup,
|
|
1291
|
+
model_name=model_name.split("/")[-1] if "/" in model_name else model_name,
|
|
1292
|
+
options=options,
|
|
1293
|
+
output_format="json", # Always return JSON, not TOON
|
|
1294
|
+
system_prompt=system_prompt,
|
|
1295
|
+
)
|
|
1296
|
+
|
|
1297
|
+
# Add our additional data to the result
|
|
1298
|
+
result["toon_data"] = toon_data
|
|
1299
|
+
result["token_savings"] = token_savings
|
|
1300
|
+
|
|
1301
|
+
return result
|
|
1302
|
+
|
|
1303
|
+
|
|
1304
|
+
def extract_from_pandas(
|
|
1305
|
+
df, # pandas.DataFrame - optional import
|
|
1306
|
+
question: str,
|
|
1307
|
+
json_schema: dict[str, Any],
|
|
1308
|
+
*,
|
|
1309
|
+
model_name: str,
|
|
1310
|
+
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1311
|
+
ai_cleanup: bool = True,
|
|
1312
|
+
options: dict[str, Any] | None = None,
|
|
1313
|
+
system_prompt: str | None = None,
|
|
1314
|
+
) -> dict[str, Any]:
|
|
1315
|
+
"""Extract information from Pandas DataFrame by converting to TOON format for token efficiency.
|
|
1316
|
+
|
|
1317
|
+
This function takes a Pandas DataFrame, converts it to TOON format to reduce tokens,
|
|
1318
|
+
sends it to the LLM with a question, and returns the JSON response.
|
|
1319
|
+
|
|
1320
|
+
Args:
|
|
1321
|
+
df: Pandas DataFrame to analyze
|
|
1322
|
+
question: The question to ask about the data
|
|
1323
|
+
json_schema: Expected JSON schema for the response
|
|
1324
|
+
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4")
|
|
1325
|
+
instruction_template: Template with {question} placeholder
|
|
1326
|
+
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1327
|
+
options: Additional options to pass to the driver
|
|
1328
|
+
|
|
1329
|
+
Returns:
|
|
1330
|
+
Dict containing:
|
|
1331
|
+
- json_object: The parsed JSON response
|
|
1332
|
+
- json_string: The JSON string response
|
|
1333
|
+
- usage: Token usage and cost information (includes token_savings)
|
|
1334
|
+
- toon_data: The TOON formatted input data
|
|
1335
|
+
- token_savings: Statistics about token savings vs JSON input
|
|
1336
|
+
- dataframe_info: Basic info about the original DataFrame
|
|
1337
|
+
|
|
1338
|
+
Raises:
|
|
1339
|
+
ValueError: If DataFrame is invalid or conversion fails
|
|
1340
|
+
RuntimeError: If required dependencies are missing
|
|
1341
|
+
|
|
1342
|
+
Example:
|
|
1343
|
+
>>> import pandas as pd
|
|
1344
|
+
>>> df = pd.DataFrame([
|
|
1345
|
+
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1346
|
+
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1347
|
+
... ])
|
|
1348
|
+
>>> schema = {
|
|
1349
|
+
... "type": "object",
|
|
1350
|
+
... "properties": {
|
|
1351
|
+
... "highest_priced_item": {"type": "string"},
|
|
1352
|
+
... "price_range": {"type": "number"}
|
|
1353
|
+
... }
|
|
1354
|
+
... }
|
|
1355
|
+
>>> result = extract_from_pandas(
|
|
1356
|
+
... df=df,
|
|
1357
|
+
... question="What is the highest priced item and price range?",
|
|
1358
|
+
... json_schema=schema,
|
|
1359
|
+
... model_name="openai/gpt-4"
|
|
1360
|
+
... )
|
|
1361
|
+
>>> print(result["json_object"])
|
|
1362
|
+
{'highest_priced_item': 'Laptop', 'price_range': 980.0}
|
|
1363
|
+
"""
|
|
1364
|
+
if not question or not question.strip():
|
|
1365
|
+
raise ValueError("Question cannot be empty")
|
|
1366
|
+
|
|
1367
|
+
if not json_schema:
|
|
1368
|
+
raise ValueError("JSON schema cannot be empty")
|
|
1369
|
+
|
|
1370
|
+
if options is None:
|
|
1371
|
+
options = {}
|
|
1372
|
+
|
|
1373
|
+
# Convert DataFrame to TOON format
|
|
1374
|
+
toon_data = _dataframe_to_toon(df)
|
|
1375
|
+
|
|
1376
|
+
# Calculate token savings (for comparison with JSON)
|
|
1377
|
+
json_data = df.to_json(indent=2, orient="records")
|
|
1378
|
+
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1379
|
+
|
|
1380
|
+
# Get basic DataFrame info
|
|
1381
|
+
dataframe_info = {
|
|
1382
|
+
"shape": df.shape,
|
|
1383
|
+
"columns": list(df.columns),
|
|
1384
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
1385
|
+
}
|
|
1386
|
+
|
|
1387
|
+
# Build the prompt with TOON data
|
|
1388
|
+
content_prompt = instruction_template.format(question=question)
|
|
1389
|
+
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1390
|
+
|
|
1391
|
+
# Call the LLM
|
|
1392
|
+
result = ask_for_json(
|
|
1393
|
+
driver=get_driver_for_model(model_name),
|
|
1394
|
+
content_prompt=full_prompt,
|
|
1395
|
+
json_schema=json_schema,
|
|
1396
|
+
ai_cleanup=ai_cleanup,
|
|
1397
|
+
model_name=model_name.split("/")[-1] if "/" in model_name else model_name,
|
|
1398
|
+
options=options,
|
|
1399
|
+
output_format="json", # Always return JSON, not TOON
|
|
1400
|
+
system_prompt=system_prompt,
|
|
1401
|
+
)
|
|
1402
|
+
|
|
1403
|
+
# Add our additional data to the result
|
|
1404
|
+
result["toon_data"] = toon_data
|
|
1405
|
+
result["token_savings"] = token_savings
|
|
1406
|
+
result["dataframe_info"] = dataframe_info
|
|
1407
|
+
|
|
1408
|
+
return result
|