prompture 0.0.33.dev1__py3-none-any.whl → 0.0.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prompture/__init__.py +133 -49
- prompture/_version.py +34 -0
- prompture/aio/__init__.py +74 -0
- prompture/async_conversation.py +484 -0
- prompture/async_core.py +803 -0
- prompture/async_driver.py +131 -0
- prompture/cache.py +469 -0
- prompture/callbacks.py +50 -0
- prompture/cli.py +7 -3
- prompture/conversation.py +504 -0
- prompture/core.py +475 -352
- prompture/cost_mixin.py +51 -0
- prompture/discovery.py +50 -35
- prompture/driver.py +125 -5
- prompture/drivers/__init__.py +171 -73
- prompture/drivers/airllm_driver.py +13 -20
- prompture/drivers/async_airllm_driver.py +26 -0
- prompture/drivers/async_azure_driver.py +117 -0
- prompture/drivers/async_claude_driver.py +107 -0
- prompture/drivers/async_google_driver.py +132 -0
- prompture/drivers/async_grok_driver.py +91 -0
- prompture/drivers/async_groq_driver.py +84 -0
- prompture/drivers/async_hugging_driver.py +61 -0
- prompture/drivers/async_lmstudio_driver.py +79 -0
- prompture/drivers/async_local_http_driver.py +44 -0
- prompture/drivers/async_ollama_driver.py +125 -0
- prompture/drivers/async_openai_driver.py +96 -0
- prompture/drivers/async_openrouter_driver.py +96 -0
- prompture/drivers/async_registry.py +129 -0
- prompture/drivers/azure_driver.py +36 -9
- prompture/drivers/claude_driver.py +86 -34
- prompture/drivers/google_driver.py +87 -51
- prompture/drivers/grok_driver.py +29 -32
- prompture/drivers/groq_driver.py +27 -26
- prompture/drivers/hugging_driver.py +6 -6
- prompture/drivers/lmstudio_driver.py +26 -13
- prompture/drivers/local_http_driver.py +6 -6
- prompture/drivers/ollama_driver.py +90 -23
- prompture/drivers/openai_driver.py +36 -9
- prompture/drivers/openrouter_driver.py +31 -25
- prompture/drivers/registry.py +306 -0
- prompture/field_definitions.py +106 -96
- prompture/logging.py +80 -0
- prompture/model_rates.py +217 -0
- prompture/runner.py +49 -47
- prompture/session.py +117 -0
- prompture/settings.py +14 -1
- prompture/tools.py +172 -265
- prompture/validator.py +3 -3
- {prompture-0.0.33.dev1.dist-info → prompture-0.0.34.dist-info}/METADATA +18 -20
- prompture-0.0.34.dist-info/RECORD +55 -0
- prompture-0.0.33.dev1.dist-info/RECORD +0 -29
- {prompture-0.0.33.dev1.dist-info → prompture-0.0.34.dist-info}/WHEEL +0 -0
- {prompture-0.0.33.dev1.dist-info → prompture-0.0.34.dist-info}/entry_points.txt +0 -0
- {prompture-0.0.33.dev1.dist-info → prompture-0.0.34.dist-info}/licenses/LICENSE +0 -0
- {prompture-0.0.33.dev1.dist-info → prompture-0.0.34.dist-info}/top_level.txt +0 -0
prompture/core.py
CHANGED
|
@@ -1,57 +1,58 @@
|
|
|
1
|
-
"""Core utilities: Helpers for requesting JSON from LLM.
|
|
2
|
-
|
|
1
|
+
"""Core utilities: Helpers for requesting JSON from LLM."""
|
|
2
|
+
|
|
3
3
|
from __future__ import annotations
|
|
4
|
+
|
|
4
5
|
import json
|
|
5
|
-
import
|
|
6
|
+
import logging
|
|
6
7
|
import sys
|
|
7
|
-
import
|
|
8
|
-
from datetime import datetime, date
|
|
8
|
+
from datetime import date, datetime
|
|
9
9
|
from decimal import Decimal
|
|
10
|
-
from typing import Any,
|
|
10
|
+
from typing import Any, Literal, Union
|
|
11
|
+
|
|
12
|
+
import requests
|
|
11
13
|
|
|
12
14
|
try:
|
|
13
15
|
import toon
|
|
14
16
|
except ImportError:
|
|
15
17
|
toon = None
|
|
16
18
|
|
|
17
|
-
from pydantic import BaseModel
|
|
19
|
+
from pydantic import BaseModel
|
|
18
20
|
|
|
19
|
-
from .drivers import get_driver, get_driver_for_model
|
|
20
21
|
from .driver import Driver
|
|
22
|
+
from .drivers import get_driver_for_model
|
|
23
|
+
from .field_definitions import get_registry_snapshot
|
|
21
24
|
from .tools import (
|
|
22
|
-
create_field_schema,
|
|
23
|
-
convert_value,
|
|
24
|
-
log_debug,
|
|
25
25
|
clean_json_text,
|
|
26
|
-
|
|
26
|
+
convert_value,
|
|
27
27
|
get_field_default,
|
|
28
28
|
)
|
|
29
|
-
from .field_definitions import get_registry_snapshot
|
|
30
29
|
|
|
30
|
+
logger = logging.getLogger("prompture.core")
|
|
31
31
|
|
|
32
|
-
|
|
32
|
+
|
|
33
|
+
def normalize_field_value(value: Any, field_type: type, field_def: dict[str, Any]) -> Any:
|
|
33
34
|
"""Normalize invalid values for fields based on their type and nullable status.
|
|
34
|
-
|
|
35
|
+
|
|
35
36
|
This function handles post-processing of extracted values BEFORE Pydantic validation,
|
|
36
37
|
converting invalid values (like empty strings for booleans) to proper defaults.
|
|
37
|
-
|
|
38
|
+
|
|
38
39
|
Args:
|
|
39
40
|
value: The extracted value from the LLM
|
|
40
41
|
field_type: The expected Python type for this field
|
|
41
42
|
field_def: The field definition dict containing nullable, default, etc.
|
|
42
|
-
|
|
43
|
+
|
|
43
44
|
Returns:
|
|
44
45
|
A normalized value suitable for the field type
|
|
45
46
|
"""
|
|
46
47
|
nullable = field_def.get("nullable", True)
|
|
47
48
|
default_value = field_def.get("default")
|
|
48
|
-
|
|
49
|
+
|
|
49
50
|
# Special handling for boolean fields
|
|
50
|
-
if field_type is bool or (hasattr(field_type,
|
|
51
|
+
if field_type is bool or (hasattr(field_type, "__origin__") and field_type.__origin__ is bool):
|
|
51
52
|
# If value is already a boolean, return it as-is
|
|
52
53
|
if isinstance(value, bool):
|
|
53
54
|
return value
|
|
54
|
-
|
|
55
|
+
|
|
55
56
|
# For non-nullable booleans
|
|
56
57
|
if not nullable:
|
|
57
58
|
# Any non-empty string should be True, empty/None should be default
|
|
@@ -68,37 +69,39 @@ def normalize_field_value(value: Any, field_type: Type, field_def: Dict[str, Any
|
|
|
68
69
|
if isinstance(value, str):
|
|
69
70
|
return bool(value.strip()) if value.strip() else None
|
|
70
71
|
return bool(value) if value else None
|
|
71
|
-
|
|
72
|
+
|
|
72
73
|
# If the field is nullable and value is None, that's acceptable
|
|
73
74
|
if nullable and value is None:
|
|
74
75
|
return value
|
|
75
|
-
|
|
76
|
+
|
|
76
77
|
# For non-nullable fields with invalid values, use the default
|
|
77
78
|
if not nullable:
|
|
78
79
|
# Check for invalid values that should be replaced
|
|
79
80
|
invalid_values = (None, "", [], {})
|
|
80
|
-
|
|
81
|
+
|
|
81
82
|
if value in invalid_values or (isinstance(value, str) and not value.strip()):
|
|
82
83
|
# Use the default value if provided, otherwise use type-appropriate default
|
|
83
84
|
if default_value is not None:
|
|
84
85
|
return default_value
|
|
85
|
-
|
|
86
|
+
|
|
86
87
|
# Type-specific defaults for non-nullable fields
|
|
87
|
-
if field_type is int or (hasattr(field_type,
|
|
88
|
+
if field_type is int or (hasattr(field_type, "__origin__") and field_type.__origin__ is int):
|
|
88
89
|
return 0
|
|
89
|
-
elif field_type is float or (hasattr(field_type,
|
|
90
|
+
elif field_type is float or (hasattr(field_type, "__origin__") and field_type.__origin__ is float):
|
|
90
91
|
return 0.0
|
|
91
|
-
elif field_type is str or (hasattr(field_type,
|
|
92
|
+
elif field_type is str or (hasattr(field_type, "__origin__") and field_type.__origin__ is str):
|
|
92
93
|
return ""
|
|
93
|
-
elif field_type is list or (hasattr(field_type,
|
|
94
|
+
elif field_type is list or (hasattr(field_type, "__origin__") and field_type.__origin__ is list):
|
|
94
95
|
return []
|
|
95
|
-
elif field_type is dict or (hasattr(field_type,
|
|
96
|
+
elif field_type is dict or (hasattr(field_type, "__origin__") and field_type.__origin__ is dict):
|
|
96
97
|
return {}
|
|
97
|
-
|
|
98
|
+
|
|
98
99
|
return value
|
|
99
100
|
|
|
100
101
|
|
|
101
|
-
def clean_json_text_with_ai(
|
|
102
|
+
def clean_json_text_with_ai(
|
|
103
|
+
driver: Driver, text: str, model_name: str = "", options: dict[str, Any] | None = None
|
|
104
|
+
) -> str:
|
|
102
105
|
"""Use LLM to fix malformed JSON strings.
|
|
103
106
|
|
|
104
107
|
Generates a specialized prompt instructing the LLM to correct the
|
|
@@ -113,12 +116,14 @@ def clean_json_text_with_ai(driver: Driver, text: str, model_name: str = "", opt
|
|
|
113
116
|
A cleaned string that should contain valid JSON.
|
|
114
117
|
"""
|
|
115
118
|
# Check if JSON is already valid - if so, return unchanged
|
|
119
|
+
if options is None:
|
|
120
|
+
options = {}
|
|
116
121
|
try:
|
|
117
122
|
json.loads(text)
|
|
118
123
|
return text # Already valid, no need for LLM correction
|
|
119
124
|
except json.JSONDecodeError:
|
|
120
125
|
pass # Invalid, proceed with LLM correction
|
|
121
|
-
|
|
126
|
+
|
|
122
127
|
prompt = (
|
|
123
128
|
"The following text is supposed to be a single JSON object, but it is malformed. "
|
|
124
129
|
"Please correct it and return only the valid JSON object. Do not add any explanations or markdown. "
|
|
@@ -135,8 +140,9 @@ def render_output(
|
|
|
135
140
|
content_prompt: str,
|
|
136
141
|
output_format: Literal["text", "html", "markdown"] = "text",
|
|
137
142
|
model_name: str = "",
|
|
138
|
-
options:
|
|
139
|
-
|
|
143
|
+
options: dict[str, Any] | None = None,
|
|
144
|
+
system_prompt: str | None = None,
|
|
145
|
+
) -> dict[str, Any]:
|
|
140
146
|
"""Sends a prompt to the driver and returns the raw output in the requested format.
|
|
141
147
|
|
|
142
148
|
This function is designed for "no fluff" output, instructing the LLM to return
|
|
@@ -159,6 +165,8 @@ def render_output(
|
|
|
159
165
|
Raises:
|
|
160
166
|
ValueError: If an unsupported output format is provided.
|
|
161
167
|
"""
|
|
168
|
+
if options is None:
|
|
169
|
+
options = {}
|
|
162
170
|
if output_format not in ("text", "html", "markdown"):
|
|
163
171
|
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'text', 'html', or 'markdown'.")
|
|
164
172
|
|
|
@@ -174,18 +182,21 @@ def render_output(
|
|
|
174
182
|
"(like ```html ... ```). Do not include conversational filler."
|
|
175
183
|
)
|
|
176
184
|
elif output_format == "markdown":
|
|
177
|
-
instruct =
|
|
178
|
-
"Return valid markdown content. You may use standard markdown formatting."
|
|
179
|
-
)
|
|
185
|
+
instruct = "Return valid markdown content. You may use standard markdown formatting."
|
|
180
186
|
|
|
181
187
|
full_prompt = f"{content_prompt}\n\nSYSTEM INSTRUCTION: {instruct}"
|
|
182
|
-
|
|
183
|
-
#
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
188
|
+
|
|
189
|
+
# Use generate_messages when system_prompt is provided
|
|
190
|
+
if system_prompt is not None:
|
|
191
|
+
messages = [
|
|
192
|
+
{"role": "system", "content": system_prompt},
|
|
193
|
+
{"role": "user", "content": full_prompt},
|
|
194
|
+
]
|
|
195
|
+
resp = driver.generate_messages(messages, options)
|
|
196
|
+
else:
|
|
197
|
+
resp = driver.generate(full_prompt, options)
|
|
187
198
|
raw = resp.get("text", "")
|
|
188
|
-
|
|
199
|
+
|
|
189
200
|
# Clean up potential markdown fences if the model disobeyed for text/html
|
|
190
201
|
if output_format in ("text", "html"):
|
|
191
202
|
# Simple cleanup for common fences if they appear despite instructions
|
|
@@ -204,24 +215,24 @@ def render_output(
|
|
|
204
215
|
"prompt_tokens": resp.get("meta", {}).get("prompt_tokens", 0),
|
|
205
216
|
"completion_tokens": resp.get("meta", {}).get("completion_tokens", 0),
|
|
206
217
|
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
207
|
-
"model_name": model_name or getattr(driver, "model", "")
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
return {
|
|
211
|
-
"text": raw,
|
|
212
|
-
"usage": usage,
|
|
213
|
-
"output_format": output_format
|
|
218
|
+
"model_name": model_name or getattr(driver, "model", ""),
|
|
214
219
|
}
|
|
215
220
|
|
|
221
|
+
return {"text": raw, "usage": usage, "output_format": output_format}
|
|
222
|
+
|
|
223
|
+
|
|
216
224
|
def ask_for_json(
|
|
217
225
|
driver: Driver,
|
|
218
226
|
content_prompt: str,
|
|
219
|
-
json_schema:
|
|
227
|
+
json_schema: dict[str, Any],
|
|
220
228
|
ai_cleanup: bool = True,
|
|
221
229
|
model_name: str = "",
|
|
222
|
-
options:
|
|
230
|
+
options: dict[str, Any] | None = None,
|
|
223
231
|
output_format: Literal["json", "toon"] = "json",
|
|
224
|
-
|
|
232
|
+
cache: bool | None = None,
|
|
233
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
234
|
+
system_prompt: str | None = None,
|
|
235
|
+
) -> dict[str, Any]:
|
|
225
236
|
"""Sends a prompt to the driver and returns structured output plus usage metadata.
|
|
226
237
|
|
|
227
238
|
This function enforces a schema-first approach by requiring a json_schema parameter
|
|
@@ -235,6 +246,8 @@ def ask_for_json(
|
|
|
235
246
|
model_name: Optional model identifier used in usage metadata.
|
|
236
247
|
options: Additional options to pass to the driver.
|
|
237
248
|
output_format: Response serialization format ("json" or "toon").
|
|
249
|
+
cache: Override for response caching. ``True`` forces caching on,
|
|
250
|
+
``False`` forces it off, ``None`` defers to the global setting.
|
|
238
251
|
|
|
239
252
|
Returns:
|
|
240
253
|
A dictionary containing:
|
|
@@ -248,26 +261,81 @@ def ask_for_json(
|
|
|
248
261
|
json.JSONDecodeError: If JSON parsing fails and ai_cleanup is False.
|
|
249
262
|
ValueError: If TOON parsing fails.
|
|
250
263
|
"""
|
|
264
|
+
if options is None:
|
|
265
|
+
options = {}
|
|
251
266
|
if output_format not in ("json", "toon"):
|
|
252
267
|
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'json' or 'toon'.")
|
|
253
268
|
|
|
269
|
+
# --- cache lookup ---
|
|
270
|
+
from .cache import get_cache, make_cache_key
|
|
271
|
+
|
|
272
|
+
_cache = get_cache()
|
|
273
|
+
use_cache = cache if cache is not None else _cache.enabled
|
|
274
|
+
_force = cache is True # explicit per-call override
|
|
275
|
+
cache_key: str | None = None
|
|
276
|
+
if use_cache:
|
|
277
|
+
cache_key = make_cache_key(
|
|
278
|
+
prompt=content_prompt,
|
|
279
|
+
model_name=model_name,
|
|
280
|
+
schema=json_schema,
|
|
281
|
+
options=options,
|
|
282
|
+
output_format=output_format,
|
|
283
|
+
)
|
|
284
|
+
cached = _cache.get(cache_key, force=_force)
|
|
285
|
+
if cached is not None:
|
|
286
|
+
cached["usage"]["cache_hit"] = True
|
|
287
|
+
return cached
|
|
288
|
+
|
|
254
289
|
schema_string = json.dumps(json_schema, indent=2)
|
|
255
290
|
if output_format == "toon" and toon is None:
|
|
256
291
|
raise RuntimeError(
|
|
257
|
-
"TOON requested but 'python-toon' is not installed. "
|
|
258
|
-
"Install it with 'pip install python-toon'."
|
|
292
|
+
"TOON requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
259
293
|
)
|
|
260
294
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
295
|
+
# Determine whether to use native JSON mode
|
|
296
|
+
use_json_mode = False
|
|
297
|
+
if json_mode == "on":
|
|
298
|
+
use_json_mode = True
|
|
299
|
+
elif json_mode == "auto":
|
|
300
|
+
use_json_mode = getattr(driver, "supports_json_mode", False)
|
|
301
|
+
|
|
302
|
+
if use_json_mode:
|
|
303
|
+
options = {**options, "json_mode": True}
|
|
304
|
+
if getattr(driver, "supports_json_schema", False):
|
|
305
|
+
options["json_schema"] = json_schema
|
|
306
|
+
|
|
307
|
+
# Adjust instruction prompt based on JSON mode capabilities
|
|
308
|
+
if use_json_mode and getattr(driver, "supports_json_schema", False):
|
|
309
|
+
# Schema enforced by API — minimal instruction
|
|
310
|
+
instruct = "Extract data matching the requested schema.\nIf a value is unknown use null."
|
|
311
|
+
elif use_json_mode:
|
|
312
|
+
# JSON guaranteed but schema not enforced by API
|
|
313
|
+
instruct = (
|
|
314
|
+
"Return a JSON object that validates against this schema:\n"
|
|
315
|
+
f"{schema_string}\n\n"
|
|
316
|
+
"If a value is unknown use null."
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
# Existing prompt-based enforcement
|
|
320
|
+
instruct = (
|
|
321
|
+
"Return only a single JSON object (no markdown, no extra text) that validates against this JSON schema:\n"
|
|
322
|
+
f"{schema_string}\n\n"
|
|
323
|
+
"If a value is unknown use null. Use double quotes for keys and strings."
|
|
324
|
+
)
|
|
266
325
|
if output_format == "toon":
|
|
267
326
|
instruct += "\n\n(Respond with JSON only; Prompture will convert to TOON.)"
|
|
268
327
|
|
|
269
328
|
full_prompt = f"{content_prompt}\n\n{instruct}"
|
|
270
|
-
|
|
329
|
+
|
|
330
|
+
# Use generate_messages when system_prompt is provided
|
|
331
|
+
if system_prompt is not None:
|
|
332
|
+
messages = [
|
|
333
|
+
{"role": "system", "content": system_prompt},
|
|
334
|
+
{"role": "user", "content": full_prompt},
|
|
335
|
+
]
|
|
336
|
+
resp = driver.generate_messages(messages, options)
|
|
337
|
+
else:
|
|
338
|
+
resp = driver.generate(full_prompt, options)
|
|
271
339
|
raw = resp.get("text", "")
|
|
272
340
|
cleaned = clean_json_text(raw)
|
|
273
341
|
|
|
@@ -285,18 +353,20 @@ def ask_for_json(
|
|
|
285
353
|
"prompt_tokens": resp.get("meta", {}).get("prompt_tokens", 0),
|
|
286
354
|
"completion_tokens": resp.get("meta", {}).get("completion_tokens", 0),
|
|
287
355
|
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
288
|
-
"model_name": model_name or getattr(driver, "model", "")
|
|
289
|
-
}
|
|
290
|
-
result = {
|
|
291
|
-
"json_string": json_string,
|
|
292
|
-
"json_object": json_obj,
|
|
293
|
-
"usage": usage
|
|
356
|
+
"model_name": model_name or getattr(driver, "model", ""),
|
|
294
357
|
}
|
|
358
|
+
result = {"json_string": json_string, "json_object": json_obj, "usage": usage}
|
|
295
359
|
if toon_string is not None:
|
|
296
360
|
result["toon_string"] = toon_string
|
|
297
361
|
result["output_format"] = "toon"
|
|
298
362
|
else:
|
|
299
363
|
result["output_format"] = "json"
|
|
364
|
+
|
|
365
|
+
# --- cache store ---
|
|
366
|
+
if use_cache and cache_key is not None:
|
|
367
|
+
cached_copy = {**result, "usage": {**result["usage"], "raw_response": {}}}
|
|
368
|
+
_cache.set(cache_key, cached_copy, force=_force)
|
|
369
|
+
|
|
300
370
|
return result
|
|
301
371
|
except json.JSONDecodeError as e:
|
|
302
372
|
if ai_cleanup:
|
|
@@ -312,30 +382,38 @@ def ask_for_json(
|
|
|
312
382
|
"total_tokens": 0,
|
|
313
383
|
"cost": 0.0,
|
|
314
384
|
"model_name": options.get("model", getattr(driver, "model", "")),
|
|
315
|
-
"raw_response": {}
|
|
385
|
+
"raw_response": {},
|
|
316
386
|
},
|
|
317
387
|
"output_format": "json" if output_format != "toon" else "toon",
|
|
318
388
|
}
|
|
319
389
|
if output_format == "toon":
|
|
320
390
|
result["toon_string"] = toon.encode(json_obj)
|
|
391
|
+
|
|
392
|
+
# --- cache store (ai cleanup path) ---
|
|
393
|
+
if use_cache and cache_key is not None:
|
|
394
|
+
_cache.set(cache_key, result, force=_force)
|
|
395
|
+
|
|
321
396
|
return result
|
|
322
397
|
except json.JSONDecodeError:
|
|
323
|
-
raise e
|
|
398
|
+
raise e from None
|
|
324
399
|
else:
|
|
325
400
|
raise e
|
|
326
401
|
|
|
402
|
+
|
|
327
403
|
def extract_and_jsonify(
|
|
328
404
|
text: Union[str, Driver], # Can be either text or driver for backward compatibility
|
|
329
|
-
json_schema:
|
|
405
|
+
json_schema: dict[str, Any],
|
|
330
406
|
*, # Force keyword arguments for remaining params
|
|
331
|
-
model_name: Union[str,
|
|
407
|
+
model_name: Union[str, dict[str, Any]] = "", # Can be schema (old) or model name (new)
|
|
332
408
|
instruction_template: str = "Extract information from the following text:",
|
|
333
409
|
ai_cleanup: bool = True,
|
|
334
410
|
output_format: Literal["json", "toon"] = "json",
|
|
335
|
-
options:
|
|
336
|
-
|
|
411
|
+
options: dict[str, Any] | None = None,
|
|
412
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
413
|
+
system_prompt: str | None = None,
|
|
414
|
+
) -> dict[str, Any]:
|
|
337
415
|
"""Extracts structured information using automatic driver selection based on model name.
|
|
338
|
-
|
|
416
|
+
|
|
339
417
|
Args:
|
|
340
418
|
text: The raw text to extract information from.
|
|
341
419
|
json_schema: JSON schema dictionary defining the expected structure.
|
|
@@ -344,18 +422,20 @@ def extract_and_jsonify(
|
|
|
344
422
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
345
423
|
output_format: Response serialization format ("json" or "toon").
|
|
346
424
|
options: Additional options to pass to the driver.
|
|
347
|
-
|
|
425
|
+
|
|
348
426
|
Returns:
|
|
349
427
|
A dictionary containing:
|
|
350
428
|
- json_string: the JSON string output.
|
|
351
429
|
- json_object: the parsed JSON object.
|
|
352
430
|
- usage: token usage and cost information from the driver's meta object.
|
|
353
|
-
|
|
431
|
+
|
|
354
432
|
Raises:
|
|
355
433
|
ValueError: If text is empty or None, or if model_name format is invalid.
|
|
356
434
|
json.JSONDecodeError: If the response cannot be parsed as JSON and ai_cleanup is False.
|
|
357
435
|
pytest.skip: If a ConnectionError occurs during testing (when pytest is running).
|
|
358
436
|
"""
|
|
437
|
+
if options is None:
|
|
438
|
+
options = {}
|
|
359
439
|
actual_template = instruction_template
|
|
360
440
|
actual_output_format = output_format
|
|
361
441
|
# Handle legacy format where first argument is driver
|
|
@@ -381,18 +461,18 @@ def extract_and_jsonify(
|
|
|
381
461
|
if driver is None:
|
|
382
462
|
if not actual_model:
|
|
383
463
|
raise ValueError("Model name cannot be empty")
|
|
384
|
-
|
|
464
|
+
|
|
385
465
|
# First validate model format
|
|
386
466
|
if "/" not in actual_model:
|
|
387
467
|
raise ValueError("Invalid model string format. Expected format: 'provider/model'")
|
|
388
|
-
|
|
468
|
+
|
|
389
469
|
try:
|
|
390
470
|
driver = get_driver_for_model(actual_model)
|
|
391
471
|
except ValueError as e:
|
|
392
472
|
if "Unsupported provider" in str(e):
|
|
393
|
-
raise ValueError(f"Unsupported provider in model name: {actual_model}")
|
|
473
|
+
raise ValueError(f"Unsupported provider in model name: {actual_model}") from e
|
|
394
474
|
raise # Re-raise any other ValueError
|
|
395
|
-
|
|
475
|
+
|
|
396
476
|
# Extract model parts for other validation
|
|
397
477
|
try:
|
|
398
478
|
provider, model_id = actual_model.split("/", 1)
|
|
@@ -401,11 +481,11 @@ def extract_and_jsonify(
|
|
|
401
481
|
except ValueError:
|
|
402
482
|
# If no "/" in model string, use entire string as both provider and model_id
|
|
403
483
|
provider = model_id = actual_model
|
|
404
|
-
|
|
484
|
+
|
|
405
485
|
opts = {**options, "model": model_id}
|
|
406
|
-
|
|
486
|
+
|
|
407
487
|
content_prompt = f"{actual_template} {actual_text}"
|
|
408
|
-
|
|
488
|
+
|
|
409
489
|
try:
|
|
410
490
|
return ask_for_json(
|
|
411
491
|
driver,
|
|
@@ -415,24 +495,29 @@ def extract_and_jsonify(
|
|
|
415
495
|
model_id,
|
|
416
496
|
opts,
|
|
417
497
|
output_format=actual_output_format,
|
|
498
|
+
json_mode=json_mode,
|
|
499
|
+
system_prompt=system_prompt,
|
|
418
500
|
)
|
|
419
501
|
except (requests.exceptions.ConnectionError, requests.exceptions.HTTPError) as e:
|
|
420
502
|
if "pytest" in sys.modules:
|
|
421
503
|
import pytest
|
|
504
|
+
|
|
422
505
|
pytest.skip(f"Connection error occurred: {e}")
|
|
423
|
-
raise ConnectionError(f"Connection error occurred: {e}")
|
|
506
|
+
raise ConnectionError(f"Connection error occurred: {e}") from e
|
|
507
|
+
|
|
424
508
|
|
|
425
509
|
def manual_extract_and_jsonify(
|
|
426
510
|
driver: Driver,
|
|
427
511
|
text: str,
|
|
428
|
-
json_schema:
|
|
512
|
+
json_schema: dict[str, Any],
|
|
429
513
|
model_name: str = "",
|
|
430
514
|
instruction_template: str = "Extract information from the following text:",
|
|
431
515
|
ai_cleanup: bool = True,
|
|
432
516
|
output_format: Literal["json", "toon"] = "json",
|
|
433
|
-
options:
|
|
434
|
-
|
|
435
|
-
|
|
517
|
+
options: dict[str, Any] | None = None,
|
|
518
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
519
|
+
system_prompt: str | None = None,
|
|
520
|
+
) -> dict[str, Any]:
|
|
436
521
|
"""Extracts structured information using an explicitly provided driver.
|
|
437
522
|
|
|
438
523
|
This variant is useful when you want to directly control which driver
|
|
@@ -448,7 +533,6 @@ def manual_extract_and_jsonify(
|
|
|
448
533
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
449
534
|
output_format: Response serialization format ("json" or "toon").
|
|
450
535
|
options: Additional options to pass to the driver.
|
|
451
|
-
verbose_level: Logging level for debug output (LogLevel.OFF by default).
|
|
452
536
|
|
|
453
537
|
Returns:
|
|
454
538
|
A dictionary containing:
|
|
@@ -460,31 +544,30 @@ def manual_extract_and_jsonify(
|
|
|
460
544
|
ValueError: If text is empty or None.
|
|
461
545
|
json.JSONDecodeError: If the response cannot be parsed as JSON and ai_cleanup is False.
|
|
462
546
|
"""
|
|
547
|
+
if options is None:
|
|
548
|
+
options = {}
|
|
463
549
|
if not isinstance(text, str):
|
|
464
550
|
raise ValueError("Text input must be a string")
|
|
465
|
-
|
|
551
|
+
|
|
466
552
|
if not text or not text.strip():
|
|
467
553
|
raise ValueError("Text input cannot be empty")
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
"text_length"
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
554
|
+
|
|
555
|
+
logger.info("[manual] Starting manual extraction")
|
|
556
|
+
logger.debug(
|
|
557
|
+
"[manual] text_length=%d model_name=%s schema_keys=%s",
|
|
558
|
+
len(text),
|
|
559
|
+
model_name,
|
|
560
|
+
list(json_schema.keys()) if json_schema else [],
|
|
561
|
+
)
|
|
475
562
|
|
|
476
563
|
opts = dict(options)
|
|
477
564
|
if model_name:
|
|
478
565
|
opts["model"] = model_name
|
|
479
566
|
|
|
480
|
-
# Generate the content prompt
|
|
481
567
|
content_prompt = f"{instruction_template} {text}"
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
log_debug(LogLevel.TRACE, verbose_level, {"content_prompt": content_prompt}, prefix="[manual]")
|
|
486
|
-
|
|
487
|
-
# Call ask_for_json and log the result
|
|
568
|
+
|
|
569
|
+
logger.debug("[manual] Generated prompt for extraction")
|
|
570
|
+
|
|
488
571
|
result = ask_for_json(
|
|
489
572
|
driver,
|
|
490
573
|
content_prompt,
|
|
@@ -493,22 +576,26 @@ def manual_extract_and_jsonify(
|
|
|
493
576
|
model_name,
|
|
494
577
|
opts,
|
|
495
578
|
output_format=output_format,
|
|
579
|
+
json_mode=json_mode,
|
|
580
|
+
system_prompt=system_prompt,
|
|
496
581
|
)
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
582
|
+
logger.debug("[manual] Manual extraction completed successfully")
|
|
583
|
+
|
|
500
584
|
return result
|
|
501
585
|
|
|
586
|
+
|
|
502
587
|
def extract_with_model(
|
|
503
|
-
model_cls: Union[
|
|
504
|
-
text: Union[str,
|
|
505
|
-
model_name: Union[str,
|
|
588
|
+
model_cls: Union[type[BaseModel], str], # Can be model class or model name string for legacy support
|
|
589
|
+
text: Union[str, dict[str, Any]], # Can be text or schema for legacy support
|
|
590
|
+
model_name: Union[str, dict[str, Any]], # Can be model name or text for legacy support
|
|
506
591
|
instruction_template: str = "Extract information from the following text:",
|
|
507
592
|
ai_cleanup: bool = True,
|
|
508
593
|
output_format: Literal["json", "toon"] = "json",
|
|
509
|
-
options:
|
|
510
|
-
|
|
511
|
-
|
|
594
|
+
options: dict[str, Any] | None = None,
|
|
595
|
+
cache: bool | None = None,
|
|
596
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
597
|
+
system_prompt: str | None = None,
|
|
598
|
+
) -> dict[str, Any]:
|
|
512
599
|
"""Extracts structured information into a Pydantic model instance.
|
|
513
600
|
|
|
514
601
|
Converts the Pydantic model to its JSON schema and uses auto-resolved driver based on model_name
|
|
@@ -522,7 +609,8 @@ def extract_with_model(
|
|
|
522
609
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails.
|
|
523
610
|
output_format: Response serialization format ("json" or "toon").
|
|
524
611
|
options: Additional options to pass to the driver.
|
|
525
|
-
|
|
612
|
+
cache: Override for response caching. ``True`` forces caching on,
|
|
613
|
+
``False`` forces it off, ``None`` defers to the global setting.
|
|
526
614
|
|
|
527
615
|
Returns:
|
|
528
616
|
A validated instance of the Pydantic model.
|
|
@@ -532,6 +620,8 @@ def extract_with_model(
|
|
|
532
620
|
ValidationError: If the extracted data doesn't match the model schema.
|
|
533
621
|
"""
|
|
534
622
|
# Handle legacy format where first arg is model class
|
|
623
|
+
if options is None:
|
|
624
|
+
options = {}
|
|
535
625
|
if isinstance(model_cls, type) and issubclass(model_cls, BaseModel):
|
|
536
626
|
actual_cls = model_cls
|
|
537
627
|
actual_text = text
|
|
@@ -544,19 +634,46 @@ def extract_with_model(
|
|
|
544
634
|
|
|
545
635
|
if not isinstance(actual_text, str) or not actual_text.strip():
|
|
546
636
|
raise ValueError("Text input cannot be empty")
|
|
547
|
-
|
|
548
|
-
#
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
637
|
+
|
|
638
|
+
# --- cache lookup ---
|
|
639
|
+
from .cache import get_cache, make_cache_key
|
|
640
|
+
|
|
641
|
+
_cache = get_cache()
|
|
642
|
+
use_cache = cache if cache is not None else _cache.enabled
|
|
643
|
+
_force = cache is True
|
|
644
|
+
cache_key: str | None = None
|
|
645
|
+
if use_cache:
|
|
646
|
+
schema_for_key = actual_cls.model_json_schema()
|
|
647
|
+
cache_key = make_cache_key(
|
|
648
|
+
prompt=f"{instruction_template} {actual_text}",
|
|
649
|
+
model_name=actual_model if isinstance(actual_model, str) else "",
|
|
650
|
+
schema=schema_for_key,
|
|
651
|
+
options=options,
|
|
652
|
+
output_format=output_format,
|
|
653
|
+
pydantic_qualname=actual_cls.__qualname__,
|
|
654
|
+
)
|
|
655
|
+
cached = _cache.get(cache_key, force=_force)
|
|
656
|
+
if cached is not None:
|
|
657
|
+
cached["usage"]["cache_hit"] = True
|
|
658
|
+
# Reconstruct Pydantic model instance from cached JSON
|
|
659
|
+
cached["model"] = actual_cls(**cached["json_object"])
|
|
660
|
+
return type(
|
|
661
|
+
"ExtractResult",
|
|
662
|
+
(dict,),
|
|
663
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
664
|
+
)(cached)
|
|
665
|
+
|
|
666
|
+
logger.info("[extract] Starting extract_with_model")
|
|
667
|
+
logger.debug(
|
|
668
|
+
"[extract] model_cls=%s text_length=%d model_name=%s",
|
|
669
|
+
actual_cls.__name__,
|
|
670
|
+
len(actual_text),
|
|
671
|
+
actual_model,
|
|
672
|
+
)
|
|
555
673
|
|
|
556
674
|
schema = actual_cls.model_json_schema()
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
675
|
+
logger.debug("[extract] Generated JSON schema")
|
|
676
|
+
|
|
560
677
|
result = extract_and_jsonify(
|
|
561
678
|
text=actual_text,
|
|
562
679
|
json_schema=schema,
|
|
@@ -564,62 +681,75 @@ def extract_with_model(
|
|
|
564
681
|
instruction_template=instruction_template,
|
|
565
682
|
ai_cleanup=ai_cleanup,
|
|
566
683
|
output_format=output_format,
|
|
567
|
-
options=options
|
|
684
|
+
options=options,
|
|
685
|
+
json_mode=json_mode,
|
|
686
|
+
system_prompt=system_prompt,
|
|
568
687
|
)
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
688
|
+
logger.debug("[extract] Extraction completed successfully")
|
|
689
|
+
|
|
572
690
|
# Post-process the extracted JSON object to normalize invalid values
|
|
573
691
|
json_object = result["json_object"]
|
|
574
692
|
schema_properties = schema.get("properties", {})
|
|
575
|
-
|
|
693
|
+
|
|
576
694
|
for field_name, field_info in actual_cls.model_fields.items():
|
|
577
695
|
if field_name in json_object and field_name in schema_properties:
|
|
578
|
-
|
|
696
|
+
schema_properties[field_name]
|
|
579
697
|
field_def = {
|
|
580
|
-
"nullable": not schema_properties[field_name].get("type")
|
|
581
|
-
|
|
582
|
-
|
|
698
|
+
"nullable": not schema_properties[field_name].get("type")
|
|
699
|
+
or "null"
|
|
700
|
+
in (
|
|
701
|
+
schema_properties[field_name].get("anyOf", [])
|
|
702
|
+
if isinstance(schema_properties[field_name].get("anyOf"), list)
|
|
703
|
+
else []
|
|
704
|
+
),
|
|
705
|
+
"default": field_info.default
|
|
706
|
+
if hasattr(field_info, "default") and field_info.default is not ...
|
|
707
|
+
else None,
|
|
583
708
|
}
|
|
584
|
-
|
|
709
|
+
|
|
585
710
|
# Normalize the value
|
|
586
|
-
json_object[field_name] = normalize_field_value(
|
|
587
|
-
|
|
588
|
-
field_info.annotation,
|
|
589
|
-
field_def
|
|
590
|
-
)
|
|
591
|
-
|
|
711
|
+
json_object[field_name] = normalize_field_value(json_object[field_name], field_info.annotation, field_def)
|
|
712
|
+
|
|
592
713
|
# Create model instance for validation
|
|
593
714
|
model_instance = actual_cls(**json_object)
|
|
594
|
-
|
|
715
|
+
|
|
595
716
|
# Return dictionary with all required fields and backwards compatibility
|
|
596
|
-
result_dict = {
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
717
|
+
result_dict = {"json_string": result["json_string"], "json_object": result["json_object"], "usage": result["usage"]}
|
|
718
|
+
|
|
719
|
+
# --- cache store ---
|
|
720
|
+
if use_cache and cache_key is not None:
|
|
721
|
+
cached_copy = {
|
|
722
|
+
"json_string": result_dict["json_string"],
|
|
723
|
+
"json_object": result_dict["json_object"],
|
|
724
|
+
"usage": {**result_dict["usage"], "raw_response": {}},
|
|
725
|
+
}
|
|
726
|
+
_cache.set(cache_key, cached_copy, force=_force)
|
|
727
|
+
|
|
602
728
|
# Add backwards compatibility property
|
|
603
729
|
result_dict["model"] = model_instance
|
|
604
|
-
|
|
730
|
+
|
|
605
731
|
# Return value can be used both as a dict and accessed as model directly
|
|
606
|
-
return type(
|
|
607
|
-
"
|
|
608
|
-
|
|
609
|
-
|
|
732
|
+
return type(
|
|
733
|
+
"ExtractResult",
|
|
734
|
+
(dict,),
|
|
735
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
736
|
+
)(result_dict)
|
|
737
|
+
|
|
610
738
|
|
|
611
739
|
def stepwise_extract_with_model(
|
|
612
|
-
model_cls:
|
|
740
|
+
model_cls: type[BaseModel],
|
|
613
741
|
text: str,
|
|
614
742
|
*, # Force keyword arguments for remaining params
|
|
615
743
|
model_name: str,
|
|
616
744
|
instruction_template: str = "Extract the {field_name} from the following text:",
|
|
617
745
|
ai_cleanup: bool = True,
|
|
618
|
-
fields:
|
|
619
|
-
field_definitions:
|
|
620
|
-
options:
|
|
621
|
-
|
|
622
|
-
|
|
746
|
+
fields: list[str] | None = None,
|
|
747
|
+
field_definitions: dict[str, Any] | None = None,
|
|
748
|
+
options: dict[str, Any] | None = None,
|
|
749
|
+
json_mode: Literal["auto", "on", "off"] = "auto",
|
|
750
|
+
system_prompt: str | None = None,
|
|
751
|
+
share_context: bool = False,
|
|
752
|
+
) -> dict[str, Union[str, dict[str, Any]]]:
|
|
623
753
|
"""Extracts structured information into a Pydantic model by processing each field individually.
|
|
624
754
|
|
|
625
755
|
For each field in the model, makes a separate LLM call to extract that specific field,
|
|
@@ -637,7 +767,6 @@ def stepwise_extract_with_model(
|
|
|
637
767
|
field_definitions: Optional field definitions dict for enhanced default handling.
|
|
638
768
|
If None, automatically uses the global field registry.
|
|
639
769
|
options: Additional options to pass to the driver.
|
|
640
|
-
verbose_level: Logging level for debug output (LogLevel.OFF by default).
|
|
641
770
|
|
|
642
771
|
Returns:
|
|
643
772
|
A dictionary containing:
|
|
@@ -648,7 +777,7 @@ def stepwise_extract_with_model(
|
|
|
648
777
|
Raises:
|
|
649
778
|
ValueError: If text is empty or None, or if model_name format is invalid.
|
|
650
779
|
KeyError: If a requested field doesn't exist in the model.
|
|
651
|
-
|
|
780
|
+
|
|
652
781
|
Note:
|
|
653
782
|
This function now gracefully handles extraction failures by falling back to default
|
|
654
783
|
values rather than failing completely. Individual field errors are logged and
|
|
@@ -656,25 +785,40 @@ def stepwise_extract_with_model(
|
|
|
656
785
|
"""
|
|
657
786
|
if not text or not text.strip():
|
|
658
787
|
raise ValueError("Text input cannot be empty")
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
788
|
+
|
|
789
|
+
# When share_context=True, delegate to Conversation-based extraction
|
|
790
|
+
if share_context:
|
|
791
|
+
from .conversation import Conversation
|
|
792
|
+
|
|
793
|
+
conv = Conversation(model_name=model_name, system_prompt=system_prompt, options=options)
|
|
794
|
+
return conv._stepwise_extract(
|
|
795
|
+
model_cls=model_cls,
|
|
796
|
+
text=text,
|
|
797
|
+
instruction_template=instruction_template,
|
|
798
|
+
ai_cleanup=ai_cleanup,
|
|
799
|
+
fields=fields,
|
|
800
|
+
field_definitions=field_definitions,
|
|
801
|
+
json_mode=json_mode,
|
|
802
|
+
)
|
|
803
|
+
|
|
804
|
+
logger.info("[stepwise] Starting stepwise extraction")
|
|
805
|
+
logger.debug(
|
|
806
|
+
"[stepwise] model_cls=%s text_length=%d fields=%s",
|
|
807
|
+
model_cls.__name__,
|
|
808
|
+
len(text),
|
|
809
|
+
fields,
|
|
810
|
+
)
|
|
666
811
|
|
|
667
812
|
# Auto-use global field registry if no field_definitions provided
|
|
668
813
|
if field_definitions is None:
|
|
669
814
|
field_definitions = get_registry_snapshot()
|
|
670
|
-
|
|
671
|
-
log_debug(LogLevel.TRACE, verbose_level, {"registry_fields": list(field_definitions.keys())}, prefix="[stepwise]")
|
|
815
|
+
logger.debug("[stepwise] Using global field registry")
|
|
672
816
|
|
|
673
817
|
data = {}
|
|
674
818
|
validation_errors = []
|
|
675
819
|
field_results = {} # Track success/failure per field
|
|
676
820
|
options = options or {}
|
|
677
|
-
|
|
821
|
+
|
|
678
822
|
# Initialize usage accumulator
|
|
679
823
|
accumulated_usage = {
|
|
680
824
|
"prompt_tokens": 0,
|
|
@@ -682,7 +826,7 @@ def stepwise_extract_with_model(
|
|
|
682
826
|
"total_tokens": 0,
|
|
683
827
|
"cost": 0.0,
|
|
684
828
|
"model_name": model_name, # Use provided model_name directly
|
|
685
|
-
"field_usages": {}
|
|
829
|
+
"field_usages": {},
|
|
686
830
|
}
|
|
687
831
|
|
|
688
832
|
# Get valid field names from the model
|
|
@@ -698,28 +842,16 @@ def stepwise_extract_with_model(
|
|
|
698
842
|
field_items = model_cls.model_fields.items()
|
|
699
843
|
|
|
700
844
|
for field_name, field_info in field_items:
|
|
701
|
-
|
|
702
|
-
log_debug(LogLevel.DEBUG, verbose_level, f"Extracting field: {field_name}", prefix="[stepwise]")
|
|
703
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
704
|
-
"field_name": field_name,
|
|
705
|
-
"field_info": str(field_info),
|
|
706
|
-
"field_type": str(field_info.annotation)
|
|
707
|
-
}, prefix="[stepwise]")
|
|
845
|
+
logger.debug("[stepwise] Extracting field: %s", field_name)
|
|
708
846
|
|
|
709
847
|
# Create field schema that expects a direct value rather than a dict
|
|
710
848
|
field_schema = {
|
|
711
849
|
"value": {
|
|
712
|
-
"type": "integer" if field_info.annotation
|
|
713
|
-
"description": field_info.description or f"Value for {field_name}"
|
|
850
|
+
"type": "integer" if field_info.annotation is int else "string",
|
|
851
|
+
"description": field_info.description or f"Value for {field_name}",
|
|
714
852
|
}
|
|
715
853
|
}
|
|
716
854
|
|
|
717
|
-
# Add structured logging for field schema and prompt
|
|
718
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
719
|
-
"field_schema": field_schema,
|
|
720
|
-
"prompt_template": instruction_template.format(field_name=field_name)
|
|
721
|
-
}, prefix="[stepwise]")
|
|
722
|
-
|
|
723
855
|
try:
|
|
724
856
|
result = extract_and_jsonify(
|
|
725
857
|
text=text,
|
|
@@ -727,12 +859,12 @@ def stepwise_extract_with_model(
|
|
|
727
859
|
model_name=model_name,
|
|
728
860
|
instruction_template=instruction_template.format(field_name=field_name),
|
|
729
861
|
ai_cleanup=ai_cleanup,
|
|
730
|
-
options=options
|
|
862
|
+
options=options,
|
|
863
|
+
json_mode=json_mode,
|
|
864
|
+
system_prompt=system_prompt,
|
|
731
865
|
)
|
|
732
866
|
|
|
733
|
-
|
|
734
|
-
log_debug(LogLevel.DEBUG, verbose_level, f"Raw extraction result for {field_name}", prefix="[stepwise]")
|
|
735
|
-
log_debug(LogLevel.TRACE, verbose_level, {"result": result}, prefix="[stepwise]")
|
|
867
|
+
logger.debug("[stepwise] Raw extraction result for %s", field_name)
|
|
736
868
|
|
|
737
869
|
# Accumulate usage data from this field extraction
|
|
738
870
|
field_usage = result.get("usage", {})
|
|
@@ -744,139 +876,125 @@ def stepwise_extract_with_model(
|
|
|
744
876
|
|
|
745
877
|
# Extract the raw value from the response - handle both dict and direct value formats
|
|
746
878
|
extracted_value = result["json_object"]["value"]
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
879
|
+
logger.debug("[stepwise] Raw extracted value for %s: %s", field_name, extracted_value)
|
|
880
|
+
|
|
750
881
|
if isinstance(extracted_value, dict) and "value" in extracted_value:
|
|
751
882
|
raw_value = extracted_value["value"]
|
|
752
|
-
|
|
883
|
+
logger.debug("[stepwise] Extracted inner value from dict for %s", field_name)
|
|
753
884
|
else:
|
|
754
885
|
raw_value = extracted_value
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
log_debug(LogLevel.DEBUG, verbose_level, {"field_name": field_name, "raw_value": raw_value}, prefix="[stepwise]")
|
|
886
|
+
logger.debug("[stepwise] Using direct value for %s", field_name)
|
|
758
887
|
|
|
759
888
|
# Post-process the raw value to normalize invalid values for non-nullable fields
|
|
760
889
|
field_def = {}
|
|
761
890
|
if field_definitions and field_name in field_definitions:
|
|
762
891
|
field_def = field_definitions[field_name] if isinstance(field_definitions[field_name], dict) else {}
|
|
763
|
-
|
|
892
|
+
|
|
764
893
|
# Determine nullable status and default value
|
|
765
894
|
nullable = field_def.get("nullable", True)
|
|
766
895
|
default_value = field_def.get("default")
|
|
767
|
-
if
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
896
|
+
if (
|
|
897
|
+
default_value is None
|
|
898
|
+
and hasattr(field_info, "default")
|
|
899
|
+
and field_info.default is not ...
|
|
900
|
+
and str(field_info.default) != "PydanticUndefined"
|
|
901
|
+
):
|
|
902
|
+
default_value = field_info.default
|
|
903
|
+
|
|
771
904
|
# Create field_def for normalize_field_value
|
|
772
|
-
normalize_def = {
|
|
773
|
-
|
|
774
|
-
"default": default_value
|
|
775
|
-
}
|
|
776
|
-
|
|
905
|
+
normalize_def = {"nullable": nullable, "default": default_value}
|
|
906
|
+
|
|
777
907
|
# Normalize the raw value before conversion
|
|
778
908
|
raw_value = normalize_field_value(raw_value, field_info.annotation, normalize_def)
|
|
779
|
-
|
|
909
|
+
logger.debug("[stepwise] Normalized value for %s: %s", field_name, raw_value)
|
|
780
910
|
|
|
781
911
|
# Convert value using tools.convert_value with logging
|
|
782
912
|
try:
|
|
783
|
-
converted_value = convert_value(
|
|
784
|
-
raw_value,
|
|
785
|
-
field_info.annotation,
|
|
786
|
-
allow_shorthand=True
|
|
787
|
-
)
|
|
913
|
+
converted_value = convert_value(raw_value, field_info.annotation, allow_shorthand=True)
|
|
788
914
|
data[field_name] = converted_value
|
|
789
915
|
field_results[field_name] = {"status": "success", "used_default": False}
|
|
790
916
|
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
log_debug(LogLevel.TRACE, verbose_level, {
|
|
794
|
-
"field_name": field_name,
|
|
795
|
-
"converted_value": converted_value
|
|
796
|
-
}, prefix="[stepwise]")
|
|
797
|
-
|
|
917
|
+
logger.debug("[stepwise] Successfully converted %s", field_name)
|
|
918
|
+
|
|
798
919
|
except ValueError as e:
|
|
799
|
-
error_msg = f"Type conversion failed for {field_name}: {
|
|
800
|
-
|
|
920
|
+
error_msg = f"Type conversion failed for {field_name}: {e!s}"
|
|
921
|
+
|
|
801
922
|
# Check if field has a default value (either explicit or from field_definitions)
|
|
802
923
|
has_default = False
|
|
803
924
|
if field_definitions and field_name in field_definitions:
|
|
804
925
|
field_def = field_definitions[field_name]
|
|
805
|
-
if isinstance(field_def, dict) and
|
|
926
|
+
if isinstance(field_def, dict) and "default" in field_def:
|
|
806
927
|
has_default = True
|
|
807
|
-
|
|
808
|
-
if not has_default and hasattr(field_info,
|
|
928
|
+
|
|
929
|
+
if not has_default and hasattr(field_info, "default"):
|
|
809
930
|
default_val = field_info.default
|
|
810
931
|
# Field has default if it's not PydanticUndefined or Ellipsis
|
|
811
|
-
if default_val is not ... and str(default_val) !=
|
|
932
|
+
if default_val is not ... and str(default_val) != "PydanticUndefined":
|
|
812
933
|
has_default = True
|
|
813
|
-
|
|
934
|
+
|
|
814
935
|
# Only add to validation_errors if field is required (no default)
|
|
815
936
|
if not has_default:
|
|
816
937
|
validation_errors.append(error_msg)
|
|
817
|
-
|
|
938
|
+
|
|
818
939
|
# Use default value (type-appropriate if no explicit default)
|
|
819
940
|
default_value = get_field_default(field_name, field_info, field_definitions)
|
|
820
941
|
data[field_name] = default_value
|
|
821
942
|
field_results[field_name] = {"status": "conversion_failed", "error": error_msg, "used_default": True}
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
943
|
+
|
|
944
|
+
logger.error("[stepwise] %s", error_msg)
|
|
945
|
+
logger.info("[stepwise] Using default value for %s: %s", field_name, default_value)
|
|
946
|
+
|
|
827
947
|
except Exception as e:
|
|
828
|
-
error_msg = f"Extraction failed for {field_name}: {
|
|
829
|
-
|
|
948
|
+
error_msg = f"Extraction failed for {field_name}: {e!s}"
|
|
949
|
+
|
|
830
950
|
# Check if field has a default value (either explicit or from field_definitions)
|
|
831
951
|
has_default = False
|
|
832
952
|
if field_definitions and field_name in field_definitions:
|
|
833
953
|
field_def = field_definitions[field_name]
|
|
834
|
-
if isinstance(field_def, dict) and
|
|
954
|
+
if isinstance(field_def, dict) and "default" in field_def:
|
|
835
955
|
has_default = True
|
|
836
|
-
|
|
837
|
-
if not has_default and hasattr(field_info,
|
|
956
|
+
|
|
957
|
+
if not has_default and hasattr(field_info, "default"):
|
|
838
958
|
default_val = field_info.default
|
|
839
959
|
# Field has default if it's not PydanticUndefined or Ellipsis
|
|
840
|
-
if default_val is not ... and str(default_val) !=
|
|
960
|
+
if default_val is not ... and str(default_val) != "PydanticUndefined":
|
|
841
961
|
has_default = True
|
|
842
|
-
|
|
962
|
+
|
|
843
963
|
# Only add to validation_errors if field is required (no default)
|
|
844
964
|
if not has_default:
|
|
845
965
|
validation_errors.append(error_msg)
|
|
846
|
-
|
|
966
|
+
|
|
847
967
|
# Use default value (type-appropriate if no explicit default)
|
|
848
968
|
default_value = get_field_default(field_name, field_info, field_definitions)
|
|
849
969
|
data[field_name] = default_value
|
|
850
970
|
field_results[field_name] = {"status": "extraction_failed", "error": error_msg, "used_default": True}
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
971
|
+
|
|
972
|
+
logger.error("[stepwise] %s", error_msg)
|
|
973
|
+
logger.info("[stepwise] Using default value for %s: %s", field_name, default_value)
|
|
974
|
+
|
|
856
975
|
# Store error details in field_usages
|
|
857
976
|
accumulated_usage["field_usages"][field_name] = {
|
|
858
977
|
"error": str(e),
|
|
859
978
|
"status": "failed",
|
|
860
979
|
"used_default": True,
|
|
861
|
-
"default_value": default_value
|
|
980
|
+
"default_value": default_value,
|
|
862
981
|
}
|
|
863
|
-
|
|
864
|
-
# Add structured logging for validation errors
|
|
982
|
+
|
|
865
983
|
if validation_errors:
|
|
866
|
-
|
|
984
|
+
logger.warning("[stepwise] Found %d validation errors", len(validation_errors))
|
|
867
985
|
for error in validation_errors:
|
|
868
|
-
|
|
869
|
-
|
|
986
|
+
logger.error("[stepwise] %s", error)
|
|
987
|
+
|
|
870
988
|
# If there are validation errors, include them in the result
|
|
871
989
|
if validation_errors:
|
|
872
990
|
accumulated_usage["validation_errors"] = validation_errors
|
|
873
|
-
|
|
991
|
+
|
|
874
992
|
try:
|
|
875
993
|
# Create model instance with collected data
|
|
876
994
|
# Create model instance with collected data
|
|
877
995
|
model_instance = model_cls(**data)
|
|
878
996
|
model_dict = model_instance.model_dump()
|
|
879
|
-
|
|
997
|
+
|
|
880
998
|
# Enhanced DateTimeEncoder to handle both datetime and date objects
|
|
881
999
|
class ExtendedJSONEncoder(json.JSONEncoder):
|
|
882
1000
|
def default(self, obj):
|
|
@@ -885,14 +1003,14 @@ def stepwise_extract_with_model(
|
|
|
885
1003
|
if isinstance(obj, Decimal):
|
|
886
1004
|
return str(obj)
|
|
887
1005
|
return super().default(obj)
|
|
888
|
-
|
|
1006
|
+
|
|
889
1007
|
# Use enhanced encoder for JSON serialization
|
|
890
1008
|
json_string = json.dumps(model_dict, cls=ExtendedJSONEncoder)
|
|
891
1009
|
|
|
892
1010
|
# Also modify return value to use ExtendedJSONEncoder
|
|
893
|
-
if
|
|
894
|
-
result[
|
|
895
|
-
|
|
1011
|
+
if "json_string" in result:
|
|
1012
|
+
result["json_string"] = json.dumps(result["json_object"], cls=ExtendedJSONEncoder)
|
|
1013
|
+
|
|
896
1014
|
# Define ExtendedJSONEncoder for handling special types
|
|
897
1015
|
class ExtendedJSONEncoder(json.JSONEncoder):
|
|
898
1016
|
def default(self, obj):
|
|
@@ -901,10 +1019,10 @@ def stepwise_extract_with_model(
|
|
|
901
1019
|
if isinstance(obj, Decimal):
|
|
902
1020
|
return str(obj)
|
|
903
1021
|
return super().default(obj)
|
|
904
|
-
|
|
1022
|
+
|
|
905
1023
|
# Create json string with custom encoder
|
|
906
1024
|
json_string = json.dumps(model_dict, cls=ExtendedJSONEncoder)
|
|
907
|
-
|
|
1025
|
+
|
|
908
1026
|
# Create result matching extract_with_model format
|
|
909
1027
|
result = {
|
|
910
1028
|
"json_string": json_string,
|
|
@@ -912,58 +1030,60 @@ def stepwise_extract_with_model(
|
|
|
912
1030
|
"usage": accumulated_usage,
|
|
913
1031
|
"field_results": field_results,
|
|
914
1032
|
}
|
|
915
|
-
|
|
1033
|
+
|
|
916
1034
|
# Add model instance as property and make callable
|
|
917
1035
|
result["model"] = model_instance
|
|
918
|
-
return type(
|
|
919
|
-
"
|
|
920
|
-
|
|
921
|
-
|
|
1036
|
+
return type(
|
|
1037
|
+
"ExtractResult",
|
|
1038
|
+
(dict,),
|
|
1039
|
+
{"__getattr__": lambda self, key: self.get(key), "__call__": lambda self: self["model"]},
|
|
1040
|
+
)(result)
|
|
922
1041
|
except Exception as e:
|
|
923
|
-
error_msg = f"Model validation error: {
|
|
1042
|
+
error_msg = f"Model validation error: {e!s}"
|
|
924
1043
|
# Add validation error to accumulated usage
|
|
925
1044
|
if "validation_errors" not in accumulated_usage:
|
|
926
1045
|
accumulated_usage["validation_errors"] = []
|
|
927
1046
|
accumulated_usage["validation_errors"].append(error_msg)
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
1047
|
+
|
|
1048
|
+
logger.error("[stepwise] %s", error_msg)
|
|
1049
|
+
|
|
932
1050
|
# Create error result with partial data
|
|
933
1051
|
error_result = {
|
|
934
1052
|
"json_string": "{}",
|
|
935
1053
|
"json_object": {},
|
|
936
1054
|
"usage": accumulated_usage,
|
|
937
1055
|
"field_results": field_results,
|
|
938
|
-
"error": error_msg
|
|
1056
|
+
"error": error_msg,
|
|
939
1057
|
}
|
|
940
|
-
return type(
|
|
941
|
-
"
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
1058
|
+
return type(
|
|
1059
|
+
"ExtractResult",
|
|
1060
|
+
(dict,),
|
|
1061
|
+
{
|
|
1062
|
+
"__getattr__": lambda self, key: self.get(key),
|
|
1063
|
+
"__call__": lambda self: None, # Return None when called if validation failed
|
|
1064
|
+
},
|
|
1065
|
+
)(error_result)
|
|
945
1066
|
|
|
946
1067
|
|
|
947
|
-
def _json_to_toon(data: Union[
|
|
1068
|
+
def _json_to_toon(data: Union[list[dict[str, Any]], dict[str, Any]], data_key: str | None = None) -> str:
|
|
948
1069
|
"""Convert JSON array or dict containing array to TOON format.
|
|
949
|
-
|
|
1070
|
+
|
|
950
1071
|
Args:
|
|
951
1072
|
data: List of dicts (uniform array) or dict containing array under a key
|
|
952
1073
|
data_key: If data is a dict, the key containing the array
|
|
953
|
-
|
|
1074
|
+
|
|
954
1075
|
Returns:
|
|
955
1076
|
TOON formatted string
|
|
956
|
-
|
|
1077
|
+
|
|
957
1078
|
Raises:
|
|
958
1079
|
ValueError: If TOON conversion fails or data format is invalid
|
|
959
1080
|
RuntimeError: If python-toon is not installed
|
|
960
1081
|
"""
|
|
961
1082
|
if toon is None:
|
|
962
1083
|
raise RuntimeError(
|
|
963
|
-
"TOON conversion requested but 'python-toon' is not installed. "
|
|
964
|
-
"Install it with 'pip install python-toon'."
|
|
1084
|
+
"TOON conversion requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
965
1085
|
)
|
|
966
|
-
|
|
1086
|
+
|
|
967
1087
|
# Handle different data formats
|
|
968
1088
|
if isinstance(data, list):
|
|
969
1089
|
array_data = data
|
|
@@ -975,7 +1095,7 @@ def _json_to_toon(data: Union[List[Dict[str, Any]], Dict[str, Any]], data_key: O
|
|
|
975
1095
|
else:
|
|
976
1096
|
# Try to find the first array value in the dict
|
|
977
1097
|
array_data = None
|
|
978
|
-
for
|
|
1098
|
+
for _key, value in data.items():
|
|
979
1099
|
if isinstance(value, list) and value:
|
|
980
1100
|
array_data = value
|
|
981
1101
|
break
|
|
@@ -983,32 +1103,32 @@ def _json_to_toon(data: Union[List[Dict[str, Any]], Dict[str, Any]], data_key: O
|
|
|
983
1103
|
raise ValueError("No array found in data. Specify data_key or provide a list directly.")
|
|
984
1104
|
else:
|
|
985
1105
|
raise ValueError("Data must be a list of dicts or a dict containing an array")
|
|
986
|
-
|
|
1106
|
+
|
|
987
1107
|
if not isinstance(array_data, list):
|
|
988
1108
|
raise ValueError("Array data must be a list")
|
|
989
|
-
|
|
1109
|
+
|
|
990
1110
|
if not array_data:
|
|
991
1111
|
raise ValueError("Array data cannot be empty")
|
|
992
|
-
|
|
1112
|
+
|
|
993
1113
|
# Validate that all items in array are dicts (uniform structure)
|
|
994
1114
|
if not all(isinstance(item, dict) for item in array_data):
|
|
995
1115
|
raise ValueError("All items in array must be dictionaries for TOON conversion")
|
|
996
|
-
|
|
1116
|
+
|
|
997
1117
|
try:
|
|
998
1118
|
return toon.encode(array_data)
|
|
999
1119
|
except Exception as e:
|
|
1000
|
-
raise ValueError(f"Failed to convert data to TOON format: {e}")
|
|
1120
|
+
raise ValueError(f"Failed to convert data to TOON format: {e}") from e
|
|
1001
1121
|
|
|
1002
1122
|
|
|
1003
1123
|
def _dataframe_to_toon(df) -> str:
|
|
1004
1124
|
"""Convert Pandas DataFrame to TOON format.
|
|
1005
|
-
|
|
1125
|
+
|
|
1006
1126
|
Args:
|
|
1007
1127
|
df: Pandas DataFrame to convert
|
|
1008
|
-
|
|
1128
|
+
|
|
1009
1129
|
Returns:
|
|
1010
1130
|
TOON formatted string
|
|
1011
|
-
|
|
1131
|
+
|
|
1012
1132
|
Raises:
|
|
1013
1133
|
ValueError: If DataFrame conversion fails
|
|
1014
1134
|
RuntimeError: If pandas or python-toon is not installed
|
|
@@ -1019,12 +1139,11 @@ def _dataframe_to_toon(df) -> str:
|
|
|
1019
1139
|
raise RuntimeError(
|
|
1020
1140
|
"Pandas DataFrame conversion requested but 'pandas' is not installed. "
|
|
1021
1141
|
"Install it with 'pip install pandas' or 'pip install prompture[pandas]'."
|
|
1022
|
-
)
|
|
1023
|
-
|
|
1142
|
+
) from None
|
|
1143
|
+
|
|
1024
1144
|
if toon is None:
|
|
1025
1145
|
raise RuntimeError(
|
|
1026
|
-
"TOON conversion requested but 'python-toon' is not installed. "
|
|
1027
|
-
"Install it with 'pip install python-toon'."
|
|
1146
|
+
"TOON conversion requested but 'python-toon' is not installed. Install it with 'pip install python-toon'."
|
|
1028
1147
|
)
|
|
1029
1148
|
|
|
1030
1149
|
dataframe_type = getattr(pd, "DataFrame", None)
|
|
@@ -1035,43 +1154,43 @@ def _dataframe_to_toon(df) -> str:
|
|
|
1035
1154
|
# Duck-type fallback for tests that provide a lightweight mock
|
|
1036
1155
|
if not hasattr(df, "to_dict") or not hasattr(df, "empty"):
|
|
1037
1156
|
raise ValueError("Input must be a pandas DataFrame")
|
|
1038
|
-
|
|
1157
|
+
|
|
1039
1158
|
if df.empty:
|
|
1040
1159
|
raise ValueError("DataFrame cannot be empty")
|
|
1041
|
-
|
|
1160
|
+
|
|
1042
1161
|
try:
|
|
1043
1162
|
# Convert DataFrame to list of dicts
|
|
1044
|
-
data = df.to_dict(
|
|
1163
|
+
data = df.to_dict("records")
|
|
1045
1164
|
return toon.encode(data)
|
|
1046
1165
|
except Exception as e:
|
|
1047
|
-
raise ValueError(f"Failed to convert DataFrame to TOON format: {e}")
|
|
1166
|
+
raise ValueError(f"Failed to convert DataFrame to TOON format: {e}") from e
|
|
1048
1167
|
|
|
1049
1168
|
|
|
1050
|
-
def _calculate_token_savings(json_text: str, toon_text: str) ->
|
|
1169
|
+
def _calculate_token_savings(json_text: str, toon_text: str) -> dict[str, Any]:
|
|
1051
1170
|
"""Calculate estimated token savings between JSON and TOON formats.
|
|
1052
|
-
|
|
1171
|
+
|
|
1053
1172
|
This is a rough estimation based on character count ratios.
|
|
1054
1173
|
Actual token counts may vary by model and tokenizer.
|
|
1055
|
-
|
|
1174
|
+
|
|
1056
1175
|
Args:
|
|
1057
1176
|
json_text: JSON formatted text
|
|
1058
1177
|
toon_text: TOON formatted text
|
|
1059
|
-
|
|
1178
|
+
|
|
1060
1179
|
Returns:
|
|
1061
1180
|
Dict containing savings statistics
|
|
1062
1181
|
"""
|
|
1063
1182
|
json_chars = len(json_text)
|
|
1064
1183
|
toon_chars = len(toon_text)
|
|
1065
|
-
|
|
1184
|
+
|
|
1066
1185
|
# Rough estimation: 4 characters ≈ 1 token (varies by model)
|
|
1067
1186
|
json_tokens_est = json_chars // 4
|
|
1068
1187
|
toon_tokens_est = toon_chars // 4
|
|
1069
|
-
|
|
1188
|
+
|
|
1070
1189
|
savings_chars = json_chars - toon_chars
|
|
1071
1190
|
savings_tokens_est = json_tokens_est - toon_tokens_est
|
|
1072
|
-
|
|
1191
|
+
|
|
1073
1192
|
percentage_saved = (savings_chars / json_chars * 100) if json_chars > 0 else 0
|
|
1074
|
-
|
|
1193
|
+
|
|
1075
1194
|
return {
|
|
1076
1195
|
"json_characters": json_chars,
|
|
1077
1196
|
"toon_characters": toon_chars,
|
|
@@ -1079,26 +1198,27 @@ def _calculate_token_savings(json_text: str, toon_text: str) -> Dict[str, Any]:
|
|
|
1079
1198
|
"estimated_json_tokens": json_tokens_est,
|
|
1080
1199
|
"estimated_toon_tokens": toon_tokens_est,
|
|
1081
1200
|
"estimated_saved_tokens": savings_tokens_est,
|
|
1082
|
-
"percentage_saved": round(percentage_saved, 1)
|
|
1201
|
+
"percentage_saved": round(percentage_saved, 1),
|
|
1083
1202
|
}
|
|
1084
1203
|
|
|
1085
1204
|
|
|
1086
1205
|
def extract_from_data(
|
|
1087
|
-
data: Union[
|
|
1206
|
+
data: Union[list[dict[str, Any]], dict[str, Any]],
|
|
1088
1207
|
question: str,
|
|
1089
|
-
json_schema:
|
|
1208
|
+
json_schema: dict[str, Any],
|
|
1090
1209
|
*,
|
|
1091
1210
|
model_name: str,
|
|
1092
|
-
data_key:
|
|
1211
|
+
data_key: str | None = None,
|
|
1093
1212
|
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1094
1213
|
ai_cleanup: bool = True,
|
|
1095
|
-
options:
|
|
1096
|
-
|
|
1214
|
+
options: dict[str, Any] | None = None,
|
|
1215
|
+
system_prompt: str | None = None,
|
|
1216
|
+
) -> dict[str, Any]:
|
|
1097
1217
|
"""Extract information from structured data by converting to TOON format for token efficiency.
|
|
1098
|
-
|
|
1218
|
+
|
|
1099
1219
|
This function takes JSON array data, converts it to TOON format to reduce tokens,
|
|
1100
1220
|
sends it to the LLM with a question, and returns the JSON response.
|
|
1101
|
-
|
|
1221
|
+
|
|
1102
1222
|
Args:
|
|
1103
1223
|
data: List of dicts (uniform array) or dict containing array under a key
|
|
1104
1224
|
question: The question to ask about the data
|
|
@@ -1108,7 +1228,7 @@ def extract_from_data(
|
|
|
1108
1228
|
instruction_template: Template with {question} placeholder
|
|
1109
1229
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1110
1230
|
options: Additional options to pass to the driver
|
|
1111
|
-
|
|
1231
|
+
|
|
1112
1232
|
Returns:
|
|
1113
1233
|
Dict containing:
|
|
1114
1234
|
- json_object: The parsed JSON response
|
|
@@ -1116,18 +1236,18 @@ def extract_from_data(
|
|
|
1116
1236
|
- usage: Token usage and cost information (includes token_savings)
|
|
1117
1237
|
- toon_data: The TOON formatted input data
|
|
1118
1238
|
- token_savings: Statistics about token savings vs JSON input
|
|
1119
|
-
|
|
1239
|
+
|
|
1120
1240
|
Raises:
|
|
1121
1241
|
ValueError: If data format is invalid or conversion fails
|
|
1122
1242
|
RuntimeError: If required dependencies are missing
|
|
1123
|
-
|
|
1243
|
+
|
|
1124
1244
|
Example:
|
|
1125
1245
|
>>> products = [
|
|
1126
1246
|
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1127
1247
|
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1128
1248
|
... ]
|
|
1129
1249
|
>>> schema = {
|
|
1130
|
-
... "type": "object",
|
|
1250
|
+
... "type": "object",
|
|
1131
1251
|
... "properties": {
|
|
1132
1252
|
... "average_price": {"type": "number"},
|
|
1133
1253
|
... "total_items": {"type": "integer"}
|
|
@@ -1144,57 +1264,59 @@ def extract_from_data(
|
|
|
1144
1264
|
"""
|
|
1145
1265
|
if not question or not question.strip():
|
|
1146
1266
|
raise ValueError("Question cannot be empty")
|
|
1147
|
-
|
|
1267
|
+
|
|
1148
1268
|
if not json_schema:
|
|
1149
1269
|
raise ValueError("JSON schema cannot be empty")
|
|
1150
|
-
|
|
1270
|
+
|
|
1151
1271
|
if options is None:
|
|
1152
1272
|
options = {}
|
|
1153
|
-
|
|
1273
|
+
|
|
1154
1274
|
# Convert data to TOON format
|
|
1155
1275
|
toon_data = _json_to_toon(data, data_key)
|
|
1156
|
-
|
|
1276
|
+
|
|
1157
1277
|
# Calculate token savings (for comparison with JSON)
|
|
1158
1278
|
json_data = json.dumps(data if isinstance(data, list) else data.get(data_key, data), indent=2)
|
|
1159
1279
|
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1160
|
-
|
|
1280
|
+
|
|
1161
1281
|
# Build the prompt with TOON data
|
|
1162
1282
|
content_prompt = instruction_template.format(question=question)
|
|
1163
1283
|
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1164
|
-
|
|
1284
|
+
|
|
1165
1285
|
# Call the LLM
|
|
1166
1286
|
result = ask_for_json(
|
|
1167
1287
|
driver=get_driver_for_model(model_name),
|
|
1168
1288
|
content_prompt=full_prompt,
|
|
1169
1289
|
json_schema=json_schema,
|
|
1170
1290
|
ai_cleanup=ai_cleanup,
|
|
1171
|
-
model_name=model_name.split(
|
|
1291
|
+
model_name=model_name.split("/")[-1] if "/" in model_name else model_name,
|
|
1172
1292
|
options=options,
|
|
1173
|
-
output_format="json" # Always return JSON, not TOON
|
|
1293
|
+
output_format="json", # Always return JSON, not TOON
|
|
1294
|
+
system_prompt=system_prompt,
|
|
1174
1295
|
)
|
|
1175
|
-
|
|
1296
|
+
|
|
1176
1297
|
# Add our additional data to the result
|
|
1177
1298
|
result["toon_data"] = toon_data
|
|
1178
1299
|
result["token_savings"] = token_savings
|
|
1179
|
-
|
|
1300
|
+
|
|
1180
1301
|
return result
|
|
1181
1302
|
|
|
1182
1303
|
|
|
1183
1304
|
def extract_from_pandas(
|
|
1184
1305
|
df, # pandas.DataFrame - optional import
|
|
1185
1306
|
question: str,
|
|
1186
|
-
json_schema:
|
|
1307
|
+
json_schema: dict[str, Any],
|
|
1187
1308
|
*,
|
|
1188
1309
|
model_name: str,
|
|
1189
1310
|
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1190
1311
|
ai_cleanup: bool = True,
|
|
1191
|
-
options:
|
|
1192
|
-
|
|
1312
|
+
options: dict[str, Any] | None = None,
|
|
1313
|
+
system_prompt: str | None = None,
|
|
1314
|
+
) -> dict[str, Any]:
|
|
1193
1315
|
"""Extract information from Pandas DataFrame by converting to TOON format for token efficiency.
|
|
1194
|
-
|
|
1316
|
+
|
|
1195
1317
|
This function takes a Pandas DataFrame, converts it to TOON format to reduce tokens,
|
|
1196
1318
|
sends it to the LLM with a question, and returns the JSON response.
|
|
1197
|
-
|
|
1319
|
+
|
|
1198
1320
|
Args:
|
|
1199
1321
|
df: Pandas DataFrame to analyze
|
|
1200
1322
|
question: The question to ask about the data
|
|
@@ -1203,7 +1325,7 @@ def extract_from_pandas(
|
|
|
1203
1325
|
instruction_template: Template with {question} placeholder
|
|
1204
1326
|
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1205
1327
|
options: Additional options to pass to the driver
|
|
1206
|
-
|
|
1328
|
+
|
|
1207
1329
|
Returns:
|
|
1208
1330
|
Dict containing:
|
|
1209
1331
|
- json_object: The parsed JSON response
|
|
@@ -1212,11 +1334,11 @@ def extract_from_pandas(
|
|
|
1212
1334
|
- toon_data: The TOON formatted input data
|
|
1213
1335
|
- token_savings: Statistics about token savings vs JSON input
|
|
1214
1336
|
- dataframe_info: Basic info about the original DataFrame
|
|
1215
|
-
|
|
1337
|
+
|
|
1216
1338
|
Raises:
|
|
1217
1339
|
ValueError: If DataFrame is invalid or conversion fails
|
|
1218
1340
|
RuntimeError: If required dependencies are missing
|
|
1219
|
-
|
|
1341
|
+
|
|
1220
1342
|
Example:
|
|
1221
1343
|
>>> import pandas as pd
|
|
1222
1344
|
>>> df = pd.DataFrame([
|
|
@@ -1241,45 +1363,46 @@ def extract_from_pandas(
|
|
|
1241
1363
|
"""
|
|
1242
1364
|
if not question or not question.strip():
|
|
1243
1365
|
raise ValueError("Question cannot be empty")
|
|
1244
|
-
|
|
1366
|
+
|
|
1245
1367
|
if not json_schema:
|
|
1246
1368
|
raise ValueError("JSON schema cannot be empty")
|
|
1247
|
-
|
|
1369
|
+
|
|
1248
1370
|
if options is None:
|
|
1249
1371
|
options = {}
|
|
1250
|
-
|
|
1372
|
+
|
|
1251
1373
|
# Convert DataFrame to TOON format
|
|
1252
1374
|
toon_data = _dataframe_to_toon(df)
|
|
1253
|
-
|
|
1375
|
+
|
|
1254
1376
|
# Calculate token savings (for comparison with JSON)
|
|
1255
|
-
json_data = df.to_json(indent=2, orient=
|
|
1377
|
+
json_data = df.to_json(indent=2, orient="records")
|
|
1256
1378
|
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1257
|
-
|
|
1379
|
+
|
|
1258
1380
|
# Get basic DataFrame info
|
|
1259
1381
|
dataframe_info = {
|
|
1260
1382
|
"shape": df.shape,
|
|
1261
1383
|
"columns": list(df.columns),
|
|
1262
|
-
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
|
|
1384
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
1263
1385
|
}
|
|
1264
|
-
|
|
1386
|
+
|
|
1265
1387
|
# Build the prompt with TOON data
|
|
1266
1388
|
content_prompt = instruction_template.format(question=question)
|
|
1267
1389
|
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1268
|
-
|
|
1390
|
+
|
|
1269
1391
|
# Call the LLM
|
|
1270
1392
|
result = ask_for_json(
|
|
1271
1393
|
driver=get_driver_for_model(model_name),
|
|
1272
1394
|
content_prompt=full_prompt,
|
|
1273
1395
|
json_schema=json_schema,
|
|
1274
1396
|
ai_cleanup=ai_cleanup,
|
|
1275
|
-
model_name=model_name.split(
|
|
1397
|
+
model_name=model_name.split("/")[-1] if "/" in model_name else model_name,
|
|
1276
1398
|
options=options,
|
|
1277
|
-
output_format="json" # Always return JSON, not TOON
|
|
1399
|
+
output_format="json", # Always return JSON, not TOON
|
|
1400
|
+
system_prompt=system_prompt,
|
|
1278
1401
|
)
|
|
1279
|
-
|
|
1402
|
+
|
|
1280
1403
|
# Add our additional data to the result
|
|
1281
1404
|
result["toon_data"] = toon_data
|
|
1282
1405
|
result["token_savings"] = token_savings
|
|
1283
1406
|
result["dataframe_info"] = dataframe_info
|
|
1284
|
-
|
|
1407
|
+
|
|
1285
1408
|
return result
|