prompture 0.0.30.dev1__py3-none-any.whl → 0.0.31.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- prompture/__init__.py +6 -2
- prompture/core.py +391 -54
- {prompture-0.0.30.dev1.dist-info → prompture-0.0.31.dev1.dist-info}/METADATA +57 -4
- {prompture-0.0.30.dev1.dist-info → prompture-0.0.31.dev1.dist-info}/RECORD +8 -8
- {prompture-0.0.30.dev1.dist-info → prompture-0.0.31.dev1.dist-info}/WHEEL +0 -0
- {prompture-0.0.30.dev1.dist-info → prompture-0.0.31.dev1.dist-info}/entry_points.txt +0 -0
- {prompture-0.0.30.dev1.dist-info → prompture-0.0.31.dev1.dist-info}/licenses/LICENSE +0 -0
- {prompture-0.0.30.dev1.dist-info → prompture-0.0.31.dev1.dist-info}/top_level.txt +0 -0
prompture/__init__.py
CHANGED
|
@@ -6,12 +6,13 @@ from .core import (
|
|
|
6
6
|
extract_and_jsonify,
|
|
7
7
|
manual_extract_and_jsonify,
|
|
8
8
|
Driver,
|
|
9
|
-
clean_json_text,
|
|
10
|
-
clean_toon_text,
|
|
11
9
|
clean_json_text_with_ai,
|
|
12
10
|
extract_with_model,
|
|
13
11
|
stepwise_extract_with_model,
|
|
12
|
+
extract_from_data,
|
|
13
|
+
extract_from_pandas,
|
|
14
14
|
)
|
|
15
|
+
from .tools import clean_json_text, clean_toon_text
|
|
15
16
|
from .field_definitions import (
|
|
16
17
|
FIELD_DEFINITIONS, get_field_definition, get_required_fields, get_field_names,
|
|
17
18
|
field_from_registry, register_field, add_field_definition, add_field_definitions,
|
|
@@ -50,6 +51,9 @@ __all__ = [
|
|
|
50
51
|
"clean_json_text_with_ai",
|
|
51
52
|
"extract_with_model",
|
|
52
53
|
"stepwise_extract_with_model",
|
|
54
|
+
# TOON Data Extraction Functions
|
|
55
|
+
"extract_from_data",
|
|
56
|
+
"extract_from_pandas",
|
|
53
57
|
# Field Definitions
|
|
54
58
|
"FIELD_DEFINITIONS",
|
|
55
59
|
"get_field_definition",
|
prompture/core.py
CHANGED
|
@@ -23,7 +23,6 @@ from .tools import (
|
|
|
23
23
|
convert_value,
|
|
24
24
|
log_debug,
|
|
25
25
|
clean_json_text,
|
|
26
|
-
clean_toon_text,
|
|
27
26
|
LogLevel,
|
|
28
27
|
get_field_default,
|
|
29
28
|
)
|
|
@@ -168,41 +167,32 @@ def ask_for_json(
|
|
|
168
167
|
if output_format not in ("json", "toon"):
|
|
169
168
|
raise ValueError(f"Unsupported output_format '{output_format}'. Use 'json' or 'toon'.")
|
|
170
169
|
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
)
|
|
177
|
-
instruct = (
|
|
178
|
-
"Reply only in TOON (Token-Oriented Object Notation).\n"
|
|
179
|
-
"- Scalars: key: value\n"
|
|
180
|
-
"- Lists: list[count]: item1,item2 (comma separated, no semicolons or line breaks)\n"
|
|
181
|
-
"- Object arrays: name[count,]{f1,f2}:\n"
|
|
182
|
-
" value1,value2\n"
|
|
183
|
-
"Use two spaces before each table row, lowercase true/false/null, include every field from the schema (use null if unknown), and output no markdown, prose, or braces beyond the headers.\n"
|
|
184
|
-
f"Schema:\n{json.dumps(json_schema, separators=(',', ':'))}"
|
|
185
|
-
)
|
|
186
|
-
else:
|
|
187
|
-
schema_string = json.dumps(json_schema, indent=2)
|
|
188
|
-
instruct = (
|
|
189
|
-
"Return only a single JSON object (no markdown, no extra text) that validates against this JSON schema:\n"
|
|
190
|
-
f"{schema_string}\n\n"
|
|
191
|
-
"If a value is unknown use null. Use double quotes for keys and strings."
|
|
170
|
+
schema_string = json.dumps(json_schema, indent=2)
|
|
171
|
+
if output_format == "toon" and toon is None:
|
|
172
|
+
raise RuntimeError(
|
|
173
|
+
"TOON requested but 'python-toon' is not installed. "
|
|
174
|
+
"Install it with 'pip install python-toon'."
|
|
192
175
|
)
|
|
193
176
|
|
|
177
|
+
instruct = (
|
|
178
|
+
"Return only a single JSON object (no markdown, no extra text) that validates against this JSON schema:\n"
|
|
179
|
+
f"{schema_string}\n\n"
|
|
180
|
+
"If a value is unknown use null. Use double quotes for keys and strings."
|
|
181
|
+
)
|
|
182
|
+
if output_format == "toon":
|
|
183
|
+
instruct += "\n\n(Respond with JSON only; Prompture will convert to TOON.)"
|
|
184
|
+
|
|
194
185
|
full_prompt = f"{content_prompt}\n\n{instruct}"
|
|
195
186
|
resp = driver.generate(full_prompt, options)
|
|
196
187
|
raw = resp.get("text", "")
|
|
197
|
-
cleaned =
|
|
188
|
+
cleaned = clean_json_text(raw)
|
|
198
189
|
|
|
199
190
|
try:
|
|
191
|
+
json_obj = json.loads(cleaned)
|
|
192
|
+
json_string = cleaned
|
|
193
|
+
toon_string = None
|
|
200
194
|
if output_format == "toon":
|
|
201
|
-
|
|
202
|
-
json_string = json.dumps(json_obj)
|
|
203
|
-
else:
|
|
204
|
-
json_obj = json.loads(cleaned)
|
|
205
|
-
json_string = cleaned
|
|
195
|
+
toon_string = toon.encode(json_obj)
|
|
206
196
|
|
|
207
197
|
usage = {
|
|
208
198
|
**resp.get("meta", {}),
|
|
@@ -213,37 +203,42 @@ def ask_for_json(
|
|
|
213
203
|
"cost": resp.get("meta", {}).get("cost", 0.0),
|
|
214
204
|
"model_name": model_name or getattr(driver, "model", "")
|
|
215
205
|
}
|
|
216
|
-
|
|
206
|
+
result = {
|
|
217
207
|
"json_string": json_string,
|
|
218
208
|
"json_object": json_obj,
|
|
219
209
|
"usage": usage
|
|
220
210
|
}
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
211
|
+
if toon_string is not None:
|
|
212
|
+
result["toon_string"] = toon_string
|
|
213
|
+
result["output_format"] = "toon"
|
|
214
|
+
else:
|
|
215
|
+
result["output_format"] = "json"
|
|
216
|
+
return result
|
|
217
|
+
except json.JSONDecodeError as e:
|
|
218
|
+
if ai_cleanup:
|
|
219
|
+
cleaned_fixed = clean_json_text_with_ai(driver, cleaned, model_name, options)
|
|
220
|
+
try:
|
|
221
|
+
json_obj = json.loads(cleaned_fixed)
|
|
222
|
+
result = {
|
|
223
|
+
"json_string": cleaned_fixed,
|
|
224
|
+
"json_object": json_obj,
|
|
225
|
+
"usage": {
|
|
226
|
+
"prompt_tokens": 0,
|
|
227
|
+
"completion_tokens": 0,
|
|
228
|
+
"total_tokens": 0,
|
|
229
|
+
"cost": 0.0,
|
|
230
|
+
"model_name": options.get("model", getattr(driver, "model", "")),
|
|
231
|
+
"raw_response": {}
|
|
232
|
+
},
|
|
233
|
+
"output_format": "json" if output_format != "toon" else "toon",
|
|
234
|
+
}
|
|
235
|
+
if output_format == "toon":
|
|
236
|
+
result["toon_string"] = toon.encode(json_obj)
|
|
237
|
+
return result
|
|
238
|
+
except json.JSONDecodeError:
|
|
245
239
|
raise e
|
|
246
|
-
|
|
240
|
+
else:
|
|
241
|
+
raise e
|
|
247
242
|
|
|
248
243
|
def extract_and_jsonify(
|
|
249
244
|
text: Union[str, Driver], # Can be either text or driver for backward compatibility
|
|
@@ -862,3 +857,345 @@ def stepwise_extract_with_model(
|
|
|
862
857
|
"__getattr__": lambda self, key: self.get(key),
|
|
863
858
|
"__call__": lambda self: None # Return None when called if validation failed
|
|
864
859
|
})(error_result)
|
|
860
|
+
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def _json_to_toon(data: Union[List[Dict[str, Any]], Dict[str, Any]], data_key: Optional[str] = None) -> str:
|
|
864
|
+
"""Convert JSON array or dict containing array to TOON format.
|
|
865
|
+
|
|
866
|
+
Args:
|
|
867
|
+
data: List of dicts (uniform array) or dict containing array under a key
|
|
868
|
+
data_key: If data is a dict, the key containing the array
|
|
869
|
+
|
|
870
|
+
Returns:
|
|
871
|
+
TOON formatted string
|
|
872
|
+
|
|
873
|
+
Raises:
|
|
874
|
+
ValueError: If TOON conversion fails or data format is invalid
|
|
875
|
+
RuntimeError: If python-toon is not installed
|
|
876
|
+
"""
|
|
877
|
+
if toon is None:
|
|
878
|
+
raise RuntimeError(
|
|
879
|
+
"TOON conversion requested but 'python-toon' is not installed. "
|
|
880
|
+
"Install it with 'pip install python-toon'."
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# Handle different data formats
|
|
884
|
+
if isinstance(data, list):
|
|
885
|
+
array_data = data
|
|
886
|
+
elif isinstance(data, dict):
|
|
887
|
+
if data_key:
|
|
888
|
+
if data_key not in data:
|
|
889
|
+
raise ValueError(f"Key '{data_key}' not found in data")
|
|
890
|
+
array_data = data[data_key]
|
|
891
|
+
else:
|
|
892
|
+
# Try to find the first array value in the dict
|
|
893
|
+
array_data = None
|
|
894
|
+
for key, value in data.items():
|
|
895
|
+
if isinstance(value, list) and value:
|
|
896
|
+
array_data = value
|
|
897
|
+
break
|
|
898
|
+
if array_data is None:
|
|
899
|
+
raise ValueError("No array found in data. Specify data_key or provide a list directly.")
|
|
900
|
+
else:
|
|
901
|
+
raise ValueError("Data must be a list of dicts or a dict containing an array")
|
|
902
|
+
|
|
903
|
+
if not isinstance(array_data, list):
|
|
904
|
+
raise ValueError("Array data must be a list")
|
|
905
|
+
|
|
906
|
+
if not array_data:
|
|
907
|
+
raise ValueError("Array data cannot be empty")
|
|
908
|
+
|
|
909
|
+
# Validate that all items in array are dicts (uniform structure)
|
|
910
|
+
if not all(isinstance(item, dict) for item in array_data):
|
|
911
|
+
raise ValueError("All items in array must be dictionaries for TOON conversion")
|
|
912
|
+
|
|
913
|
+
try:
|
|
914
|
+
return toon.encode(array_data)
|
|
915
|
+
except Exception as e:
|
|
916
|
+
raise ValueError(f"Failed to convert data to TOON format: {e}")
|
|
917
|
+
|
|
918
|
+
|
|
919
|
+
def _dataframe_to_toon(df) -> str:
|
|
920
|
+
"""Convert Pandas DataFrame to TOON format.
|
|
921
|
+
|
|
922
|
+
Args:
|
|
923
|
+
df: Pandas DataFrame to convert
|
|
924
|
+
|
|
925
|
+
Returns:
|
|
926
|
+
TOON formatted string
|
|
927
|
+
|
|
928
|
+
Raises:
|
|
929
|
+
ValueError: If DataFrame conversion fails
|
|
930
|
+
RuntimeError: If pandas or python-toon is not installed
|
|
931
|
+
"""
|
|
932
|
+
try:
|
|
933
|
+
import pandas as pd
|
|
934
|
+
except ImportError:
|
|
935
|
+
raise RuntimeError(
|
|
936
|
+
"Pandas DataFrame conversion requested but 'pandas' is not installed. "
|
|
937
|
+
"Install it with 'pip install pandas' or 'pip install prompture[pandas]'."
|
|
938
|
+
)
|
|
939
|
+
|
|
940
|
+
if toon is None:
|
|
941
|
+
raise RuntimeError(
|
|
942
|
+
"TOON conversion requested but 'python-toon' is not installed. "
|
|
943
|
+
"Install it with 'pip install python-toon'."
|
|
944
|
+
)
|
|
945
|
+
|
|
946
|
+
dataframe_type = getattr(pd, "DataFrame", None)
|
|
947
|
+
if isinstance(dataframe_type, type):
|
|
948
|
+
if not isinstance(df, dataframe_type):
|
|
949
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
950
|
+
else:
|
|
951
|
+
# Duck-type fallback for tests that provide a lightweight mock
|
|
952
|
+
if not hasattr(df, "to_dict") or not hasattr(df, "empty"):
|
|
953
|
+
raise ValueError("Input must be a pandas DataFrame")
|
|
954
|
+
|
|
955
|
+
if df.empty:
|
|
956
|
+
raise ValueError("DataFrame cannot be empty")
|
|
957
|
+
|
|
958
|
+
try:
|
|
959
|
+
# Convert DataFrame to list of dicts
|
|
960
|
+
data = df.to_dict('records')
|
|
961
|
+
return toon.encode(data)
|
|
962
|
+
except Exception as e:
|
|
963
|
+
raise ValueError(f"Failed to convert DataFrame to TOON format: {e}")
|
|
964
|
+
|
|
965
|
+
|
|
966
|
+
def _calculate_token_savings(json_text: str, toon_text: str) -> Dict[str, Any]:
|
|
967
|
+
"""Calculate estimated token savings between JSON and TOON formats.
|
|
968
|
+
|
|
969
|
+
This is a rough estimation based on character count ratios.
|
|
970
|
+
Actual token counts may vary by model and tokenizer.
|
|
971
|
+
|
|
972
|
+
Args:
|
|
973
|
+
json_text: JSON formatted text
|
|
974
|
+
toon_text: TOON formatted text
|
|
975
|
+
|
|
976
|
+
Returns:
|
|
977
|
+
Dict containing savings statistics
|
|
978
|
+
"""
|
|
979
|
+
json_chars = len(json_text)
|
|
980
|
+
toon_chars = len(toon_text)
|
|
981
|
+
|
|
982
|
+
# Rough estimation: 4 characters ≈ 1 token (varies by model)
|
|
983
|
+
json_tokens_est = json_chars // 4
|
|
984
|
+
toon_tokens_est = toon_chars // 4
|
|
985
|
+
|
|
986
|
+
savings_chars = json_chars - toon_chars
|
|
987
|
+
savings_tokens_est = json_tokens_est - toon_tokens_est
|
|
988
|
+
|
|
989
|
+
percentage_saved = (savings_chars / json_chars * 100) if json_chars > 0 else 0
|
|
990
|
+
|
|
991
|
+
return {
|
|
992
|
+
"json_characters": json_chars,
|
|
993
|
+
"toon_characters": toon_chars,
|
|
994
|
+
"saved_characters": savings_chars,
|
|
995
|
+
"estimated_json_tokens": json_tokens_est,
|
|
996
|
+
"estimated_toon_tokens": toon_tokens_est,
|
|
997
|
+
"estimated_saved_tokens": savings_tokens_est,
|
|
998
|
+
"percentage_saved": round(percentage_saved, 1)
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
|
|
1002
|
+
def extract_from_data(
|
|
1003
|
+
data: Union[List[Dict[str, Any]], Dict[str, Any]],
|
|
1004
|
+
question: str,
|
|
1005
|
+
json_schema: Dict[str, Any],
|
|
1006
|
+
*,
|
|
1007
|
+
model_name: str,
|
|
1008
|
+
data_key: Optional[str] = None,
|
|
1009
|
+
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1010
|
+
ai_cleanup: bool = True,
|
|
1011
|
+
options: Optional[Dict[str, Any]] = None,
|
|
1012
|
+
) -> Dict[str, Any]:
|
|
1013
|
+
"""Extract information from structured data by converting to TOON format for token efficiency.
|
|
1014
|
+
|
|
1015
|
+
This function takes JSON array data, converts it to TOON format to reduce tokens,
|
|
1016
|
+
sends it to the LLM with a question, and returns the JSON response.
|
|
1017
|
+
|
|
1018
|
+
Args:
|
|
1019
|
+
data: List of dicts (uniform array) or dict containing array under a key
|
|
1020
|
+
question: The question to ask about the data
|
|
1021
|
+
json_schema: Expected JSON schema for the response
|
|
1022
|
+
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4")
|
|
1023
|
+
data_key: If data is a dict, the key containing the array (e.g., "products")
|
|
1024
|
+
instruction_template: Template with {question} placeholder
|
|
1025
|
+
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1026
|
+
options: Additional options to pass to the driver
|
|
1027
|
+
|
|
1028
|
+
Returns:
|
|
1029
|
+
Dict containing:
|
|
1030
|
+
- json_object: The parsed JSON response
|
|
1031
|
+
- json_string: The JSON string response
|
|
1032
|
+
- usage: Token usage and cost information (includes token_savings)
|
|
1033
|
+
- toon_data: The TOON formatted input data
|
|
1034
|
+
- token_savings: Statistics about token savings vs JSON input
|
|
1035
|
+
|
|
1036
|
+
Raises:
|
|
1037
|
+
ValueError: If data format is invalid or conversion fails
|
|
1038
|
+
RuntimeError: If required dependencies are missing
|
|
1039
|
+
|
|
1040
|
+
Example:
|
|
1041
|
+
>>> products = [
|
|
1042
|
+
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1043
|
+
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1044
|
+
... ]
|
|
1045
|
+
>>> schema = {
|
|
1046
|
+
... "type": "object",
|
|
1047
|
+
... "properties": {
|
|
1048
|
+
... "average_price": {"type": "number"},
|
|
1049
|
+
... "total_items": {"type": "integer"}
|
|
1050
|
+
... }
|
|
1051
|
+
... }
|
|
1052
|
+
>>> result = extract_from_data(
|
|
1053
|
+
... data=products,
|
|
1054
|
+
... question="What is the average price and total number of items?",
|
|
1055
|
+
... json_schema=schema,
|
|
1056
|
+
... model_name="openai/gpt-4"
|
|
1057
|
+
... )
|
|
1058
|
+
>>> print(result["json_object"])
|
|
1059
|
+
{'average_price': 509.99, 'total_items': 2}
|
|
1060
|
+
"""
|
|
1061
|
+
if not question or not question.strip():
|
|
1062
|
+
raise ValueError("Question cannot be empty")
|
|
1063
|
+
|
|
1064
|
+
if not json_schema:
|
|
1065
|
+
raise ValueError("JSON schema cannot be empty")
|
|
1066
|
+
|
|
1067
|
+
if options is None:
|
|
1068
|
+
options = {}
|
|
1069
|
+
|
|
1070
|
+
# Convert data to TOON format
|
|
1071
|
+
toon_data = _json_to_toon(data, data_key)
|
|
1072
|
+
|
|
1073
|
+
# Calculate token savings (for comparison with JSON)
|
|
1074
|
+
json_data = json.dumps(data if isinstance(data, list) else data.get(data_key, data), indent=2)
|
|
1075
|
+
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1076
|
+
|
|
1077
|
+
# Build the prompt with TOON data
|
|
1078
|
+
content_prompt = instruction_template.format(question=question)
|
|
1079
|
+
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1080
|
+
|
|
1081
|
+
# Call the LLM
|
|
1082
|
+
result = ask_for_json(
|
|
1083
|
+
driver=get_driver_for_model(model_name),
|
|
1084
|
+
content_prompt=full_prompt,
|
|
1085
|
+
json_schema=json_schema,
|
|
1086
|
+
ai_cleanup=ai_cleanup,
|
|
1087
|
+
model_name=model_name.split('/')[-1] if '/' in model_name else model_name,
|
|
1088
|
+
options=options,
|
|
1089
|
+
output_format="json" # Always return JSON, not TOON
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
# Add our additional data to the result
|
|
1093
|
+
result["toon_data"] = toon_data
|
|
1094
|
+
result["token_savings"] = token_savings
|
|
1095
|
+
|
|
1096
|
+
return result
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def extract_from_pandas(
|
|
1100
|
+
df, # pandas.DataFrame - optional import
|
|
1101
|
+
question: str,
|
|
1102
|
+
json_schema: Dict[str, Any],
|
|
1103
|
+
*,
|
|
1104
|
+
model_name: str,
|
|
1105
|
+
instruction_template: str = "Analyze the following data and answer: {question}",
|
|
1106
|
+
ai_cleanup: bool = True,
|
|
1107
|
+
options: Optional[Dict[str, Any]] = None,
|
|
1108
|
+
) -> Dict[str, Any]:
|
|
1109
|
+
"""Extract information from Pandas DataFrame by converting to TOON format for token efficiency.
|
|
1110
|
+
|
|
1111
|
+
This function takes a Pandas DataFrame, converts it to TOON format to reduce tokens,
|
|
1112
|
+
sends it to the LLM with a question, and returns the JSON response.
|
|
1113
|
+
|
|
1114
|
+
Args:
|
|
1115
|
+
df: Pandas DataFrame to analyze
|
|
1116
|
+
question: The question to ask about the data
|
|
1117
|
+
json_schema: Expected JSON schema for the response
|
|
1118
|
+
model_name: Model identifier in format "provider/model" (e.g., "openai/gpt-4")
|
|
1119
|
+
instruction_template: Template with {question} placeholder
|
|
1120
|
+
ai_cleanup: Whether to attempt AI-based cleanup if JSON parsing fails
|
|
1121
|
+
options: Additional options to pass to the driver
|
|
1122
|
+
|
|
1123
|
+
Returns:
|
|
1124
|
+
Dict containing:
|
|
1125
|
+
- json_object: The parsed JSON response
|
|
1126
|
+
- json_string: The JSON string response
|
|
1127
|
+
- usage: Token usage and cost information (includes token_savings)
|
|
1128
|
+
- toon_data: The TOON formatted input data
|
|
1129
|
+
- token_savings: Statistics about token savings vs JSON input
|
|
1130
|
+
- dataframe_info: Basic info about the original DataFrame
|
|
1131
|
+
|
|
1132
|
+
Raises:
|
|
1133
|
+
ValueError: If DataFrame is invalid or conversion fails
|
|
1134
|
+
RuntimeError: If required dependencies are missing
|
|
1135
|
+
|
|
1136
|
+
Example:
|
|
1137
|
+
>>> import pandas as pd
|
|
1138
|
+
>>> df = pd.DataFrame([
|
|
1139
|
+
... {"id": 1, "name": "Laptop", "price": 999.99, "category": "electronics"},
|
|
1140
|
+
... {"id": 2, "name": "Book", "price": 19.99, "category": "books"}
|
|
1141
|
+
... ])
|
|
1142
|
+
>>> schema = {
|
|
1143
|
+
... "type": "object",
|
|
1144
|
+
... "properties": {
|
|
1145
|
+
... "highest_priced_item": {"type": "string"},
|
|
1146
|
+
... "price_range": {"type": "number"}
|
|
1147
|
+
... }
|
|
1148
|
+
... }
|
|
1149
|
+
>>> result = extract_from_pandas(
|
|
1150
|
+
... df=df,
|
|
1151
|
+
... question="What is the highest priced item and price range?",
|
|
1152
|
+
... json_schema=schema,
|
|
1153
|
+
... model_name="openai/gpt-4"
|
|
1154
|
+
... )
|
|
1155
|
+
>>> print(result["json_object"])
|
|
1156
|
+
{'highest_priced_item': 'Laptop', 'price_range': 980.0}
|
|
1157
|
+
"""
|
|
1158
|
+
if not question or not question.strip():
|
|
1159
|
+
raise ValueError("Question cannot be empty")
|
|
1160
|
+
|
|
1161
|
+
if not json_schema:
|
|
1162
|
+
raise ValueError("JSON schema cannot be empty")
|
|
1163
|
+
|
|
1164
|
+
if options is None:
|
|
1165
|
+
options = {}
|
|
1166
|
+
|
|
1167
|
+
# Convert DataFrame to TOON format
|
|
1168
|
+
toon_data = _dataframe_to_toon(df)
|
|
1169
|
+
|
|
1170
|
+
# Calculate token savings (for comparison with JSON)
|
|
1171
|
+
json_data = df.to_json(indent=2, orient='records')
|
|
1172
|
+
token_savings = _calculate_token_savings(json_data, toon_data)
|
|
1173
|
+
|
|
1174
|
+
# Get basic DataFrame info
|
|
1175
|
+
dataframe_info = {
|
|
1176
|
+
"shape": df.shape,
|
|
1177
|
+
"columns": list(df.columns),
|
|
1178
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}
|
|
1179
|
+
}
|
|
1180
|
+
|
|
1181
|
+
# Build the prompt with TOON data
|
|
1182
|
+
content_prompt = instruction_template.format(question=question)
|
|
1183
|
+
full_prompt = f"{content_prompt}\n\nData (in TOON format):\n{toon_data}"
|
|
1184
|
+
|
|
1185
|
+
# Call the LLM
|
|
1186
|
+
result = ask_for_json(
|
|
1187
|
+
driver=get_driver_for_model(model_name),
|
|
1188
|
+
content_prompt=full_prompt,
|
|
1189
|
+
json_schema=json_schema,
|
|
1190
|
+
ai_cleanup=ai_cleanup,
|
|
1191
|
+
model_name=model_name.split('/')[-1] if '/' in model_name else model_name,
|
|
1192
|
+
options=options,
|
|
1193
|
+
output_format="json" # Always return JSON, not TOON
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
# Add our additional data to the result
|
|
1197
|
+
result["toon_data"] = toon_data
|
|
1198
|
+
result["token_savings"] = token_savings
|
|
1199
|
+
result["dataframe_info"] = dataframe_info
|
|
1200
|
+
|
|
1201
|
+
return result
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: prompture
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.31.dev1
|
|
4
4
|
Summary: Ask LLMs to return structured JSON and run cross-model tests. API-first.
|
|
5
5
|
Home-page: https://github.com/jhd3197/prompture
|
|
6
6
|
Author: Juan Denis
|
|
@@ -18,9 +18,11 @@ Requires-Dist: groq>=0.4.0
|
|
|
18
18
|
Requires-Dist: httpx>=0.25.0
|
|
19
19
|
Requires-Dist: jsonschema>=4.0
|
|
20
20
|
Requires-Dist: openai>=1.0.0
|
|
21
|
+
Requires-Dist: pandas>=1.3.0
|
|
21
22
|
Requires-Dist: pydantic>=1.10
|
|
22
23
|
Requires-Dist: pydantic-settings>=2.0
|
|
23
24
|
Requires-Dist: python-dotenv>=0.19.0
|
|
25
|
+
Requires-Dist: python-toon>=0.1.0
|
|
24
26
|
Requires-Dist: requests>=2.28
|
|
25
27
|
Requires-Dist: python-dateutil>=2.9.0
|
|
26
28
|
Requires-Dist: tukuy>=0.0.6
|
|
@@ -53,6 +55,7 @@ Dynamic: summary
|
|
|
53
55
|
## ✨ Features
|
|
54
56
|
|
|
55
57
|
- ✅ **Structured output** → JSON schema enforcement, or direct **Pydantic** instances
|
|
58
|
+
- ✅ **TOON input conversion** → 45-60% token savings for structured data analysis with `extract_from_data()` and `extract_from_pandas()`
|
|
56
59
|
- ✅ **Stepwise extraction** → Per-field prompts, with smart type conversion (incl. shorthand numbers)
|
|
57
60
|
- ✅ **Multi-driver** → OpenAI, Azure, Claude, Ollama, LM Studio, Google, Groq, OpenRouter, Grok, HTTP, Mock, HuggingFace (via `get_driver()`)
|
|
58
61
|
- ✅ **Usage & cost** → Token + $ tracking on every call (`usage` from driver meta)
|
|
@@ -145,6 +148,55 @@ print(person.dict())
|
|
|
145
148
|
|
|
146
149
|
**Why start here?** It's fast (one call), cost-efficient, and returns a validated Pydantic instance.
|
|
147
150
|
|
|
151
|
+
|
|
152
|
+
## 🚀 TOON Input Conversion: 45-60% Token Savings
|
|
153
|
+
|
|
154
|
+
Analyze structured data with automatic TOON (Token-Oriented Object Notation) conversion for massive token savings.
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from prompture import extract_from_data, extract_from_pandas
|
|
158
|
+
|
|
159
|
+
# Your product data
|
|
160
|
+
products = [
|
|
161
|
+
{"id": 1, "name": "Laptop", "price": 999.99, "rating": 4.5},
|
|
162
|
+
{"id": 2, "name": "Book", "price": 19.99, "rating": 4.2},
|
|
163
|
+
{"id": 3, "name": "Headphones", "price": 149.99, "rating": 4.7}
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
# Ask questions about your data - automatically uses TOON format for 60%+ token savings
|
|
167
|
+
result = extract_from_data(
|
|
168
|
+
data=products,
|
|
169
|
+
question="What is the average price and highest rated product?",
|
|
170
|
+
json_schema={
|
|
171
|
+
"type": "object",
|
|
172
|
+
"properties": {
|
|
173
|
+
"average_price": {"type": "number"},
|
|
174
|
+
"highest_rated": {"type": "string"}
|
|
175
|
+
}
|
|
176
|
+
},
|
|
177
|
+
model_name="openai/gpt-4"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
print(result["json_object"])
|
|
181
|
+
# {"average_price": 389.96, "highest_rated": "Headphones"}
|
|
182
|
+
|
|
183
|
+
print(f"Token savings: {result['token_savings']['percentage_saved']}%")
|
|
184
|
+
# Token savings: 62.3%
|
|
185
|
+
|
|
186
|
+
# Works with Pandas DataFrames too!
|
|
187
|
+
import pandas as pd
|
|
188
|
+
df = pd.DataFrame(products)
|
|
189
|
+
result = extract_from_pandas(df=df, question="...", json_schema=schema, model_name="openai/gpt-4")
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Preview token savings without LLM calls:**
|
|
193
|
+
```bash
|
|
194
|
+
python examples/token_comparison_utility.py
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
> **Note:** Both `python-toon` and `pandas` are now included by default when you install Prompture!
|
|
198
|
+
|
|
199
|
+
---
|
|
148
200
|
---
|
|
149
201
|
|
|
150
202
|
## 📋 Field Definitions
|
|
@@ -241,7 +293,7 @@ print(resp2["json_object"], resp2["usage"])
|
|
|
241
293
|
|
|
242
294
|
### Experimental TOON output
|
|
243
295
|
|
|
244
|
-
Prompture can ask for TOON (Token-Oriented Object Notation) instead of JSON by setting `output_format="toon"` on `ask_for_json`, `extract_and_jsonify`, `manual_extract_and_jsonify`, or `extract_with_model`.
|
|
296
|
+
Prompture can ask for TOON (Token-Oriented Object Notation) instead of JSON by setting `output_format="toon"` on `ask_for_json`, `extract_and_jsonify`, `manual_extract_and_jsonify`, or `extract_with_model`. The LLM is still instructed to return JSON (for reliability); Prompture parses it and emits a TOON string via `python-toon`.
|
|
245
297
|
|
|
246
298
|
```python
|
|
247
299
|
result = extract_and_jsonify(
|
|
@@ -250,8 +302,9 @@ result = extract_and_jsonify(
|
|
|
250
302
|
model_name="lmstudio/deepseek/deepseek-r1-0528-qwen3-8b",
|
|
251
303
|
output_format="toon",
|
|
252
304
|
)
|
|
253
|
-
print(result["
|
|
254
|
-
print(result["json_object"]) # regular dict
|
|
305
|
+
print(result["toon_string"]) # TOON text generated locally
|
|
306
|
+
print(result["json_object"]) # regular dict parsed from the JSON response
|
|
307
|
+
# result["json_string"] still contains the original JSON text
|
|
255
308
|
```
|
|
256
309
|
|
|
257
310
|
> [!IMPORTANT]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
prompture/__init__.py,sha256=
|
|
1
|
+
prompture/__init__.py,sha256=aWlRheKlKjFh2v0FifG52hgEXthsFZLcMd7qlYzdnoc,2099
|
|
2
2
|
prompture/cli.py,sha256=vA86GNjtKSHz8eRMl5YDaT9HHIWuhkeJtfx8jqTaqtM,809
|
|
3
|
-
prompture/core.py,sha256=
|
|
3
|
+
prompture/core.py,sha256=Er08vBlkL7Ho4LmqR6-B9q7ILLNMJB059yx6TBg_Xu4,50448
|
|
4
4
|
prompture/driver.py,sha256=w8pdXHujImIGF3ee8rkG8f6-UD0h2jLHhucSPInRrYI,989
|
|
5
5
|
prompture/field_definitions.py,sha256=6kDMYNedccTK5l2L_I8_NI3_av-iYHqGPwkKDy8214c,21731
|
|
6
6
|
prompture/runner.py,sha256=5xwal3iBQQj4_q7l3Rjr0e3RrUMJPaPDLiEchO0mmHo,4192
|
|
@@ -19,9 +19,9 @@ prompture/drivers/local_http_driver.py,sha256=S2diikvtQOQHF7fB07zU2X0QWkej4Of__r
|
|
|
19
19
|
prompture/drivers/ollama_driver.py,sha256=fq_eFgwmCT3SK1D-ICHjxLjcm_An0suwkFIWC38xsS0,4681
|
|
20
20
|
prompture/drivers/openai_driver.py,sha256=9q9OjQslquRFvIl1Hd9JVmFFFVh6OBIWrFulw1mkYWg,3976
|
|
21
21
|
prompture/drivers/openrouter_driver.py,sha256=GKvLOFDhsyopH-k3iaD3VWllm7xbGuopRSA02MfCKoM,5031
|
|
22
|
-
prompture-0.0.
|
|
23
|
-
prompture-0.0.
|
|
24
|
-
prompture-0.0.
|
|
25
|
-
prompture-0.0.
|
|
26
|
-
prompture-0.0.
|
|
27
|
-
prompture-0.0.
|
|
22
|
+
prompture-0.0.31.dev1.dist-info/licenses/LICENSE,sha256=0HgDepH7aaHNFhHF-iXuW6_GqDfYPnVkjtiCAZ4yS8I,1060
|
|
23
|
+
prompture-0.0.31.dev1.dist-info/METADATA,sha256=jjKlX4Bjrf5zA3_QJcMwBcO0fVFn8eIVkjIcJZslTCI,17320
|
|
24
|
+
prompture-0.0.31.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
25
|
+
prompture-0.0.31.dev1.dist-info/entry_points.txt,sha256=AFPG3lJR86g4IJMoWQUW5Ph7G6MLNWG3A2u2Tp9zkp8,48
|
|
26
|
+
prompture-0.0.31.dev1.dist-info/top_level.txt,sha256=to86zq_kjfdoLeAxQNr420UWqT0WzkKoZ509J7Qr2t4,10
|
|
27
|
+
prompture-0.0.31.dev1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|