praisonaiagents 0.0.127__py3-none-any.whl → 0.0.129__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import os
3
3
  import warnings
4
+ import re
4
5
  from typing import Any, Dict, List, Optional, Union, Literal, Callable
5
6
  from pydantic import BaseModel
6
7
  import time
@@ -87,6 +88,10 @@ class LLM:
87
88
  "llama-3.2-90b-text-preview": 6144 # 8,192 actual
88
89
  }
89
90
 
91
+ # Ollama-specific prompt constants
92
+ OLLAMA_TOOL_USAGE_PROMPT = "Please analyze the request and use the available tools to help answer the question. Start by identifying what information you need."
93
+ OLLAMA_FINAL_ANSWER_PROMPT = "Based on the tool results above, please provide the final answer to the original question."
94
+
90
95
  def _log_llm_config(self, method_name: str, **config):
91
96
  """Centralized debug logging for LLM configuration and parameters.
92
97
 
@@ -277,15 +282,32 @@ class LLM:
277
282
  # Direct ollama/ prefix
278
283
  if self.model.startswith("ollama/"):
279
284
  return True
285
+
286
+ # Check base_url if provided
287
+ if self.base_url and "ollama" in self.base_url.lower():
288
+ return True
280
289
 
281
290
  # Check environment variables for Ollama base URL
282
291
  base_url = os.getenv("OPENAI_BASE_URL", "")
283
292
  api_base = os.getenv("OPENAI_API_BASE", "")
284
293
 
285
- # Common Ollama endpoints
286
- ollama_endpoints = ["localhost:11434", "127.0.0.1:11434", ":11434"]
294
+ # Common Ollama endpoints (including custom ports)
295
+ if any(url and ("ollama" in url.lower() or ":11434" in url)
296
+ for url in [base_url, api_base, self.base_url or ""]):
297
+ return True
287
298
 
288
- return any(endpoint in base_url or endpoint in api_base for endpoint in ollama_endpoints)
299
+ return False
300
+
301
+ def _format_ollama_tool_result_message(self, function_name: str, tool_result: Any) -> Dict[str, str]:
302
+ """
303
+ Format tool result message for Ollama provider.
304
+ Simplified approach without hardcoded regex extraction.
305
+ """
306
+ tool_result_str = str(tool_result)
307
+ return {
308
+ "role": "user",
309
+ "content": f"The {function_name} function returned: {tool_result_str}"
310
+ }
289
311
 
290
312
  def _process_stream_delta(self, delta, response_text: str, tool_calls: List[Dict], formatted_tools: Optional[List] = None) -> tuple:
291
313
  """
@@ -422,13 +444,22 @@ class LLM:
422
444
  """
423
445
  messages = []
424
446
 
447
+ # Check if this is a Gemini model that supports native structured outputs
448
+ is_gemini_with_structured_output = False
449
+ if output_json or output_pydantic:
450
+ from .model_capabilities import supports_structured_outputs
451
+ is_gemini_with_structured_output = (
452
+ self._is_gemini_model() and
453
+ supports_structured_outputs(self.model)
454
+ )
455
+
425
456
  # Handle system prompt
426
457
  if system_prompt:
427
- # Append JSON schema if needed
428
- if output_json:
429
- system_prompt += f"\nReturn ONLY a JSON object that matches this Pydantic model: {json.dumps(output_json.model_json_schema())}"
430
- elif output_pydantic:
431
- system_prompt += f"\nReturn ONLY a JSON object that matches this Pydantic model: {json.dumps(output_pydantic.model_json_schema())}"
458
+ # Only append JSON schema for non-Gemini models or Gemini models without structured output support
459
+ if (output_json or output_pydantic) and not is_gemini_with_structured_output:
460
+ schema_model = output_json or output_pydantic
461
+ if schema_model and hasattr(schema_model, 'model_json_schema'):
462
+ system_prompt += f"\nReturn ONLY a JSON object that matches this Pydantic model: {json.dumps(schema_model.model_json_schema())}"
432
463
 
433
464
  # Skip system messages for legacy o1 models as they don't support them
434
465
  if not self._needs_system_message_skip():
@@ -440,7 +471,8 @@ class LLM:
440
471
 
441
472
  # Handle prompt modifications for JSON output
442
473
  original_prompt = prompt
443
- if output_json or output_pydantic:
474
+ if (output_json or output_pydantic) and not is_gemini_with_structured_output:
475
+ # Only modify prompt for non-Gemini models
444
476
  if isinstance(prompt, str):
445
477
  prompt = prompt + "\nReturn ONLY a valid JSON object. No other text or explanation."
446
478
  elif isinstance(prompt, list):
@@ -660,6 +692,7 @@ class LLM:
660
692
 
661
693
  start_time = time.time()
662
694
  reflection_count = 0
695
+ interaction_displayed = False # Track if interaction has been displayed
663
696
 
664
697
  # Display initial instruction once
665
698
  if verbose:
@@ -695,6 +728,8 @@ class LLM:
695
728
  temperature=temperature,
696
729
  stream=False, # force non-streaming
697
730
  tools=formatted_tools,
731
+ output_json=output_json,
732
+ output_pydantic=output_pydantic,
698
733
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
699
734
  )
700
735
  )
@@ -703,7 +738,7 @@ class LLM:
703
738
  final_response = resp
704
739
 
705
740
  # Optionally display reasoning if present
706
- if verbose and reasoning_content:
741
+ if verbose and reasoning_content and not interaction_displayed:
707
742
  display_interaction(
708
743
  original_prompt,
709
744
  f"Reasoning:\n{reasoning_content}\n\nAnswer:\n{response_text}",
@@ -711,7 +746,8 @@ class LLM:
711
746
  generation_time=time.time() - current_time,
712
747
  console=console
713
748
  )
714
- else:
749
+ interaction_displayed = True
750
+ elif verbose and not interaction_displayed:
715
751
  display_interaction(
716
752
  original_prompt,
717
753
  response_text,
@@ -719,6 +755,7 @@ class LLM:
719
755
  generation_time=time.time() - current_time,
720
756
  console=console
721
757
  )
758
+ interaction_displayed = True
722
759
 
723
760
  # Otherwise do the existing streaming approach
724
761
  else:
@@ -741,6 +778,8 @@ class LLM:
741
778
  tools=formatted_tools,
742
779
  temperature=temperature,
743
780
  stream=True,
781
+ output_json=output_json,
782
+ output_pydantic=output_pydantic,
744
783
  **kwargs
745
784
  )
746
785
  ):
@@ -760,6 +799,8 @@ class LLM:
760
799
  tools=formatted_tools,
761
800
  temperature=temperature,
762
801
  stream=True,
802
+ output_json=output_json,
803
+ output_pydantic=output_pydantic,
763
804
  **kwargs
764
805
  )
765
806
  ):
@@ -772,7 +813,7 @@ class LLM:
772
813
  if formatted_tools and self._supports_streaming_tools():
773
814
  tool_calls = self._process_tool_calls_from_stream(delta, tool_calls)
774
815
 
775
- response_text = response_text.strip() if response_text else "" if response_text else "" if response_text else "" if response_text else ""
816
+ response_text = response_text.strip() if response_text else ""
776
817
 
777
818
  # Create a mock final_response with the captured data
778
819
  final_response = {
@@ -791,12 +832,14 @@ class LLM:
791
832
  tools=formatted_tools,
792
833
  temperature=temperature,
793
834
  stream=False,
835
+ output_json=output_json,
836
+ output_pydantic=output_pydantic,
794
837
  **kwargs
795
838
  )
796
839
  )
797
840
  response_text = final_response["choices"][0]["message"]["content"]
798
841
 
799
- if verbose:
842
+ if verbose and not interaction_displayed:
800
843
  # Display the complete response at once
801
844
  display_interaction(
802
845
  original_prompt,
@@ -805,18 +848,37 @@ class LLM:
805
848
  generation_time=time.time() - current_time,
806
849
  console=console
807
850
  )
851
+ interaction_displayed = True
808
852
 
809
853
  tool_calls = final_response["choices"][0]["message"].get("tool_calls")
810
854
 
855
+ # For Ollama, if response is empty but we have tools, prompt for tool usage
856
+ if self._is_ollama_provider() and (not response_text or response_text.strip() == "") and formatted_tools and iteration_count == 0:
857
+ messages.append({
858
+ "role": "user",
859
+ "content": self.OLLAMA_TOOL_USAGE_PROMPT
860
+ })
861
+ iteration_count += 1
862
+ continue
863
+
811
864
  # Handle tool calls - Sequential tool calling logic
812
865
  if tool_calls and execute_tool_fn:
813
866
  # Convert tool_calls to a serializable format for all providers
814
867
  serializable_tool_calls = self._serialize_tool_calls(tool_calls)
815
- messages.append({
816
- "role": "assistant",
817
- "content": response_text,
818
- "tool_calls": serializable_tool_calls
819
- })
868
+ # Check if this is Ollama provider
869
+ if self._is_ollama_provider():
870
+ # For Ollama, only include role and content
871
+ messages.append({
872
+ "role": "assistant",
873
+ "content": response_text
874
+ })
875
+ else:
876
+ # For other providers, include tool_calls
877
+ messages.append({
878
+ "role": "assistant",
879
+ "content": response_text,
880
+ "tool_calls": serializable_tool_calls
881
+ })
820
882
 
821
883
  should_continue = False
822
884
  tool_results = [] # Store all tool results
@@ -842,11 +904,17 @@ class LLM:
842
904
  logging.debug(f"[TOOL_EXEC_DEBUG] About to display tool call with message: {display_message}")
843
905
  display_tool_call(display_message, console=console)
844
906
 
845
- messages.append({
846
- "role": "tool",
847
- "tool_call_id": tool_call_id,
848
- "content": json.dumps(tool_result) if tool_result is not None else "Function returned an empty output"
849
- })
907
+ # Check if this is Ollama provider
908
+ if self._is_ollama_provider():
909
+ # For Ollama, use user role and format as natural language
910
+ messages.append(self._format_ollama_tool_result_message(function_name, tool_result))
911
+ else:
912
+ # For other providers, use tool role with tool_call_id
913
+ messages.append({
914
+ "role": "tool",
915
+ "tool_call_id": tool_call_id,
916
+ "content": json.dumps(tool_result) if tool_result is not None else "Function returned an empty output"
917
+ })
850
918
 
851
919
  # Check if we should continue (for tools like sequential thinking)
852
920
  # This mimics the logic from agent.py lines 1004-1007
@@ -858,100 +926,12 @@ class LLM:
858
926
  iteration_count += 1
859
927
  continue
860
928
 
861
- # Special handling for Ollama models that don't automatically process tool results
862
- ollama_handled = False
863
- ollama_params = self._handle_ollama_model(response_text, tool_results, messages, original_prompt)
864
-
865
- if ollama_params:
866
- # Get response based on streaming mode
867
- if stream:
868
- # Streaming approach
869
- if verbose:
870
- with Live(display_generating("", start_time), console=console, refresh_per_second=4) as live:
871
- response_text = ""
872
- for chunk in litellm.completion(
873
- **self._build_completion_params(
874
- messages=ollama_params["follow_up_messages"],
875
- temperature=temperature,
876
- stream=True
877
- )
878
- ):
879
- if chunk and chunk.choices and chunk.choices[0].delta.content:
880
- content = chunk.choices[0].delta.content
881
- response_text += content
882
- live.update(display_generating(response_text, start_time))
883
- else:
884
- response_text = ""
885
- for chunk in litellm.completion(
886
- **self._build_completion_params(
887
- messages=ollama_params["follow_up_messages"],
888
- temperature=temperature,
889
- stream=True
890
- )
891
- ):
892
- if chunk and chunk.choices and chunk.choices[0].delta.content:
893
- response_text += chunk.choices[0].delta.content
894
- else:
895
- # Non-streaming approach
896
- resp = litellm.completion(
897
- **self._build_completion_params(
898
- messages=ollama_params["follow_up_messages"],
899
- temperature=temperature,
900
- stream=False
901
- )
902
- )
903
- response_text = resp.get("choices", [{}])[0].get("message", {}).get("content", "") or ""
904
-
905
- # Set flag to indicate Ollama was handled
906
- ollama_handled = True
907
- final_response_text = response_text.strip() if response_text else ""
908
- logging.debug(f"[OLLAMA_DEBUG] Ollama follow-up response: {final_response_text[:200]}...")
909
-
910
- # Display the response if we got one
911
- if final_response_text and verbose:
912
- display_interaction(
913
- ollama_params["original_prompt"],
914
- final_response_text,
915
- markdown=markdown,
916
- generation_time=time.time() - start_time,
917
- console=console
918
- )
919
-
920
- # Update messages and continue the loop instead of returning
921
- if final_response_text:
922
- # Update messages with the response to maintain conversation context
923
- messages.append({
924
- "role": "assistant",
925
- "content": final_response_text
926
- })
927
- # Continue the loop to check if more tools are needed
928
- iteration_count += 1
929
- continue
930
- else:
931
- logging.warning("[OLLAMA_DEBUG] Ollama follow-up returned empty response")
932
-
933
- # Handle reasoning_steps after tool execution if not already handled by Ollama
934
- if reasoning_steps and not ollama_handled:
935
- # Make a non-streaming call to capture reasoning content
936
- reasoning_resp = litellm.completion(
937
- **self._build_completion_params(
938
- messages=messages,
939
- temperature=temperature,
940
- stream=False, # force non-streaming
941
- **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
942
- )
943
- )
944
- reasoning_content = reasoning_resp["choices"][0]["message"].get("provider_specific_fields", {}).get("reasoning_content")
945
- response_text = reasoning_resp["choices"][0]["message"]["content"]
946
-
947
- # Store reasoning content for later use
948
- if reasoning_content:
949
- stored_reasoning_content = reasoning_content
950
-
951
- # Update messages with the response
929
+ # For Ollama, add explicit prompt if we need a final answer
930
+ if self._is_ollama_provider() and iteration_count > 0:
931
+ # Add an explicit prompt for Ollama to generate the final answer
952
932
  messages.append({
953
- "role": "assistant",
954
- "content": response_text
933
+ "role": "user",
934
+ "content": self.OLLAMA_FINAL_ANSWER_PROMPT
955
935
  })
956
936
 
957
937
  # After tool execution, continue the loop to check if more tools are needed
@@ -974,7 +954,7 @@ class LLM:
974
954
  return final_response_text
975
955
 
976
956
  # No tool calls were made in this iteration, return the response
977
- if verbose:
957
+ if verbose and not interaction_displayed:
978
958
  # If we have stored reasoning content from tool execution, display it
979
959
  if stored_reasoning_content:
980
960
  display_interaction(
@@ -992,6 +972,7 @@ class LLM:
992
972
  generation_time=time.time() - start_time,
993
973
  console=console
994
974
  )
975
+ interaction_displayed = True
995
976
 
996
977
  response_text = response_text.strip() if response_text else ""
997
978
 
@@ -1003,15 +984,17 @@ class LLM:
1003
984
  if output_json or output_pydantic:
1004
985
  self.chat_history.append({"role": "user", "content": original_prompt})
1005
986
  self.chat_history.append({"role": "assistant", "content": response_text})
1006
- if verbose:
987
+ if verbose and not interaction_displayed:
1007
988
  display_interaction(original_prompt, response_text, markdown=markdown,
1008
989
  generation_time=time.time() - start_time, console=console)
990
+ interaction_displayed = True
1009
991
  return response_text
1010
992
 
1011
993
  if not self_reflect:
1012
- if verbose:
994
+ if verbose and not interaction_displayed:
1013
995
  display_interaction(original_prompt, response_text, markdown=markdown,
1014
996
  generation_time=time.time() - start_time, console=console)
997
+ interaction_displayed = True
1015
998
  # Return reasoning content if reasoning_steps is True
1016
999
  if reasoning_steps and stored_reasoning_content:
1017
1000
  return stored_reasoning_content
@@ -1040,6 +1023,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1040
1023
  temperature=temperature,
1041
1024
  stream=False, # Force non-streaming
1042
1025
  response_format={"type": "json_object"},
1026
+ output_json=output_json,
1027
+ output_pydantic=output_pydantic,
1043
1028
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1044
1029
  )
1045
1030
  )
@@ -1075,6 +1060,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1075
1060
  temperature=temperature,
1076
1061
  stream=stream,
1077
1062
  response_format={"type": "json_object"},
1063
+ output_json=output_json,
1064
+ output_pydantic=output_pydantic,
1078
1065
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1079
1066
  )
1080
1067
  ):
@@ -1090,6 +1077,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1090
1077
  temperature=temperature,
1091
1078
  stream=stream,
1092
1079
  response_format={"type": "json_object"},
1080
+ output_json=output_json,
1081
+ output_pydantic=output_pydantic,
1093
1082
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1094
1083
  )
1095
1084
  ):
@@ -1107,15 +1096,17 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1107
1096
  )
1108
1097
 
1109
1098
  if satisfactory and reflection_count >= min_reflect - 1:
1110
- if verbose:
1099
+ if verbose and not interaction_displayed:
1111
1100
  display_interaction(prompt, response_text, markdown=markdown,
1112
1101
  generation_time=time.time() - start_time, console=console)
1102
+ interaction_displayed = True
1113
1103
  return response_text
1114
1104
 
1115
1105
  if reflection_count >= max_reflect - 1:
1116
- if verbose:
1106
+ if verbose and not interaction_displayed:
1117
1107
  display_interaction(prompt, response_text, markdown=markdown,
1118
1108
  generation_time=time.time() - start_time, console=console)
1109
+ interaction_displayed = True
1119
1110
  return response_text
1120
1111
 
1121
1112
  reflection_count += 1
@@ -1135,6 +1126,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1135
1126
  messages=messages,
1136
1127
  temperature=temperature,
1137
1128
  stream=True,
1129
+ output_json=output_json,
1130
+ output_pydantic=output_pydantic,
1138
1131
  **kwargs
1139
1132
  )
1140
1133
  ):
@@ -1149,21 +1142,24 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1149
1142
  messages=messages,
1150
1143
  temperature=temperature,
1151
1144
  stream=True,
1145
+ output_json=output_json,
1146
+ output_pydantic=output_pydantic,
1152
1147
  **kwargs
1153
1148
  )
1154
1149
  ):
1155
1150
  if chunk and chunk.choices and chunk.choices[0].delta.content:
1156
1151
  response_text += chunk.choices[0].delta.content
1157
1152
 
1158
- response_text = response_text.strip() if response_text else "" if response_text else ""
1153
+ response_text = response_text.strip() if response_text else ""
1159
1154
  continue
1160
1155
 
1161
1156
  except json.JSONDecodeError:
1162
1157
  reflection_count += 1
1163
1158
  if reflection_count >= max_reflect:
1164
- if verbose:
1159
+ if verbose and not interaction_displayed:
1165
1160
  display_interaction(prompt, response_text, markdown=markdown,
1166
1161
  generation_time=time.time() - start_time, console=console)
1162
+ interaction_displayed = True
1167
1163
  return response_text
1168
1164
  continue
1169
1165
  except Exception as e:
@@ -1171,9 +1167,10 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1171
1167
  return None
1172
1168
 
1173
1169
  # If we've exhausted reflection attempts
1174
- if verbose:
1170
+ if verbose and not interaction_displayed:
1175
1171
  display_interaction(prompt, response_text, markdown=markdown,
1176
1172
  generation_time=time.time() - start_time, console=console)
1173
+ interaction_displayed = True
1177
1174
  return response_text
1178
1175
 
1179
1176
  except Exception as error:
@@ -1185,6 +1182,12 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1185
1182
  total_time = time.time() - start_time
1186
1183
  logging.debug(f"get_response completed in {total_time:.2f} seconds")
1187
1184
 
1185
+ def _is_gemini_model(self) -> bool:
1186
+ """Check if the model is a Gemini model."""
1187
+ if not self.model:
1188
+ return False
1189
+ return any(prefix in self.model.lower() for prefix in ['gemini', 'gemini/', 'google/gemini'])
1190
+
1188
1191
  async def get_response_async(
1189
1192
  self,
1190
1193
  prompt: Union[str, List[Dict]],
@@ -1273,6 +1276,7 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1273
1276
 
1274
1277
  start_time = time.time()
1275
1278
  reflection_count = 0
1279
+ interaction_displayed = False # Track if interaction has been displayed
1276
1280
 
1277
1281
  # Format tools for LiteLLM using the shared helper
1278
1282
  formatted_tools = self._format_tools_for_litellm(tools)
@@ -1293,15 +1297,17 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1293
1297
  resp = await litellm.acompletion(
1294
1298
  **self._build_completion_params(
1295
1299
  messages=messages,
1296
- temperature=temperature,
1297
- stream=False, # force non-streaming
1298
- **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1299
- )
1300
+ temperature=temperature,
1301
+ stream=False, # force non-streaming
1302
+ output_json=output_json,
1303
+ output_pydantic=output_pydantic,
1304
+ **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1305
+ )
1300
1306
  )
1301
1307
  reasoning_content = resp["choices"][0]["message"].get("provider_specific_fields", {}).get("reasoning_content")
1302
1308
  response_text = resp["choices"][0]["message"]["content"]
1303
1309
 
1304
- if verbose and reasoning_content:
1310
+ if verbose and reasoning_content and not interaction_displayed:
1305
1311
  display_interaction(
1306
1312
  "Initial reasoning:",
1307
1313
  f"Reasoning:\n{reasoning_content}\n\nAnswer:\n{response_text}",
@@ -1309,7 +1315,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1309
1315
  generation_time=time.time() - start_time,
1310
1316
  console=console
1311
1317
  )
1312
- elif verbose:
1318
+ interaction_displayed = True
1319
+ elif verbose and not interaction_displayed:
1313
1320
  display_interaction(
1314
1321
  "Initial response:",
1315
1322
  response_text,
@@ -1317,6 +1324,7 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1317
1324
  generation_time=time.time() - start_time,
1318
1325
  console=console
1319
1326
  )
1327
+ interaction_displayed = True
1320
1328
  else:
1321
1329
  # Determine if we should use streaming based on tool support
1322
1330
  use_streaming = stream
@@ -1335,6 +1343,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1335
1343
  temperature=temperature,
1336
1344
  stream=True,
1337
1345
  tools=formatted_tools,
1346
+ output_json=output_json,
1347
+ output_pydantic=output_pydantic,
1338
1348
  **kwargs
1339
1349
  )
1340
1350
  ):
@@ -1355,6 +1365,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1355
1365
  temperature=temperature,
1356
1366
  stream=True,
1357
1367
  tools=formatted_tools,
1368
+ output_json=output_json,
1369
+ output_pydantic=output_pydantic,
1358
1370
  **kwargs
1359
1371
  )
1360
1372
  ):
@@ -1367,7 +1379,7 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1367
1379
  if formatted_tools and self._supports_streaming_tools():
1368
1380
  tool_calls = self._process_tool_calls_from_stream(delta, tool_calls)
1369
1381
 
1370
- response_text = response_text.strip() if response_text else "" if response_text else "" if response_text else ""
1382
+ response_text = response_text.strip() if response_text else ""
1371
1383
 
1372
1384
  # We already have tool_calls from streaming if supported
1373
1385
  # No need for a second API call!
@@ -1379,13 +1391,15 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1379
1391
  temperature=temperature,
1380
1392
  stream=False,
1381
1393
  tools=formatted_tools,
1394
+ output_json=output_json,
1395
+ output_pydantic=output_pydantic,
1382
1396
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1383
1397
  )
1384
1398
  )
1385
1399
  response_text = tool_response.choices[0].message.get("content", "")
1386
1400
  tool_calls = tool_response.choices[0].message.get("tool_calls", [])
1387
1401
 
1388
- if verbose:
1402
+ if verbose and not interaction_displayed:
1389
1403
  # Display the complete response at once
1390
1404
  display_interaction(
1391
1405
  original_prompt,
@@ -1394,16 +1408,35 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1394
1408
  generation_time=time.time() - start_time,
1395
1409
  console=console
1396
1410
  )
1411
+ interaction_displayed = True
1397
1412
 
1413
+ # For Ollama, if response is empty but we have tools, prompt for tool usage
1414
+ if self._is_ollama_provider() and (not response_text or response_text.strip() == "") and formatted_tools and iteration_count == 0:
1415
+ messages.append({
1416
+ "role": "user",
1417
+ "content": self.OLLAMA_TOOL_USAGE_PROMPT
1418
+ })
1419
+ iteration_count += 1
1420
+ continue
1421
+
1398
1422
  # Now handle tools if we have them (either from streaming or non-streaming)
1399
1423
  if tools and execute_tool_fn and tool_calls:
1400
1424
  # Convert tool_calls to a serializable format for all providers
1401
1425
  serializable_tool_calls = self._serialize_tool_calls(tool_calls)
1402
- messages.append({
1403
- "role": "assistant",
1404
- "content": response_text,
1405
- "tool_calls": serializable_tool_calls
1406
- })
1426
+ # Check if it's Ollama provider
1427
+ if self._is_ollama_provider():
1428
+ # For Ollama, only include role and content
1429
+ messages.append({
1430
+ "role": "assistant",
1431
+ "content": response_text
1432
+ })
1433
+ else:
1434
+ # For other providers, include tool_calls
1435
+ messages.append({
1436
+ "role": "assistant",
1437
+ "content": response_text,
1438
+ "tool_calls": serializable_tool_calls
1439
+ })
1407
1440
 
1408
1441
  tool_results = [] # Store all tool results
1409
1442
  for tool_call in tool_calls:
@@ -1421,77 +1454,31 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1421
1454
  else:
1422
1455
  display_message += "Function returned no output"
1423
1456
  display_tool_call(display_message, console=console)
1457
+ # Check if it's Ollama provider
1458
+ if self._is_ollama_provider():
1459
+ # For Ollama, use user role and format as natural language
1460
+ messages.append(self._format_ollama_tool_result_message(function_name, tool_result))
1461
+ else:
1462
+ # For other providers, use tool role with tool_call_id
1463
+ messages.append({
1464
+ "role": "tool",
1465
+ "tool_call_id": tool_call_id,
1466
+ "content": json.dumps(tool_result) if tool_result is not None else "Function returned an empty output"
1467
+ })
1468
+
1469
+ # For Ollama, add explicit prompt if we need a final answer
1470
+ if self._is_ollama_provider() and iteration_count > 0:
1471
+ # Add an explicit prompt for Ollama to generate the final answer
1424
1472
  messages.append({
1425
- "role": "tool",
1426
- "tool_call_id": tool_call_id,
1427
- "content": json.dumps(tool_result) if tool_result is not None else "Function returned an empty output"
1473
+ "role": "user",
1474
+ "content": self.OLLAMA_FINAL_ANSWER_PROMPT
1428
1475
  })
1429
-
1476
+
1430
1477
  # Get response after tool calls
1431
1478
  response_text = ""
1432
1479
 
1433
- # Special handling for Ollama models that don't automatically process tool results
1434
- ollama_handled = False
1435
- ollama_params = self._handle_ollama_model(response_text, tool_results, messages, original_prompt)
1436
-
1437
- if ollama_params:
1438
- # Get response with streaming
1439
- if verbose:
1440
- response_text = ""
1441
- async for chunk in await litellm.acompletion(
1442
- **self._build_completion_params(
1443
- messages=ollama_params["follow_up_messages"],
1444
- temperature=temperature,
1445
- stream=stream
1446
- )
1447
- ):
1448
- if chunk and chunk.choices and chunk.choices[0].delta.content:
1449
- content = chunk.choices[0].delta.content
1450
- response_text += content
1451
- print("\033[K", end="\r")
1452
- print(f"Processing results... {time.time() - start_time:.1f}s", end="\r")
1453
- else:
1454
- response_text = ""
1455
- async for chunk in await litellm.acompletion(
1456
- **self._build_completion_params(
1457
- messages=ollama_params["follow_up_messages"],
1458
- temperature=temperature,
1459
- stream=stream
1460
- )
1461
- ):
1462
- if chunk and chunk.choices and chunk.choices[0].delta.content:
1463
- response_text += chunk.choices[0].delta.content
1464
-
1465
- # Set flag to indicate Ollama was handled
1466
- ollama_handled = True
1467
- final_response_text = response_text.strip()
1468
- logging.debug(f"[OLLAMA_DEBUG] Ollama follow-up response: {final_response_text[:200]}...")
1469
-
1470
- # Display the response if we got one
1471
- if final_response_text and verbose:
1472
- display_interaction(
1473
- ollama_params["original_prompt"],
1474
- final_response_text,
1475
- markdown=markdown,
1476
- generation_time=time.time() - start_time,
1477
- console=console
1478
- )
1479
-
1480
- # Store the response for potential final return
1481
- if final_response_text:
1482
- # Update messages with the response to maintain conversation context
1483
- messages.append({
1484
- "role": "assistant",
1485
- "content": final_response_text
1486
- })
1487
- # Continue the loop to check if more tools are needed
1488
- iteration_count += 1
1489
- continue
1490
- else:
1491
- logging.warning("[OLLAMA_DEBUG] Ollama follow-up returned empty response")
1492
-
1493
- # If no special handling was needed or if it's not an Ollama model
1494
- if reasoning_steps and not ollama_handled:
1480
+ # If no special handling was needed
1481
+ if reasoning_steps:
1495
1482
  # Non-streaming call to capture reasoning
1496
1483
  resp = await litellm.acompletion(
1497
1484
  **self._build_completion_params(
@@ -1499,13 +1486,15 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1499
1486
  temperature=temperature,
1500
1487
  stream=False, # force non-streaming
1501
1488
  tools=formatted_tools, # Include tools
1489
+ output_json=output_json,
1490
+ output_pydantic=output_pydantic,
1502
1491
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1503
1492
  )
1504
1493
  )
1505
1494
  reasoning_content = resp["choices"][0]["message"].get("provider_specific_fields", {}).get("reasoning_content")
1506
1495
  response_text = resp["choices"][0]["message"]["content"]
1507
1496
 
1508
- if verbose and reasoning_content:
1497
+ if verbose and reasoning_content and not interaction_displayed:
1509
1498
  display_interaction(
1510
1499
  "Tool response reasoning:",
1511
1500
  f"Reasoning:\n{reasoning_content}\n\nAnswer:\n{response_text}",
@@ -1513,7 +1502,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1513
1502
  generation_time=time.time() - start_time,
1514
1503
  console=console
1515
1504
  )
1516
- elif verbose:
1505
+ interaction_displayed = True
1506
+ elif verbose and not interaction_displayed:
1517
1507
  display_interaction(
1518
1508
  "Tool response:",
1519
1509
  response_text,
@@ -1521,7 +1511,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1521
1511
  generation_time=time.time() - start_time,
1522
1512
  console=console
1523
1513
  )
1524
- elif not ollama_handled:
1514
+ interaction_displayed = True
1515
+ else:
1525
1516
  # Get response after tool calls with streaming if not already handled
1526
1517
  if verbose:
1527
1518
  async for chunk in await litellm.acompletion(
@@ -1530,6 +1521,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1530
1521
  temperature=temperature,
1531
1522
  stream=stream,
1532
1523
  tools=formatted_tools,
1524
+ output_json=output_json,
1525
+ output_pydantic=output_pydantic,
1533
1526
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1534
1527
  )
1535
1528
  ):
@@ -1545,13 +1538,15 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1545
1538
  messages=messages,
1546
1539
  temperature=temperature,
1547
1540
  stream=stream,
1541
+ output_json=output_json,
1542
+ output_pydantic=output_pydantic,
1548
1543
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1549
1544
  )
1550
1545
  ):
1551
1546
  if chunk and chunk.choices and chunk.choices[0].delta.content:
1552
1547
  response_text += chunk.choices[0].delta.content
1553
1548
 
1554
- response_text = response_text.strip() if response_text else "" if response_text else ""
1549
+ response_text = response_text.strip() if response_text else ""
1555
1550
 
1556
1551
  # After tool execution, update messages and continue the loop
1557
1552
  if response_text:
@@ -1578,9 +1573,10 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1578
1573
  if output_json or output_pydantic:
1579
1574
  self.chat_history.append({"role": "user", "content": original_prompt})
1580
1575
  self.chat_history.append({"role": "assistant", "content": response_text})
1581
- if verbose:
1576
+ if verbose and not interaction_displayed:
1582
1577
  display_interaction(original_prompt, response_text, markdown=markdown,
1583
1578
  generation_time=time.time() - start_time, console=console)
1579
+ interaction_displayed = True
1584
1580
  return response_text
1585
1581
 
1586
1582
  if not self_reflect:
@@ -1588,7 +1584,7 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1588
1584
  display_text = final_response_text if final_response_text else response_text
1589
1585
 
1590
1586
  # Display with stored reasoning content if available
1591
- if verbose:
1587
+ if verbose and not interaction_displayed:
1592
1588
  if stored_reasoning_content:
1593
1589
  display_interaction(
1594
1590
  original_prompt,
@@ -1600,6 +1596,7 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1600
1596
  else:
1601
1597
  display_interaction(original_prompt, display_text, markdown=markdown,
1602
1598
  generation_time=time.time() - start_time, console=console)
1599
+ interaction_displayed = True
1603
1600
 
1604
1601
  # Return reasoning content if reasoning_steps is True and we have it
1605
1602
  if reasoning_steps and stored_reasoning_content:
@@ -1627,6 +1624,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1627
1624
  temperature=temperature,
1628
1625
  stream=False, # Force non-streaming
1629
1626
  response_format={"type": "json_object"},
1627
+ output_json=output_json,
1628
+ output_pydantic=output_pydantic,
1630
1629
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1631
1630
  )
1632
1631
  )
@@ -1662,6 +1661,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1662
1661
  temperature=temperature,
1663
1662
  stream=stream,
1664
1663
  response_format={"type": "json_object"},
1664
+ output_json=output_json,
1665
+ output_pydantic=output_pydantic,
1665
1666
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1666
1667
  )
1667
1668
  ):
@@ -1677,6 +1678,8 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1677
1678
  temperature=temperature,
1678
1679
  stream=stream,
1679
1680
  response_format={"type": "json_object"},
1681
+ output_json=output_json,
1682
+ output_pydantic=output_pydantic,
1680
1683
  **{k:v for k,v in kwargs.items() if k != 'reasoning_steps'}
1681
1684
  )
1682
1685
  ):
@@ -1695,15 +1698,17 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1695
1698
  )
1696
1699
 
1697
1700
  if satisfactory and reflection_count >= min_reflect - 1:
1698
- if verbose:
1701
+ if verbose and not interaction_displayed:
1699
1702
  display_interaction(prompt, response_text, markdown=markdown,
1700
1703
  generation_time=time.time() - start_time, console=console)
1704
+ interaction_displayed = True
1701
1705
  return response_text
1702
1706
 
1703
1707
  if reflection_count >= max_reflect - 1:
1704
- if verbose:
1708
+ if verbose and not interaction_displayed:
1705
1709
  display_interaction(prompt, response_text, markdown=markdown,
1706
1710
  generation_time=time.time() - start_time, console=console)
1711
+ interaction_displayed = True
1707
1712
  return response_text
1708
1713
 
1709
1714
  reflection_count += 1
@@ -1790,67 +1795,6 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1790
1795
 
1791
1796
  litellm.callbacks = events
1792
1797
 
1793
- def _handle_ollama_model(self, response_text: str, tool_results: List[Any], messages: List[Dict], original_prompt: Union[str, List[Dict]]) -> Optional[Dict[str, Any]]:
1794
- """
1795
- Handle special Ollama model requirements when processing tool results.
1796
-
1797
- Args:
1798
- response_text: The initial response text from the model
1799
- tool_results: List of tool execution results
1800
- messages: The conversation messages list
1801
- original_prompt: The original user prompt
1802
-
1803
- Returns:
1804
- Dict with follow-up parameters if Ollama needs special handling, None otherwise
1805
- """
1806
- if not self._is_ollama_provider() or not tool_results:
1807
- return None
1808
-
1809
- # Check if the response is just a JSON tool call
1810
- try:
1811
- json_response = json.loads(response_text.strip() if response_text else "{}")
1812
- if not (('name' in json_response or 'function' in json_response) and
1813
- not any(word in response_text.lower() for word in ['summary', 'option', 'result', 'found'])):
1814
- return None
1815
-
1816
- logging.debug("Detected Ollama returning only tool call JSON, preparing follow-up call to process results")
1817
-
1818
- # Extract the original user query from messages
1819
- original_query = ""
1820
- for msg in reversed(messages): # Look from the end to find the most recent user message
1821
- if msg.get("role") == "user":
1822
- content = msg.get("content", "")
1823
- # Handle list content (multimodal)
1824
- if isinstance(content, list):
1825
- for item in content:
1826
- if isinstance(item, dict) and item.get("type") == "text":
1827
- original_query = item.get("text", "")
1828
- break
1829
- else:
1830
- original_query = content
1831
- if original_query:
1832
- break
1833
-
1834
- # Create a shorter follow-up prompt with all tool results
1835
- # If there's only one result, use it directly; otherwise combine them
1836
- if len(tool_results) == 1:
1837
- results_text = json.dumps(tool_results[0], indent=2)
1838
- else:
1839
- results_text = json.dumps(tool_results, indent=2)
1840
-
1841
- follow_up_prompt = f"Results:\n{results_text}\nProvide Answer to this Original Question based on the above results: '{original_query}'"
1842
- logging.debug(f"[OLLAMA_DEBUG] Original query extracted: {original_query}")
1843
- logging.debug(f"[OLLAMA_DEBUG] Follow-up prompt: {follow_up_prompt[:200]}...")
1844
-
1845
- # Return parameters for follow-up call
1846
- return {
1847
- "follow_up_messages": [{"role": "user", "content": follow_up_prompt}],
1848
- "original_prompt": original_prompt
1849
- }
1850
-
1851
- except (json.JSONDecodeError, KeyError):
1852
- # Not a JSON response or not a tool call format
1853
- return None
1854
1798
 
1855
1799
  def _build_completion_params(self, **override_params) -> Dict[str, Any]:
1856
1800
  """Build parameters for litellm completion calls with all necessary config"""
@@ -1895,11 +1839,33 @@ Output MUST be JSON with 'reflection' and 'satisfactory'.
1895
1839
  # Override with any provided parameters
1896
1840
  params.update(override_params)
1897
1841
 
1842
+ # Handle structured output parameters
1843
+ output_json = override_params.get('output_json')
1844
+ output_pydantic = override_params.get('output_pydantic')
1845
+
1846
+ if output_json or output_pydantic:
1847
+ # Always remove these from params as they're not native litellm parameters
1848
+ params.pop('output_json', None)
1849
+ params.pop('output_pydantic', None)
1850
+
1851
+ # Check if this is a Gemini model that supports native structured outputs
1852
+ if self._is_gemini_model():
1853
+ from .model_capabilities import supports_structured_outputs
1854
+ schema_model = output_json or output_pydantic
1855
+
1856
+ if schema_model and hasattr(schema_model, 'model_json_schema') and supports_structured_outputs(self.model):
1857
+ schema = schema_model.model_json_schema()
1858
+
1859
+ # Gemini uses response_mime_type and response_schema
1860
+ params['response_mime_type'] = 'application/json'
1861
+ params['response_schema'] = schema
1862
+
1863
+ logging.debug(f"Using Gemini native structured output with schema: {json.dumps(schema, indent=2)}")
1864
+
1898
1865
  # Add tool_choice="auto" when tools are provided (unless already specified)
1899
1866
  if 'tools' in params and params['tools'] and 'tool_choice' not in params:
1900
1867
  # For Gemini models, use tool_choice to encourage tool usage
1901
- # More comprehensive Gemini model detection
1902
- if any(prefix in self.model.lower() for prefix in ['gemini', 'gemini/', 'google/gemini']):
1868
+ if self._is_gemini_model():
1903
1869
  try:
1904
1870
  import litellm
1905
1871
  # Check if model supports function calling before setting tool_choice