ziya 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ziya might be problematic. Click here for more details.

Files changed (73) hide show
  1. app/agents/agent.py +71 -73
  2. app/agents/direct_streaming.py +1 -1
  3. app/agents/prompts.py +1 -1
  4. app/agents/prompts_manager.py +14 -10
  5. app/agents/wrappers/google_direct.py +31 -1
  6. app/agents/wrappers/nova_tool_execution.py +2 -2
  7. app/agents/wrappers/nova_wrapper.py +1 -1
  8. app/agents/wrappers/ziya_bedrock.py +53 -31
  9. app/config/models_config.py +61 -20
  10. app/config/shell_config.py +5 -1
  11. app/extensions/prompt_extensions/claude_extensions.py +27 -5
  12. app/extensions/prompt_extensions/mcp_prompt_extensions.py +82 -56
  13. app/main.py +5 -3
  14. app/mcp/client.py +19 -10
  15. app/mcp/manager.py +68 -10
  16. app/mcp/tools.py +8 -9
  17. app/mcp_servers/shell_server.py +3 -3
  18. app/middleware/streaming.py +29 -41
  19. app/routes/file_validation.py +35 -0
  20. app/routes/mcp_routes.py +54 -8
  21. app/server.py +525 -614
  22. app/streaming_tool_executor.py +748 -137
  23. app/templates/asset-manifest.json +20 -20
  24. app/templates/index.html +1 -1
  25. app/templates/static/css/{main.0297bfee.css → main.e7109b49.css} +2 -2
  26. app/templates/static/css/main.e7109b49.css.map +1 -0
  27. app/templates/static/js/14386.65fcfe53.chunk.js +2 -0
  28. app/templates/static/js/14386.65fcfe53.chunk.js.map +1 -0
  29. app/templates/static/js/35589.0368973a.chunk.js +2 -0
  30. app/templates/static/js/35589.0368973a.chunk.js.map +1 -0
  31. app/templates/static/js/{50295.ab92f61b.chunk.js → 50295.90aca393.chunk.js} +3 -3
  32. app/templates/static/js/50295.90aca393.chunk.js.map +1 -0
  33. app/templates/static/js/55734.5f0fd567.chunk.js +2 -0
  34. app/templates/static/js/55734.5f0fd567.chunk.js.map +1 -0
  35. app/templates/static/js/58542.57fed736.chunk.js +2 -0
  36. app/templates/static/js/58542.57fed736.chunk.js.map +1 -0
  37. app/templates/static/js/{68418.2554bb1e.chunk.js → 68418.f7b4d2d9.chunk.js} +3 -3
  38. app/templates/static/js/68418.f7b4d2d9.chunk.js.map +1 -0
  39. app/templates/static/js/99948.b280eda0.chunk.js +2 -0
  40. app/templates/static/js/99948.b280eda0.chunk.js.map +1 -0
  41. app/templates/static/js/main.e075582c.js +3 -0
  42. app/templates/static/js/main.e075582c.js.map +1 -0
  43. app/utils/code_util.py +5 -2
  44. app/utils/context_cache.py +11 -0
  45. app/utils/conversation_filter.py +90 -0
  46. app/utils/custom_bedrock.py +43 -1
  47. app/utils/diff_utils/validation/validators.py +32 -22
  48. app/utils/file_cache.py +5 -3
  49. app/utils/precision_prompt_system.py +116 -0
  50. app/utils/streaming_optimizer.py +100 -0
  51. {ziya-0.3.0.dist-info → ziya-0.3.2.dist-info}/METADATA +3 -2
  52. {ziya-0.3.0.dist-info → ziya-0.3.2.dist-info}/RECORD +59 -55
  53. app/templates/static/css/main.0297bfee.css.map +0 -1
  54. app/templates/static/js/14386.567bf803.chunk.js +0 -2
  55. app/templates/static/js/14386.567bf803.chunk.js.map +0 -1
  56. app/templates/static/js/35589.278ecda2.chunk.js +0 -2
  57. app/templates/static/js/35589.278ecda2.chunk.js.map +0 -1
  58. app/templates/static/js/50295.ab92f61b.chunk.js.map +0 -1
  59. app/templates/static/js/55734.90d8bd52.chunk.js +0 -2
  60. app/templates/static/js/55734.90d8bd52.chunk.js.map +0 -1
  61. app/templates/static/js/58542.08fb5cf4.chunk.js +0 -2
  62. app/templates/static/js/58542.08fb5cf4.chunk.js.map +0 -1
  63. app/templates/static/js/68418.2554bb1e.chunk.js.map +0 -1
  64. app/templates/static/js/99948.71670e91.chunk.js +0 -2
  65. app/templates/static/js/99948.71670e91.chunk.js.map +0 -1
  66. app/templates/static/js/main.1d79eac2.js +0 -3
  67. app/templates/static/js/main.1d79eac2.js.map +0 -1
  68. /app/templates/static/js/{50295.ab92f61b.chunk.js.LICENSE.txt → 50295.90aca393.chunk.js.LICENSE.txt} +0 -0
  69. /app/templates/static/js/{68418.2554bb1e.chunk.js.LICENSE.txt → 68418.f7b4d2d9.chunk.js.LICENSE.txt} +0 -0
  70. /app/templates/static/js/{main.1d79eac2.js.LICENSE.txt → main.e075582c.js.LICENSE.txt} +0 -0
  71. {ziya-0.3.0.dist-info → ziya-0.3.2.dist-info}/WHEEL +0 -0
  72. {ziya-0.3.0.dist-info → ziya-0.3.2.dist-info}/entry_points.txt +0 -0
  73. {ziya-0.3.0.dist-info → ziya-0.3.2.dist-info}/licenses/LICENSE +0 -0
app/server.py CHANGED
@@ -39,7 +39,8 @@ from app.agents.agent import model, RetryingChatBedrock, initialize_langserve
39
39
  from app.agents.agent import get_or_create_agent, get_or_create_agent_executor, create_agent_chain, create_agent_executor
40
40
  from app.agents.agent import update_conversation_state, update_and_return, parse_output
41
41
  from langchain_google_genai.chat_models import ChatGoogleGenerativeAIError
42
- from fastapi.responses import FileResponse, StreamingResponse
42
+ from fastapi.responses import FileResponse
43
+ from starlette.responses import StreamingResponse
43
44
  from pydantic import BaseModel, Field
44
45
 
45
46
  # Direct streaming imports
@@ -81,7 +82,7 @@ from app.utils.diff_utils import apply_diff_pipeline
81
82
  from app.utils.custom_exceptions import ThrottlingException, ExpiredTokenException
82
83
  from app.utils.custom_exceptions import ValidationError
83
84
  from app.utils.file_utils import read_file_content
84
- from app.middleware import RequestSizeMiddleware, ModelSettingsMiddleware, ErrorHandlingMiddleware, HunkStatusMiddleware
85
+ from app.middleware import RequestSizeMiddleware, ModelSettingsMiddleware, ErrorHandlingMiddleware, HunkStatusMiddleware, StreamingMiddleware
85
86
  from app.utils.context_enhancer import initialize_ast_if_enabled
86
87
  from fastapi.websockets import WebSocketState
87
88
  from app.middleware.continuation import ContinuationMiddleware
@@ -91,141 +92,42 @@ def build_messages_for_streaming(question: str, chat_history: List, files: List,
91
92
  Build messages for streaming using the extended prompt template.
92
93
  This centralizes message construction to avoid duplication.
93
94
  """
94
-
95
- from app.agents.prompts_manager import get_extended_prompt, get_model_info_from_config
96
- from app.agents.agent import get_combined_docs_from_files, _format_chat_history
97
- from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
98
-
95
+ logger.info(f"🔍 FUNCTION_START: build_messages_for_streaming called with {len(files)} files")
96
+
97
+ # Always use precision prompt system
98
+ from app.utils.precision_prompt_system import precision_system
99
+ from app.agents.prompts_manager import get_model_info_from_config
100
+
99
101
  model_info = get_model_info_from_config()
100
-
101
- # Get model_id for MCP guidelines exclusion
102
- from app.agents.models import ModelManager
103
- model_id = ModelManager.get_model_id()
104
-
105
- # Get MCP context, including endpoint and model_id for extensions
106
- mcp_context = {
107
- "model_id": model_id,
108
- "endpoint": model_info["endpoint"]
109
- }
110
- try:
111
- from app.mcp.manager import get_mcp_manager
112
- mcp_manager = get_mcp_manager()
113
- if mcp_manager.is_initialized:
114
- available_tools = [tool.name for tool in mcp_manager.get_all_tools()]
115
- mcp_context["mcp_tools_available"] = len(available_tools) > 0
116
- mcp_context["available_mcp_tools"] = available_tools
117
- except Exception as e:
118
- logger.warning(f"Could not get MCP tools: {e}")
119
-
120
- # Get file context
121
- from app.agents.agent import extract_codebase
122
- file_context = extract_codebase({"config": {"files": files}, "conversation_id": conversation_id})
123
-
124
- # Apply post-instructions to the question once here
125
- from app.utils.post_instructions import PostInstructionManager
126
- modified_question = PostInstructionManager.apply_post_instructions(
127
- query=question,
128
- model_name=model_info["model_name"],
129
- model_family=model_info["model_family"],
130
- endpoint=model_info["endpoint"]
131
- )
132
-
133
- # Get the extended prompt and format it properly
134
- extended_prompt = get_extended_prompt(
135
- model_name=model_info["model_name"],
136
- model_family=model_info["model_family"],
137
- endpoint=model_info["endpoint"],
138
- context=mcp_context
102
+ request_path = "/streaming_tools" # Default for streaming
103
+
104
+ # Use precision system for 100% equivalence
105
+ messages = precision_system.build_messages(
106
+ request_path=request_path,
107
+ model_info=model_info,
108
+ files=files,
109
+ question=question,
110
+ chat_history=chat_history
139
111
  )
140
-
141
- # Get available tools for the template
142
- tools_list = []
143
- try:
144
- from app.mcp.manager import get_mcp_manager
145
- mcp_manager = get_mcp_manager()
146
- if mcp_manager.is_initialized:
147
- tools_list = [f"- {tool.name}: {tool.description}" for tool in mcp_manager.get_all_tools()]
148
- except Exception as e:
149
- logger.warning(f"Could not get tools for template: {e}")
150
-
151
- # Build messages manually to ensure proper conversation history
152
- messages = []
153
-
154
- # Add system message with context
155
- system_content = extended_prompt.messages[0].prompt.template.format(
156
- codebase=file_context,
157
- ast_context="",
158
- tools="\n".join(tools_list) if tools_list else "No tools available",
159
- TOOL_SENTINEL_OPEN=TOOL_SENTINEL_OPEN,
160
- TOOL_SENTINEL_CLOSE=TOOL_SENTINEL_CLOSE
161
- )
162
-
163
- if use_langchain_format:
164
- messages.append(SystemMessage(content=system_content))
165
- else:
166
- messages.append({"role": "system", "content": system_content})
167
-
168
- # Add conversation history
169
- for item in chat_history:
170
- if isinstance(item, dict):
171
- role = item.get('type', item.get('role', 'human'))
172
- content = item.get('content', '')
173
- elif isinstance(item, (list, tuple)) and len(item) >= 2:
174
- role, content = item[0], item[1]
175
- else:
176
- continue
177
-
178
- if role in ['human', 'user']:
179
- if use_langchain_format:
180
- messages.append(HumanMessage(content=content))
181
- else:
182
- messages.append({"role": "user", "content": content})
183
- elif role in ['assistant', 'ai']:
184
- if use_langchain_format:
185
- messages.append(AIMessage(content=content))
186
- else:
187
- messages.append({"role": "assistant", "content": content})
188
-
189
- # Add current question
112
+
113
+ logger.info(f"🎯 PRECISION_SYSTEM: Built {len(messages)} messages with {len(files)} files preserved")
114
+
115
+ # Convert to LangChain format if needed
190
116
  if use_langchain_format:
191
- messages.append(HumanMessage(content=modified_question))
192
- else:
193
- messages.append({"role": "user", "content": modified_question})
194
-
117
+ from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
118
+ langchain_messages = []
119
+ for msg in messages:
120
+ if isinstance(msg, dict) and "role" in msg:
121
+ if msg["role"] == "system":
122
+ langchain_messages.append(SystemMessage(content=msg["content"]))
123
+ elif msg["role"] == "user":
124
+ langchain_messages.append(HumanMessage(content=msg["content"]))
125
+ elif msg["role"] == "assistant":
126
+ langchain_messages.append(AIMessage(content=msg["content"]))
127
+ return langchain_messages
128
+
195
129
  return messages
196
- logger.info("CONTEXT CONSTRUCTION DETAILS:")
197
- logger.info(f"File context length: {len(file_context)} characters")
198
- logger.info(f"Modified question length: {len(modified_question)} characters")
199
- logger.info(f"Chat history items: {len(chat_history)}")
200
- logger.info(f"Available tools: {len(tools_list)}")
201
- logger.info(f"MCP tools available: {mcp_context.get('mcp_tools_available', False)}")
202
-
203
- # Debug: Check template substitution
204
- logger.debug("=== TEMPLATE SUBSTITUTION DEBUG ===")
205
- logger.debug("Template variables being substituted:")
206
- logger.debug(f"- codebase length: {len(file_context)}")
207
- logger.debug(f"- question length: {len(modified_question)}")
208
- logger.debug(f"- chat_history items: {len(_format_chat_history(chat_history))}")
209
- logger.debug(f"- tools count: {len(tools_list)}")
210
-
211
- formatted_messages = extended_prompt.format_messages(
212
- codebase=file_context,
213
- question=modified_question,
214
- chat_history=_format_chat_history(chat_history),
215
- ast_context="", # Will be enhanced if AST is enabled
216
- tools="\n".join(tools_list) if tools_list else "No tools available",
217
- TOOL_SENTINEL_OPEN=TOOL_SENTINEL_OPEN,
218
- TOOL_SENTINEL_CLOSE=TOOL_SENTINEL_CLOSE
219
- )
220
-
221
- # Debug: Check if template substitution caused duplication
222
- for i, msg in enumerate(formatted_messages):
223
- if hasattr(msg, 'content'):
224
- file_markers_count = msg.content.count('File: ')
225
- if file_markers_count > 0:
226
- logger.debug(f"Message {i} after template substitution has {file_markers_count} file markers")
227
-
228
- return formatted_messages
130
+
229
131
 
230
132
  # Dictionary to track active streaming tasks
231
133
  active_streams = {}
@@ -270,7 +172,7 @@ async def chat_endpoint(request: Request):
270
172
 
271
173
  # Extract data from the request
272
174
  messages = body.get('messages', [])
273
- question = body.get('question', '')
175
+ question = body.get('question', '') or body.get('message', '') # Check both question and message
274
176
  files = body.get('files', [])
275
177
  conversation_id = body.get('conversation_id')
276
178
 
@@ -285,6 +187,7 @@ async def chat_endpoint(request: Request):
285
187
  is_bedrock_deepseek = current_model and 'deepseek' in current_model.lower()
286
188
  is_bedrock_openai = current_model and 'openai' in current_model.lower()
287
189
  is_google_model = current_model and ('gemini' in current_model.lower() or 'google' in current_model.lower())
190
+ # Check if direct streaming is enabled globally - use direct streaming by default for Bedrock models like 0.3.1
288
191
  use_direct_streaming = is_bedrock_claude or is_bedrock_nova or is_bedrock_deepseek or is_bedrock_openai or is_google_model
289
192
 
290
193
  logger.info(f"🔍 CHAT_ENDPOINT: Current model = {current_model}, is_bedrock_claude = {is_bedrock_claude}")
@@ -329,18 +232,19 @@ async def chat_endpoint(request: Request):
329
232
  elif role in ['assistant', 'ai']:
330
233
  chat_history.append({'type': 'ai', 'content': content})
331
234
 
332
- # Format the data for stream_chunks
235
+ # Format the data for stream_chunks - LangChain expects files at top level
333
236
  formatted_body = {
334
237
  'question': question,
335
238
  'conversation_id': conversation_id,
336
239
  'chat_history': chat_history,
240
+ 'files': files, # LangChain expects files at top level
337
241
  'config': {
338
242
  'conversation_id': conversation_id,
339
- 'files': files
243
+ 'files': files # Also include in config for compatibility
340
244
  }
341
245
  }
342
246
 
343
- logger.info("[CHAT_ENDPOINT] Calling stream_chunks directly for Bedrock models")
247
+ logger.info("[CHAT_ENDPOINT] Using StreamingToolExecutor via stream_chunks for unified execution")
344
248
 
345
249
  return StreamingResponse(
346
250
  stream_chunks(formatted_body),
@@ -349,6 +253,10 @@ async def chat_endpoint(request: Request):
349
253
  "Cache-Control": "no-cache",
350
254
  "Connection": "keep-alive",
351
255
  "X-Accel-Buffering": "no",
256
+ "X-Content-Type-Options": "nosniff",
257
+ "Transfer-Encoding": "chunked",
258
+ "X-Nginx-Buffering": "no",
259
+ "Proxy-Buffering": "off",
352
260
  "Access-Control-Allow-Origin": "*",
353
261
  "Access-Control-Allow-Methods": "POST, OPTIONS",
354
262
  "Access-Control-Allow-Headers": "Content-Type"
@@ -424,6 +332,9 @@ app.add_middleware(
424
332
  allow_headers=["*"],
425
333
  )
426
334
 
335
+ # Add streaming middleware
336
+ app.add_middleware(StreamingMiddleware)
337
+
427
338
  # Add request size middleware
428
339
  app.add_middleware(
429
340
  RequestSizeMiddleware,
@@ -510,6 +421,9 @@ if os.path.exists(static_dir):
510
421
  app.mount("/static", StaticFiles(directory=static_dir), name="static")
511
422
  logger.info(f"Mounted static files from {static_dir}")
512
423
 
424
+ # Global flag to prevent multiple LangServe initializations
425
+ _langserve_initialized = False
426
+
513
427
  # Initialize MCP manager on startup
514
428
  @app.on_event("startup")
515
429
  async def startup_event():
@@ -534,7 +448,7 @@ async def startup_event():
534
448
  # Reinitialize the agent chain now that MCP is available
535
449
  # Invalidate agent chain cache since MCP tools are now available
536
450
  from app.agents.models import ModelManager
537
- ModelManager.invalidate_agent_chain_cache()
451
+ # ModelManager.invalidate_agent_chain_cache() # Method doesn't exist
538
452
 
539
453
  # Initialize secure MCP tools
540
454
  from app.mcp.enhanced_tools import get_connection_pool as get_secure_pool
@@ -548,9 +462,10 @@ async def startup_event():
548
462
  agent = create_agent_chain(model.get_model())
549
463
  agent_executor = create_agent_executor(agent)
550
464
 
551
- # Reinitialize langserve routes with the updated agent
552
- initialize_langserve(app, agent_executor)
553
- logger.info("Agent chain reinitialized with MCP tools")
465
+ # COMPLETELY DISABLED: LangServe routes cause duplicate execution with /api/chat
466
+ # initialize_langserve(app, agent_executor)
467
+ # _langserve_initialized = True
468
+ logger.info("LangServe completely disabled to prevent duplicate execution - using /api/chat only")
554
469
  else:
555
470
  logger.warning("MCP initialization failed or no servers configured")
556
471
  logger.info("MCP manager initialized successfully during startup")
@@ -599,104 +514,9 @@ logger.info("=== END /ziya ROUTES ===")
599
514
  # DISABLED: LangServe routes bypass custom streaming and extended context handling
600
515
  # add_routes(app, agent_executor, disabled_endpoints=["playground", "stream_log", "stream", "invoke"], path="/ziya")
601
516
 
602
- # Add custom stream_log endpoint for compatibility
603
- @app.post("/ziya/stream_log")
604
- async def stream_log_endpoint(request: Request, body: dict):
605
- """Stream log endpoint with proper diff parameter handling."""
606
- try:
607
- # Debug logging
608
- logger.info("Stream log endpoint request body:")
609
-
610
- # Extract and store diff parameter if present
611
- diff_content = None
612
- if 'diff' in body:
613
- diff_content = body['diff']
614
- # Create a copy of the body without the diff parameter
615
- body_copy = {k: v for k, v in body.items() if k != 'diff'}
616
- else:
617
- body_copy = body
618
-
619
- # Extract input from body if present
620
- if 'input' in body_copy:
621
- input_data = body_copy['input']
622
-
623
- # Get the question from input_data
624
- question = input_data.get('question', 'EMPTY')
625
- logger.info(f"Question from input: '{question}'")
626
-
627
- # Handle chat_history
628
- chat_history = input_data.get('chat_history', [])
629
- if not isinstance(chat_history, list):
630
- logger.warning(f"Chat history is not a list: {type(chat_history)}")
631
- chat_history = []
632
-
633
- # Log chat history details for debugging
634
- logger.info(f"Chat history length: {len(chat_history)}")
635
- for i, msg in enumerate(chat_history):
636
- if isinstance(msg, dict):
637
- logger.info(f"Input chat history item {i}: type={msg.get('type', 'unknown')}")
638
- else:
639
- logger.info(f"Input chat history item {i}: type={type(msg)}")
640
-
641
- input_data['chat_history'] = chat_history
642
-
643
- # Handle config and files
644
- config = input_data.get('config', {})
645
- files = []
646
- if isinstance(config, dict):
647
- files = config.get("files", [])
648
- elif isinstance(config, list):
649
- logger.warning("Config is a list, assuming it's the files list")
650
- files = config
651
-
652
- if not isinstance(files, list):
653
- logger.warning(f"Files is not a list: {type(files)}")
654
- files = []
655
-
656
- # Count string files for summary logging
657
- string_file_count = sum(1 for f in files if isinstance(f, str))
658
- if string_file_count > 0:
659
- logger.info(f"Files count: {len(files)} ({string_file_count} are strings)")
660
- else:
661
- logger.info(f"Files count: {len(files)}")
662
- # Don't log individual file details here - too verbose
663
-
664
- # Update input_data with normalized values
665
- input_data['chat_history'] = chat_history
666
- input_data['config'] = {'files': files} if isinstance(config, list) else config
667
-
668
- # Ensure we use the current question from input_data
669
- input_data['question'] = question
670
- body_copy = input_data
671
-
672
- # Use direct streaming with StreamingResponse
673
- return StreamingResponse(
674
- stream_chunks(body_copy),
675
- media_type="text/event-stream",
676
- headers={
677
- "Cache-Control": "no-cache",
678
- "Connection": "keep-alive",
679
- "Access-Control-Allow-Origin": "*",
680
- "Access-Control-Allow-Headers": "*",
681
- "Content-Type": "text/event-stream"
682
- }
683
- )
684
- except Exception as e:
685
- logger.error(f"Error in stream_log_endpoint: {str(e)}")
686
- # Return error as streaming response
687
- error_json = json.dumps({"error": str(e)})
688
- return StreamingResponse(
689
- (f"data: {error_json}\n\ndata: {json.dumps({'done': True})}\n\n" for _ in range(1)),
690
- media_type="text/event-stream",
691
- headers={
692
- "Cache-Control": "no-cache",
693
- "Connection": "keep-alive",
694
- "Access-Control-Allow-Origin": "*",
695
- "Access-Control-Allow-Headers": "*",
696
- "Content-Type": "text/event-stream"
697
- }
698
- )
699
-
517
+ # DISABLED: Manual /ziya endpoints conflict with /api/chat
518
+ # @app.post("/ziya/stream_log")
519
+ # async def stream_log_endpoint(request: Request, body: dict):
700
520
  async def cleanup_stream(conversation_id: str):
701
521
  """Clean up resources when a stream ends or is aborted."""
702
522
  if conversation_id in active_streams:
@@ -985,10 +805,11 @@ async def handle_continuation(continuation_state: Dict[str, Any]):
985
805
 
986
806
  # Add a marker for the continuation start
987
807
  continuation_start_marker = "**📝 Continuing from previous response...**\n\n"
988
- yield f"data: {json.dumps({'ops': [{'op': 'add', 'path': '/streamed_output_str/-', 'value': continuation_start_marker}]})}\n\n"
808
+ yield f"data: {json.dumps({'content': continuation_start_marker})}\n\n"
989
809
 
990
- # Add continuation prompt
991
- updated_messages.append(HumanMessage(content=continuation_prompt))
810
+ # Add continuation prompt with tool execution context
811
+ continuation_prompt_with_context = f"{continuation_prompt}\n\nIMPORTANT: Do not simulate or hallucinate tool calls. Only use actual tool execution when needed."
812
+ updated_messages.append(HumanMessage(content=continuation_prompt_with_context))
992
813
 
993
814
  # Stream continuation with clean buffer
994
815
  async for chunk in stream_continuation(updated_messages, continuation_state):
@@ -997,8 +818,7 @@ async def handle_continuation(continuation_state: Dict[str, Any]):
997
818
  except Exception as e:
998
819
  logger.error(f"🔄 CONTINUATION: Error in continuation {continuation_id}: {e}")
999
820
  # Yield error and complete the stream
1000
- error_chunk = {"op": "add", "path": "/streamed_output_str/-", "value": f"\n\n*[Continuation error: {str(e)}]*"}
1001
- yield f"data: {json.dumps({'ops': [error_chunk]})}\n\n"
821
+ yield f"data: {json.dumps({'error': f'Continuation error: {str(e)}'})}\n\n"
1002
822
  finally:
1003
823
  # Clean up continuation state
1004
824
  with _continuation_lock:
@@ -1038,8 +858,7 @@ async def stream_continuation(messages: List, continuation_state: Dict[str, Any]
1038
858
  content_str = str(content) if content else ""
1039
859
 
1040
860
  if content_str:
1041
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": content_str}]
1042
- yield f"data: {json.dumps({'ops': ops})}\n\n"
861
+ yield f"data: {json.dumps({'content': content_str})}\n\n"
1043
862
 
1044
863
  yield f"data: {json.dumps({'done': True})}\n\n"
1045
864
 
@@ -1053,22 +872,126 @@ async def stream_chunks(body):
1053
872
  logger.error("🔍 EXECUTION_TRACE: stream_chunks() called - ENTRY POINT")
1054
873
  logger.info("🔍 STREAM_CHUNKS: Function called")
1055
874
 
1056
- # Dynamic check for direct streaming (not relying on import-time config)
1057
- import os
1058
- use_direct_streaming = os.getenv('ZIYA_USE_DIRECT_STREAMING', 'true').lower() == 'true'
875
+ # Temporarily reduce context to test tool execution
876
+ if body.get("question") and "distribution by file type" in body.get("question", "").lower():
877
+ logger.info("🔍 TEMP: Reducing context for tool execution test")
878
+ if "config" in body and "files" in body["config"]:
879
+ body["config"]["files"] = [] # Skip file context to avoid throttling
1059
880
 
1060
- # FORCE DIRECT STREAMING - Claude should use direct streaming, not LangChain
881
+ # Restore 0.3.0 direct streaming behavior
1061
882
  use_direct_streaming = True
1062
- logger.debug(f"🔍 STREAM_CHUNKS: FORCED use_direct_streaming = {use_direct_streaming}")
883
+
884
+ logger.debug(f"🔍 STREAM_CHUNKS: use_direct_streaming = {use_direct_streaming}")
1063
885
 
1064
886
  logger.info(f"🚀 DIRECT_STREAMING: Environment check = {use_direct_streaming}")
1065
- logger.info(f"🚀 DIRECT_STREAMING: Import-time config = {USE_DIRECT_STREAMING}")
1066
887
  logger.info(f"🚀 DIRECT_STREAMING: ZIYA_USE_DIRECT_STREAMING env var = '{os.getenv('ZIYA_USE_DIRECT_STREAMING', 'NOT_SET')}'")
1067
888
 
1068
889
  # Check if we should use direct streaming
1069
890
  if use_direct_streaming:
1070
- logger.info("🚀 DIRECT_STREAMING: Using DirectStreamingAgent")
1071
- from app.agents.direct_streaming import DirectStreamingAgent
891
+ logger.info("🚀 DIRECT_STREAMING: Using StreamingToolExecutor for direct streaming")
892
+ logger.info(f"🔍 REQUEST_DEBUG: body keys = {list(body.keys())}")
893
+ logger.info(f"🔍 REQUEST_DEBUG: body = {body}")
894
+
895
+ # Extract data from body for StreamingToolExecutor
896
+ question = body.get("question", "")
897
+ chat_history = body.get("chat_history", [])
898
+ files = body.get("config", {}).get("files", [])
899
+ conversation_id = body.get("conversation_id")
900
+
901
+ logger.info(f"🔍 DIRECT_STREAMING_DEBUG: question='{question}', chat_history={len(chat_history)}, files={len(files)}")
902
+
903
+ if question:
904
+ try:
905
+ from app.streaming_tool_executor import StreamingToolExecutor
906
+ from app.agents.models import ModelManager
907
+
908
+ # Get current model state
909
+ state = ModelManager.get_state()
910
+ current_region = state.get('aws_region', 'us-east-1')
911
+ aws_profile = state.get('aws_profile', 'default')
912
+ endpoint = os.environ.get("ZIYA_ENDPOINT", "bedrock")
913
+
914
+ # Only use StreamingToolExecutor for Bedrock models
915
+ if endpoint != 'bedrock':
916
+ logger.info(f"🚀 DIRECT_STREAMING: Endpoint {endpoint} not supported by StreamingToolExecutor, falling back to LangChain")
917
+ raise ValueError(f"StreamingToolExecutor only supports bedrock endpoint, got {endpoint}")
918
+
919
+ logger.info(f"🔍 DIRECT_STREAMING_DEBUG: About to call build_messages_for_streaming with {len(files)} files")
920
+ # Build messages with full context using the same function as LangChain path - use langchain format like 0.3.0
921
+ logger.info(f"🔍 CALLING_BUILD_MESSAGES: About to call build_messages_for_streaming")
922
+ messages = build_messages_for_streaming(question, chat_history, files, conversation_id, use_langchain_format=True)
923
+ logger.info(f"🔍 DIRECT_STREAMING_PATH: Built {len(messages)} messages with full context")
924
+
925
+ # Debug the system message content
926
+ if messages and hasattr(messages[0], 'content'):
927
+ system_content_length = len(messages[0].content)
928
+ logger.info(f"🔍 DIRECT_STREAMING_DEBUG: System message length = {system_content_length}")
929
+ logger.info(f"🔍 DIRECT_STREAMING_DEBUG: System message preview = {messages[0].content[:200]}...")
930
+
931
+ executor = StreamingToolExecutor(profile_name=aws_profile, region=current_region)
932
+ logger.info(f"🚀 DIRECT_STREAMING: Created StreamingToolExecutor with profile={aws_profile}, region={current_region}")
933
+
934
+ # Send initial heartbeat
935
+ yield f"data: {json.dumps({'heartbeat': True, 'type': 'heartbeat'})}\n\n"
936
+
937
+ chunk_count = 0
938
+ async for chunk in executor.stream_with_tools(messages, conversation_id=conversation_id):
939
+ chunk_count += 1
940
+
941
+ # Convert to expected format and yield all chunk types
942
+ if chunk.get('type') == 'text':
943
+ content = chunk.get('content', '')
944
+ yield f"data: {json.dumps({'content': content})}\n\n"
945
+ elif chunk.get('type') == 'tool_start':
946
+ # Stream tool start notification
947
+ yield f"data: {json.dumps({'tool_start': chunk})}\n\n"
948
+ elif chunk.get('type') == 'tool_display':
949
+ logger.info(f"🔍 TOOL_DISPLAY: {chunk.get('tool_name')} completed")
950
+ # Stream tool result
951
+ yield f"data: {json.dumps({'tool_result': chunk})}\n\n"
952
+ elif chunk.get('type') == 'tool_execution': # Legacy support
953
+ logger.info(f"🔍 TOOL_EXECUTION (legacy): {chunk.get('tool_name')} completed")
954
+ elif chunk.get('type') == 'stream_end':
955
+ break
956
+ elif chunk.get('type') == 'error':
957
+ yield f"data: {json.dumps({'error': chunk.get('content', 'Unknown error')})}\n\n"
958
+ elif chunk.get('type') == 'tool_result_for_model':
959
+ # Don't stream to frontend - this is for model conversation only
960
+ logger.debug(f"Tool result for model conversation: {chunk.get('tool_use_id')}")
961
+ elif chunk.get('type') == 'iteration_continue':
962
+ # Send heartbeat to flush stream before next iteration
963
+ yield f"data: {json.dumps({'heartbeat': True, 'type': 'heartbeat'})}\n\n"
964
+ else:
965
+ logger.debug(f"Unknown chunk type: {chunk.get('type')}")
966
+
967
+ # Always send done message at the end
968
+ yield f"data: {json.dumps({'done': True})}\n\n"
969
+
970
+ logger.info(f"🚀 DIRECT_STREAMING: Completed streaming with {chunk_count} chunks")
971
+ return
972
+
973
+ except ValueError as ve:
974
+ # Expected error for non-Bedrock endpoints - fall through to LangChain silently
975
+ logger.info(f"🚀 DIRECT_STREAMING: {ve} - falling back to LangChain")
976
+ except Exception as e:
977
+ import traceback
978
+ error_details = traceback.format_exc()
979
+ logger.error(f"🚀 DIRECT_STREAMING: Error in StreamingToolExecutor: {e}")
980
+ logger.error(f"🚀 DIRECT_STREAMING: Full traceback:\n{error_details}")
981
+ # Fall through to LangChain path
982
+
983
+ logger.info("🚀 DIRECT_STREAMING: No question found or error occurred, falling back to LangChain")
984
+
985
+ # Build messages properly for non-Bedrock models
986
+ question = body.get("question", "")
987
+ chat_history = body.get("chat_history", [])
988
+ files = body.get("config", {}).get("files", [])
989
+ conversation_id = body.get("conversation_id")
990
+
991
+ if question:
992
+ messages = build_messages_for_streaming(question, chat_history, files, conversation_id, use_langchain_format=True)
993
+ logger.info(f"🔍 LANGCHAIN_PATH: Built {len(messages)} messages for non-Bedrock model")
994
+ else:
1072
995
 
1073
996
  # Extract messages from body
1074
997
  messages = []
@@ -1134,7 +1057,27 @@ async def stream_chunks(body):
1134
1057
 
1135
1058
  # Format the system message
1136
1059
  formatted_system_content = system_content.replace('{codebase}', codebase_content)
1137
- formatted_system_content = formatted_system_content.replace('{tools}', 'MCP tools available')
1060
+
1061
+ # Check if MCP is actually enabled and has tools
1062
+ mcp_tools_text = "No tools available"
1063
+ # Check if MCP is enabled before loading tools
1064
+ if os.environ.get("ZIYA_ENABLE_MCP", "true").lower() in ("true", "1", "yes"):
1065
+ try:
1066
+ mcp_manager = get_mcp_manager()
1067
+ if mcp_manager.is_initialized:
1068
+ available_tools = mcp_manager.get_all_tools()
1069
+ if available_tools:
1070
+ mcp_tools_text = f"MCP tools available: {', '.join([tool.name for tool in available_tools])}"
1071
+ else:
1072
+ mcp_tools_text = "MCP initialized but no tools available"
1073
+ else:
1074
+ mcp_tools_text = "MCP tools disabled"
1075
+ except Exception as e:
1076
+ mcp_tools_text = "MCP tools unavailable"
1077
+ else:
1078
+ mcp_tools_text = "MCP tools disabled"
1079
+
1080
+ formatted_system_content = formatted_system_content.replace('{tools}', mcp_tools_text)
1138
1081
 
1139
1082
  messages.append({'type': 'system', 'content': formatted_system_content})
1140
1083
 
@@ -1164,66 +1107,72 @@ async def stream_chunks(body):
1164
1107
  logger.debug(f"First message type: {messages[0].get('type', 'unknown')}")
1165
1108
  logger.debug(f"System message length: {len(messages[0].get('content', '')) if messages[0].get('type') == 'system' else 'N/A'}")
1166
1109
  # Create DirectStreamingAgent and stream
1167
- try:
1168
- agent = DirectStreamingAgent()
1169
-
1170
- chunk_count = 0
1171
- tool_results_attempted = 0
1172
- total_data_sent = 0
1173
-
1174
- # Get available tools to pass to the agent
1175
- from app.mcp.enhanced_tools import create_secure_mcp_tools
1176
- mcp_tools = create_secure_mcp_tools()
1177
- logger.info(f"🚀 DIRECT_STREAMING: Passing {len(mcp_tools)} tools to DirectStreamingAgent")
1178
-
1179
- async for chunk in agent.stream_with_tools(messages, tools=mcp_tools, conversation_id=body.get('conversation_id')):
1180
- chunk_count += 1
1181
-
1182
- if chunk.get('type') == 'tool_execution':
1183
- tool_results_attempted += 1
1184
- logger.info(f"🔍 ATTEMPTING_TOOL_TRANSMISSION: #{tool_results_attempted} - {chunk.get('tool_name')}")
1185
-
1186
- # DEBUGGING: Test JSON serialization before transmission
1187
- try:
1188
- test_json = json.dumps(chunk)
1189
- json_size = len(test_json)
1190
- logger.info(f"🔍 JSON_SERIALIZATION: {chunk.get('tool_name')} serialized to {json_size} chars")
1191
-
1192
- if json_size > 100000: # 100KB
1193
- logger.warning(f"🔍 LARGE_JSON_PAYLOAD: {chunk.get('tool_name')} JSON is {json_size} chars")
1194
- if json_size > 1000000: # 1MB
1195
- logger.error(f"🔍 JSON_TOO_LARGE: {chunk.get('tool_name')} JSON is {json_size} chars - may break transmission")
1196
-
1197
- except Exception as json_error:
1198
- logger.error(f"🔍 JSON_SERIALIZATION_FAILED: {chunk.get('tool_name')} failed to serialize: {json_error}")
1199
- continue # Skip this chunk
1200
-
1201
- sse_data = f"data: {json.dumps(chunk)}\n\n"
1202
- chunk_size = len(sse_data)
1203
- total_data_sent += chunk_size
1204
-
1205
- # Log large chunks or tool results
1206
- if chunk.get('type') == 'tool_execution' or chunk_size > 1000:
1207
- logger.info(f"🔍 CHUNK_TRANSMISSION: chunk #{chunk_count}, type={chunk.get('type')}, size={chunk_size}, total_sent={total_data_sent}")
1208
- if chunk.get('type') == 'tool_execution':
1209
- logger.info(f"🔍 TOOL_CHUNK: tool_name={chunk.get('tool_name')}, result_size={len(chunk.get('result', ''))}")
1210
-
1211
- yield sse_data
1212
-
1213
- yield "data: [DONE]\n\n"
1214
- return
1215
- except CredentialRetrievalError as e:
1216
- # Handle credential errors (including mwinit failures) with proper SSE error response
1217
- from app.utils.error_handlers import handle_streaming_error
1218
- async for error_chunk in handle_streaming_error(None, e):
1219
- yield error_chunk
1220
- return
1221
- except ValueError as e:
1222
- if "OpenAI models should use LangChain path" in str(e):
1223
- logger.info("🚀 DIRECT_STREAMING: OpenAI model detected, falling back to LangChain path")
1224
- # Fall through to LangChain path below
1225
- else:
1226
- raise
1110
+ # try:
1111
+ # agent = DirectStreamingAgent()
1112
+ #
1113
+ # chunk_count = 0
1114
+ # tool_results_attempted = 0
1115
+ # total_data_sent = 0
1116
+ #
1117
+ # # Get available tools to pass to the agent
1118
+ # from app.mcp.enhanced_tools import create_secure_mcp_tools
1119
+ # mcp_tools = create_secure_mcp_tools()
1120
+ # logger.info(f"🚀 DIRECT_STREAMING: Passing {len(mcp_tools)} tools to DirectStreamingAgent")
1121
+ #
1122
+ # async for chunk in agent.stream_with_tools(messages, tools=mcp_tools, conversation_id=body.get('conversation_id')):
1123
+ # chunk_count += 1
1124
+ #
1125
+ # if chunk.get('type') == 'tool_execution':
1126
+ # tool_results_attempted += 1
1127
+ # logger.info(f"🔍 ATTEMPTING_TOOL_TRANSMISSION: #{tool_results_attempted} - {chunk.get('tool_name')}")
1128
+ #
1129
+ # # DEBUGGING: Test JSON serialization before transmission
1130
+ # try:
1131
+ # test_json = json.dumps(chunk)
1132
+ # json_size = len(test_json)
1133
+ # logger.info(f"🔍 JSON_SERIALIZATION: {chunk.get('tool_name')} serialized to {json_size} chars")
1134
+ #
1135
+ # if json_size > 100000: # 100KB
1136
+ # logger.warning(f"🔍 LARGE_JSON_PAYLOAD: {chunk.get('tool_name')} JSON is {json_size} chars")
1137
+ # if json_size > 1000000: # 1MB
1138
+ # logger.error(f"🔍 JSON_TOO_LARGE: {chunk.get('tool_name')} JSON is {json_size} chars - may break transmission")
1139
+ #
1140
+ # except Exception as json_error:
1141
+ # logger.error(f"🔍 JSON_SERIALIZATION_FAILED: {chunk.get('tool_name')} failed to serialize: {json_error}")
1142
+ # continue # Skip this chunk
1143
+ #
1144
+ # sse_data = f"data: {json.dumps(chunk)}\n\n"
1145
+ # chunk_size = len(sse_data)
1146
+ # total_data_sent += chunk_size
1147
+ #
1148
+ # # Log large chunks or tool results
1149
+ # if chunk.get('type') == 'tool_execution' or chunk_size > 1000:
1150
+ # logger.info(f"🔍 CHUNK_TRANSMISSION: chunk #{chunk_count}, type={chunk.get('type')}, size={chunk_size}, total_sent={total_data_sent}")
1151
+ # if chunk.get('type') == 'tool_execution':
1152
+ # logger.info(f"🔍 TOOL_CHUNK: tool_name={chunk.get('tool_name')}, result_size={len(chunk.get('result', ''))}")
1153
+ #
1154
+ # yield sse_data
1155
+ #
1156
+ # # Force immediate delivery for tool results
1157
+ # if chunk.get('type') == 'tool_execution':
1158
+ # import sys
1159
+ # sys.stdout.flush()
1160
+ #
1161
+ # yield "data: [DONE]\n\n"
1162
+ # return
1163
+ # except CredentialRetrievalError as e:
1164
+ # # Handle credential errors (including mwinit failures) with proper SSE error response
1165
+ # from app.utils.error_handlers import handle_streaming_error
1166
+ # async for error_chunk in handle_streaming_error(None, e):
1167
+ # yield error_chunk
1168
+ # return
1169
+ # except ValueError as e:
1170
+ # if "OpenAI models should use LangChain path" in str(e):
1171
+ # logger.info("🚀 DIRECT_STREAMING: OpenAI model detected, falling back to LangChain path")
1172
+ # # Fall through to LangChain path below
1173
+ # else:
1174
+ # raise
1175
+ pass # DirectStreamingAgent disabled
1227
1176
 
1228
1177
  # Check if model should use LangChain path instead of StreamingToolExecutor
1229
1178
  from app.agents.models import ModelManager
@@ -1288,8 +1237,7 @@ async def stream_chunks(body):
1288
1237
  if hasattr(chunk, 'content') and chunk.content:
1289
1238
  content_str = chunk.content
1290
1239
  if content_str:
1291
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": content_str}]
1292
- yield f"data: {json.dumps({'ops': ops})}\n\n"
1240
+ yield f"data: {json.dumps({'content': content_str})}\n\n"
1293
1241
 
1294
1242
  yield f"data: {json.dumps({'done': True})}\n\n"
1295
1243
  return
@@ -1298,12 +1246,9 @@ async def stream_chunks(body):
1298
1246
  logger.error(f"🚀 DIRECT_STREAMING: Error in OpenAI message construction: {e}")
1299
1247
  # Fall through to regular LangChain path
1300
1248
  else:
1301
- # Use StreamingToolExecutor for other models
1302
- # Extract variables from request body
1303
- question = body.get("question", "")
1304
- chat_history = body.get("chat_history", [])
1305
- config_data = body.get("config", {})
1306
- files = config_data.get("files", [])
1249
+ # DISABLED: Redundant StreamingToolExecutor path - causes duplicate execution
1250
+ logger.info("🚀 DIRECT_STREAMING: Skipping redundant StreamingToolExecutor path - using primary path only")
1251
+ pass
1307
1252
 
1308
1253
  # Debug: Log what we received
1309
1254
  logger.debug(f"Received question: '{question}'")
@@ -1344,43 +1289,42 @@ async def stream_chunks(body):
1344
1289
 
1345
1290
  # Get available tools including MCP tools
1346
1291
  tools = []
1347
- try:
1348
- from app.mcp.manager import get_mcp_manager
1349
- mcp_manager = get_mcp_manager()
1350
- logger.debug(f"MCP manager initialized: {mcp_manager.is_initialized}")
1351
- if mcp_manager.is_initialized:
1352
- # Convert MCP tools to Bedrock format
1353
- mcp_tools = mcp_manager.get_all_tools()
1354
- logger.debug(f"Found {len(mcp_tools)} MCP tools")
1355
- for tool in mcp_tools:
1356
- logger.debug(f"MCP tool: {tool.name}")
1357
- tools.append({
1358
- 'name': tool.name,
1359
- 'description': tool.description,
1360
- 'input_schema': getattr(tool, 'inputSchema', getattr(tool, 'input_schema', {}))
1361
- })
1362
- except Exception as e:
1363
- logger.debug(f"MCP tool loading error: {e}")
1364
- logger.warning(f"Could not get MCP tools: {e}")
1292
+
1293
+ # Check if MCP is enabled before loading tools
1294
+ if not os.environ.get("ZIYA_ENABLE_MCP", "true").lower() in ("true", "1", "yes"):
1295
+ logger.debug("MCP is disabled, no tools will be loaded")
1296
+ else:
1297
+ try:
1298
+ from app.mcp.manager import get_mcp_manager
1299
+ mcp_manager = get_mcp_manager()
1300
+ logger.debug(f"MCP manager initialized: {mcp_manager.is_initialized}")
1301
+ if mcp_manager.is_initialized:
1302
+ # Convert MCP tools to Bedrock format
1303
+ mcp_tools = mcp_manager.get_all_tools()
1304
+ logger.debug(f"Found {len(mcp_tools)} MCP tools")
1305
+ for tool in mcp_tools:
1306
+ logger.debug(f"MCP tool: {tool.name}")
1307
+ tools.append({
1308
+ 'name': tool.name,
1309
+ 'description': tool.description,
1310
+ 'input_schema': getattr(tool, 'inputSchema', getattr(tool, 'input_schema', {}))
1311
+ })
1312
+ except Exception as e:
1313
+ logger.debug(f"MCP tool loading error: {e}")
1314
+ logger.warning(f"Could not get MCP tools: {e}")
1365
1315
 
1366
1316
  # Add shell tool if no MCP tools available
1367
1317
  if not tools:
1368
1318
  logger.debug("No MCP tools found, using shell tool")
1369
- from app.agents.direct_streaming import get_shell_tool_schema
1370
- tools = [get_shell_tool_schema()]
1319
+ # from app.agents.direct_streaming import get_shell_tool_schema
1320
+ # tools = [get_shell_tool_schema()]
1321
+ logger.debug("Shell tool functionality not available")
1371
1322
  else:
1372
1323
  logger.debug(f"Using {len(tools)} tools: {[t['name'] for t in tools]}")
1373
1324
 
1374
- # Stream with proper tool execution
1375
- async for chunk in executor.stream_with_tools(messages, tools):
1376
- # Debug: Log all chunks being yielded
1377
- if chunk.get('type') == 'tool_start':
1378
- logger.info(f"🔧 SERVER: Yielding tool_start chunk: {chunk}")
1379
- elif chunk.get('type') == 'tool_execution':
1380
- logger.info(f"🔧 SERVER: Yielding tool_execution chunk: {chunk.get('tool_name')}")
1381
- yield f"data: {json.dumps(chunk)}\n\n"
1382
-
1383
- # Return after successful streaming
1325
+ # DISABLED: Redundant StreamingToolExecutor call - causes duplicate execution
1326
+ # async for chunk in executor.stream_with_tools(messages, tools):
1327
+ logger.info("🚀 DIRECT_STREAMING: Skipping redundant StreamingToolExecutor call")
1384
1328
  return
1385
1329
 
1386
1330
  except Exception as e:
@@ -1460,16 +1404,9 @@ async def stream_chunks(body):
1460
1404
 
1461
1405
  logger.debug(f"Built {len(messages)} messages for Nova StreamingToolExecutor")
1462
1406
 
1463
- # Use StreamingToolExecutor for Nova
1464
- async for chunk in executor.stream_with_tools(messages):
1465
- # Debug: Log all chunks being yielded
1466
- if chunk.get('type') == 'tool_start':
1467
- logger.info(f"🔧 SERVER_NOVA: Yielding tool_start chunk: {chunk}")
1468
- elif chunk.get('type') == 'tool_execution':
1469
- logger.info(f"🔧 SERVER_NOVA: Yielding tool_execution chunk: {chunk.get('tool_name')}")
1470
- yield f"data: {json.dumps(chunk)}\n\n"
1471
-
1472
- yield f"data: {json.dumps({'done': True})}\n\n"
1407
+ # DISABLED: Redundant Nova StreamingToolExecutor call - causes duplicate execution
1408
+ # async for chunk in executor.stream_with_tools(messages):
1409
+ logger.info("🚀 DIRECT_STREAMING: Skipping redundant Nova StreamingToolExecutor call")
1473
1410
  return
1474
1411
 
1475
1412
  except Exception as e:
@@ -1639,8 +1576,8 @@ async def stream_chunks(body):
1639
1576
  break
1640
1577
 
1641
1578
  if agent_chain:
1642
- logger.info("🔍 STREAM_CHUNKS: Using Google function calling agent")
1643
- # Use Google function calling agent directly
1579
+ logger.info("🔍 STREAM_CHUNKS: Using agent chain with file context")
1580
+ # Use agent chain with proper file context
1644
1581
  try:
1645
1582
  input_data = {
1646
1583
  "question": question,
@@ -1648,7 +1585,7 @@ async def stream_chunks(body):
1648
1585
  "chat_history": chat_history,
1649
1586
  "config": {
1650
1587
  "conversation_id": conversation_id,
1651
- "files": []
1588
+ "files": files # Include the actual files
1652
1589
  }
1653
1590
  }
1654
1591
 
@@ -1656,13 +1593,13 @@ async def stream_chunks(body):
1656
1593
  response_content = result.get("output", "")
1657
1594
 
1658
1595
  # Stream the response
1659
- yield f"data: {json.dumps({'type': 'text', 'content': response_content})}\\n\\n"
1660
- yield f"data: {json.dumps({'type': 'done'})}\\n\\n"
1596
+ yield f"data: {json.dumps({'type': 'text', 'content': response_content})}\n\n"
1597
+ yield f"data: {json.dumps({'type': 'done'})}\n\n"
1661
1598
  return
1662
1599
 
1663
1600
  except Exception as e:
1664
- logger.error(f"Google function calling failed: {e}")
1665
- # Fall back to XML approach
1601
+ logger.error(f"Agent chain failed: {e}")
1602
+ # Fall back to direct model approach
1666
1603
 
1667
1604
  # Use the messages that were already built correctly above with build_messages_for_streaming()
1668
1605
  # Don't rebuild them here - this was causing the context history loss for OpenAI models
@@ -1698,6 +1635,8 @@ async def stream_chunks(body):
1698
1635
 
1699
1636
  token_throttling_retries = 0
1700
1637
  max_token_throttling_retries = 2 # Allow 2 fresh connection attempts
1638
+ within_stream_retries = 0
1639
+ max_within_stream_retries = 3 # Quick retries within same stream first
1701
1640
 
1702
1641
  # Context overflow detection state
1703
1642
  overflow_checked = False
@@ -1714,6 +1653,8 @@ async def stream_chunks(body):
1714
1653
  logger.info(f"🔍 STREAM_CHUNKS: Created {len(mcp_tools)} MCP tools for iteration")
1715
1654
  except Exception as e:
1716
1655
  logger.warning(f"Failed to get MCP tools for iteration: {e}")
1656
+ # Allow tool calls to complete - only stop at the END of tool calls
1657
+ model_with_stop = model_instance.bind(stop=["</TOOL_SENTINEL>"])
1717
1658
  logger.info(f"🔍 STREAM_CHUNKS: model_with_stop type: {type(model_with_stop)}")
1718
1659
 
1719
1660
  # Agent iteration loop for tool execution
@@ -1732,10 +1673,11 @@ async def stream_chunks(body):
1732
1673
 
1733
1674
  current_response = ""
1734
1675
  tool_executed = False
1676
+ tool_execution_completed = False # Initialize the variable
1735
1677
 
1736
1678
  try:
1737
- # Use model with stop sequence for tool detection
1738
- model_to_use = model_with_stop
1679
+ # Use model instance for tool detection
1680
+ model_to_use = model_instance
1739
1681
  logger.info(f"🔍 AGENT ITERATION {iteration}: Available tools: {[tool.name for tool in mcp_tools] if mcp_tools else 'No tools'}")
1740
1682
 
1741
1683
  # Track if we're currently inside a tool call across chunks
@@ -1743,12 +1685,16 @@ async def stream_chunks(body):
1743
1685
  tool_call_buffer = ""
1744
1686
  tool_call_detected = False # Flag to suppress ALL output after tool detection
1745
1687
  pending_tool_execution = False # Flag to indicate we need to execute tools
1746
- buffered_content = "" # Buffer ALL content after tool call detection
1747
- tool_execution_completed = False # Track if we've executed and need model to continue
1748
1688
 
1749
- # Store stream reference for potential closure
1750
- stream_generator = model_to_use.astream(messages, config=config)
1751
- async for chunk in stream_generator:
1689
+ # DISABLED for Bedrock: LangChain streaming path - causes duplicate execution with StreamingToolExecutor
1690
+ # But ENABLED for non-Bedrock endpoints like Google
1691
+ endpoint = os.environ.get("ZIYA_ENDPOINT", "bedrock")
1692
+ if endpoint == "bedrock":
1693
+ logger.info("🚀 DIRECT_STREAMING: LangChain path disabled for Bedrock - using StreamingToolExecutor only")
1694
+ return
1695
+
1696
+ # Stream from model for non-Bedrock endpoints (use simple streaming like 0.3.0)
1697
+ async for chunk in model_instance.astream(messages):
1752
1698
  # Log the actual messages being sent to model on first iteration
1753
1699
  if iteration == 1 and not hasattr(stream_chunks, '_logged_model_input'):
1754
1700
  stream_chunks._logged_model_input = True
@@ -1772,6 +1718,23 @@ async def stream_chunks(body):
1772
1718
  if not connection_active:
1773
1719
  logger.info("Connection lost during agent iteration")
1774
1720
  break
1721
+
1722
+ # Handle dict chunks from DirectGoogleModel
1723
+ if isinstance(chunk, dict):
1724
+ if chunk.get('type') == 'text':
1725
+ content_str = chunk.get('content', '')
1726
+ if content_str:
1727
+ current_response += content_str
1728
+ ops = [{"op": "add", "path": "/streamed_output_str/-", "value": content_str}]
1729
+ yield f"data: {json.dumps({'ops': ops})}\n\n"
1730
+ chunk_count += 1
1731
+ elif chunk.get('type') == 'error':
1732
+ error_msg = chunk.get('content', 'Unknown error')
1733
+ yield f"data: {json.dumps({'error': error_msg})}\n\n"
1734
+ yield f"data: {json.dumps({'done': True})}\n\n"
1735
+ return
1736
+ continue
1737
+
1775
1738
  # Process chunk content - always process chunks, don't check for 'content' attribute first
1776
1739
 
1777
1740
  # Check if this is an error response chunk
@@ -1825,11 +1788,10 @@ async def stream_chunks(body):
1825
1788
  # Stream the completed part
1826
1789
 
1827
1790
  # Add visual marker that continuation is happening
1828
- marker_ops = [{"op": "add", "path": "/streamed_output_str/-", "value": "\n\n---\n**⏳ Response is long, preparing continuation...**\n---\n\n"}]
1829
- yield f"data: {json.dumps({'ops': marker_ops})}\n\n"
1791
+ marker_msg = "\n\n---\\n**⏳ Response is long, preparing continuation...**\\n---\n\n"
1792
+ yield f"data: {json.dumps({'content': marker_msg})}\n\n"
1830
1793
 
1831
- completed_ops = [{"op": "add", "path": "/streamed_output_str/-", "value": overflow_info["completed_response"]}]
1832
- yield f"data: {json.dumps({'ops': completed_ops})}\n\n"
1794
+ yield f"data: {json.dumps({'content': overflow_info['completed_response']})}\n\n"
1833
1795
 
1834
1796
  # Start continuation
1835
1797
  async for continuation_chunk in handle_continuation(overflow_info):
@@ -1853,21 +1815,17 @@ async def stream_chunks(body):
1853
1815
  ops = [{"op": "add", "path": "/reasoning_content/-", "value": reasoning}]
1854
1816
  yield f"data: {json.dumps({'ops': ops})}\n\n"
1855
1817
 
1856
- # AGGRESSIVE: If we see ANY hint of a tool call starting, stop everything
1857
- # Check for tool markers in the content
1858
- if any(marker in content_str for marker in ["<TOOL_SENTINEL>", "</TOOL_SENTINEL>", "<name>mcp_"]):
1818
+ # Check for complete tool calls - need both opening and closing sentinels
1819
+ # and proper structure with name and arguments
1820
+ if ("<TOOL_SENTINEL>" in current_response and
1821
+ "</TOOL_SENTINEL>" in current_response and
1822
+ "<name>" in current_response and
1823
+ "</name>" in current_response and
1824
+ "<arguments>" in current_response and
1825
+ "</arguments>" in current_response):
1859
1826
  tool_call_detected = True
1860
- logger.info(f"🔍 STREAM: AGGRESSIVE STOP - detected tool marker in: {content_str[:50]}")
1861
- # CRITICAL: Break immediately to stop accumulating more chunks
1862
- # This prevents the model from generating multiple tool calls
1863
- logger.info("🔍 STREAM: BREAKING IMMEDIATELY after detecting tool marker")
1864
- # Force close the stream to prevent hanging
1865
- if hasattr(stream_generator, 'aclose'):
1866
- try:
1867
- await stream_generator.aclose()
1868
- except:
1869
- pass
1870
- break # Exit the streaming loop immediately
1827
+ logger.info(f"🔍 STREAM: Complete tool call detected, stopping stream")
1828
+ break
1871
1829
 
1872
1830
  # If we've just executed tools, the model should now be generating the response
1873
1831
  if tool_execution_completed:
@@ -1883,11 +1841,26 @@ async def stream_chunks(body):
1883
1841
  if TOOL_SENTINEL_OPEN in content_str:
1884
1842
  inside_tool_call = True
1885
1843
  tool_call_buffer = ""
1886
- # Stream any content before the tool call, but not the sentinel itself
1844
+ # Stream any content before the tool call
1887
1845
  before_tool = content_str[:content_str.find(TOOL_SENTINEL_OPEN)]
1888
1846
  if before_tool:
1889
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": before_tool}]
1890
- yield f"data: {json.dumps({'ops': ops})}\n\n"
1847
+ text_msg = {
1848
+ 'type': 'text',
1849
+ 'content': before_tool
1850
+ }
1851
+ yield f"data: {json.dumps(text_msg)}\n\n"
1852
+ import asyncio
1853
+ await asyncio.sleep(0.01) # Longer delay to prevent batching
1854
+
1855
+ # Send tool_start message
1856
+ tool_start_msg = {
1857
+ 'type': 'tool_start',
1858
+ 'message': 'Tool execution starting...'
1859
+ }
1860
+ yield f"data: {json.dumps(tool_start_msg)}\n\n"
1861
+ logger.info("🔍 STREAM: Sent tool_start message to frontend")
1862
+ await asyncio.sleep(0.01) # Delay after tool_start
1863
+
1891
1864
  tool_call_detected = True # Set flag to suppress all further output
1892
1865
  buffered_content = "" # Start buffering from tool call
1893
1866
  logger.info("🔍 STREAM: Entering tool call - suppressing all output")
@@ -1906,7 +1879,13 @@ async def stream_chunks(body):
1906
1879
  continue
1907
1880
 
1908
1881
  else:
1909
- content_str = str(chunk)
1882
+ # Extract content properly from LangChain chunks
1883
+ if hasattr(chunk, 'content'):
1884
+ content_str = chunk.content
1885
+ elif hasattr(chunk, 'text'):
1886
+ content_str = chunk.text
1887
+ else:
1888
+ content_str = ""
1910
1889
  if not content_str:
1911
1890
  continue
1912
1891
 
@@ -1950,73 +1929,40 @@ async def stream_chunks(body):
1950
1929
  # Check if the current code block is a diff block
1951
1930
  is_in_diff_block = in_code_block and '```diff' in current_response
1952
1931
 
1932
+ # Ultra-aggressive tool suppression - catch any fragment that could be part of a tool call
1953
1933
  should_suppress = (
1954
- # Only suppress if not in a diff block AND matches suppression patterns
1955
1934
  not is_in_diff_block and (
1956
1935
  inside_tool_call or
1957
- TOOL_SENTINEL_OPEN in content_str or
1958
- TOOL_SENTINEL_CLOSE in content_str or
1959
- content_str.strip().startswith('<TOOL') or
1960
- content_str.strip().endswith('_call') or
1961
- TOOL_SENTINEL_CLOSE.lstrip('<') in content_str or
1962
- # Only suppress internal tool sentinels, not frontend tool blocks
1963
- '<TOOL_' in content_str or
1964
- '<TOOL_SENTINEL' in content_str or
1965
- '</TOOL_SENTINEL' in content_str or
1966
- # Suppress partial tool sentinel fragments
1967
- 'SENTINEL>' in content_str or
1968
- '<SENTINEL' in content_str or
1969
- '_run' in content_str or # Catch _run fragments
1970
- '_shell' in content_str or # Catch _shell fragments
1971
- '_comman' in content_str or # Catch _command fragments
1972
- '_SENTINEL' in content_str or
1973
- # Suppress JSON fragments that commonly leak
1974
- ('"d": "' in content_str) or # Catch "d": "pwd"
1975
- ('": "' in content_str and current_response.count(TOOL_SENTINEL_OPEN) > 0) or # Catch ": "1"
1976
- ('"pwd"' in content_str and current_response.count(TOOL_SENTINEL_OPEN) > 0) or
1977
- "mcp_" in content_str and ("\"command\"" in content_str or "\"format\"" in content_str or "\"timeout\"" in content_str) or
1978
- # Enhanced tool call detection
1979
- content_str.strip().startswith("mcp_") or
1980
- "mcp_run" in content_str or
1981
- "mcp_get" in content_str or
1982
- # Catch argument patterns (only suppress if inside tool call)
1983
- ("\"comman" in content_str) or # Catch partial "command"
1984
- ("d\": \"" in content_str and tool_call_detected) or # Catch command value
1985
- ("pwd\"" in content_str and tool_call_detected) or # Catch pwd command
1986
- ("timeout" in content_str and tool_call_detected) or # Catch timeout
1987
- (": \"" in content_str and tool_call_detected) or # Catch JSON patterns
1988
- ("\"command\":" in content_str and TOOL_SENTINEL_OPEN in current_response and not tool_executed) or
1989
- ("\"format\":" in content_str and TOOL_SENTINEL_OPEN in current_response and not tool_executed) or
1990
- ("\"timeout\":" in content_str and TOOL_SENTINEL_OPEN in current_response and not tool_executed) or
1991
- # Catch partial tool fragments that leak through
1992
- content_str.strip().endswith(">mcp_") or
1993
- content_str.strip().endswith("1>") or
1994
- # Catch mixed tool content
1995
- ("mcp_" in content_str and any(char in content_str for char in ["\"", ":", "{", "}"])) or
1996
- any(marker in content_str for marker in ["<name>", "</name>", "<arguments>", "</arguments>"]) or
1997
- # NEW: Catch specific leaked fragments we're seeing in frontend
1998
- "_run_shell_command" in content_str or
1999
- "_run_" in content_str or
2000
- "shell_command" in content_str or
2001
- "</name" in content_str or
2002
- "command\":" in content_str or
2003
- "timeout\":" in content_str or
2004
- # Catch JSON-like fragments
2005
- (content_str.strip().startswith('\"') and ('command' in content_str or 'timeout' in content_str)) or
2006
- # Catch partial XML closing tags
2007
- content_str.strip().endswith('</') or
2008
- content_str.strip().startswith('</') or
2009
- # Catch comma-separated values that look like JSON
2010
- (content_str.strip().endswith(',') and ('command' in content_str or 'timeout' in content_str)) or
2011
- # CRITICAL: Suppress ALL content after tool calls but before execution
2012
- # This prevents hallucinated responses from leaking through
2013
- has_pending_tools
1936
+ TOOL_SENTINEL_OPEN in current_response or # If we've seen the start of a tool call anywhere
1937
+ '<TOOL' in content_str or # Catch partial tool sentinels
1938
+ 'TOOL_' in content_str or # Catch fragments like "_modules.\n\n<TOOL_"
1939
+ '</TOOL' in content_str or
1940
+ 'SENTINEL' in content_str or
1941
+ '<name>' in content_str or
1942
+ '</name>' in content_str or
1943
+ '<arguments>' in content_str or
1944
+ '</arguments>' in content_str or
1945
+ 'mcp_run_shell_command' in content_str or
1946
+ 'mcp_get_current_time' in content_str or
1947
+ ('"command"' in content_str and TOOL_SENTINEL_OPEN in current_response) or
1948
+ ('"timeout"' in content_str and TOOL_SENTINEL_OPEN in current_response) or
1949
+ ('find .' in content_str and TOOL_SENTINEL_OPEN in current_response) or
1950
+ # Catch split fragments
1951
+ content_str.strip().endswith('<TOOL') or
1952
+ content_str.strip().endswith('_modules.\n\n<TOOL') or
1953
+ content_str.strip().startswith('_') and TOOL_SENTINEL_OPEN in current_response
2014
1954
  )
2015
1955
  )
2016
-
1956
+
2017
1957
  if not should_suppress:
2018
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": content_str}]
2019
- yield f"data: {json.dumps({'ops': ops})}\n\n"
1958
+ text_msg = {
1959
+ 'type': 'text',
1960
+ 'content': content_str
1961
+ }
1962
+ yield f"data: {json.dumps(text_msg)}\n\n"
1963
+ # Force task scheduling to ensure individual processing
1964
+ import asyncio
1965
+ await asyncio.sleep(0)
2020
1966
  else:
2021
1967
  logger.debug(f"🔍 AGENT: Suppressed tool call content from frontend")
2022
1968
  # Check for tool calls and execute when model has finished generating them
@@ -2075,10 +2021,15 @@ async def stream_chunks(body):
2075
2021
  tool_blocks.append(tool_block)
2076
2022
  start_pos = tool_end + 3
2077
2023
 
2078
- # Stream tool results to frontend
2024
+ # Stream tool results to frontend with proper message type
2079
2025
  for tool_block in tool_blocks:
2080
- tool_result = "\n" + tool_block + "\n"
2081
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": tool_result}]
2026
+ tool_result = "\\n" + tool_block + "\\n"
2027
+ tool_execution_msg = {
2028
+ 'type': 'tool_display',
2029
+ 'content': tool_result,
2030
+ 'tool_name': 'mcp_tool'
2031
+ }
2032
+ yield f"data: {json.dumps(tool_execution_msg)}\n\n"
2082
2033
  # Don't send markdown tool blocks when we're using structured tool_execution events
2083
2034
  # The structured events are already handled by the frontend
2084
2035
  logger.info(f"🔍 STREAM: Skipping markdown tool block (using structured events)")
@@ -2118,8 +2069,7 @@ async def stream_chunks(body):
2118
2069
  except Exception as tool_error:
2119
2070
  logger.error(f"🔍 STREAM: Tool execution error: {tool_error}")
2120
2071
  error_msg = f"**Tool Error:** {str(tool_error)}"
2121
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": error_msg}]
2122
- yield f"data: {json.dumps({'ops': ops})}\n\n"
2072
+ yield f"data: {json.dumps({'content': error_msg})}\n\n"
2123
2073
  tool_executed = True
2124
2074
  tool_call_detected = False
2125
2075
  pending_tool_execution = False
@@ -2146,11 +2096,15 @@ async def stream_chunks(body):
2146
2096
  tool_blocks.append(tool_block)
2147
2097
  start_pos = tool_end + 3
2148
2098
 
2149
- # Stream ALL tool results to frontend
2099
+ # Stream ALL tool results to frontend with proper message type
2150
2100
  for tool_block in tool_blocks:
2151
- tool_result = "\n" + tool_block + "\n"
2152
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": tool_result}]
2153
- yield f"data: {json.dumps({'ops': ops})}\n\n"
2101
+ tool_result = "\\n" + tool_block + "\\n"
2102
+ tool_execution_msg = {
2103
+ 'type': 'tool_display',
2104
+ 'content': tool_result,
2105
+ 'tool_name': 'mcp_tool'
2106
+ }
2107
+ yield f"data: {json.dumps(tool_execution_msg)}\n\n"
2154
2108
  logger.info(f"🔍 STREAM: Tool result streamed: {tool_result[:50]}...")
2155
2109
 
2156
2110
  # Add ALL tool results to conversation context for model continuation
@@ -2195,8 +2149,7 @@ async def stream_chunks(body):
2195
2149
  except Exception as tool_error:
2196
2150
  logger.error(f"🔍 STREAM: Tool execution error: {tool_error}")
2197
2151
  error_msg = f"**Tool Error:** {str(tool_error)}"
2198
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": error_msg}]
2199
- yield f"data: {json.dumps({'ops': ops})}\n\n"
2152
+ yield f"data: {json.dumps({'content': error_msg})}\n\n"
2200
2153
  tool_executed = True
2201
2154
 
2202
2155
  logger.info(f"🔍 AGENT: Finished streaming loop for iteration {iteration}")
@@ -2246,11 +2199,15 @@ async def stream_chunks(body):
2246
2199
  tool_blocks.append(tool_block)
2247
2200
  start_pos = tool_end + 3
2248
2201
 
2249
- # Stream ALL tool results to frontend
2202
+ # Stream ALL tool results to frontend with proper message type
2250
2203
  for tool_block in tool_blocks:
2251
- tool_result = "\n" + tool_block + "\n"
2252
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": tool_result}]
2253
- yield f"data: {json.dumps({'ops': ops})}\n\n"
2204
+ tool_result = "\\n" + tool_block + "\\n"
2205
+ tool_execution_msg = {
2206
+ 'type': 'tool_display',
2207
+ 'content': tool_result,
2208
+ 'tool_name': 'mcp_tool'
2209
+ }
2210
+ yield f"data: {json.dumps(tool_execution_msg)}\n\n"
2254
2211
  logger.info(f"🔍 STREAM: Tool result streamed: {tool_result[:50]}...")
2255
2212
 
2256
2213
  # Add ALL tool results to conversation context for model continuation
@@ -2336,13 +2293,79 @@ async def stream_chunks(body):
2336
2293
  logger.error(f"Error in agent iteration {iteration}: {str(e)}", exc_info=True)
2337
2294
  processed_response = current_response # Initialize before use
2338
2295
 
2296
+ # Handle timeout errors with retry logic
2297
+ error_str = str(e)
2298
+ is_timeout_error = ("Read timeout" in error_str or
2299
+ "ReadTimeoutError" in error_str or
2300
+ "timeout" in error_str.lower())
2339
2301
 
2340
2302
  # Check for token-based throttling specifically
2341
- error_str = str(e)
2342
2303
  is_token_throttling = ("Too many tokens" in error_str and
2343
2304
  "ThrottlingException" in error_str and
2344
2305
  "reached max retries" in error_str)
2345
2306
 
2307
+ # Use two-tier retry: first within stream, then new stream
2308
+ if (is_timeout_error or is_token_throttling):
2309
+ # Tier 1: Quick retries within same stream
2310
+ if within_stream_retries < max_within_stream_retries:
2311
+ within_stream_retries += 1
2312
+ wait_time = min(2 ** within_stream_retries, 8) # 2s, 4s, 8s
2313
+ error_type = "timeout" if is_timeout_error else "token throttling"
2314
+
2315
+ logger.info(f"🔄 WITHIN-STREAM: {error_type} retry {within_stream_retries}/{max_within_stream_retries} in {wait_time}s")
2316
+
2317
+ retry_msg = f"\\n🔄 {error_type.title()} detected, retrying in {wait_time}s...\\n"
2318
+ yield f"data: {json.dumps({'content': retry_msg})}\n\n"
2319
+
2320
+ await asyncio.sleep(wait_time)
2321
+
2322
+ # Retry same iteration within stream
2323
+ iteration -= 1
2324
+ if iteration < 1:
2325
+ iteration = 1
2326
+ continue
2327
+
2328
+ # Tier 2: Fresh connection/new stream
2329
+ elif token_throttling_retries < max_token_throttling_retries:
2330
+ token_throttling_retries += 1
2331
+ within_stream_retries = 0 # Reset within-stream counter
2332
+ wait_time = min(10 * (2 ** (token_throttling_retries - 1)), 30) # 10s, 20s, 30s
2333
+ error_type = "timeout" if is_timeout_error else "token throttling"
2334
+
2335
+ logger.info(f"🔄 NEW-STREAM: {error_type} retry {token_throttling_retries}/{max_token_throttling_retries} with fresh connection in {wait_time}s")
2336
+
2337
+ fresh_conn_msg = f"\\n🔄 Starting fresh connection... (attempt {token_throttling_retries}/{max_token_throttling_retries})\\n"
2338
+ yield f"data: {json.dumps({'content': fresh_conn_msg})}\n\n"
2339
+
2340
+ await asyncio.sleep(wait_time)
2341
+
2342
+ # End current stream and trigger new one via recursive call
2343
+ yield f"data: {json.dumps({'retry_with_fresh_stream': True})}\n\n"
2344
+
2345
+ # Start completely new stream
2346
+ async for chunk in stream_chunks(body):
2347
+ yield chunk
2348
+ return
2349
+
2350
+ # Gracefully close stream with error message
2351
+ if is_timeout_error:
2352
+ error_msg = "⚠️ Request timed out. The response may be incomplete."
2353
+ elif is_token_throttling:
2354
+ error_msg = "⚠️ Rate limit exceeded. Please try again in a moment."
2355
+ else:
2356
+ error_msg = f"⚠️ An error occurred: {str(e)}"
2357
+
2358
+ # Send error to client
2359
+ error_content = f"\n\n{error_msg}\\n"
2360
+ yield f"data: {json.dumps({'content': error_content})}\n\n"
2361
+
2362
+ # Send completion signal
2363
+ yield f"data: {json.dumps({'done': True})}\n\n"
2364
+
2365
+ # Clean up and exit gracefully
2366
+ await cleanup_stream(conversation_id)
2367
+ return
2368
+
2346
2369
  if is_token_throttling and token_throttling_retries < max_token_throttling_retries:
2347
2370
  token_throttling_retries += 1
2348
2371
  logger.info(f"🔄 TOKEN_THROTTLING: Detected token throttling in multi-round session, attempt {token_throttling_retries}/{max_token_throttling_retries}")
@@ -2354,8 +2377,8 @@ async def stream_chunks(body):
2354
2377
  "retry_attempt": token_throttling_retries,
2355
2378
  "wait_time": 20
2356
2379
  }
2357
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": f"\n🔄 Retrying with fresh connection... (attempt {token_throttling_retries}/{max_token_throttling_retries})\n"}]
2358
- yield f"data: {json.dumps({'ops': ops})}\n\n"
2380
+ final_retry_msg = f"\\n🔄 Retrying with fresh connection... (attempt {token_throttling_retries}/{max_token_throttling_retries})\\n"
2381
+ yield f"data: {json.dumps({'content': final_retry_msg})}\n\n"
2359
2382
 
2360
2383
  # Wait 20 seconds and retry with fresh connection
2361
2384
  await asyncio.sleep(20)
@@ -2372,12 +2395,11 @@ async def stream_chunks(body):
2372
2395
  logger.debug(f"PARTIAL RESPONSE PRESERVED (AGENT ERROR):\n{current_response}")
2373
2396
 
2374
2397
  # Send the partial content to the frontend
2375
- ops = [{"op": "add", "path": "/streamed_output_str/-", "value": current_response}]
2376
- yield f"data: {json.dumps({'ops': ops})}\n\n"
2398
+ yield f"data: {json.dumps({'content': current_response})}\n\n"
2377
2399
 
2378
2400
  # Send warning about partial response
2379
- warning_signal = {"op": "add", "path": "/warning", "value": f"Server encountered an error after generating {len(current_response)} characters. The partial response has been preserved."}
2380
- yield f"data: {json.dumps({'ops': [warning_signal]})}\n\n"
2401
+ warning_msg = f"Server encountered an error after generating {len(current_response)} characters. The partial response has been preserved."
2402
+ yield f"data: {json.dumps({'warning': warning_msg})}\n\n"
2381
2403
 
2382
2404
  full_response = current_response # Ensure it's preserved in full_response
2383
2405
 
@@ -2456,118 +2478,8 @@ async def stream_chunks(body):
2456
2478
  await cleanup_stream(conversation_id)
2457
2479
 
2458
2480
  # Override the stream endpoint with our error handling
2459
- @app.post("/ziya/stream")
2460
- async def stream_endpoint(request: Request, body: dict):
2461
- """Stream endpoint with centralized error handling."""
2462
- logger.info(f"🔍 STREAM_ENDPOINT: Direct /ziya/stream called - this should be using stream_chunks")
2463
- logger.info(f"🔍 STREAM_ENDPOINT: Request body keys: {body.keys()}")
2464
-
2465
- # Check for direct streaming mode
2466
- import os
2467
- use_direct_streaming = os.getenv('ZIYA_USE_DIRECT_STREAMING', 'true').lower() == 'true'
2468
- logger.info(f"🚀 DIRECT_STREAMING: stream_endpoint check = {use_direct_streaming}")
2469
-
2470
- if use_direct_streaming:
2471
- logger.info("🚀 DIRECT_STREAMING: Using direct streaming in stream_endpoint")
2472
- return StreamingResponse(
2473
- stream_chunks(body),
2474
- media_type="text/event-stream",
2475
- headers={
2476
- "Cache-Control": "no-cache",
2477
- "Connection": "keep-alive",
2478
- "X-Accel-Buffering": "no",
2479
- "Access-Control-Allow-Origin": "*",
2480
- "Access-Control-Allow-Methods": "POST, OPTIONS",
2481
- "Access-Control-Allow-Headers": "Content-Type"
2482
- }
2483
- )
2484
-
2485
- try:
2486
- # Debug logging
2487
- logger.info("[INSTRUMENTATION] /ziya/stream received request")
2488
- logger.info(f"[INSTRUMENTATION] /ziya/stream question: '{body.get('question', 'EMPTY')[:50]}...' (truncated)")
2489
- logger.info(f"[INSTRUMENTATION] /ziya/stream chat_history length: {len(body.get('chat_history', []))}")
2490
- logger.info(f"[INSTRUMENTATION] /ziya/stream files count: {len(body.get('config', {}).get('files', []))}")
2491
-
2492
- # Log body structure
2493
- logger.info(f"[INSTRUMENTATION] /ziya/stream body keys: {body.keys() if isinstance(body, dict) else type(body)}")
2494
-
2495
- # Log chat history structure if present
2496
- chat_history = body.get('chat_history', [])
2497
- if chat_history and len(chat_history) > 0:
2498
- logger.info(f"[INSTRUMENTATION] /ziya/stream first history item type: {type(chat_history[0])}")
2499
- if isinstance(chat_history[0], list) and len(chat_history[0]) >= 2:
2500
- logger.info(f"[INSTRUMENTATION] /ziya/stream first history format: ['{chat_history[0][0][:20]}...', '{chat_history[0][1][:20]}...'] (truncated)")
2501
- elif isinstance(chat_history[0], dict):
2502
- logger.info(f"[INSTRUMENTATION] /ziya/stream first history keys: {chat_history[0].keys()}")
2503
-
2504
- # Check if the question is empty or missing
2505
- if not body.get("question") or not body.get("question").strip():
2506
- logger.warning("[INSTRUMENTATION] /ziya/stream empty question detected")
2507
- raise ValidationError("Please provide a question to continue.")
2508
-
2509
- # Clean chat history if present
2510
- if "chat_history" in body:
2511
- logger.info(f"[INSTRUMENTATION] /ziya/stream cleaning chat history of length {len(chat_history)}")
2512
- cleaned_history = []
2513
- for pair in body["chat_history"]:
2514
- try:
2515
- # Handle both tuple format [role, content] and dict format {"type": role, "content": content}
2516
- if isinstance(pair, dict) and 'type' in pair and 'content' in pair:
2517
- role, content = pair['type'], pair['content']
2518
- elif isinstance(pair, (list, tuple)) and len(pair) == 2:
2519
- role, content = pair[0], pair[1]
2520
- else:
2521
- logger.warning(f"[INSTRUMENTATION] /ziya/stream invalid chat history pair format: {type(pair)}")
2522
- continue
2523
-
2524
- if not isinstance(role, str) or not isinstance(content, str):
2525
- logger.warning(f"[INSTRUMENTATION] /ziya/stream non-string message: role={type(role)}, content={type(content)}")
2526
- continue
2527
-
2528
- if role.strip() and content.strip():
2529
- cleaned_history.append((role.strip(), content.strip()))
2530
- logger.info(f"[INSTRUMENTATION] /ziya/stream added valid message: role='{role}', content='{content[:20]}...' (truncated)")
2531
- else:
2532
- logger.warning(f"[INSTRUMENTATION] /ziya/stream empty message content")
2533
- except Exception as e:
2534
- logger.error(f"[INSTRUMENTATION] /ziya/stream error processing chat history item: {str(e)}")
2535
-
2536
- logger.info(f"[INSTRUMENTATION] /ziya/stream cleaned chat history from {len(body['chat_history'])} to {len(cleaned_history)} pairs")
2537
- body["chat_history"] = cleaned_history
2538
-
2539
- logger.info("[INSTRUMENTATION] /ziya/stream starting stream endpoint with body size: %d", len(str(body)))
2540
-
2541
- # Convert to ChatPromptValue if needed
2542
- if isinstance(body, dict) and "messages" in body:
2543
- logger.info(f"[INSTRUMENTATION] /ziya/stream converting {len(body['messages'])} messages to ChatPromptValue")
2544
- from langchain_core.prompt_values import ChatPromptValue
2545
- from langchain_core.messages import HumanMessage
2546
- messages = [HumanMessage(content=msg) for msg in body["messages"]]
2547
- prompt_value = ChatPromptValue(messages=messages)
2548
- # Keep body as dict but store the prompt value for later use if needed
2549
- logger.info(f"[INSTRUMENTATION] /ziya/stream created ChatPromptValue with {len(messages)} messages")
2550
-
2551
- # Return the streaming response
2552
- logger.info("[INSTRUMENTATION] /ziya/stream calling stream_chunks()")
2553
- return StreamingResponse(
2554
- stream_chunks(body),
2555
- media_type="text/event-stream",
2556
- headers={
2557
- "Cache-Control": "no-cache",
2558
- "Connection": "keep-alive",
2559
- "X-Accel-Buffering": "no",
2560
- "Access-Control-Allow-Origin": "*",
2561
- "Access-Control-Allow-Methods": "POST, OPTIONS",
2562
- "Access-Control-Allow-Headers": "Content-Type"
2563
- }
2564
- )
2565
- except Exception as e:
2566
- # Handle any exceptions using the centralized error handler
2567
- logger.error(f"Exception in stream_endpoint: {str(e)}")
2568
- return handle_request_exception(request, e)
2569
-
2570
- async def stream_agent_response(body, request):
2481
+ # DISABLED: Manual /ziya/stream endpoint conflicts with /api/chat
2482
+ # @app.post("/ziya/stream")
2571
2483
  """Stream the agent's response with centralized error handling."""
2572
2484
  try:
2573
2485
  first_chunk = True
@@ -2761,28 +2673,30 @@ async def get_folder(request: FolderRequest):
2761
2673
  return {"error": str(e)}
2762
2674
 
2763
2675
  # Import scan progress from directory_util
2764
- from app.utils.directory_util import get_scan_progress, cancel_scan, _scan_progress
2676
+ # from app.utils.directory_util import get_scan_progress, cancel_scan, _scan_progress
2765
2677
 
2766
2678
  @app.get("/folder-progress")
2767
2679
  async def get_folder_progress():
2768
2680
  """Get current folder scanning progress."""
2769
- progress = get_scan_progress()
2681
+ # progress = get_scan_progress()
2770
2682
  # Only return active=True if there's actual progress to report
2771
- if progress["active"] and not progress["progress"]:
2772
- # No actual progress data, don't report as active
2773
- progress["active"] = False
2774
- progress["progress"] = {}
2775
- return progress
2683
+ # if progress["active"] and not progress["progress"]:
2684
+ # # No actual progress data, don't report as active
2685
+ # progress["active"] = False
2686
+ # progress["progress"] = {}
2687
+ # return progress
2688
+ return {"active": False, "progress": {}}
2776
2689
 
2777
2690
  @app.post("/folder-cancel")
2778
2691
  async def cancel_folder_scan():
2779
2692
  """Cancel current folder scanning operation."""
2780
- was_active = cancel_scan()
2781
- if was_active:
2782
- logger.info("Folder scan cancellation requested")
2783
- return {"status": "cancellation_requested"}
2784
- else:
2785
- return {"status": "no_active_scan"}
2693
+ # was_active = cancel_scan()
2694
+ # if was_active:
2695
+ # logger.info("Folder scan cancellation requested")
2696
+ logger.info("Folder scan cancellation not available")
2697
+ return {"cancelled": False}
2698
+
2699
+ @app.post("/file")
2786
2700
  async def get_file(request: FileRequest):
2787
2701
  """Get the content of a file."""
2788
2702
  try:
@@ -3569,13 +3483,10 @@ async def set_model(request: SetModelRequest):
3569
3483
  logger.error(f"Failed to create agent: {str(agent_error)}", exc_info=True)
3570
3484
  raise agent_error
3571
3485
 
3572
- # Reinitialize langserve routes with new agent_executor
3573
- try:
3574
- initialize_langserve(app, agent_executor)
3575
- logger.info("Reinitialized langserve routes")
3576
- except Exception as langserve_error:
3577
- logger.error(f"Failed to initialize langserve: {str(langserve_error)}", exc_info=True)
3578
- raise langserve_error
3486
+ # COMPLETELY DISABLED: LangServe routes cause duplicate execution with /api/chat
3487
+ # initialize_langserve(app, agent_executor)
3488
+ # _langserve_initialized = True
3489
+ logger.info("LangServe completely disabled to prevent duplicate execution - using /api/chat only")
3579
3490
 
3580
3491
  # Force garbage collection after successful model change
3581
3492
  import gc