PyPI - lollms-client - Versions diffs - 0.19.6__tar.gz → 0.19.7__tar.gz - Mend

lollms-client 0.19.6tar.gz → 0.19.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lollms-client might be problematic. Click here for more details.

Files changed (81) hide show

{lollms_client-0.19.6 → lollms_client-0.19.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lollms_client
-Version: 0.19.6
+Version: 0.19.7
 Summary: A client library for LoLLMs generate endpoint
 Author-email: ParisNeo <parisneoai@gmail.com>
 License: Apache Software License

{lollms_client-0.19.6 → lollms_client-0.19.7}/lollms_client/__init__.py RENAMED Viewed

@@ -7,7 +7,7 @@ from lollms_client.lollms_utilities import PromptReshaper # Keep general utiliti
 from lollms_client.lollms_mcp_binding import LollmsMCPBinding, LollmsMCPBindingManager
-__version__ = "0.19.6" # Updated version
+__version__ = "0.19.7" # Updated version
 # Optionally, you could define __all__ if you want to be explicit about exports
 __all__ = [

{lollms_client-0.19.6 → lollms_client-0.19.7}/lollms_client/lollms_core.py RENAMED Viewed

@@ -853,8 +853,6 @@ Respond with a JSON object containing ONE of the following structures:
         turn_history.append({"type":"final_answer_generated", "content":final_answer_text})
         return {"final_answer": final_answer_text, "tool_calls": tool_calls_made_this_turn, "error": None}
-    # --- RAG ---
     def generate_text_with_rag(
         self,
         prompt: str,
@@ -878,16 +876,17 @@ Respond with a JSON object containing ONE of the following structures:
         ctx_size: int | None = None,
         streaming_callback: Optional[Callable[[str, MSG_TYPE, Optional[Dict], Optional[List]], bool]] = None,
         rag_hop_query_generation_temperature: float = 0.2,
-        rag_hop_summary_temperature: float = 0.3,
+        # rag_hop_summary_temperature is no longer needed
+        max_rag_context_characters: int = 32000,
         **llm_generation_kwargs
     ) -> Dict[str, Any]:
         if not self.binding:
             return {"final_answer": "", "rag_hops_history": [], "all_retrieved_sources": [], "error": "LLM binding not initialized."}
         turn_rag_history_for_callback: List[Dict[str, Any]] = []
-        accumulated_rag_context_str = ""
         rag_hops_details_list: List[Dict[str, Any]] = []
-        all_unique_retrieved_chunks_map: Dict[str, Dict[str, Any]] = {} # To store unique chunks by content hash or path+text
+        # Stores all unique chunks with their full details, keyed by a unique identifier (e.g., path + content hash snippet)
+        all_unique_retrieved_chunks_map: Dict[str, Dict[str, Any]] = {}
         current_query_for_rag = rag_query_text
         original_user_prompt = prompt
@@ -896,205 +895,204 @@ Respond with a JSON object containing ONE of the following structures:
                 streaming_callback(f"Starting RAG Hop {hop_count + 1}", MSG_TYPE.MSG_TYPE_STEP, {"type": "rag_hop_start", "hop": hop_count + 1}, turn_rag_history_for_callback)
             # 1. Determine/Generate RAG Query Text
-            if hop_count > 0 or (current_query_for_rag is None and max_rag_hops > 0):
+            if hop_count > 0: # Query generation for multi-hop (hop 2 onwards)
                 if streaming_callback:
                     streaming_callback("LLM generating refined RAG query...", MSG_TYPE.MSG_TYPE_STEP_START, {"type": "rag_query_generation", "hop": hop_count + 1}, turn_rag_history_for_callback)
+                system_prompt_q_gen = "You are an expert research assistant. Your task is to formulate the best possible *new* search query to find additional information relevant to the user's original request, considering previous search attempts."
                 query_gen_prompt_parts = [
-                    f"{self.system_full_header}You are an expert research assistant. Your task is to formulate the best possible search query to find information relevant to the user's original request, considering the information already gathered.",
-                    f"{self.user_full_header}Original user request: '{original_user_prompt}'"
+                    f"Original user request:\n'{original_user_prompt}'"
                 ]
-                if accumulated_rag_context_str:
-                    query_gen_prompt_parts.append(f"Information gathered so far (summaries):\n{accumulated_rag_context_str}")
                 if rag_hops_details_list:
-                    query_gen_prompt_parts.append("Previous search attempts and their summarized findings:")
-                    for prev_hop in rag_hops_details_list:
-                        query_gen_prompt_parts.append(f"  - Queried for: '{prev_hop['query']}', Summary: '{prev_hop.get('new_information_summary', 'N/A')}'")
+                    query_gen_prompt_parts.append("\nPrevious search queries and number of chunks found:")
+                    for i, prev_hop in enumerate(rag_hops_details_list):
+                        num_chunks_found_in_hop = len(prev_hop.get("retrieved_chunks_details", []))
+                        query_gen_prompt_parts.append(f"  - Query {i+1}: '{prev_hop['query']}' (Found {num_chunks_found_in_hop} chunks)")
-                query_gen_prompt_parts.append("Based on this, what is the most effective and specific search query to perform next to get closer to answering the user's request? Output only the search query text, nothing else.")
+                query_gen_prompt_parts.append("\nBased on the original request and the queries already attempted, what is the most effective and specific *new* search query to perform next to get closer to answering the user's request? The query should aim to find information not likely covered by previous queries. Output only the search query text, nothing else.")
                 query_gen_prompt_parts.append(self.ai_full_header)
-                new_query_text_raw = self.remove_thinking_blocks(self.generate_text(prompt="".join(query_gen_prompt_parts), temperature=rag_hop_query_generation_temperature, n_predict=100, stream=False))
+                new_query_text_raw = self.generate_text(
+                    prompt="".join(query_gen_prompt_parts),
+                    system_prompt=system_prompt_q_gen,
+                    temperature=rag_hop_query_generation_temperature,
+                    n_predict=100,
+                    stream=False
+                )
                 if isinstance(new_query_text_raw, dict) and "error" in new_query_text_raw:
-                    return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"Failed to generate RAG query: {new_query_text_raw['error']}"}
+                    return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"Failed to generate RAG query for hop {hop_count + 1}: {new_query_text_raw['error']}"}
-                current_query_for_rag = new_query_text_raw.strip().replace("Search query:", "").replace("Query:", "").strip("\"'")
+                current_query_for_rag = self.remove_thinking_blocks(new_query_text_raw).strip().replace("Search query:", "").replace("Query:", "").strip("\"'")
                 if streaming_callback:
-                    streaming_callback(f"Generated RAG query: {current_query_for_rag}", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "rag_query_generation", "hop": hop_count + 1, "query": current_query_for_rag}, turn_rag_history_for_callback)
-            elif current_query_for_rag is None and max_rag_hops == 0:
+                    streaming_callback(f"Generated RAG query for hop {hop_count + 1}: {current_query_for_rag}", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "rag_query_generation", "hop": hop_count + 1, "query": current_query_for_rag}, turn_rag_history_for_callback)
+            elif current_query_for_rag is None: # First hop, and no rag_query_text provided
                 current_query_for_rag = original_user_prompt
+            # If current_query_for_rag was provided as an argument, it's used for the first hop.
             if not current_query_for_rag:
-                if max_rag_hops > 0 and hop_count < max_rag_hops:
-                    ASCIIColors.warning(f"RAG Hop {hop_count + 1}: Generated query was empty. Skipping hop.")
-                    rag_hops_details_list.append({"query": "EMPTY_QUERY_SKIPPED", "retrieved_chunks_details": [], "new_information_summary": "Skipped due to empty query.", "llm_decision_json": {"need_more_data": True if hop_count < max_rag_hops -1 else False}})
-                    turn_rag_history_for_callback.append({"type":"rag_hop_info", "hop": hop_count + 1, "query": "EMPTY_QUERY_SKIPPED", "summary":"Skipped."})
-                    continue
-                else:
-                    ASCIIColors.warning("RAG query is empty. Proceeding without RAG context.")
-                    break
+                ASCIIColors.warning(f"RAG Hop {hop_count + 1}: Query is empty. Stopping RAG process.")
+                # Add a detail for this aborted hop
+                rag_hops_details_list.append({
+                    "query": "EMPTY_QUERY_STOPPED_HOPS",
+                    "retrieved_chunks_details": [],
+                    "status": "Query became empty, RAG stopped."
+                })
+                turn_rag_history_for_callback.append({"type":"rag_hop_info", "hop": hop_count + 1, "query": "EMPTY_QUERY_STOPPED_HOPS", "status":"Stopped."})
+                break # Stop if query is empty
             # 2. Perform RAG Query
             if streaming_callback:
-                streaming_callback(f"Querying knowledge base for: '{current_query_for_rag}'...", MSG_TYPE.MSG_TYPE_STEP_START, {"type": "rag_retrieval", "hop": hop_count + 1, "query": current_query_for_rag}, turn_rag_history_for_callback)
+                streaming_callback(f"Querying knowledge base for (Hop {hop_count + 1}): '{current_query_for_rag}'...", MSG_TYPE.MSG_TYPE_STEP_START, {"type": "rag_retrieval", "hop": hop_count + 1, "query": current_query_for_rag}, turn_rag_history_for_callback)
             try:
-                retrieved_chunks_raw = rag_query_function(current_query_for_rag, rag_vectorizer_name, rag_top_k, rag_min_similarity_percent)
+                retrieved_chunks_raw_this_hop = rag_query_function(current_query_for_rag, rag_vectorizer_name, rag_top_k, rag_min_similarity_percent)
             except Exception as e_rag_query:
                 trace_exception(e_rag_query)
-                return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"RAG query function failed: {e_rag_query}"}
+                return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"RAG query function failed on hop {hop_count + 1}: {e_rag_query}"}
             if streaming_callback:
-                streaming_callback(f"Retrieved {len(retrieved_chunks_raw)} chunks.", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "rag_retrieval", "hop": hop_count + 1, "num_chunks": len(retrieved_chunks_raw)}, turn_rag_history_for_callback)
+                streaming_callback(f"Retrieved {len(retrieved_chunks_raw_this_hop)} chunks for hop {hop_count + 1}.", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "rag_retrieval", "hop": hop_count + 1, "num_chunks": len(retrieved_chunks_raw_this_hop)}, turn_rag_history_for_callback)
-            current_hop_details = {"query": current_query_for_rag, "retrieved_chunks_details": []}
-            formatted_new_chunks_for_llm_summary = ""
-            if retrieved_chunks_raw:
-                for i, chunk in enumerate(retrieved_chunks_raw):
+            current_hop_chunk_details_for_history = []
+            new_chunks_added_this_hop = 0
+            if retrieved_chunks_raw_this_hop:
+                for chunk in retrieved_chunks_raw_this_hop:
                     doc_path = chunk.get('file_path', 'Unknown Document')
-                    similarity = chunk.get('similarity_percent', 'N/A')
                     content = chunk.get('chunk_text', '')
+                    similarity = chunk.get('similarity_percent', 0.0) # Default to 0.0 if not present
+                    # Ensure content is string and similarity is float for sorting later
+                    if not isinstance(content, str): content = str(content)
+                    try:
+                        similarity = float(similarity)
+                    except (ValueError, TypeError):
+                        similarity = 0.0 # Default if conversion fails
+                    chunk_detail_for_map_and_history = {
+                        "document": doc_path,
+                        "similarity": similarity,
+                        "content": content,
+                        "retrieved_in_hop": hop_count + 1,
+                        "query_used": current_query_for_rag
+                    }
+                    current_hop_chunk_details_for_history.append(chunk_detail_for_map_and_history)
-                    chunk_detail_for_history = {"document": doc_path, "similarity": similarity, "content": content}
-                    current_hop_details["retrieved_chunks_details"].append(chunk_detail_for_history)
-                    # Add to unique list for final output
-                    # Use a combination of path and content to uniquely identify a chunk to avoid duplicates if same content appears from different queries.
-                    # A more robust unique key might involve hashing content if it's very large.
-                    unique_key = f"{doc_path}::{content[:100]}" # Simple key
+                    unique_key = f"{doc_path}::{content[:100]}" # Simple key for uniqueness
                     if unique_key not in all_unique_retrieved_chunks_map:
-                         all_unique_retrieved_chunks_map[unique_key] = chunk_detail_for_history
-                    # Format for LLM processing (summary or direct use)
-                    formatted_new_chunks_for_llm_summary += f"Document: {doc_path} (Similarity: {similarity}%)\nContent:\n{content}\n---\n"
+                         all_unique_retrieved_chunks_map[unique_key] = chunk_detail_for_map_and_history
+                         new_chunks_added_this_hop +=1
-            if not retrieved_chunks_raw:
-                current_hop_details["new_information_summary"] = "No relevant information found for this query."
-                current_hop_details["llm_decision_json"] = {"need_more_data": True if max_rag_hops > 0 and hop_count < max_rag_hops -1 else False, "reasoning_for_decision":"No new information retrieved."}
-                rag_hops_details_list.append(current_hop_details)
-                turn_rag_history_for_callback.append({"type":"rag_hop_info", **current_hop_details})
-                if max_rag_hops == 0 or hop_count >= max_rag_hops -1 :
-                    break
-                else:
-                    accumulated_rag_context_str += f"\n\n---\nAttempted query: '{current_query_for_rag}' - No new information found.\n---"
-                    continue
-            if max_rag_hops == 0: # Classic RAG
-                accumulated_rag_context_str += formatted_new_chunks_for_llm_summary
-                current_hop_details["new_information_summary"] = "Directly used in context (classic RAG)."
-                current_hop_details["llm_decision_json"] = {"need_more_data": False}
-                rag_hops_details_list.append(current_hop_details)
-                turn_rag_history_for_callback.append({"type":"rag_hop_info", **current_hop_details})
-                break
-            # Multi-hop: LLM summarizes and decides
-            if streaming_callback:
-                streaming_callback("LLM processing retrieved data and deciding next step...", MSG_TYPE.MSG_TYPE_STEP_START, {"type": "rag_llm_decision", "hop": hop_count + 1}, turn_rag_history_for_callback)
-            decision_prompt_llm_parts = [
-                f"{self.system_full_header}You are an AI research assistant. Analyze newly retrieved information against the user's request and prior knowledge, then decide if more searching is needed.",
-                f"{self.user_full_header}Original user request: '{original_user_prompt}'",
-            ]
-            if accumulated_rag_context_str:
-                decision_prompt_llm_parts.append(f"Current accumulated knowledge summary:\n{accumulated_rag_context_str}")
-            decision_prompt_llm_parts.append(f"You just searched for: '{current_query_for_rag}'")
-            decision_prompt_llm_parts.append(f"And found this new information:\n--- New Information Start ---\n{formatted_new_chunks_for_llm_summary}--- New Information End ---")
-            decision_prompt_llm_parts.append(
-                "Task: Provide a concise summary of ONLY the new information relevant to the original request. "
-                "Then, assess if you now have sufficient information to comprehensively answer the user's original request or if another, more targeted search is necessary. "
-                "Respond STRICTLY in the following JSON format, with no other text before or after the JSON block:"
-            )
-            json_template_for_decision = """
-{
-  "new_information_summary": "<Your concise summary of ONLY the new_information relevant to the original_user_request. Focus on what's new and useful. If nothing new is relevant, state that.>",
-  "need_more_data": <true_or_false>,
-  "reasoning_for_decision": "<Briefly explain why you need more data or why you have enough. If needing more, suggest what kind of information is still missing.>"
-}
-"""
-            decision_prompt_llm_parts.append(f"```json\n{json_template_for_decision}\n```")
-            decision_prompt_llm_parts.append(self.ai_full_header)
-            llm_decision_json_str = self.generate_code(prompt="".join(decision_prompt_llm_parts), language="json", template=json_template_for_decision, temperature=rag_hop_summary_temperature, max_size=1024)
-            if isinstance(llm_decision_json_str, dict) and "error" in llm_decision_json_str:
-                return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"LLM failed to make RAG decision: {llm_decision_json_str['error']}"}
-            if not llm_decision_json_str:
-                 return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": "LLM provided empty decision for RAG hop."}
-            try:
-                llm_decision = json.loads(llm_decision_json_str)
-            except json.JSONDecodeError:
-                try:
-                    match = re.search(r"```json\s*(\{.*?\})\s*```", llm_decision_json_str, re.DOTALL)
-                    if match: llm_decision = json.loads(match.group(1))
-                    else: llm_decision = json.loads(self.extract_code_blocks(llm_decision_json_str, format="markdown")[0]["content"])
-                except Exception as e_json_parse:
-                    trace_exception(e_json_parse)
-                    return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"Failed to parse LLM RAG decision JSON: {llm_decision_json_str}. Error: {e_json_parse}"}
-            new_summary = llm_decision.get("new_information_summary", "Summary not provided by LLM.")
-            need_more_data = llm_decision.get("need_more_data", True)
-            current_hop_details["new_information_summary"] = new_summary
-            current_hop_details["llm_decision_json"] = llm_decision
+            hop_status = "Completed"
+            if not retrieved_chunks_raw_this_hop:
+                hop_status = "No chunks retrieved for this query."
+            elif new_chunks_added_this_hop == 0 and hop_count > 0: # Only consider "no new unique chunks" for subsequent hops
+                hop_status = "No *new* unique chunks retrieved."
+                # Optionally, could break here if no new unique chunks are found in a multi-hop scenario
+                # ASCIIColors.warning(f"RAG Hop {hop_count + 1}: No new unique chunks found. Consider stopping if this persists.")
+            current_hop_details = {
+                "query": current_query_for_rag,
+                "retrieved_chunks_details": current_hop_chunk_details_for_history, # Chunks from THIS hop
+                "status": hop_status
+            }
             rag_hops_details_list.append(current_hop_details)
             turn_rag_history_for_callback.append({"type":"rag_hop_info", **current_hop_details})
+            # Reset for next potential query generation if it's not the last planned hop
+            if hop_count < max_rag_hops:
+                current_query_for_rag = None
+            else: # This was the last hop
+                break
+        # 3. Prepare Final Context from All Unique Retrieved Chunks
+        accumulated_rag_context_str = ""
+        if all_unique_retrieved_chunks_map:
             if streaming_callback:
-                streaming_callback(f"LLM decision: Summary='{new_summary[:100]}...', NeedMoreData={need_more_data}", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "rag_llm_decision", "hop": hop_count + 1, "decision": llm_decision}, turn_rag_history_for_callback)
+                streaming_callback("Preparing final RAG context from all retrieved chunks...", MSG_TYPE.MSG_TYPE_STEP, {"type": "context_preparation"}, turn_rag_history_for_callback)
-            accumulated_rag_context_str += f"\n\n--- Summary of findings from query '{current_query_for_rag}' (Hop {hop_count + 1}) ---\n{new_summary}\n---"
+            # Sort all unique chunks by similarity (highest first)
+            sorted_unique_chunks = sorted(
+                list(all_unique_retrieved_chunks_map.values()),
+                key=lambda c: c.get('similarity', 0.0),
+                reverse=True
+            )
-            if not need_more_data or hop_count >= max_rag_hops -1 : # Subtract 1 because current hop is finishing
-                break
+            current_context_chars = 0
+            chunks_used_in_final_context = 0
+            context_lines = []
+            for chunk in sorted_unique_chunks:
+                chunk_text_to_add = f"Source: {chunk['document']} (Similarity: {chunk['similarity']:.2f}%, Hop: {chunk['retrieved_in_hop']}, Query: '{chunk['query_used']}')\nContent:\n{chunk['content']}\n---\n"
+                if current_context_chars + len(chunk_text_to_add) <= max_rag_context_characters:
+                    context_lines.append(chunk_text_to_add)
+                    current_context_chars += len(chunk_text_to_add)
+                    chunks_used_in_final_context +=1
+                else:
+                    ASCIIColors.warning(f"Reached max RAG context character limit ({max_rag_context_characters}). Used {chunks_used_in_final_context} of {len(sorted_unique_chunks)} unique chunks.")
+                    break
+            accumulated_rag_context_str = "".join(context_lines)
+            if streaming_callback:
+                streaming_callback(f"Final RAG context prepared using {chunks_used_in_final_context} chunks ({current_context_chars} chars).", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "context_preparation", "num_chunks_in_context": chunks_used_in_final_context, "chars_in_context": current_context_chars}, turn_rag_history_for_callback)
         # 4. Final Answer Generation
         if streaming_callback:
-            streaming_callback("LLM generating final answer using all gathered information...", MSG_TYPE.MSG_TYPE_STEP_START, {"type": "final_answer_generation"}, turn_rag_history_for_callback)
+            streaming_callback("LLM generating final answer...", MSG_TYPE.MSG_TYPE_STEP_START, {"type": "final_answer_generation"}, turn_rag_history_for_callback)
-        final_answer_prompt_parts = []
-        if system_prompt:
-            final_answer_prompt_parts.append(f"{self.system_full_header}{system_prompt}")
-        final_answer_prompt_parts.append(f"{self.user_full_header}Original request: {original_user_prompt}")
+        final_answer_prompt_parts = [f"Original request: {original_user_prompt}"]
         if accumulated_rag_context_str:
-            final_answer_prompt_parts.append(f"\nBased on the information I have gathered:\n--- Gathered Context Start ---\n{accumulated_rag_context_str.strip()}\n--- Gathered Context End ---")
+            final_answer_prompt_parts.append(f"\nBased on the following information I have gathered from a knowledge base:\n--- Gathered Context Start ---\n{accumulated_rag_context_str.strip()}\n--- Gathered Context End ---")
         else:
             final_answer_prompt_parts.append("\n(No specific information was retrieved from the knowledge base for this request.)")
-        final_answer_prompt_parts.append("\nPlease provide a comprehensive answer to the original request using ONLY the provided gathered context. If the context is insufficient, clearly state that.")
+        final_answer_prompt_parts.append("\nPlease provide a comprehensive answer to the original request using ONLY the provided gathered context. If the context is insufficient, clearly state that. If the context contains code examples, ensure they are accurately reproduced.")
         final_answer_prompt_parts.append(self.ai_full_header)
         final_answer_llm_prompt = "\n".join(final_answer_prompt_parts)
-        final_answer_streaming_callback = None
-        if streaming_callback:
-            def final_answer_cb_adapter(chunk, msg_type):
-                return streaming_callback(chunk, msg_type, {"type": "final_answer_chunk"}, turn_rag_history_for_callback)
-            final_answer_streaming_callback = final_answer_cb_adapter
+        final_answer_streaming_callback_adapted = None
+        if streaming_callback and stream:
+            def final_answer_cb_adapter(chunk_text, msg_type_llm):
+                return streaming_callback(chunk_text, msg_type_llm, {"type": "final_answer_chunk"}, turn_rag_history_for_callback)
+            final_answer_streaming_callback_adapted = final_answer_cb_adapter
+        actual_streaming_cb_for_generate = final_answer_streaming_callback_adapted if stream else None
-        final_answer_text = self.remove_thinking_blocks(self.generate_text(
-            prompt=final_answer_llm_prompt, images=images,
+        final_answer_raw = self.generate_text(
+            prompt=final_answer_llm_prompt, images=images, system_prompt=system_prompt,
             n_predict=n_predict, stream=stream, temperature=temperature, top_k=top_k, top_p=top_p,
             repeat_penalty=repeat_penalty, repeat_last_n=repeat_last_n, seed=seed, n_threads=n_threads,
-            ctx_size=ctx_size, streaming_callback=final_answer_streaming_callback, **llm_generation_kwargs
-        ))
-        if streaming_callback:
-            streaming_callback("Final answer generation complete.", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "final_answer_generation"}, turn_rag_history_for_callback)
+            ctx_size=ctx_size, streaming_callback=actual_streaming_cb_for_generate, **llm_generation_kwargs
+        )
-        if isinstance(final_answer_text, dict) and "error" in final_answer_text:
-            return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"Final answer generation failed: {final_answer_text['error']}"}
+        if isinstance(final_answer_raw, dict) and "error" in final_answer_raw:
+            return {"final_answer": "", "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": f"Final answer generation failed: {final_answer_raw['error']}"}
-        return {"final_answer": final_answer_text, "rag_hops_history": rag_hops_details_list, "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), "error": None}
+        final_answer_text = self.remove_thinking_blocks(final_answer_raw)
+        if streaming_callback:
+            streaming_callback("Final answer generation complete.", MSG_TYPE.MSG_TYPE_STEP_END, {"type": "final_answer_generation"}, turn_rag_history_for_callback)
+            if not stream and final_answer_text:
+                 streaming_callback(final_answer_text, MSG_TYPE.MSG_TYPE_CHUNK, {"type": "final_answer_full"}, turn_rag_history_for_callback)
+        return {
+            "final_answer": final_answer_text,
+            "rag_hops_history": rag_hops_details_list,
+            "all_retrieved_sources": list(all_unique_retrieved_chunks_map.values()), # All unique chunks found
+            "error": None
+        }
     def generate_code(
                         self,
                         prompt,
                         images=[],
+                        system_prompt=None,
                         template=None,
                         language="json",
                         code_tag_format="markdown", # or "html"
@@ -1111,8 +1109,8 @@ Respond with a JSON object containing ONE of the following structures:
         Uses the underlying LLM binding via `generate_text`.
         Handles potential continuation if the code block is incomplete.
         """
-        system_prompt = f"""Act as a code generation assistant that generates code from user prompt."""
+        if not  system_prompt:
+            system_prompt = f"""Act as a code generation assistant that generates code from user prompt."""
         if template:
             system_prompt += "Here is a template of the answer:\n"

{lollms_client-0.19.6 → lollms_client-0.19.7}/lollms_client.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: lollms_client
-Version: 0.19.6
+Version: 0.19.7
 Summary: A client library for LoLLMs generate endpoint
 Author-email: ParisNeo <parisneoai@gmail.com>
 License: Apache Software License