kssrag 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kssrag/core/agents.py +210 -188
- kssrag/server.py +26 -8
- {kssrag-0.2.2.dist-info → kssrag-0.2.4.dist-info}/METADATA +3 -2
- {kssrag-0.2.2.dist-info → kssrag-0.2.4.dist-info}/RECORD +7 -7
- {kssrag-0.2.2.dist-info → kssrag-0.2.4.dist-info}/WHEEL +1 -1
- {kssrag-0.2.2.dist-info → kssrag-0.2.4.dist-info}/entry_points.txt +0 -0
- {kssrag-0.2.2.dist-info → kssrag-0.2.4.dist-info}/top_level.txt +0 -0
kssrag/core/agents.py
CHANGED
|
@@ -21,13 +21,36 @@ class RAGAgent:
|
|
|
21
21
|
if not any(msg.get("role") == "system" for msg in self.conversation):
|
|
22
22
|
self.add_message("system", self.system_prompt)
|
|
23
23
|
|
|
24
|
+
# def add_message(self, role: str, content: str):
|
|
25
|
+
# """Add a message to the conversation history"""
|
|
26
|
+
# self.conversation.append({"role": role, "content": content})
|
|
27
|
+
|
|
28
|
+
# # Keep conversation manageable (last 15 messages)
|
|
29
|
+
# if len(self.conversation) > 15:
|
|
30
|
+
# self._smart_trim_conversation()
|
|
31
|
+
|
|
24
32
|
def add_message(self, role: str, content: str):
|
|
25
|
-
"""Add a message to the conversation history"""
|
|
33
|
+
"""Add a message to the conversation history (with simple dedupe for assistant)."""
|
|
34
|
+
content = content.strip()
|
|
35
|
+
# Prevent adding empty messages
|
|
36
|
+
if not content:
|
|
37
|
+
logger.info("Attempted to add empty message – ignored.")
|
|
38
|
+
return
|
|
39
|
+
|
|
40
|
+
# If last message is identical assistant content, skip to avoid duplicates
|
|
41
|
+
if self.conversation:
|
|
42
|
+
last = self.conversation[-1]
|
|
43
|
+
if role == "assistant" and last.get("role") == "assistant":
|
|
44
|
+
if last.get("content", "").strip() == content:
|
|
45
|
+
logger.info("Duplicate assistant message suppressed.")
|
|
46
|
+
return
|
|
47
|
+
|
|
26
48
|
self.conversation.append({"role": role, "content": content})
|
|
27
|
-
|
|
49
|
+
|
|
28
50
|
# Keep conversation manageable (last 15 messages)
|
|
29
51
|
if len(self.conversation) > 15:
|
|
30
52
|
self._smart_trim_conversation()
|
|
53
|
+
|
|
31
54
|
|
|
32
55
|
def _smart_trim_conversation(self):
|
|
33
56
|
"""Trim conversation while preserving system message and recent exchanges"""
|
|
@@ -63,43 +86,55 @@ class RAGAgent:
|
|
|
63
86
|
for i, doc in enumerate(context_docs, 1):
|
|
64
87
|
context += f"\n--- Document {i} ---\n{doc['content']}\n"
|
|
65
88
|
return context
|
|
66
|
-
|
|
89
|
+
|
|
67
90
|
def _build_messages(self, question: str, context: str = "") -> List[Dict[str, str]]:
|
|
68
|
-
"""
|
|
69
|
-
|
|
70
|
-
|
|
91
|
+
"""
|
|
92
|
+
Build messages for the LLM including context, conversation history, and summaries.
|
|
93
|
+
|
|
94
|
+
Improvements:
|
|
95
|
+
- Prevents token explosion by trimming conversation smartly
|
|
96
|
+
- Injects last 3 summaries only
|
|
97
|
+
- Adds stealth summarization only if there are at least 2 user-assistant exchanges
|
|
98
|
+
- Preserves system messages and formatting
|
|
99
|
+
"""
|
|
100
|
+
# Start with system + conversation history
|
|
101
|
+
messages: List[Dict[str, str]] = []
|
|
102
|
+
|
|
103
|
+
# Always include system message at top
|
|
104
|
+
system_msg = next((msg for msg in self.conversation if msg["role"] == "system"), None)
|
|
105
|
+
if system_msg:
|
|
106
|
+
messages.append(system_msg)
|
|
71
107
|
|
|
108
|
+
# Keep only last 12 user/assistant messages to prevent token overload
|
|
109
|
+
conversation_tail = [msg for msg in self.conversation if msg["role"] != "system"][-12:]
|
|
110
|
+
messages.extend(conversation_tail)
|
|
111
|
+
|
|
72
112
|
logger.info(f"Building messages for query: '{question}'")
|
|
73
|
-
logger.info(f"Conversation
|
|
113
|
+
logger.info(f"Conversation tail: {len(conversation_tail)} messages")
|
|
74
114
|
logger.info(f"Active summaries: {len(self.conversation_summaries)}")
|
|
75
|
-
logger.info(f"
|
|
115
|
+
logger.info(f"Context length: {len(context)} chars" if context else "No retrieved context")
|
|
76
116
|
|
|
77
|
-
#
|
|
117
|
+
# Inject last 5 summaries safely as a system message
|
|
78
118
|
if self.conversation_summaries:
|
|
79
|
-
|
|
80
|
-
summary_context = "Previous conversation context:\n" + "\n".join(
|
|
81
|
-
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
# Add
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
# ✅ FIX: Always append new user message (don't replace existing ones)
|
|
92
|
-
messages.append({"role": "user", "content": user_message})
|
|
93
|
-
|
|
94
|
-
# Add stealth summarization instruction for ongoing conversations
|
|
95
|
-
if len(self.conversation) >= 1: # More than just system + current user message + 2nd Query
|
|
119
|
+
summaries_to_use = self.conversation_summaries[-5:]
|
|
120
|
+
summary_context = "Previous conversation context:\n" + "\n".join(f"- {s}" for s in summaries_to_use)
|
|
121
|
+
messages.append({"role": "system", "content": summary_context})
|
|
122
|
+
logger.info(f"Injected {len(summaries_to_use)} conversation summaries")
|
|
123
|
+
|
|
124
|
+
# Add the user's current question + retrieved context
|
|
125
|
+
user_content = f"{context}\n\nQuestion: {question}" if context else question
|
|
126
|
+
messages.append({"role": "user", "content": user_content})
|
|
127
|
+
|
|
128
|
+
# Add stealth summarization only if conversation has at least 2 user-assistant pairs
|
|
129
|
+
exchange_count = sum(1 for msg in self.conversation if msg["role"] != "system") // 2
|
|
130
|
+
if exchange_count >= 2:
|
|
96
131
|
summary_instruction = self._create_summary_instruction()
|
|
97
132
|
messages.append({"role": "system", "content": summary_instruction})
|
|
98
|
-
logger.info(f"
|
|
99
|
-
logger.debug(f"Instruction content: {summary_instruction}")
|
|
133
|
+
logger.info(f"Stealth summary instruction added ({len(summary_instruction)} chars)")
|
|
100
134
|
|
|
101
|
-
logger.info(f"
|
|
135
|
+
logger.info(f"Final message count to LLM: {len(messages)}")
|
|
102
136
|
return messages
|
|
137
|
+
|
|
103
138
|
|
|
104
139
|
def _create_summary_instruction(self) -> str:
|
|
105
140
|
"""Create the stealth summarization instruction with examples"""
|
|
@@ -123,136 +158,85 @@ class RAGAgent:
|
|
|
123
158
|
- Focus on user preferences, names, important context
|
|
124
159
|
|
|
125
160
|
The summary will be automatically hidden from the user."""
|
|
126
|
-
|
|
127
|
-
# def _extract_summary_and_response(self, full_response: str) -> tuple[str, Optional[str]]:
|
|
128
|
-
# """Extract summary from response and return clean user response - handles partial markers"""
|
|
129
|
-
# summary_start = "[SUMMARY_START]"
|
|
130
|
-
# summary_end = "[SUMMARY_END]"
|
|
131
|
-
|
|
132
|
-
# # Check if we have complete markers
|
|
133
|
-
# if summary_start in full_response and summary_end in full_response:
|
|
134
|
-
# start_idx = full_response.find(summary_start) + len(summary_start)
|
|
135
|
-
# end_idx = full_response.find(summary_end)
|
|
136
|
-
|
|
137
|
-
# summary = full_response[start_idx:end_idx].strip()
|
|
138
|
-
# user_response = full_response[:full_response.find(summary_start)].strip()
|
|
139
|
-
|
|
140
|
-
# logger.info(f"✅ SUCCESS: Summary extracted and separated from user response")
|
|
141
|
-
# logger.info(f"User response length: {len(user_response)} chars")
|
|
142
|
-
# logger.info(f"Summary extracted: '{summary}'")
|
|
143
|
-
# return user_response, summary
|
|
144
|
-
|
|
145
|
-
# # Check if we have partial markers (common in streaming)
|
|
146
|
-
# elif summary_start in full_response:
|
|
147
|
-
# # We have start marker but no end marker - extract what we can
|
|
148
|
-
# start_idx = full_response.find(summary_start) + len(summary_start)
|
|
149
|
-
# potential_summary = full_response[start_idx:].strip()
|
|
150
|
-
|
|
151
|
-
# # Clean up any partial end markers or weird formatting
|
|
152
|
-
# if potential_summary:
|
|
153
|
-
# # Remove any trailing partial markers or whitespace
|
|
154
|
-
# cleaned_summary = potential_summary.split('[SUMMARY_')[0].split('[SUMMARY')[0].strip()
|
|
155
|
-
# user_response = full_response[:full_response.find(summary_start)].strip()
|
|
156
|
-
|
|
157
|
-
# if cleaned_summary and len(cleaned_summary) > 10: # Only if meaningful content
|
|
158
|
-
# logger.info(f"⚠️ Partial summary extracted (missing end marker): '{cleaned_summary}'")
|
|
159
|
-
# return user_response, cleaned_summary
|
|
160
|
-
|
|
161
|
-
# logger.info("❌ Incomplete summary markers found")
|
|
162
|
-
# return full_response, None
|
|
163
|
-
|
|
164
|
-
# logger.info("❌ No summary markers found, returning full response")
|
|
165
|
-
# logger.info(f"Full response length: {len(full_response)} chars")
|
|
166
|
-
# return full_response, None
|
|
167
161
|
|
|
168
162
|
def _extract_summary_and_response(self, full_response: str) -> tuple[str, Optional[str]]:
|
|
169
|
-
"""Extract summary from response and return clean user response
|
|
170
|
-
|
|
163
|
+
"""Extract summary from response and return clean user response safely."""
|
|
164
|
+
if not full_response:
|
|
165
|
+
return "", None
|
|
166
|
+
|
|
171
167
|
summary_start = "[SUMMARY_START]"
|
|
172
168
|
summary_end = "[SUMMARY_END]"
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
normalized =
|
|
176
|
-
|
|
177
|
-
#
|
|
169
|
+
|
|
170
|
+
original = full_response
|
|
171
|
+
normalized = original.replace('\r\n', '\n').replace('\r', '\n')
|
|
172
|
+
|
|
173
|
+
# Case 1: Full summary markers
|
|
178
174
|
if summary_start in normalized and summary_end in normalized:
|
|
179
175
|
start_idx = normalized.find(summary_start) + len(summary_start)
|
|
180
176
|
end_idx = normalized.find(summary_end)
|
|
181
|
-
|
|
182
177
|
summary = normalized[start_idx:end_idx].strip()
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
logger.info(f"User response length: {len(user_response)} chars")
|
|
187
|
-
logger.info(f"Summary extracted: '{summary}'")
|
|
188
|
-
|
|
189
|
-
# NEW: Add validation from improved version
|
|
178
|
+
|
|
179
|
+
user_response = original.split(summary_start)[0].strip()
|
|
180
|
+
|
|
190
181
|
if not summary or len(summary) < 5:
|
|
191
|
-
logger.info("
|
|
192
|
-
return
|
|
193
|
-
|
|
182
|
+
logger.info("Summary too short or invalid – returning full response as user response")
|
|
183
|
+
return original.strip(), None
|
|
184
|
+
|
|
194
185
|
return user_response, summary
|
|
195
|
-
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
# We have start marker but no end marker - extract what we can
|
|
186
|
+
|
|
187
|
+
# Case 2: Partial summary start only
|
|
188
|
+
if summary_start in normalized:
|
|
199
189
|
start_idx = normalized.find(summary_start) + len(summary_start)
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
user_response
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
return user_response, cleaned_summary
|
|
222
|
-
|
|
223
|
-
logger.info("❌ Incomplete summary markers found")
|
|
224
|
-
return full_response.strip(), None # NEW: strip for consistency
|
|
225
|
-
|
|
226
|
-
# No markers found - KEEP original but with normalization
|
|
227
|
-
logger.info("❌ No summary markers found, returning full response")
|
|
228
|
-
logger.info(f"Full response length: {len(full_response)} chars")
|
|
229
|
-
return full_response.strip(), None # NEW: strip for consistency
|
|
190
|
+
potential = normalized[start_idx:start_idx + 200].strip()
|
|
191
|
+
|
|
192
|
+
cleaned_summary = (
|
|
193
|
+
potential
|
|
194
|
+
.split('[SUMMARY_')[0]
|
|
195
|
+
.split('[SUMMARY')[0]
|
|
196
|
+
.split('[')[0]
|
|
197
|
+
.strip()
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
user_response = original.split(summary_start)[0].strip()
|
|
201
|
+
|
|
202
|
+
if cleaned_summary and len(cleaned_summary) >= 10:
|
|
203
|
+
logger.info("Partial summary extracted safely")
|
|
204
|
+
return user_response, cleaned_summary
|
|
205
|
+
|
|
206
|
+
logger.info("Partial summary invalid or too short")
|
|
207
|
+
return original.strip(), None
|
|
208
|
+
|
|
209
|
+
# Case 3: No markers
|
|
210
|
+
return original.strip(), None
|
|
230
211
|
|
|
231
212
|
def _add_conversation_summary(self, new_summary: str):
|
|
232
213
|
"""Add a new discrete conversation summary"""
|
|
233
214
|
if not new_summary or new_summary.lower() == "none":
|
|
234
|
-
logger.info("
|
|
215
|
+
logger.info(" No summary to add (empty or 'none')")
|
|
235
216
|
return
|
|
236
|
-
|
|
237
|
-
|
|
217
|
+
|
|
218
|
+
new_summary = new_summary.strip()
|
|
219
|
+
if not new_summary:
|
|
220
|
+
logger.info(" No summary to add after strip")
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
# Append new summary
|
|
238
224
|
self.conversation_summaries.append(new_summary)
|
|
239
|
-
logger.info(f"
|
|
225
|
+
logger.info(f" ADDED Summary #{len(self.conversation_summaries)}: '{new_summary}'")
|
|
240
226
|
|
|
241
227
|
# Keep only recent summaries (last 7)
|
|
242
228
|
if len(self.conversation_summaries) > 7:
|
|
243
229
|
self.conversation_summaries = self.conversation_summaries[-7:]
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
logger.info(f"📊 Summary count maintained at {len(self.conversation_summaries)}")
|
|
247
|
-
logger.info(f"Added conversation summary #{len(self.conversation_summaries)}: {new_summary}")
|
|
230
|
+
logger.info(f" Summary count trimmed to {len(self.conversation_summaries)}")
|
|
231
|
+
|
|
248
232
|
|
|
249
233
|
def query(self, question: str, top_k: int = 5, include_context: bool = True) -> str:
|
|
250
234
|
"""Process a query with stealth conversation summarization"""
|
|
251
235
|
try:
|
|
252
236
|
# Retrieve relevant context
|
|
253
|
-
logger.info(f"
|
|
237
|
+
logger.info(f" QUERY START: '{question}' (top_k: {top_k})")
|
|
254
238
|
context_docs = self.retriever.retrieve(question, top_k)
|
|
255
|
-
logger.info(f"
|
|
239
|
+
logger.info(f" Retrieved {len(context_docs)} context documents")
|
|
256
240
|
|
|
257
241
|
if not context_docs and include_context:
|
|
258
242
|
logger.warning(f"No context found for query: {question}")
|
|
@@ -266,7 +250,7 @@ class RAGAgent:
|
|
|
266
250
|
|
|
267
251
|
# Generate response
|
|
268
252
|
full_response = self.llm.predict(messages)
|
|
269
|
-
logger.info(f"
|
|
253
|
+
logger.info(f" LLM response received: {len(full_response)} chars")
|
|
270
254
|
|
|
271
255
|
# Extract summary and clean response
|
|
272
256
|
user_response, conversation_summary = self._extract_summary_and_response(full_response)
|
|
@@ -281,12 +265,12 @@ class RAGAgent:
|
|
|
281
265
|
# Add assistant response to conversation (clean version only)
|
|
282
266
|
self.add_message("assistant", user_response)
|
|
283
267
|
|
|
284
|
-
logger.info(f"
|
|
268
|
+
logger.info(f" Final user response: {len(user_response)} chars")
|
|
285
269
|
return user_response
|
|
286
270
|
|
|
287
271
|
except Exception as e:
|
|
288
272
|
logger.error(f"Error processing query: {str(e)}")
|
|
289
|
-
# logger.error(f"
|
|
273
|
+
# logger.error(f" QUERY FAILED: {str(e)}")
|
|
290
274
|
return "I encountered an issue processing your query. Please try again."
|
|
291
275
|
|
|
292
276
|
def query_stream(self, question: str, top_k: int = 5) -> Generator[str, None, None]:
|
|
@@ -294,7 +278,7 @@ class RAGAgent:
|
|
|
294
278
|
Professional-grade streaming with multiple fallback strategies
|
|
295
279
|
"""
|
|
296
280
|
try:
|
|
297
|
-
logger.info(f"
|
|
281
|
+
logger.info(f" STREAMING QUERY START: '{question}'")
|
|
298
282
|
|
|
299
283
|
# Strategy 1: Try true streaming first
|
|
300
284
|
if hasattr(self.llm, 'predict_stream'):
|
|
@@ -305,81 +289,119 @@ class RAGAgent:
|
|
|
305
289
|
logger.warning(f"Streaming failed, falling back: {stream_error}")
|
|
306
290
|
|
|
307
291
|
# Strategy 2: Fallback to simulated streaming
|
|
308
|
-
logger.info("
|
|
292
|
+
logger.info(" Falling back to simulated streaming")
|
|
309
293
|
yield from self._simulated_streaming(question, top_k)
|
|
310
294
|
|
|
311
295
|
except Exception as e:
|
|
312
|
-
logger.error(f"
|
|
296
|
+
logger.error(f" ALL STREAMING STRATEGIES FAILED: {str(e)}")
|
|
313
297
|
yield f"Error: {str(e)}"
|
|
314
298
|
|
|
299
|
+
# def _stream_with_summary_protection(self, question: str, top_k: int) -> Generator[str, None, None]:
|
|
300
|
+
# """Streaming-safe: never leak summary markers mid-stream."""
|
|
301
|
+
# relevant_docs = self.retriever.retrieve(question, top_k=top_k)
|
|
302
|
+
# context = self._build_context(relevant_docs)
|
|
303
|
+
# messages = self._build_messages(question, context)
|
|
304
|
+
|
|
305
|
+
# buffer = ""
|
|
306
|
+
# summary_buffer = ""
|
|
307
|
+
# in_summary = False
|
|
308
|
+
|
|
309
|
+
# for chunk in self.llm.predict_stream(messages):
|
|
310
|
+
# buffer += chunk
|
|
311
|
+
|
|
312
|
+
# # Detect summary start
|
|
313
|
+
# if '[SUMMARY_START]' in buffer:
|
|
314
|
+
# in_summary = True
|
|
315
|
+
# clean_part = buffer.split('[SUMMARY_START]')[0].strip()
|
|
316
|
+
# if clean_part:
|
|
317
|
+
# yield clean_part
|
|
318
|
+
# summary_buffer = buffer.split('[SUMMARY_START]')[1]
|
|
319
|
+
# buffer = ""
|
|
320
|
+
# continue
|
|
321
|
+
|
|
322
|
+
# if in_summary:
|
|
323
|
+
# summary_buffer += chunk
|
|
324
|
+
# if '[SUMMARY_END]' in summary_buffer:
|
|
325
|
+
# in_summary = False
|
|
326
|
+
# summary_content = summary_buffer.split('[SUMMARY_END]')[0].strip()
|
|
327
|
+
# if summary_content:
|
|
328
|
+
# self._add_conversation_summary(summary_content)
|
|
329
|
+
# logger.info(f"Summary extracted in stream: '{summary_content}'")
|
|
330
|
+
# buffer = summary_buffer.split('[SUMMARY_END]')[1] # remainder
|
|
331
|
+
# summary_buffer = ""
|
|
332
|
+
# if buffer:
|
|
333
|
+
# yield buffer.strip()
|
|
334
|
+
# buffer = ""
|
|
335
|
+
# continue
|
|
336
|
+
|
|
337
|
+
# if not in_summary:
|
|
338
|
+
# yield chunk
|
|
339
|
+
|
|
340
|
+
# # Flush leftover buffer
|
|
341
|
+
# if buffer.strip() and not in_summary:
|
|
342
|
+
# yield buffer.strip()
|
|
343
|
+
# elif in_summary:
|
|
344
|
+
# logger.info("Leftover buffer contains partial summary – discarded to prevent marker leak")
|
|
345
|
+
|
|
315
346
|
def _stream_with_summary_protection(self, question: str, top_k: int) -> Generator[str, None, None]:
|
|
316
|
-
"""
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
# Don't break here - let the method complete naturally
|
|
337
|
-
continue
|
|
338
|
-
|
|
339
|
-
if not summary_started:
|
|
340
|
-
yield chunk
|
|
341
|
-
|
|
342
|
-
# Process the complete response
|
|
343
|
-
self._process_complete_response(buffer)
|
|
344
|
-
|
|
345
|
-
except Exception as e:
|
|
346
|
-
logger.error(f"Streaming error: {e}")
|
|
347
|
-
raise # Re-raise to trigger fallback
|
|
347
|
+
"""Token-only streaming. Never reconstruct or re-emit content."""
|
|
348
|
+
relevant_docs = self.retriever.retrieve(question, top_k=top_k)
|
|
349
|
+
context = self._build_context(relevant_docs)
|
|
350
|
+
messages = self._build_messages(question, context)
|
|
351
|
+
|
|
352
|
+
buffer = ""
|
|
353
|
+
|
|
354
|
+
for chunk in self.llm.predict_stream(messages):
|
|
355
|
+
buffer += chunk
|
|
356
|
+
|
|
357
|
+
# The moment summary markers appear, stop streaming to client
|
|
358
|
+
if '[SUMMARY_START]' in buffer or 'SUMMARY_' in buffer:
|
|
359
|
+
logger.info("Summary marker detected — stopping client stream")
|
|
360
|
+
break
|
|
361
|
+
|
|
362
|
+
# Yield ONLY raw tokens
|
|
363
|
+
yield chunk
|
|
364
|
+
|
|
365
|
+
# After streaming finishes, process full response exactly once
|
|
366
|
+
self._process_complete_response(buffer)
|
|
348
367
|
|
|
349
368
|
def _process_complete_response(self, full_response: str):
|
|
350
369
|
"""Process complete response and extract summary"""
|
|
351
370
|
user_response, conversation_summary = self._extract_summary_and_response(full_response)
|
|
352
|
-
|
|
371
|
+
|
|
353
372
|
if conversation_summary:
|
|
354
|
-
logger.info(f"
|
|
373
|
+
logger.info(f" Summary extracted: '{conversation_summary}'")
|
|
355
374
|
self._add_conversation_summary(conversation_summary)
|
|
356
|
-
|
|
357
|
-
|
|
375
|
+
|
|
376
|
+
# extra guard: only add assistant message if different from last assistant message
|
|
377
|
+
if user_response:
|
|
378
|
+
last = self.conversation[-1] if self.conversation else None
|
|
379
|
+
if not (last and last.get("role") == "assistant" and last.get("content", "").strip() == user_response.strip()):
|
|
380
|
+
self.add_message("assistant", user_response)
|
|
381
|
+
else:
|
|
382
|
+
logger.info("Skipped adding duplicate assistant message in _process_complete_response.")
|
|
358
383
|
|
|
359
384
|
def _simulated_streaming(self, question: str, top_k: int) -> Generator[str, None, None]:
|
|
360
|
-
"""Simulated streaming that guarantees no summary leakage"""
|
|
385
|
+
"""Simulated streaming that guarantees no summary leakage."""
|
|
361
386
|
relevant_docs = self.retriever.retrieve(question, top_k=top_k)
|
|
362
387
|
context = self._build_context(relevant_docs)
|
|
363
388
|
messages = self._build_messages(question, context)
|
|
364
|
-
|
|
365
|
-
# Get complete response
|
|
389
|
+
|
|
366
390
|
complete_response = self.llm.predict(messages)
|
|
367
|
-
|
|
368
|
-
# Extract clean response
|
|
369
391
|
user_response, conversation_summary = self._extract_summary_and_response(complete_response)
|
|
370
|
-
|
|
392
|
+
|
|
371
393
|
if conversation_summary:
|
|
372
|
-
logger.info(f"📝 Summary extracted: '{conversation_summary}'")
|
|
373
394
|
self._add_conversation_summary(conversation_summary)
|
|
374
|
-
|
|
395
|
+
|
|
375
396
|
self.add_message("assistant", user_response)
|
|
376
|
-
|
|
377
|
-
# Simulate streaming
|
|
378
|
-
chunk_size = 2
|
|
397
|
+
|
|
398
|
+
# Simulate streaming chunks
|
|
399
|
+
chunk_size = 2
|
|
379
400
|
for i in range(0, len(user_response), chunk_size):
|
|
380
|
-
yield user_response[i:i+chunk_size]
|
|
401
|
+
yield user_response[i:i + chunk_size]
|
|
381
402
|
import time
|
|
382
|
-
time.sleep(0.02)
|
|
403
|
+
time.sleep(0.02)
|
|
404
|
+
|
|
383
405
|
|
|
384
406
|
def _extract_clean_content(self, buffer: str) -> str:
|
|
385
407
|
"""Extract clean content before any summary markers"""
|
|
@@ -403,5 +425,5 @@ class RAGAgent:
|
|
|
403
425
|
"message_count": len(self.conversation),
|
|
404
426
|
"recent_messages": [f"{msg['role']}: {msg['content'][:50]}..." for msg in self.conversation[-3:]]
|
|
405
427
|
}
|
|
406
|
-
logger.info(f"
|
|
428
|
+
logger.info(f" Context snapshot: {context}")
|
|
407
429
|
return context
|
kssrag/server.py
CHANGED
|
@@ -109,23 +109,41 @@ def create_app(rag_agent: RAGAgent, server_config: Optional[ServerConfig] = None
|
|
|
109
109
|
|
|
110
110
|
agent = sessions[session_id]
|
|
111
111
|
|
|
112
|
+
# async def generate():
|
|
113
|
+
# full_response = ""
|
|
114
|
+
# try:
|
|
115
|
+
# # Use agent's query_stream which handles context and summarization
|
|
116
|
+
# for chunk in agent.query_stream(query, top_k=5):
|
|
117
|
+
# full_response += chunk
|
|
118
|
+
# yield f"data: {json.dumps({'chunk': chunk, 'done': False})}\n\n"
|
|
119
|
+
|
|
120
|
+
# yield f"data: {json.dumps({'chunk': '', 'done': True})}\n\n"
|
|
121
|
+
|
|
122
|
+
# except Exception as e:
|
|
123
|
+
# logger.error(f"Streaming error: {str(e)}")
|
|
124
|
+
# yield f"data: {json.dumps({'error': str(e), 'done': True})}\n\n"
|
|
125
|
+
|
|
112
126
|
async def generate():
|
|
113
|
-
full_response = ""
|
|
114
127
|
try:
|
|
115
|
-
#
|
|
116
|
-
for
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
128
|
+
# Stream tokens ONLY
|
|
129
|
+
for token in agent.query_stream(query, top_k=5):
|
|
130
|
+
if not token:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
yield f"data: {json.dumps({'chunk': token, 'done': False})}\n\n"
|
|
134
|
+
|
|
135
|
+
# Signal completion (no payload)
|
|
120
136
|
yield f"data: {json.dumps({'chunk': '', 'done': True})}\n\n"
|
|
121
|
-
|
|
137
|
+
|
|
122
138
|
except Exception as e:
|
|
123
139
|
logger.error(f"Streaming error: {str(e)}")
|
|
124
140
|
yield f"data: {json.dumps({'error': str(e), 'done': True})}\n\n"
|
|
141
|
+
|
|
142
|
+
|
|
125
143
|
|
|
126
144
|
return StreamingResponse(
|
|
127
145
|
generate(),
|
|
128
|
-
media_type="text/
|
|
146
|
+
media_type="text/event-stream",
|
|
129
147
|
headers={
|
|
130
148
|
"Cache-Control": "no-cache",
|
|
131
149
|
"Connection": "keep-alive",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: kssrag
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: A flexible Retrieval-Augmented Generation framework by Ksschkw
|
|
5
5
|
Home-page: https://github.com/Ksschkw/kssrag
|
|
6
6
|
Author: Ksschkw
|
|
@@ -85,7 +85,7 @@ Dynamic: summary
|
|
|
85
85
|
|
|
86
86
|

|
|
87
87
|

|
|
88
|
-

|
|
89
89
|

|
|
90
90
|

|
|
91
91
|
|
|
@@ -809,6 +809,7 @@ kssrag/
|
|
|
809
809
|
- [**Full Documentation**](https://github.com/Ksschkw/kssrag/docs)
|
|
810
810
|
- [**API Reference**](https://github.com/Ksschkw/kssrag/docs/api_reference.md)
|
|
811
811
|
- [**Examples Directory**](https://github.com/Ksschkw/kssrag/examples)
|
|
812
|
+
- [**PyPi**](https://pypi.org/project/kssrag/0.2.4/)
|
|
812
813
|
|
|
813
814
|
### Community
|
|
814
815
|
- [**GitHub Issues**](https://github.com/Ksschkw/kssrag/issues) - Bug reports and feature requests
|
|
@@ -2,9 +2,9 @@ kssrag/__init__.py,sha256=N1XfR8IRKtEJAzcOVyHnKXtgx-ztlrSFtFwiVkGbAX8,2041
|
|
|
2
2
|
kssrag/cli.py,sha256=9AbtUEV9X63bhRj4EU-eHhud8iPM7LJAGSbu_IPlMUE,9703
|
|
3
3
|
kssrag/config.py,sha256=zd978GZQ66TlLZnk9yP7uvoXoWD89BS8VHi7w_yGXrM,6529
|
|
4
4
|
kssrag/kssrag.py,sha256=vy3oCHeHFAp_dJW0JjLbTxeEwCcwtXuOL_Ejmv0qz8Y,5251
|
|
5
|
-
kssrag/server.py,sha256=
|
|
5
|
+
kssrag/server.py,sha256=kgp3dFGeUJwEJkWRU_i6ykUUgPvIgbrqS2DqO7KiG5s,6937
|
|
6
6
|
kssrag/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
-
kssrag/core/agents.py,sha256=
|
|
7
|
+
kssrag/core/agents.py,sha256=4I4CYTClfhFH8p30LQjT6JqD0RpCsdzumbP94gU3WN0,19425
|
|
8
8
|
kssrag/core/chunkers.py,sha256=HmWL3y2DhhobV5zIlIdZP2KK2N7TASqeirPqmc3_inI,7324
|
|
9
9
|
kssrag/core/retrievers.py,sha256=1e9c7ukUD4pFSVasOMTXSKoz_rapXQTl-FrSHK6Osqg,3037
|
|
10
10
|
kssrag/core/vectorstores.py,sha256=H8hTpjc6hAFMhqAO2Cjq-Jp6xrJhsJKiRN9qxb_-6XM,21003
|
|
@@ -26,8 +26,8 @@ tests/test_integration.py,sha256=TY7MrTcAiu1KG4MlgIC7VVlzUTnOoqp9pieK8rhBNrg,105
|
|
|
26
26
|
tests/test_ocr.py,sha256=PoGKLNISpAwaoPvGuS7qiOf6dsVnsFRFtYkG1WFi6TU,6202
|
|
27
27
|
tests/test_streaming.py,sha256=rMQ0w8_HQFFV0PbHDqQXRBqaNfbd3WqJVNT2hKVbsqw,1442
|
|
28
28
|
tests/test_vectorstores.py,sha256=YOwI2bfqprzbq8ahIw4pbbbEOaKGcg-XPcLCO7WiLxE,1474
|
|
29
|
-
kssrag-0.2.
|
|
30
|
-
kssrag-0.2.
|
|
31
|
-
kssrag-0.2.
|
|
32
|
-
kssrag-0.2.
|
|
33
|
-
kssrag-0.2.
|
|
29
|
+
kssrag-0.2.4.dist-info/METADATA,sha256=RldRylUJJ7qBiVlGKJGjPQq8iXs8uS1JGMOCFJBHKcM,24069
|
|
30
|
+
kssrag-0.2.4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
31
|
+
kssrag-0.2.4.dist-info/entry_points.txt,sha256=g4tQj5YUqPK3Osb9BI85tsErxleSBUENiqlnX0fWK5M,43
|
|
32
|
+
kssrag-0.2.4.dist-info/top_level.txt,sha256=sO9LGINa0GEjLoHTtufpz01yM5SmeTw6M4zWHEF0R2s,13
|
|
33
|
+
kssrag-0.2.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|