dataknobs-bots 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_bots/__init__.py +42 -0
- dataknobs_bots/api/__init__.py +42 -0
- dataknobs_bots/api/dependencies.py +140 -0
- dataknobs_bots/api/exceptions.py +289 -0
- dataknobs_bots/bot/__init__.py +15 -0
- dataknobs_bots/bot/base.py +1091 -0
- dataknobs_bots/bot/context.py +102 -0
- dataknobs_bots/bot/manager.py +430 -0
- dataknobs_bots/bot/registry.py +629 -0
- dataknobs_bots/config/__init__.py +39 -0
- dataknobs_bots/config/resolution.py +353 -0
- dataknobs_bots/knowledge/__init__.py +82 -0
- dataknobs_bots/knowledge/query/__init__.py +25 -0
- dataknobs_bots/knowledge/query/expander.py +262 -0
- dataknobs_bots/knowledge/query/transformer.py +288 -0
- dataknobs_bots/knowledge/rag.py +738 -0
- dataknobs_bots/knowledge/retrieval/__init__.py +23 -0
- dataknobs_bots/knowledge/retrieval/formatter.py +249 -0
- dataknobs_bots/knowledge/retrieval/merger.py +279 -0
- dataknobs_bots/memory/__init__.py +56 -0
- dataknobs_bots/memory/base.py +38 -0
- dataknobs_bots/memory/buffer.py +58 -0
- dataknobs_bots/memory/vector.py +188 -0
- dataknobs_bots/middleware/__init__.py +11 -0
- dataknobs_bots/middleware/base.py +92 -0
- dataknobs_bots/middleware/cost.py +421 -0
- dataknobs_bots/middleware/logging.py +184 -0
- dataknobs_bots/reasoning/__init__.py +65 -0
- dataknobs_bots/reasoning/base.py +50 -0
- dataknobs_bots/reasoning/react.py +299 -0
- dataknobs_bots/reasoning/simple.py +51 -0
- dataknobs_bots/registry/__init__.py +41 -0
- dataknobs_bots/registry/backend.py +181 -0
- dataknobs_bots/registry/memory.py +244 -0
- dataknobs_bots/registry/models.py +102 -0
- dataknobs_bots/registry/portability.py +210 -0
- dataknobs_bots/tools/__init__.py +5 -0
- dataknobs_bots/tools/knowledge_search.py +113 -0
- dataknobs_bots/utils/__init__.py +1 -0
- dataknobs_bots-0.2.4.dist-info/METADATA +591 -0
- dataknobs_bots-0.2.4.dist-info/RECORD +42 -0
- dataknobs_bots-0.2.4.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Retrieval utilities for RAG knowledge bases.
|
|
2
|
+
|
|
3
|
+
This module provides post-retrieval processing for improving RAG quality,
|
|
4
|
+
including chunk merging, context formatting, and result optimization.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataknobs_bots.knowledge.retrieval.formatter import (
|
|
8
|
+
ContextFormatter,
|
|
9
|
+
FormatterConfig,
|
|
10
|
+
)
|
|
11
|
+
from dataknobs_bots.knowledge.retrieval.merger import (
|
|
12
|
+
ChunkMerger,
|
|
13
|
+
MergedChunk,
|
|
14
|
+
MergerConfig,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"ChunkMerger",
|
|
19
|
+
"MergedChunk",
|
|
20
|
+
"MergerConfig",
|
|
21
|
+
"ContextFormatter",
|
|
22
|
+
"FormatterConfig",
|
|
23
|
+
]
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Context formatting utilities for RAG retrieval.
|
|
2
|
+
|
|
3
|
+
This module provides formatting for retrieved chunks to optimize
|
|
4
|
+
LLM context window usage and improve comprehension.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from dataknobs_bots.knowledge.retrieval.merger import MergedChunk
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class FormatterConfig:
|
|
17
|
+
"""Configuration for context formatting.
|
|
18
|
+
|
|
19
|
+
Attributes:
|
|
20
|
+
small_chunk_threshold: Max chars for "small" chunks (full heading path)
|
|
21
|
+
medium_chunk_threshold: Max chars for "medium" chunks (last 2 headings)
|
|
22
|
+
include_scores: Whether to include similarity scores
|
|
23
|
+
include_source: Whether to include source file information
|
|
24
|
+
group_by_source: Whether to group chunks by source file
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
small_chunk_threshold: int = 200
|
|
28
|
+
medium_chunk_threshold: int = 800
|
|
29
|
+
include_scores: bool = False
|
|
30
|
+
include_source: bool = True
|
|
31
|
+
group_by_source: bool = False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ContextFormatter:
|
|
35
|
+
"""Formats retrieved chunks for LLM context with dynamic heading inclusion.
|
|
36
|
+
|
|
37
|
+
This formatter applies intelligent heading inclusion based on content
|
|
38
|
+
size to optimize token usage while maintaining context clarity:
|
|
39
|
+
- Small chunks: Full heading path (need context)
|
|
40
|
+
- Medium chunks: Last 2 heading levels
|
|
41
|
+
- Large chunks: No headings (content is self-contained)
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
```python
|
|
45
|
+
formatter = ContextFormatter(FormatterConfig(
|
|
46
|
+
small_chunk_threshold=200,
|
|
47
|
+
include_scores=True
|
|
48
|
+
))
|
|
49
|
+
|
|
50
|
+
# Format standard results
|
|
51
|
+
context = formatter.format(results)
|
|
52
|
+
|
|
53
|
+
# Format merged chunks
|
|
54
|
+
context = formatter.format_merged(merged_chunks)
|
|
55
|
+
```
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
def __init__(self, config: FormatterConfig | None = None):
|
|
59
|
+
"""Initialize the context formatter.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
config: Formatter configuration, uses defaults if not provided
|
|
63
|
+
"""
|
|
64
|
+
self.config = config or FormatterConfig()
|
|
65
|
+
|
|
66
|
+
def format(self, results: list[dict[str, Any]]) -> str:
|
|
67
|
+
"""Format search results for LLM context.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
results: Search results from RAGKnowledgeBase.query()
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Formatted context string
|
|
74
|
+
"""
|
|
75
|
+
if not results:
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
if self.config.group_by_source:
|
|
79
|
+
return self._format_grouped_by_source(results)
|
|
80
|
+
|
|
81
|
+
formatted_chunks = []
|
|
82
|
+
for i, result in enumerate(results, 1):
|
|
83
|
+
formatted = self._format_result(result, i)
|
|
84
|
+
formatted_chunks.append(formatted)
|
|
85
|
+
|
|
86
|
+
return "\n\n---\n\n".join(formatted_chunks)
|
|
87
|
+
|
|
88
|
+
def format_merged(self, merged_chunks: list[MergedChunk]) -> str:
|
|
89
|
+
"""Format merged chunks for LLM context.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
merged_chunks: Merged chunks from ChunkMerger
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Formatted context string
|
|
96
|
+
"""
|
|
97
|
+
if not merged_chunks:
|
|
98
|
+
return ""
|
|
99
|
+
|
|
100
|
+
# Convert to result format and use standard formatting
|
|
101
|
+
results = []
|
|
102
|
+
for chunk in merged_chunks:
|
|
103
|
+
results.append({
|
|
104
|
+
"text": chunk.text,
|
|
105
|
+
"source": chunk.source,
|
|
106
|
+
"heading_path": chunk.heading_display,
|
|
107
|
+
"similarity": chunk.avg_similarity,
|
|
108
|
+
"metadata": {
|
|
109
|
+
"headings": chunk.heading_path,
|
|
110
|
+
"content_length": chunk.content_length,
|
|
111
|
+
},
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
return self.format(results)
|
|
115
|
+
|
|
116
|
+
def _format_result(self, result: dict[str, Any], index: int) -> str:
|
|
117
|
+
"""Format a single result with dynamic heading inclusion.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
result: Search result dictionary
|
|
121
|
+
index: Result index for numbering
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
Formatted chunk string
|
|
125
|
+
"""
|
|
126
|
+
text = result.get("text", "")
|
|
127
|
+
source = result.get("source", "")
|
|
128
|
+
similarity = result.get("similarity", 0.0)
|
|
129
|
+
metadata = result.get("metadata", {})
|
|
130
|
+
|
|
131
|
+
# Get heading information
|
|
132
|
+
headings = metadata.get("headings", [])
|
|
133
|
+
if not headings:
|
|
134
|
+
heading_path = result.get("heading_path", "")
|
|
135
|
+
if isinstance(heading_path, str) and heading_path:
|
|
136
|
+
headings = heading_path.split(" > ")
|
|
137
|
+
|
|
138
|
+
# Determine content length for heading decision
|
|
139
|
+
content_length = metadata.get("content_length", len(text))
|
|
140
|
+
|
|
141
|
+
# Get headings to display based on content size
|
|
142
|
+
display_headings = self._get_display_headings(headings, content_length)
|
|
143
|
+
|
|
144
|
+
# Build formatted chunk
|
|
145
|
+
lines = []
|
|
146
|
+
|
|
147
|
+
# Add index and heading
|
|
148
|
+
if display_headings:
|
|
149
|
+
heading_str = " > ".join(display_headings)
|
|
150
|
+
if self.config.include_scores:
|
|
151
|
+
lines.append(f"[{index}] [{similarity:.2f}] {heading_str}")
|
|
152
|
+
else:
|
|
153
|
+
lines.append(f"[{index}] {heading_str}")
|
|
154
|
+
else:
|
|
155
|
+
if self.config.include_scores:
|
|
156
|
+
lines.append(f"[{index}] [{similarity:.2f}]")
|
|
157
|
+
else:
|
|
158
|
+
lines.append(f"[{index}]")
|
|
159
|
+
|
|
160
|
+
# Add content
|
|
161
|
+
lines.append(text.strip())
|
|
162
|
+
|
|
163
|
+
# Add source
|
|
164
|
+
if self.config.include_source and source:
|
|
165
|
+
lines.append(f"(Source: {source})")
|
|
166
|
+
|
|
167
|
+
return "\n".join(lines)
|
|
168
|
+
|
|
169
|
+
def _get_display_headings(
|
|
170
|
+
self,
|
|
171
|
+
headings: list[str],
|
|
172
|
+
content_length: int,
|
|
173
|
+
) -> list[str]:
|
|
174
|
+
"""Get headings to display based on content length.
|
|
175
|
+
|
|
176
|
+
Implements dynamic heading inclusion:
|
|
177
|
+
- Small chunks: Full heading path
|
|
178
|
+
- Medium chunks: Last 2 heading levels
|
|
179
|
+
- Large chunks: No headings
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
headings: Full heading path
|
|
183
|
+
content_length: Length of content in characters
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
List of headings to display
|
|
187
|
+
"""
|
|
188
|
+
if not headings:
|
|
189
|
+
return []
|
|
190
|
+
|
|
191
|
+
if content_length < self.config.small_chunk_threshold:
|
|
192
|
+
# Small chunks: include full heading path
|
|
193
|
+
return headings
|
|
194
|
+
elif content_length < self.config.medium_chunk_threshold:
|
|
195
|
+
# Medium chunks: include last 2 heading levels
|
|
196
|
+
return headings[-2:] if len(headings) > 2 else headings
|
|
197
|
+
else:
|
|
198
|
+
# Large chunks: omit headings (content is self-contained)
|
|
199
|
+
return []
|
|
200
|
+
|
|
201
|
+
def _format_grouped_by_source(self, results: list[dict[str, Any]]) -> str:
|
|
202
|
+
"""Format results grouped by source file.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
results: Search results
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Formatted context string with source grouping
|
|
209
|
+
"""
|
|
210
|
+
from collections import defaultdict
|
|
211
|
+
|
|
212
|
+
# Group by source
|
|
213
|
+
groups: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
|
214
|
+
for result in results:
|
|
215
|
+
source = result.get("source", "unknown")
|
|
216
|
+
groups[source].append(result)
|
|
217
|
+
|
|
218
|
+
# Format each group
|
|
219
|
+
formatted_groups = []
|
|
220
|
+
chunk_index = 1
|
|
221
|
+
|
|
222
|
+
for source, source_results in groups.items():
|
|
223
|
+
group_lines = [f"## Source: {source}"]
|
|
224
|
+
|
|
225
|
+
for result in source_results:
|
|
226
|
+
formatted = self._format_result(result, chunk_index)
|
|
227
|
+
# Remove source line since we're grouping
|
|
228
|
+
lines = formatted.split("\n")
|
|
229
|
+
lines = [line for line in lines if not line.startswith("(Source:")]
|
|
230
|
+
group_lines.append("\n".join(lines))
|
|
231
|
+
chunk_index += 1
|
|
232
|
+
|
|
233
|
+
formatted_groups.append("\n\n".join(group_lines))
|
|
234
|
+
|
|
235
|
+
return "\n\n---\n\n".join(formatted_groups)
|
|
236
|
+
|
|
237
|
+
def wrap_for_prompt(self, context: str, tag: str = "knowledge_base") -> str:
|
|
238
|
+
"""Wrap formatted context in XML tags for prompt injection.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
context: Formatted context string
|
|
242
|
+
tag: Tag name to wrap with
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
Context wrapped in XML tags
|
|
246
|
+
"""
|
|
247
|
+
if not context:
|
|
248
|
+
return ""
|
|
249
|
+
return f"<{tag}>\n{context}\n</{tag}>"
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
"""Chunk merging utilities for RAG retrieval optimization.
|
|
2
|
+
|
|
3
|
+
This module provides functionality to merge adjacent chunks that share
|
|
4
|
+
the same heading path, improving context coherence for LLM consumption.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections import defaultdict
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class MergerConfig:
|
|
16
|
+
"""Configuration for chunk merging.
|
|
17
|
+
|
|
18
|
+
Attributes:
|
|
19
|
+
max_merged_size: Maximum size of merged chunk content in characters
|
|
20
|
+
preserve_order: Whether to preserve positional ordering within groups
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
max_merged_size: int = 2000
|
|
24
|
+
preserve_order: bool = True
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class MergedChunk:
|
|
29
|
+
"""A merged chunk combining multiple related chunks.
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
text: Combined text content
|
|
33
|
+
source: Source file path
|
|
34
|
+
heading_path: Shared heading path
|
|
35
|
+
heading_display: Formatted heading display string
|
|
36
|
+
chunks: Original chunks that were merged
|
|
37
|
+
avg_similarity: Average similarity score of merged chunks
|
|
38
|
+
content_length: Total content length
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
text: str
|
|
42
|
+
source: str
|
|
43
|
+
heading_path: list[str]
|
|
44
|
+
heading_display: str
|
|
45
|
+
chunks: list[dict[str, Any]]
|
|
46
|
+
avg_similarity: float
|
|
47
|
+
content_length: int
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class ChunkMerger:
|
|
51
|
+
"""Merges adjacent chunks sharing the same heading path.
|
|
52
|
+
|
|
53
|
+
This merger groups search results by their heading path and source,
|
|
54
|
+
then combines them into coherent context units while respecting
|
|
55
|
+
size limits.
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
```python
|
|
59
|
+
merger = ChunkMerger(MergerConfig(max_merged_size=2000))
|
|
60
|
+
results = await kb.query("How do I configure auth?", k=10)
|
|
61
|
+
merged = merger.merge(results)
|
|
62
|
+
|
|
63
|
+
for chunk in merged:
|
|
64
|
+
print(f"[{chunk.avg_similarity:.2f}] {chunk.heading_display}")
|
|
65
|
+
print(chunk.text)
|
|
66
|
+
```
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(self, config: MergerConfig | None = None):
|
|
70
|
+
"""Initialize the chunk merger.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
config: Merger configuration, uses defaults if not provided
|
|
74
|
+
"""
|
|
75
|
+
self.config = config or MergerConfig()
|
|
76
|
+
|
|
77
|
+
def merge(self, results: list[dict[str, Any]]) -> list[MergedChunk]:
|
|
78
|
+
"""Merge search results by shared heading path.
|
|
79
|
+
|
|
80
|
+
Groups chunks by (source, heading_path) and merges those that
|
|
81
|
+
share identical heading paths. Chunks are ordered by their
|
|
82
|
+
position within the document.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
results: Search results from RAGKnowledgeBase.query()
|
|
86
|
+
Each result should have:
|
|
87
|
+
- text: Chunk content
|
|
88
|
+
- source: Source file
|
|
89
|
+
- heading_path: Heading hierarchy string or list
|
|
90
|
+
- similarity: Similarity score
|
|
91
|
+
- metadata: Full chunk metadata
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
List of MergedChunk objects, sorted by average similarity
|
|
95
|
+
"""
|
|
96
|
+
if not results:
|
|
97
|
+
return []
|
|
98
|
+
|
|
99
|
+
# Group chunks by (source, heading_path)
|
|
100
|
+
groups: dict[tuple[str, tuple[str, ...]], list[dict[str, Any]]] = defaultdict(list)
|
|
101
|
+
|
|
102
|
+
for result in results:
|
|
103
|
+
source = result.get("source", "")
|
|
104
|
+
heading_path = self._normalize_heading_path(result)
|
|
105
|
+
key = (source, tuple(heading_path))
|
|
106
|
+
groups[key].append(result)
|
|
107
|
+
|
|
108
|
+
# Merge each group
|
|
109
|
+
merged_chunks = []
|
|
110
|
+
for (source, heading_path_tuple), chunks in groups.items():
|
|
111
|
+
heading_path = list(heading_path_tuple)
|
|
112
|
+
|
|
113
|
+
# Sort by position if available
|
|
114
|
+
if self.config.preserve_order:
|
|
115
|
+
chunks = self._sort_by_position(chunks)
|
|
116
|
+
|
|
117
|
+
# Merge chunks respecting size limit
|
|
118
|
+
merged = self._merge_chunk_group(chunks, source, heading_path)
|
|
119
|
+
merged_chunks.extend(merged)
|
|
120
|
+
|
|
121
|
+
# Sort by average similarity (descending)
|
|
122
|
+
merged_chunks.sort(key=lambda c: c.avg_similarity, reverse=True)
|
|
123
|
+
|
|
124
|
+
return merged_chunks
|
|
125
|
+
|
|
126
|
+
def _normalize_heading_path(self, result: dict[str, Any]) -> list[str]:
|
|
127
|
+
"""Extract and normalize heading path from result.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
result: Search result dictionary
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of heading strings
|
|
134
|
+
"""
|
|
135
|
+
# Try to get from metadata first (may have list format)
|
|
136
|
+
metadata = result.get("metadata", {})
|
|
137
|
+
headings = metadata.get("headings", [])
|
|
138
|
+
if headings:
|
|
139
|
+
return headings
|
|
140
|
+
|
|
141
|
+
# Fall back to heading_path string
|
|
142
|
+
heading_path = result.get("heading_path", "")
|
|
143
|
+
if isinstance(heading_path, list):
|
|
144
|
+
return heading_path
|
|
145
|
+
elif heading_path:
|
|
146
|
+
return heading_path.split(" > ")
|
|
147
|
+
|
|
148
|
+
return []
|
|
149
|
+
|
|
150
|
+
def _sort_by_position(self, chunks: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
151
|
+
"""Sort chunks by their position in the document.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
chunks: List of chunk results
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
Sorted list
|
|
158
|
+
"""
|
|
159
|
+
def get_position(chunk: dict[str, Any]) -> int:
|
|
160
|
+
metadata = chunk.get("metadata", {})
|
|
161
|
+
# Try chunk_index first, then line_number
|
|
162
|
+
return metadata.get("chunk_index", metadata.get("line_number", 0))
|
|
163
|
+
|
|
164
|
+
return sorted(chunks, key=get_position)
|
|
165
|
+
|
|
166
|
+
def _merge_chunk_group(
|
|
167
|
+
self,
|
|
168
|
+
chunks: list[dict[str, Any]],
|
|
169
|
+
source: str,
|
|
170
|
+
heading_path: list[str],
|
|
171
|
+
) -> list[MergedChunk]:
|
|
172
|
+
"""Merge a group of chunks with the same heading path.
|
|
173
|
+
|
|
174
|
+
Combines chunks until max_merged_size is reached, then starts
|
|
175
|
+
a new merged chunk. Overflow chunks are returned as separate
|
|
176
|
+
merged chunks.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
chunks: Chunks to merge
|
|
180
|
+
source: Source file path
|
|
181
|
+
heading_path: Shared heading path
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List of merged chunks
|
|
185
|
+
"""
|
|
186
|
+
if not chunks:
|
|
187
|
+
return []
|
|
188
|
+
|
|
189
|
+
merged_results = []
|
|
190
|
+
current_chunks: list[dict[str, Any]] = []
|
|
191
|
+
current_size = 0
|
|
192
|
+
|
|
193
|
+
for chunk in chunks:
|
|
194
|
+
chunk_text = chunk.get("text", "")
|
|
195
|
+
chunk_size = len(chunk_text)
|
|
196
|
+
|
|
197
|
+
# Check if adding this chunk would exceed the limit
|
|
198
|
+
if current_size + chunk_size > self.config.max_merged_size and current_chunks:
|
|
199
|
+
# Save current merge and start new one
|
|
200
|
+
merged_results.append(
|
|
201
|
+
self._create_merged_chunk(current_chunks, source, heading_path)
|
|
202
|
+
)
|
|
203
|
+
current_chunks = []
|
|
204
|
+
current_size = 0
|
|
205
|
+
|
|
206
|
+
current_chunks.append(chunk)
|
|
207
|
+
current_size += chunk_size
|
|
208
|
+
|
|
209
|
+
# Don't forget the last group
|
|
210
|
+
if current_chunks:
|
|
211
|
+
merged_results.append(
|
|
212
|
+
self._create_merged_chunk(current_chunks, source, heading_path)
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
return merged_results
|
|
216
|
+
|
|
217
|
+
def _create_merged_chunk(
|
|
218
|
+
self,
|
|
219
|
+
chunks: list[dict[str, Any]],
|
|
220
|
+
source: str,
|
|
221
|
+
heading_path: list[str],
|
|
222
|
+
) -> MergedChunk:
|
|
223
|
+
"""Create a MergedChunk from a list of chunks.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
chunks: Chunks to combine
|
|
227
|
+
source: Source file path
|
|
228
|
+
heading_path: Shared heading path
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
MergedChunk object
|
|
232
|
+
"""
|
|
233
|
+
# Combine text with double newline separator
|
|
234
|
+
texts = [chunk.get("text", "") for chunk in chunks]
|
|
235
|
+
combined_text = "\n\n".join(text.strip() for text in texts if text.strip())
|
|
236
|
+
|
|
237
|
+
# Calculate average similarity
|
|
238
|
+
similarities = [chunk.get("similarity", 0.0) for chunk in chunks]
|
|
239
|
+
avg_similarity = sum(similarities) / len(similarities) if similarities else 0.0
|
|
240
|
+
|
|
241
|
+
# Build heading display
|
|
242
|
+
heading_display = " > ".join(heading_path) if heading_path else ""
|
|
243
|
+
|
|
244
|
+
return MergedChunk(
|
|
245
|
+
text=combined_text,
|
|
246
|
+
source=source,
|
|
247
|
+
heading_path=heading_path,
|
|
248
|
+
heading_display=heading_display,
|
|
249
|
+
chunks=chunks,
|
|
250
|
+
avg_similarity=avg_similarity,
|
|
251
|
+
content_length=len(combined_text),
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
def to_result_list(self, merged_chunks: list[MergedChunk]) -> list[dict[str, Any]]:
|
|
255
|
+
"""Convert merged chunks back to result list format.
|
|
256
|
+
|
|
257
|
+
Useful for compatibility with existing code that expects
|
|
258
|
+
the standard result format.
|
|
259
|
+
|
|
260
|
+
Args:
|
|
261
|
+
merged_chunks: List of merged chunks
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
List of result dictionaries
|
|
265
|
+
"""
|
|
266
|
+
results = []
|
|
267
|
+
for merged in merged_chunks:
|
|
268
|
+
results.append({
|
|
269
|
+
"text": merged.text,
|
|
270
|
+
"source": merged.source,
|
|
271
|
+
"heading_path": merged.heading_display,
|
|
272
|
+
"similarity": merged.avg_similarity,
|
|
273
|
+
"metadata": {
|
|
274
|
+
"headings": merged.heading_path,
|
|
275
|
+
"content_length": merged.content_length,
|
|
276
|
+
"merged_count": len(merged.chunks),
|
|
277
|
+
},
|
|
278
|
+
})
|
|
279
|
+
return results
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Memory implementations for DynaBot."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .base import Memory
|
|
6
|
+
from .buffer import BufferMemory
|
|
7
|
+
from .vector import VectorMemory
|
|
8
|
+
|
|
9
|
+
__all__ = ["Memory", "BufferMemory", "VectorMemory", "create_memory_from_config"]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def create_memory_from_config(config: dict[str, Any]) -> Memory:
|
|
13
|
+
"""Create memory instance from configuration.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
config: Memory configuration with 'type' field and type-specific params
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Configured Memory instance
|
|
20
|
+
|
|
21
|
+
Raises:
|
|
22
|
+
ValueError: If memory type is not recognized
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
```python
|
|
26
|
+
# Buffer memory
|
|
27
|
+
config = {
|
|
28
|
+
"type": "buffer",
|
|
29
|
+
"max_messages": 10
|
|
30
|
+
}
|
|
31
|
+
memory = await create_memory_from_config(config)
|
|
32
|
+
|
|
33
|
+
# Vector memory
|
|
34
|
+
config = {
|
|
35
|
+
"type": "vector",
|
|
36
|
+
"backend": "faiss",
|
|
37
|
+
"dimension": 1536,
|
|
38
|
+
"embedding_provider": "openai",
|
|
39
|
+
"embedding_model": "text-embedding-3-small"
|
|
40
|
+
}
|
|
41
|
+
memory = await create_memory_from_config(config)
|
|
42
|
+
```
|
|
43
|
+
"""
|
|
44
|
+
memory_type = config.get("type", "buffer").lower()
|
|
45
|
+
|
|
46
|
+
if memory_type == "buffer":
|
|
47
|
+
return BufferMemory(max_messages=config.get("max_messages", 10))
|
|
48
|
+
|
|
49
|
+
elif memory_type == "vector":
|
|
50
|
+
return await VectorMemory.from_config(config)
|
|
51
|
+
|
|
52
|
+
else:
|
|
53
|
+
raise ValueError(
|
|
54
|
+
f"Unknown memory type: {memory_type}. "
|
|
55
|
+
f"Available types: buffer, vector"
|
|
56
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Base memory interface for bot memory implementations."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Memory(ABC):
|
|
8
|
+
"""Abstract base class for memory implementations."""
|
|
9
|
+
|
|
10
|
+
@abstractmethod
|
|
11
|
+
async def add_message(
|
|
12
|
+
self, content: str, role: str, metadata: dict[str, Any] | None = None
|
|
13
|
+
) -> None:
|
|
14
|
+
"""Add message to memory.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
content: Message content
|
|
18
|
+
role: Message role (user, assistant, system, etc.)
|
|
19
|
+
metadata: Optional metadata for the message
|
|
20
|
+
"""
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
async def get_context(self, current_message: str) -> list[dict[str, Any]]:
|
|
25
|
+
"""Get relevant context for current message.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
current_message: The current message to get context for
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of relevant message dictionaries
|
|
32
|
+
"""
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
@abstractmethod
|
|
36
|
+
async def clear(self) -> None:
|
|
37
|
+
"""Clear all memory."""
|
|
38
|
+
pass
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Buffer memory implementation for simple FIFO message storage."""
|
|
2
|
+
|
|
3
|
+
from collections import deque
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from .base import Memory
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BufferMemory(Memory):
|
|
10
|
+
"""Simple buffer memory keeping last N messages.
|
|
11
|
+
|
|
12
|
+
This implementation uses a fixed-size buffer that keeps the most recent
|
|
13
|
+
messages in memory. When the buffer is full, the oldest messages are
|
|
14
|
+
automatically removed.
|
|
15
|
+
|
|
16
|
+
Attributes:
|
|
17
|
+
max_messages: Maximum number of messages to keep in buffer
|
|
18
|
+
messages: Deque containing the messages
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, max_messages: int = 10):
|
|
22
|
+
"""Initialize buffer memory.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
max_messages: Maximum number of messages to keep
|
|
26
|
+
"""
|
|
27
|
+
self.max_messages = max_messages
|
|
28
|
+
self.messages: deque[dict[str, Any]] = deque(maxlen=max_messages)
|
|
29
|
+
|
|
30
|
+
async def add_message(
|
|
31
|
+
self, content: str, role: str, metadata: dict[str, Any] | None = None
|
|
32
|
+
) -> None:
|
|
33
|
+
"""Add message to buffer.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: Message content
|
|
37
|
+
role: Message role
|
|
38
|
+
metadata: Optional metadata
|
|
39
|
+
"""
|
|
40
|
+
self.messages.append({"content": content, "role": role, "metadata": metadata or {}})
|
|
41
|
+
|
|
42
|
+
async def get_context(self, current_message: str) -> list[dict[str, Any]]:
|
|
43
|
+
"""Get all messages in buffer.
|
|
44
|
+
|
|
45
|
+
The current_message parameter is not used in buffer memory since
|
|
46
|
+
we simply return all buffered messages in order.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
current_message: Not used in buffer memory
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
List of all buffered messages
|
|
53
|
+
"""
|
|
54
|
+
return list(self.messages)
|
|
55
|
+
|
|
56
|
+
async def clear(self) -> None:
|
|
57
|
+
"""Clear all messages from buffer."""
|
|
58
|
+
self.messages.clear()
|