dataknobs-bots 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataknobs_bots/__init__.py +42 -0
- dataknobs_bots/api/__init__.py +42 -0
- dataknobs_bots/api/dependencies.py +140 -0
- dataknobs_bots/api/exceptions.py +289 -0
- dataknobs_bots/bot/__init__.py +15 -0
- dataknobs_bots/bot/base.py +1091 -0
- dataknobs_bots/bot/context.py +102 -0
- dataknobs_bots/bot/manager.py +430 -0
- dataknobs_bots/bot/registry.py +629 -0
- dataknobs_bots/config/__init__.py +39 -0
- dataknobs_bots/config/resolution.py +353 -0
- dataknobs_bots/knowledge/__init__.py +82 -0
- dataknobs_bots/knowledge/query/__init__.py +25 -0
- dataknobs_bots/knowledge/query/expander.py +262 -0
- dataknobs_bots/knowledge/query/transformer.py +288 -0
- dataknobs_bots/knowledge/rag.py +738 -0
- dataknobs_bots/knowledge/retrieval/__init__.py +23 -0
- dataknobs_bots/knowledge/retrieval/formatter.py +249 -0
- dataknobs_bots/knowledge/retrieval/merger.py +279 -0
- dataknobs_bots/memory/__init__.py +56 -0
- dataknobs_bots/memory/base.py +38 -0
- dataknobs_bots/memory/buffer.py +58 -0
- dataknobs_bots/memory/vector.py +188 -0
- dataknobs_bots/middleware/__init__.py +11 -0
- dataknobs_bots/middleware/base.py +92 -0
- dataknobs_bots/middleware/cost.py +421 -0
- dataknobs_bots/middleware/logging.py +184 -0
- dataknobs_bots/reasoning/__init__.py +65 -0
- dataknobs_bots/reasoning/base.py +50 -0
- dataknobs_bots/reasoning/react.py +299 -0
- dataknobs_bots/reasoning/simple.py +51 -0
- dataknobs_bots/registry/__init__.py +41 -0
- dataknobs_bots/registry/backend.py +181 -0
- dataknobs_bots/registry/memory.py +244 -0
- dataknobs_bots/registry/models.py +102 -0
- dataknobs_bots/registry/portability.py +210 -0
- dataknobs_bots/tools/__init__.py +5 -0
- dataknobs_bots/tools/knowledge_search.py +113 -0
- dataknobs_bots/utils/__init__.py +1 -0
- dataknobs_bots-0.2.4.dist-info/METADATA +591 -0
- dataknobs_bots-0.2.4.dist-info/RECORD +42 -0
- dataknobs_bots-0.2.4.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,738 @@
|
|
|
1
|
+
"""RAG (Retrieval-Augmented Generation) knowledge base implementation."""
|
|
2
|
+
|
|
3
|
+
import types
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from dataknobs_xization import (
|
|
8
|
+
ChunkQualityConfig,
|
|
9
|
+
ContentTransformer,
|
|
10
|
+
HeadingInclusion,
|
|
11
|
+
chunk_markdown_tree,
|
|
12
|
+
parse_markdown,
|
|
13
|
+
)
|
|
14
|
+
from dataknobs_bots.knowledge.retrieval import (
|
|
15
|
+
ChunkMerger,
|
|
16
|
+
ContextFormatter,
|
|
17
|
+
FormatterConfig,
|
|
18
|
+
MergerConfig,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RAGKnowledgeBase:
|
|
23
|
+
"""RAG knowledge base using dataknobs-xization for chunking and vector search.
|
|
24
|
+
|
|
25
|
+
This implementation:
|
|
26
|
+
- Parses markdown documents using dataknobs-xization
|
|
27
|
+
- Chunks documents intelligently based on structure
|
|
28
|
+
- Stores chunks with embeddings in vector store
|
|
29
|
+
- Provides semantic search for relevant context
|
|
30
|
+
|
|
31
|
+
Attributes:
|
|
32
|
+
vector_store: Vector store backend from dataknobs_data
|
|
33
|
+
embedding_provider: LLM provider for generating embeddings
|
|
34
|
+
chunking_config: Configuration for document chunking
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
vector_store: Any,
|
|
40
|
+
embedding_provider: Any,
|
|
41
|
+
chunking_config: dict[str, Any] | None = None,
|
|
42
|
+
merger_config: MergerConfig | None = None,
|
|
43
|
+
formatter_config: FormatterConfig | None = None,
|
|
44
|
+
):
|
|
45
|
+
"""Initialize RAG knowledge base.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
vector_store: Vector store backend instance
|
|
49
|
+
embedding_provider: LLM provider with embed() method
|
|
50
|
+
chunking_config: Configuration for chunking:
|
|
51
|
+
- max_chunk_size: Maximum chunk size in characters
|
|
52
|
+
- chunk_overlap: Overlap between chunks
|
|
53
|
+
- combine_under_heading: Combine text under same heading
|
|
54
|
+
- quality_filter: ChunkQualityConfig for filtering
|
|
55
|
+
- generate_embeddings: Whether to generate enriched embedding text
|
|
56
|
+
merger_config: Configuration for chunk merging (optional)
|
|
57
|
+
formatter_config: Configuration for context formatting (optional)
|
|
58
|
+
"""
|
|
59
|
+
self.vector_store = vector_store
|
|
60
|
+
self.embedding_provider = embedding_provider
|
|
61
|
+
self.chunking_config = chunking_config or {
|
|
62
|
+
"max_chunk_size": 500,
|
|
63
|
+
"chunk_overlap": 50,
|
|
64
|
+
"combine_under_heading": True,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Initialize merger and formatter
|
|
68
|
+
self.merger = ChunkMerger(merger_config) if merger_config else ChunkMerger()
|
|
69
|
+
self.formatter = ContextFormatter(formatter_config) if formatter_config else ContextFormatter()
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
async def from_config(cls, config: dict[str, Any]) -> "RAGKnowledgeBase":
|
|
73
|
+
"""Create RAG knowledge base from configuration.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
config: Configuration dictionary with:
|
|
77
|
+
- vector_store: Vector store configuration
|
|
78
|
+
- embedding_provider: LLM provider name
|
|
79
|
+
- embedding_model: Model for embeddings
|
|
80
|
+
- chunking: Optional chunking configuration
|
|
81
|
+
- documents_path: Optional path to load documents from
|
|
82
|
+
- document_pattern: Optional glob pattern for documents
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
Configured RAGKnowledgeBase instance
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
```python
|
|
89
|
+
config = {
|
|
90
|
+
"vector_store": {
|
|
91
|
+
"backend": "faiss",
|
|
92
|
+
"dimensions": 1536,
|
|
93
|
+
"collection": "docs"
|
|
94
|
+
},
|
|
95
|
+
"embedding_provider": "openai",
|
|
96
|
+
"embedding_model": "text-embedding-3-small",
|
|
97
|
+
"chunking": {
|
|
98
|
+
"max_chunk_size": 500,
|
|
99
|
+
"chunk_overlap": 50
|
|
100
|
+
},
|
|
101
|
+
"documents_path": "./docs"
|
|
102
|
+
}
|
|
103
|
+
kb = await RAGKnowledgeBase.from_config(config)
|
|
104
|
+
```
|
|
105
|
+
"""
|
|
106
|
+
from dataknobs_data.vector.stores import VectorStoreFactory
|
|
107
|
+
from dataknobs_llm.llm import LLMProviderFactory
|
|
108
|
+
|
|
109
|
+
# Create vector store
|
|
110
|
+
vs_config = config["vector_store"]
|
|
111
|
+
factory = VectorStoreFactory()
|
|
112
|
+
vector_store = factory.create(**vs_config)
|
|
113
|
+
await vector_store.initialize()
|
|
114
|
+
|
|
115
|
+
# Create embedding provider
|
|
116
|
+
llm_factory = LLMProviderFactory(is_async=True)
|
|
117
|
+
embedding_provider = llm_factory.create(
|
|
118
|
+
{
|
|
119
|
+
"provider": config.get("embedding_provider", "openai"),
|
|
120
|
+
"model": config.get("embedding_model", "text-embedding-ada-002"),
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
await embedding_provider.initialize()
|
|
124
|
+
|
|
125
|
+
# Create merger config if specified
|
|
126
|
+
merger_config = None
|
|
127
|
+
if "merger" in config:
|
|
128
|
+
merger_config = MergerConfig(**config["merger"])
|
|
129
|
+
|
|
130
|
+
# Create formatter config if specified
|
|
131
|
+
formatter_config = None
|
|
132
|
+
if "formatter" in config:
|
|
133
|
+
formatter_config = FormatterConfig(**config["formatter"])
|
|
134
|
+
|
|
135
|
+
# Create instance
|
|
136
|
+
kb = cls(
|
|
137
|
+
vector_store=vector_store,
|
|
138
|
+
embedding_provider=embedding_provider,
|
|
139
|
+
chunking_config=config.get("chunking", {}),
|
|
140
|
+
merger_config=merger_config,
|
|
141
|
+
formatter_config=formatter_config,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
# Load documents if path provided
|
|
145
|
+
if "documents_path" in config:
|
|
146
|
+
await kb.load_documents_from_directory(
|
|
147
|
+
config["documents_path"], config.get("document_pattern", "**/*.md")
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
return kb
|
|
151
|
+
|
|
152
|
+
async def load_markdown_document(
|
|
153
|
+
self, filepath: str | Path, metadata: dict[str, Any] | None = None
|
|
154
|
+
) -> int:
|
|
155
|
+
"""Load and chunk a markdown document.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
filepath: Path to markdown file
|
|
159
|
+
metadata: Optional metadata to attach to all chunks
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Number of chunks created
|
|
163
|
+
|
|
164
|
+
Example:
|
|
165
|
+
```python
|
|
166
|
+
num_chunks = await kb.load_markdown_document(
|
|
167
|
+
"docs/api.md",
|
|
168
|
+
metadata={"category": "api", "version": "1.0"}
|
|
169
|
+
)
|
|
170
|
+
```
|
|
171
|
+
"""
|
|
172
|
+
import numpy as np
|
|
173
|
+
|
|
174
|
+
# Read document
|
|
175
|
+
filepath = Path(filepath)
|
|
176
|
+
with open(filepath, encoding="utf-8") as f:
|
|
177
|
+
markdown_text = f.read()
|
|
178
|
+
|
|
179
|
+
# Parse markdown
|
|
180
|
+
tree = parse_markdown(markdown_text)
|
|
181
|
+
|
|
182
|
+
# Build quality filter config if specified
|
|
183
|
+
quality_filter = None
|
|
184
|
+
if "quality_filter" in self.chunking_config:
|
|
185
|
+
qf_config = self.chunking_config["quality_filter"]
|
|
186
|
+
if isinstance(qf_config, ChunkQualityConfig):
|
|
187
|
+
quality_filter = qf_config
|
|
188
|
+
elif isinstance(qf_config, dict):
|
|
189
|
+
quality_filter = ChunkQualityConfig(**qf_config)
|
|
190
|
+
|
|
191
|
+
# Chunk the document with enhanced options
|
|
192
|
+
chunks = chunk_markdown_tree(
|
|
193
|
+
tree,
|
|
194
|
+
max_chunk_size=self.chunking_config.get("max_chunk_size", 500),
|
|
195
|
+
chunk_overlap=self.chunking_config.get("chunk_overlap", 50),
|
|
196
|
+
heading_inclusion=HeadingInclusion.IN_METADATA, # Keep headings in metadata only
|
|
197
|
+
combine_under_heading=self.chunking_config.get("combine_under_heading", True),
|
|
198
|
+
quality_filter=quality_filter,
|
|
199
|
+
generate_embeddings=self.chunking_config.get("generate_embeddings", True),
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Process and store chunks
|
|
203
|
+
vectors = []
|
|
204
|
+
ids = []
|
|
205
|
+
metadatas = []
|
|
206
|
+
|
|
207
|
+
for i, chunk in enumerate(chunks):
|
|
208
|
+
# Use embedding_text if available, otherwise use chunk text
|
|
209
|
+
text_for_embedding = chunk.metadata.embedding_text or chunk.text
|
|
210
|
+
|
|
211
|
+
# Generate embedding
|
|
212
|
+
embedding = await self.embedding_provider.embed(text_for_embedding)
|
|
213
|
+
|
|
214
|
+
# Convert to numpy if needed
|
|
215
|
+
if not isinstance(embedding, np.ndarray):
|
|
216
|
+
embedding = np.array(embedding, dtype=np.float32)
|
|
217
|
+
|
|
218
|
+
# Prepare metadata with new fields
|
|
219
|
+
chunk_id = f"{filepath.stem}_{i}"
|
|
220
|
+
chunk_metadata = {
|
|
221
|
+
"text": chunk.text,
|
|
222
|
+
"source": str(filepath),
|
|
223
|
+
"chunk_index": i,
|
|
224
|
+
"heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(),
|
|
225
|
+
"headings": chunk.metadata.headings,
|
|
226
|
+
"heading_levels": chunk.metadata.heading_levels,
|
|
227
|
+
"line_number": chunk.metadata.line_number,
|
|
228
|
+
"chunk_size": chunk.metadata.chunk_size,
|
|
229
|
+
"content_length": chunk.metadata.content_length,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
# Merge with user metadata
|
|
233
|
+
if metadata:
|
|
234
|
+
chunk_metadata.update(metadata)
|
|
235
|
+
|
|
236
|
+
vectors.append(embedding)
|
|
237
|
+
ids.append(chunk_id)
|
|
238
|
+
metadatas.append(chunk_metadata)
|
|
239
|
+
|
|
240
|
+
# Batch insert into vector store
|
|
241
|
+
if vectors:
|
|
242
|
+
await self.vector_store.add_vectors(
|
|
243
|
+
vectors=vectors, ids=ids, metadata=metadatas
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
return len(chunks)
|
|
247
|
+
|
|
248
|
+
async def load_documents_from_directory(
|
|
249
|
+
self, directory: str | Path, pattern: str = "**/*.md"
|
|
250
|
+
) -> dict[str, Any]:
|
|
251
|
+
"""Load all markdown documents from a directory.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
directory: Directory path containing documents
|
|
255
|
+
pattern: Glob pattern for files to load (default: **/*.md)
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Dictionary with loading statistics:
|
|
259
|
+
- total_files: Number of files processed
|
|
260
|
+
- total_chunks: Total chunks created
|
|
261
|
+
- errors: List of errors encountered
|
|
262
|
+
|
|
263
|
+
Example:
|
|
264
|
+
```python
|
|
265
|
+
results = await kb.load_documents_from_directory(
|
|
266
|
+
"docs/",
|
|
267
|
+
pattern="**/*.md"
|
|
268
|
+
)
|
|
269
|
+
print(f"Loaded {results['total_chunks']} chunks from {results['total_files']} files")
|
|
270
|
+
```
|
|
271
|
+
"""
|
|
272
|
+
directory = Path(directory)
|
|
273
|
+
results = {"total_files": 0, "total_chunks": 0, "errors": []}
|
|
274
|
+
|
|
275
|
+
for filepath in directory.glob(pattern):
|
|
276
|
+
if not filepath.is_file():
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
num_chunks = await self.load_markdown_document(
|
|
281
|
+
filepath, metadata={"filename": filepath.name}
|
|
282
|
+
)
|
|
283
|
+
results["total_files"] += 1
|
|
284
|
+
results["total_chunks"] += num_chunks
|
|
285
|
+
except Exception as e:
|
|
286
|
+
results["errors"].append({"file": str(filepath), "error": str(e)})
|
|
287
|
+
|
|
288
|
+
return results
|
|
289
|
+
|
|
290
|
+
async def load_json_document(
|
|
291
|
+
self,
|
|
292
|
+
filepath: str | Path,
|
|
293
|
+
metadata: dict[str, Any] | None = None,
|
|
294
|
+
schema: str | None = None,
|
|
295
|
+
transformer: ContentTransformer | None = None,
|
|
296
|
+
title: str | None = None,
|
|
297
|
+
) -> int:
|
|
298
|
+
"""Load and chunk a JSON document by converting it to markdown.
|
|
299
|
+
|
|
300
|
+
This method converts JSON data to markdown format using ContentTransformer,
|
|
301
|
+
then processes it like any other markdown document.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
filepath: Path to JSON file
|
|
305
|
+
metadata: Optional metadata to attach to all chunks
|
|
306
|
+
schema: Optional schema name (requires transformer with registered schema)
|
|
307
|
+
transformer: Optional ContentTransformer instance with custom configuration
|
|
308
|
+
title: Optional document title for the markdown
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Number of chunks created
|
|
312
|
+
|
|
313
|
+
Example:
|
|
314
|
+
```python
|
|
315
|
+
# Generic conversion
|
|
316
|
+
num_chunks = await kb.load_json_document(
|
|
317
|
+
"data/patterns.json",
|
|
318
|
+
metadata={"content_type": "patterns"}
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# With custom schema
|
|
322
|
+
transformer = ContentTransformer()
|
|
323
|
+
transformer.register_schema("pattern", {
|
|
324
|
+
"title_field": "name",
|
|
325
|
+
"sections": [
|
|
326
|
+
{"field": "description", "heading": "Description"},
|
|
327
|
+
{"field": "example", "heading": "Example", "format": "code"}
|
|
328
|
+
]
|
|
329
|
+
})
|
|
330
|
+
num_chunks = await kb.load_json_document(
|
|
331
|
+
"data/patterns.json",
|
|
332
|
+
transformer=transformer,
|
|
333
|
+
schema="pattern"
|
|
334
|
+
)
|
|
335
|
+
```
|
|
336
|
+
"""
|
|
337
|
+
import json
|
|
338
|
+
|
|
339
|
+
filepath = Path(filepath)
|
|
340
|
+
|
|
341
|
+
# Read JSON
|
|
342
|
+
with open(filepath, encoding="utf-8") as f:
|
|
343
|
+
data = json.load(f)
|
|
344
|
+
|
|
345
|
+
# Convert to markdown
|
|
346
|
+
if transformer is None:
|
|
347
|
+
transformer = ContentTransformer()
|
|
348
|
+
|
|
349
|
+
markdown_text = transformer.transform_json(
|
|
350
|
+
data,
|
|
351
|
+
schema=schema,
|
|
352
|
+
title=title or filepath.stem.replace("_", " ").title(),
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
return await self._load_markdown_text(
|
|
356
|
+
markdown_text,
|
|
357
|
+
source=str(filepath),
|
|
358
|
+
metadata=metadata,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
async def load_yaml_document(
|
|
362
|
+
self,
|
|
363
|
+
filepath: str | Path,
|
|
364
|
+
metadata: dict[str, Any] | None = None,
|
|
365
|
+
schema: str | None = None,
|
|
366
|
+
transformer: ContentTransformer | None = None,
|
|
367
|
+
title: str | None = None,
|
|
368
|
+
) -> int:
|
|
369
|
+
"""Load and chunk a YAML document by converting it to markdown.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
filepath: Path to YAML file
|
|
373
|
+
metadata: Optional metadata to attach to all chunks
|
|
374
|
+
schema: Optional schema name (requires transformer with registered schema)
|
|
375
|
+
transformer: Optional ContentTransformer instance with custom configuration
|
|
376
|
+
title: Optional document title for the markdown
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
Number of chunks created
|
|
380
|
+
|
|
381
|
+
Example:
|
|
382
|
+
```python
|
|
383
|
+
num_chunks = await kb.load_yaml_document(
|
|
384
|
+
"data/config.yaml",
|
|
385
|
+
metadata={"content_type": "configuration"}
|
|
386
|
+
)
|
|
387
|
+
```
|
|
388
|
+
"""
|
|
389
|
+
filepath = Path(filepath)
|
|
390
|
+
|
|
391
|
+
# Convert to markdown
|
|
392
|
+
if transformer is None:
|
|
393
|
+
transformer = ContentTransformer()
|
|
394
|
+
|
|
395
|
+
markdown_text = transformer.transform_yaml(
|
|
396
|
+
filepath,
|
|
397
|
+
schema=schema,
|
|
398
|
+
title=title or filepath.stem.replace("_", " ").title(),
|
|
399
|
+
)
|
|
400
|
+
|
|
401
|
+
return await self._load_markdown_text(
|
|
402
|
+
markdown_text,
|
|
403
|
+
source=str(filepath),
|
|
404
|
+
metadata=metadata,
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
async def load_csv_document(
|
|
408
|
+
self,
|
|
409
|
+
filepath: str | Path,
|
|
410
|
+
metadata: dict[str, Any] | None = None,
|
|
411
|
+
title: str | None = None,
|
|
412
|
+
title_field: str | None = None,
|
|
413
|
+
transformer: ContentTransformer | None = None,
|
|
414
|
+
) -> int:
|
|
415
|
+
"""Load and chunk a CSV document by converting it to markdown.
|
|
416
|
+
|
|
417
|
+
Each row becomes a section with the first column (or title_field) as heading.
|
|
418
|
+
|
|
419
|
+
Args:
|
|
420
|
+
filepath: Path to CSV file
|
|
421
|
+
metadata: Optional metadata to attach to all chunks
|
|
422
|
+
title: Optional document title for the markdown
|
|
423
|
+
title_field: Column to use as section title (default: first column)
|
|
424
|
+
transformer: Optional ContentTransformer instance with custom configuration
|
|
425
|
+
|
|
426
|
+
Returns:
|
|
427
|
+
Number of chunks created
|
|
428
|
+
|
|
429
|
+
Example:
|
|
430
|
+
```python
|
|
431
|
+
num_chunks = await kb.load_csv_document(
|
|
432
|
+
"data/faq.csv",
|
|
433
|
+
title="Frequently Asked Questions",
|
|
434
|
+
title_field="question"
|
|
435
|
+
)
|
|
436
|
+
```
|
|
437
|
+
"""
|
|
438
|
+
filepath = Path(filepath)
|
|
439
|
+
|
|
440
|
+
# Convert to markdown
|
|
441
|
+
if transformer is None:
|
|
442
|
+
transformer = ContentTransformer()
|
|
443
|
+
|
|
444
|
+
markdown_text = transformer.transform_csv(
|
|
445
|
+
filepath,
|
|
446
|
+
title=title or filepath.stem.replace("_", " ").title(),
|
|
447
|
+
title_field=title_field,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
return await self._load_markdown_text(
|
|
451
|
+
markdown_text,
|
|
452
|
+
source=str(filepath),
|
|
453
|
+
metadata=metadata,
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
async def _load_markdown_text(
|
|
457
|
+
self,
|
|
458
|
+
markdown_text: str,
|
|
459
|
+
source: str,
|
|
460
|
+
metadata: dict[str, Any] | None = None,
|
|
461
|
+
) -> int:
|
|
462
|
+
"""Internal method to load markdown text directly.
|
|
463
|
+
|
|
464
|
+
Used by load_json_document, load_yaml_document, and load_csv_document.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
markdown_text: Markdown content to load
|
|
468
|
+
source: Source identifier for metadata
|
|
469
|
+
metadata: Optional metadata to attach to all chunks
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
Number of chunks created
|
|
473
|
+
"""
|
|
474
|
+
import numpy as np
|
|
475
|
+
|
|
476
|
+
# Parse markdown
|
|
477
|
+
tree = parse_markdown(markdown_text)
|
|
478
|
+
|
|
479
|
+
# Build quality filter config if specified
|
|
480
|
+
quality_filter = None
|
|
481
|
+
if "quality_filter" in self.chunking_config:
|
|
482
|
+
qf_config = self.chunking_config["quality_filter"]
|
|
483
|
+
if isinstance(qf_config, ChunkQualityConfig):
|
|
484
|
+
quality_filter = qf_config
|
|
485
|
+
elif isinstance(qf_config, dict):
|
|
486
|
+
quality_filter = ChunkQualityConfig(**qf_config)
|
|
487
|
+
|
|
488
|
+
# Chunk the document with enhanced options
|
|
489
|
+
chunks = chunk_markdown_tree(
|
|
490
|
+
tree,
|
|
491
|
+
max_chunk_size=self.chunking_config.get("max_chunk_size", 500),
|
|
492
|
+
chunk_overlap=self.chunking_config.get("chunk_overlap", 50),
|
|
493
|
+
heading_inclusion=HeadingInclusion.IN_METADATA,
|
|
494
|
+
combine_under_heading=self.chunking_config.get("combine_under_heading", True),
|
|
495
|
+
quality_filter=quality_filter,
|
|
496
|
+
generate_embeddings=self.chunking_config.get("generate_embeddings", True),
|
|
497
|
+
)
|
|
498
|
+
|
|
499
|
+
# Process and store chunks
|
|
500
|
+
vectors = []
|
|
501
|
+
ids = []
|
|
502
|
+
metadatas = []
|
|
503
|
+
|
|
504
|
+
# Generate a base ID from source
|
|
505
|
+
source_stem = Path(source).stem if source else "doc"
|
|
506
|
+
|
|
507
|
+
for i, chunk in enumerate(chunks):
|
|
508
|
+
# Use embedding_text if available, otherwise use chunk text
|
|
509
|
+
text_for_embedding = chunk.metadata.embedding_text or chunk.text
|
|
510
|
+
|
|
511
|
+
# Generate embedding
|
|
512
|
+
embedding = await self.embedding_provider.embed(text_for_embedding)
|
|
513
|
+
|
|
514
|
+
# Convert to numpy if needed
|
|
515
|
+
if not isinstance(embedding, np.ndarray):
|
|
516
|
+
embedding = np.array(embedding, dtype=np.float32)
|
|
517
|
+
|
|
518
|
+
# Prepare metadata with new fields
|
|
519
|
+
chunk_id = f"{source_stem}_{i}"
|
|
520
|
+
chunk_metadata = {
|
|
521
|
+
"text": chunk.text,
|
|
522
|
+
"source": source,
|
|
523
|
+
"chunk_index": i,
|
|
524
|
+
"heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(),
|
|
525
|
+
"headings": chunk.metadata.headings,
|
|
526
|
+
"heading_levels": chunk.metadata.heading_levels,
|
|
527
|
+
"line_number": chunk.metadata.line_number,
|
|
528
|
+
"chunk_size": chunk.metadata.chunk_size,
|
|
529
|
+
"content_length": chunk.metadata.content_length,
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
# Merge with user metadata
|
|
533
|
+
if metadata:
|
|
534
|
+
chunk_metadata.update(metadata)
|
|
535
|
+
|
|
536
|
+
vectors.append(embedding)
|
|
537
|
+
ids.append(chunk_id)
|
|
538
|
+
metadatas.append(chunk_metadata)
|
|
539
|
+
|
|
540
|
+
# Batch insert into vector store
|
|
541
|
+
if vectors:
|
|
542
|
+
await self.vector_store.add_vectors(
|
|
543
|
+
vectors=vectors, ids=ids, metadata=metadatas
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
return len(chunks)
|
|
547
|
+
|
|
548
|
+
async def query(
|
|
549
|
+
self,
|
|
550
|
+
query: str,
|
|
551
|
+
k: int = 5,
|
|
552
|
+
filter_metadata: dict[str, Any] | None = None,
|
|
553
|
+
min_similarity: float = 0.0,
|
|
554
|
+
merge_adjacent: bool = False,
|
|
555
|
+
max_chunk_size: int | None = None,
|
|
556
|
+
) -> list[dict[str, Any]]:
|
|
557
|
+
"""Query knowledge base for relevant chunks.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
query: Query text to search for
|
|
561
|
+
k: Number of results to return
|
|
562
|
+
filter_metadata: Optional metadata filters
|
|
563
|
+
min_similarity: Minimum similarity score (0-1)
|
|
564
|
+
merge_adjacent: Whether to merge adjacent chunks with same heading
|
|
565
|
+
max_chunk_size: Maximum size for merged chunks (uses merger config default if not specified)
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
List of result dictionaries with:
|
|
569
|
+
- text: Chunk text
|
|
570
|
+
- source: Source file
|
|
571
|
+
- heading_path: Heading hierarchy
|
|
572
|
+
- similarity: Similarity score
|
|
573
|
+
- metadata: Full chunk metadata
|
|
574
|
+
|
|
575
|
+
Example:
|
|
576
|
+
```python
|
|
577
|
+
results = await kb.query(
|
|
578
|
+
"How do I configure the database?",
|
|
579
|
+
k=3,
|
|
580
|
+
merge_adjacent=True
|
|
581
|
+
)
|
|
582
|
+
for result in results:
|
|
583
|
+
print(f"[{result['similarity']:.2f}] {result['heading_path']}")
|
|
584
|
+
print(result['text'])
|
|
585
|
+
```
|
|
586
|
+
"""
|
|
587
|
+
import numpy as np
|
|
588
|
+
|
|
589
|
+
# Generate query embedding
|
|
590
|
+
query_embedding = await self.embedding_provider.embed(query)
|
|
591
|
+
|
|
592
|
+
# Convert to numpy if needed
|
|
593
|
+
if not isinstance(query_embedding, np.ndarray):
|
|
594
|
+
query_embedding = np.array(query_embedding, dtype=np.float32)
|
|
595
|
+
|
|
596
|
+
# Search vector store
|
|
597
|
+
search_results = await self.vector_store.search(
|
|
598
|
+
query_vector=query_embedding,
|
|
599
|
+
k=k,
|
|
600
|
+
filter=filter_metadata,
|
|
601
|
+
include_metadata=True,
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
# Format results
|
|
605
|
+
results = []
|
|
606
|
+
for _chunk_id, similarity, chunk_metadata in search_results:
|
|
607
|
+
if chunk_metadata and similarity >= min_similarity:
|
|
608
|
+
results.append(
|
|
609
|
+
{
|
|
610
|
+
"text": chunk_metadata.get("text", ""),
|
|
611
|
+
"source": chunk_metadata.get("source", ""),
|
|
612
|
+
"heading_path": chunk_metadata.get("heading_path", ""),
|
|
613
|
+
"similarity": similarity,
|
|
614
|
+
"metadata": chunk_metadata,
|
|
615
|
+
}
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
# Apply chunk merging if requested
|
|
619
|
+
if merge_adjacent and results:
|
|
620
|
+
# Update merger config if max_chunk_size specified
|
|
621
|
+
if max_chunk_size is not None:
|
|
622
|
+
merger = ChunkMerger(MergerConfig(max_merged_size=max_chunk_size))
|
|
623
|
+
else:
|
|
624
|
+
merger = self.merger
|
|
625
|
+
|
|
626
|
+
merged_chunks = merger.merge(results)
|
|
627
|
+
results = merger.to_result_list(merged_chunks)
|
|
628
|
+
|
|
629
|
+
return results
|
|
630
|
+
|
|
631
|
+
def format_context(
|
|
632
|
+
self,
|
|
633
|
+
results: list[dict[str, Any]],
|
|
634
|
+
wrap_in_tags: bool = True,
|
|
635
|
+
) -> str:
|
|
636
|
+
"""Format search results for LLM context.
|
|
637
|
+
|
|
638
|
+
Convenience method to format results using the configured formatter.
|
|
639
|
+
|
|
640
|
+
Args:
|
|
641
|
+
results: Search results from query()
|
|
642
|
+
wrap_in_tags: Whether to wrap in <knowledge_base> tags
|
|
643
|
+
|
|
644
|
+
Returns:
|
|
645
|
+
Formatted context string
|
|
646
|
+
"""
|
|
647
|
+
context = self.formatter.format(results)
|
|
648
|
+
if wrap_in_tags:
|
|
649
|
+
context = self.formatter.wrap_for_prompt(context)
|
|
650
|
+
return context
|
|
651
|
+
|
|
652
|
+
async def clear(self) -> None:
|
|
653
|
+
"""Clear all documents from the knowledge base.
|
|
654
|
+
|
|
655
|
+
Warning: This removes all stored chunks and embeddings.
|
|
656
|
+
"""
|
|
657
|
+
if hasattr(self.vector_store, "clear"):
|
|
658
|
+
await self.vector_store.clear()
|
|
659
|
+
else:
|
|
660
|
+
raise NotImplementedError(
|
|
661
|
+
"Vector store does not support clearing. "
|
|
662
|
+
"Consider creating a new knowledge base with a fresh collection."
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
async def save(self) -> None:
|
|
666
|
+
"""Save the knowledge base to persistent storage.
|
|
667
|
+
|
|
668
|
+
This persists the vector store index and metadata to disk.
|
|
669
|
+
Only applicable for vector stores that support persistence (e.g., FAISS).
|
|
670
|
+
|
|
671
|
+
Example:
|
|
672
|
+
```python
|
|
673
|
+
await kb.load_markdown_document("docs/api.md")
|
|
674
|
+
await kb.save() # Persist to disk
|
|
675
|
+
```
|
|
676
|
+
"""
|
|
677
|
+
if hasattr(self.vector_store, "save"):
|
|
678
|
+
await self.vector_store.save()
|
|
679
|
+
|
|
680
|
+
async def close(self) -> None:
|
|
681
|
+
"""Close the knowledge base and release resources.
|
|
682
|
+
|
|
683
|
+
This method:
|
|
684
|
+
- Saves the vector store to disk (if persistence is configured)
|
|
685
|
+
- Closes the vector store connection
|
|
686
|
+
- Closes the embedding provider (releases HTTP sessions)
|
|
687
|
+
|
|
688
|
+
Should be called when done using the knowledge base to prevent
|
|
689
|
+
resource leaks (e.g., unclosed aiohttp sessions).
|
|
690
|
+
|
|
691
|
+
Example:
|
|
692
|
+
```python
|
|
693
|
+
kb = await RAGKnowledgeBase.from_config(config)
|
|
694
|
+
try:
|
|
695
|
+
await kb.load_markdown_document("docs/api.md")
|
|
696
|
+
results = await kb.query("How do I configure?")
|
|
697
|
+
finally:
|
|
698
|
+
await kb.close()
|
|
699
|
+
```
|
|
700
|
+
"""
|
|
701
|
+
# Close vector store (will save if persist_path is set)
|
|
702
|
+
if hasattr(self.vector_store, "close"):
|
|
703
|
+
await self.vector_store.close()
|
|
704
|
+
|
|
705
|
+
# Close embedding provider (releases HTTP client sessions)
|
|
706
|
+
if hasattr(self.embedding_provider, "close"):
|
|
707
|
+
await self.embedding_provider.close()
|
|
708
|
+
|
|
709
|
+
async def __aenter__(self) -> "RAGKnowledgeBase":
|
|
710
|
+
"""Async context manager entry.
|
|
711
|
+
|
|
712
|
+
Returns:
|
|
713
|
+
Self for use in async with statement
|
|
714
|
+
|
|
715
|
+
Example:
|
|
716
|
+
```python
|
|
717
|
+
async with await RAGKnowledgeBase.from_config(config) as kb:
|
|
718
|
+
await kb.load_markdown_document("docs/api.md")
|
|
719
|
+
results = await kb.query("How do I configure?")
|
|
720
|
+
# Automatically saved and closed
|
|
721
|
+
```
|
|
722
|
+
"""
|
|
723
|
+
return self
|
|
724
|
+
|
|
725
|
+
async def __aexit__(
|
|
726
|
+
self,
|
|
727
|
+
exc_type: type[BaseException] | None,
|
|
728
|
+
exc_val: BaseException | None,
|
|
729
|
+
exc_tb: types.TracebackType | None,
|
|
730
|
+
) -> None:
|
|
731
|
+
"""Async context manager exit - ensures cleanup.
|
|
732
|
+
|
|
733
|
+
Args:
|
|
734
|
+
exc_type: Exception type if an exception occurred
|
|
735
|
+
exc_val: Exception value if an exception occurred
|
|
736
|
+
exc_tb: Exception traceback if an exception occurred
|
|
737
|
+
"""
|
|
738
|
+
await self.close()
|