agno 2.4.5__py3-none-any.whl → 2.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +2 -1
- agno/db/surrealdb/models.py +1 -1
- agno/knowledge/chunking/agentic.py +1 -5
- agno/knowledge/chunking/code.py +1 -1
- agno/knowledge/chunking/document.py +22 -42
- agno/knowledge/chunking/fixed.py +1 -5
- agno/knowledge/chunking/markdown.py +9 -25
- agno/knowledge/chunking/recursive.py +1 -3
- agno/knowledge/chunking/row.py +3 -2
- agno/knowledge/chunking/semantic.py +1 -1
- agno/knowledge/chunking/strategy.py +19 -0
- agno/knowledge/knowledge.py +173 -14
- agno/knowledge/reader/text_reader.py +1 -1
- agno/learn/stores/learned_knowledge.py +108 -131
- agno/utils/print_response/agent.py +8 -8
- agno/utils/print_response/team.py +8 -8
- {agno-2.4.5.dist-info → agno-2.4.6.dist-info}/METADATA +33 -58
- {agno-2.4.5.dist-info → agno-2.4.6.dist-info}/RECORD +21 -21
- {agno-2.4.5.dist-info → agno-2.4.6.dist-info}/WHEEL +0 -0
- {agno-2.4.5.dist-info → agno-2.4.6.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.5.dist-info → agno-2.4.6.dist-info}/top_level.txt +0 -0
agno/agent/agent.py
CHANGED
|
@@ -858,8 +858,9 @@ class Agent:
|
|
|
858
858
|
return
|
|
859
859
|
|
|
860
860
|
# Handle learning=True: create default LearningMachine
|
|
861
|
+
# Enables user_profile (structured fields) and user_memory (unstructured observations)
|
|
861
862
|
if self.learning is True:
|
|
862
|
-
self._learning = LearningMachine(db=self.db, model=self.model, user_profile=True)
|
|
863
|
+
self._learning = LearningMachine(db=self.db, model=self.model, user_profile=True, user_memory=True)
|
|
863
864
|
return
|
|
864
865
|
|
|
865
866
|
# Handle learning=LearningMachine(...): inject dependencies
|
agno/db/surrealdb/models.py
CHANGED
|
@@ -48,7 +48,7 @@ def surrealize_dates(record: dict) -> dict:
|
|
|
48
48
|
if isinstance(value, date):
|
|
49
49
|
copy[key] = datetime.combine(value, datetime.min.time()).replace(tzinfo=timezone.utc)
|
|
50
50
|
elif key in ["created_at", "updated_at"] and isinstance(value, (int, float)):
|
|
51
|
-
copy[key] = datetime.fromtimestamp(value
|
|
51
|
+
copy[key] = datetime.fromtimestamp(value, tz=timezone.utc)
|
|
52
52
|
elif key in ["created_at", "updated_at"] and isinstance(value, str):
|
|
53
53
|
# Handle ISO string format - convert back to datetime object for SurrealDB
|
|
54
54
|
try:
|
|
@@ -55,11 +55,7 @@ class AgenticChunking(ChunkingStrategy):
|
|
|
55
55
|
chunk = remaining_text[:break_point].strip()
|
|
56
56
|
meta_data = chunk_meta_data.copy()
|
|
57
57
|
meta_data["chunk"] = chunk_number
|
|
58
|
-
chunk_id =
|
|
59
|
-
if document.id:
|
|
60
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
61
|
-
elif document.name:
|
|
62
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
58
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk)
|
|
63
59
|
meta_data["chunk_size"] = len(chunk)
|
|
64
60
|
chunks.append(
|
|
65
61
|
Document(
|
agno/knowledge/chunking/code.py
CHANGED
|
@@ -82,7 +82,7 @@ class CodeChunking(ChunkingStrategy):
|
|
|
82
82
|
for i, chunk in enumerate(chunks, 1):
|
|
83
83
|
meta_data = document.meta_data.copy()
|
|
84
84
|
meta_data["chunk"] = i
|
|
85
|
-
chunk_id =
|
|
85
|
+
chunk_id = self._generate_chunk_id(document, i, chunk.text)
|
|
86
86
|
meta_data["chunk_size"] = len(chunk.text)
|
|
87
87
|
|
|
88
88
|
chunked_documents.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk.text))
|
|
@@ -38,17 +38,10 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
38
38
|
if current_chunk:
|
|
39
39
|
meta_data = chunk_meta_data.copy()
|
|
40
40
|
meta_data["chunk"] = chunk_number
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
46
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
47
|
-
chunks.append(
|
|
48
|
-
Document(
|
|
49
|
-
id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk)
|
|
50
|
-
)
|
|
51
|
-
)
|
|
41
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
42
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
43
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
44
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
52
45
|
chunk_number += 1
|
|
53
46
|
current_chunk = []
|
|
54
47
|
current_size = 0
|
|
@@ -70,18 +63,15 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
70
63
|
if current_chunk:
|
|
71
64
|
meta_data = chunk_meta_data.copy()
|
|
72
65
|
meta_data["chunk"] = chunk_number
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
elif document.name:
|
|
77
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
78
|
-
meta_data["chunk_size"] = len(" ".join(current_chunk))
|
|
66
|
+
chunk_content = " ".join(current_chunk)
|
|
67
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
68
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
79
69
|
chunks.append(
|
|
80
70
|
Document(
|
|
81
71
|
id=chunk_id,
|
|
82
72
|
name=document.name,
|
|
83
73
|
meta_data=meta_data,
|
|
84
|
-
content=
|
|
74
|
+
content=chunk_content,
|
|
85
75
|
)
|
|
86
76
|
)
|
|
87
77
|
chunk_number += 1
|
|
@@ -94,18 +84,11 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
94
84
|
else:
|
|
95
85
|
meta_data = chunk_meta_data.copy()
|
|
96
86
|
meta_data["chunk"] = chunk_number
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
elif document.name:
|
|
101
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
102
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
87
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
88
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
89
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
103
90
|
if current_chunk:
|
|
104
|
-
chunks.append(
|
|
105
|
-
Document(
|
|
106
|
-
id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk)
|
|
107
|
-
)
|
|
108
|
-
)
|
|
91
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
109
92
|
chunk_number += 1
|
|
110
93
|
current_chunk = [para]
|
|
111
94
|
current_size = para_size
|
|
@@ -113,15 +96,10 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
113
96
|
if current_chunk:
|
|
114
97
|
meta_data = chunk_meta_data.copy()
|
|
115
98
|
meta_data["chunk"] = chunk_number
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
121
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
122
|
-
chunks.append(
|
|
123
|
-
Document(id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk))
|
|
124
|
-
)
|
|
99
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
100
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
101
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
102
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
125
103
|
|
|
126
104
|
# Handle overlap if specified
|
|
127
105
|
if self.overlap > 0:
|
|
@@ -131,11 +109,11 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
131
109
|
# Add overlap from previous chunk
|
|
132
110
|
prev_text = chunks[i - 1].content[-self.overlap :]
|
|
133
111
|
meta_data = chunk_meta_data.copy()
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
112
|
+
# Use the chunk's existing metadata and ID instead of stale chunk_number
|
|
113
|
+
meta_data["chunk"] = chunks[i].meta_data["chunk"]
|
|
114
|
+
chunk_id = chunks[i].id
|
|
138
115
|
meta_data["chunk_size"] = len(prev_text + chunks[i].content)
|
|
116
|
+
|
|
139
117
|
if prev_text:
|
|
140
118
|
overlapped_chunks.append(
|
|
141
119
|
Document(
|
|
@@ -145,6 +123,8 @@ class DocumentChunking(ChunkingStrategy):
|
|
|
145
123
|
content=prev_text + chunks[i].content,
|
|
146
124
|
)
|
|
147
125
|
)
|
|
126
|
+
else:
|
|
127
|
+
overlapped_chunks.append(chunks[i])
|
|
148
128
|
else:
|
|
149
129
|
overlapped_chunks.append(chunks[i])
|
|
150
130
|
chunks = overlapped_chunks
|
agno/knowledge/chunking/fixed.py
CHANGED
|
@@ -38,11 +38,7 @@ class FixedSizeChunking(ChunkingStrategy):
|
|
|
38
38
|
chunk = content[start:end]
|
|
39
39
|
meta_data = chunk_meta_data.copy()
|
|
40
40
|
meta_data["chunk"] = chunk_number
|
|
41
|
-
chunk_id =
|
|
42
|
-
if document.id:
|
|
43
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
44
|
-
elif document.name:
|
|
45
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
41
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk)
|
|
46
42
|
meta_data["chunk_size"] = len(chunk)
|
|
47
43
|
chunked_documents.append(
|
|
48
44
|
Document(
|
|
@@ -267,11 +267,7 @@ class MarkdownChunking(ChunkingStrategy):
|
|
|
267
267
|
for sub_chunk in sub_chunks:
|
|
268
268
|
meta_data = chunk_meta_data.copy()
|
|
269
269
|
meta_data["chunk"] = chunk_number
|
|
270
|
-
chunk_id =
|
|
271
|
-
if document.id:
|
|
272
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
273
|
-
elif document.name:
|
|
274
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
270
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, sub_chunk)
|
|
275
271
|
meta_data["chunk_size"] = len(sub_chunk)
|
|
276
272
|
|
|
277
273
|
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=sub_chunk))
|
|
@@ -282,19 +278,12 @@ class MarkdownChunking(ChunkingStrategy):
|
|
|
282
278
|
else:
|
|
283
279
|
meta_data = chunk_meta_data.copy()
|
|
284
280
|
meta_data["chunk"] = chunk_number
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
elif document.name:
|
|
289
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
290
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
281
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
282
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
283
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
291
284
|
|
|
292
285
|
if current_chunk:
|
|
293
|
-
chunks.append(
|
|
294
|
-
Document(
|
|
295
|
-
id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk)
|
|
296
|
-
)
|
|
297
|
-
)
|
|
286
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
298
287
|
chunk_number += 1
|
|
299
288
|
|
|
300
289
|
current_chunk = [section]
|
|
@@ -304,15 +293,10 @@ class MarkdownChunking(ChunkingStrategy):
|
|
|
304
293
|
if current_chunk and not self.split_on_headings:
|
|
305
294
|
meta_data = chunk_meta_data.copy()
|
|
306
295
|
meta_data["chunk"] = chunk_number
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
chunk_id = f"{document.name}_{chunk_number}"
|
|
312
|
-
meta_data["chunk_size"] = len("\n\n".join(current_chunk))
|
|
313
|
-
chunks.append(
|
|
314
|
-
Document(id=chunk_id, name=document.name, meta_data=meta_data, content="\n\n".join(current_chunk))
|
|
315
|
-
)
|
|
296
|
+
chunk_content = "\n\n".join(current_chunk)
|
|
297
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk_content)
|
|
298
|
+
meta_data["chunk_size"] = len(chunk_content)
|
|
299
|
+
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
316
300
|
|
|
317
301
|
# Handle overlap if specified
|
|
318
302
|
if self.overlap > 0:
|
|
@@ -46,9 +46,7 @@ class RecursiveChunking(ChunkingStrategy):
|
|
|
46
46
|
chunk = self.clean_text(content[start:end])
|
|
47
47
|
meta_data = chunk_meta_data.copy()
|
|
48
48
|
meta_data["chunk"] = chunk_number
|
|
49
|
-
chunk_id =
|
|
50
|
-
if document.id:
|
|
51
|
-
chunk_id = f"{document.id}_{chunk_number}"
|
|
49
|
+
chunk_id = self._generate_chunk_id(document, chunk_number, chunk)
|
|
52
50
|
chunk_number += 1
|
|
53
51
|
meta_data["chunk_size"] = len(chunk)
|
|
54
52
|
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk))
|
agno/knowledge/chunking/row.py
CHANGED
|
@@ -33,7 +33,8 @@ class RowChunking(ChunkingStrategy):
|
|
|
33
33
|
|
|
34
34
|
if chunk_content: # Skip empty rows
|
|
35
35
|
meta_data = document.meta_data.copy()
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
row_number = start_index + i
|
|
37
|
+
meta_data["row_number"] = row_number # Preserve logical row numbering
|
|
38
|
+
chunk_id = self._generate_chunk_id(document, row_number, chunk_content, prefix="row")
|
|
38
39
|
chunks.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk_content))
|
|
39
40
|
return chunks
|
|
@@ -160,7 +160,7 @@ class SemanticChunking(ChunkingStrategy):
|
|
|
160
160
|
for i, chunk in enumerate(chunks, 1):
|
|
161
161
|
meta_data = document.meta_data.copy()
|
|
162
162
|
meta_data["chunk"] = i
|
|
163
|
-
chunk_id =
|
|
163
|
+
chunk_id = self._generate_chunk_id(document, i, chunk.text)
|
|
164
164
|
meta_data["chunk_size"] = len(chunk.text)
|
|
165
165
|
|
|
166
166
|
chunked_documents.append(Document(id=chunk_id, name=document.name, meta_data=meta_data, content=chunk.text))
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import hashlib
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
2
3
|
from enum import Enum
|
|
3
4
|
from typing import List, Optional
|
|
@@ -12,6 +13,24 @@ class ChunkingStrategy(ABC):
|
|
|
12
13
|
def chunk(self, document: Document) -> List[Document]:
|
|
13
14
|
raise NotImplementedError
|
|
14
15
|
|
|
16
|
+
def _generate_chunk_id(
|
|
17
|
+
self, document: Document, chunk_number: int, content: Optional[str] = None, prefix: Optional[str] = None
|
|
18
|
+
) -> Optional[str]:
|
|
19
|
+
"""Generate a deterministic ID for the chunk."""
|
|
20
|
+
suffix = f"_{prefix}_{chunk_number}" if prefix else f"_{chunk_number}"
|
|
21
|
+
|
|
22
|
+
if document.id:
|
|
23
|
+
return f"{document.id}{suffix}"
|
|
24
|
+
elif document.name:
|
|
25
|
+
return f"{document.name}{suffix}"
|
|
26
|
+
else:
|
|
27
|
+
# Hash the chunk content for a deterministic ID when no identifier exists
|
|
28
|
+
hash_source = content if content else document.content
|
|
29
|
+
if hash_source:
|
|
30
|
+
content_hash = hashlib.md5(hash_source.encode("utf-8")).hexdigest()[:12] # nosec B324
|
|
31
|
+
return f"chunk_{content_hash}{suffix}"
|
|
32
|
+
return None
|
|
33
|
+
|
|
15
34
|
async def achunk(self, document: Document) -> List[Document]:
|
|
16
35
|
"""Async version of chunk. Override for truly async implementations."""
|
|
17
36
|
return self.chunk(document)
|
agno/knowledge/knowledge.py
CHANGED
|
@@ -823,7 +823,13 @@ class Knowledge:
|
|
|
823
823
|
log_warning(f"Invalid filter key: {key} - not present in knowledge base")
|
|
824
824
|
|
|
825
825
|
elif isinstance(filters, List):
|
|
826
|
-
# Validate
|
|
826
|
+
# Validate list filters against known metadata keys
|
|
827
|
+
if valid_metadata_filters is None or not valid_metadata_filters:
|
|
828
|
+
# Can't validate keys without metadata - return original list
|
|
829
|
+
log_warning("No valid metadata filters tracked yet. Cannot validate list filter keys.")
|
|
830
|
+
return filters, []
|
|
831
|
+
|
|
832
|
+
valid_list_filters: List[FilterExpr] = []
|
|
827
833
|
for i, filter_item in enumerate(filters):
|
|
828
834
|
if not isinstance(filter_item, FilterExpr):
|
|
829
835
|
log_warning(
|
|
@@ -832,9 +838,23 @@ class Knowledge:
|
|
|
832
838
|
f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
|
|
833
839
|
f"AND(...), OR(...), NOT(...) from agno.filters"
|
|
834
840
|
)
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
841
|
+
continue
|
|
842
|
+
|
|
843
|
+
# Check if filter has a key attribute and validate it
|
|
844
|
+
if hasattr(filter_item, "key"):
|
|
845
|
+
key = filter_item.key
|
|
846
|
+
base_key = key.split(".")[-1] if "." in key else key
|
|
847
|
+
if base_key in valid_metadata_filters or key in valid_metadata_filters:
|
|
848
|
+
valid_list_filters.append(filter_item)
|
|
849
|
+
else:
|
|
850
|
+
invalid_keys.append(key)
|
|
851
|
+
log_warning(f"Invalid filter key: {key} - not present in knowledge base")
|
|
852
|
+
else:
|
|
853
|
+
# Complex filters (AND, OR, NOT) - keep them as-is
|
|
854
|
+
# They contain nested filters that will be validated by the vector DB
|
|
855
|
+
valid_list_filters.append(filter_item)
|
|
856
|
+
|
|
857
|
+
return valid_list_filters, invalid_keys
|
|
838
858
|
|
|
839
859
|
return valid_filters, invalid_keys
|
|
840
860
|
|
|
@@ -1541,7 +1561,49 @@ class Knowledge:
|
|
|
1541
1561
|
# 6. Chunk documents if needed
|
|
1542
1562
|
if reader and not reader.chunk:
|
|
1543
1563
|
read_documents = await reader.chunk_documents_async(read_documents)
|
|
1544
|
-
|
|
1564
|
+
|
|
1565
|
+
# 7. Group documents by source URL for multi-page readers (like WebsiteReader)
|
|
1566
|
+
docs_by_source: Dict[str, List[Document]] = {}
|
|
1567
|
+
for doc in read_documents:
|
|
1568
|
+
source_url = doc.meta_data.get("url", content.url) if doc.meta_data else content.url
|
|
1569
|
+
source_url = source_url or "unknown"
|
|
1570
|
+
if source_url not in docs_by_source:
|
|
1571
|
+
docs_by_source[source_url] = []
|
|
1572
|
+
docs_by_source[source_url].append(doc)
|
|
1573
|
+
|
|
1574
|
+
# 8. Process each source separately if multiple sources exist
|
|
1575
|
+
if len(docs_by_source) > 1:
|
|
1576
|
+
for source_url, source_docs in docs_by_source.items():
|
|
1577
|
+
# Compute per-document hash based on actual source URL
|
|
1578
|
+
doc_hash = self._build_document_content_hash(source_docs[0], content)
|
|
1579
|
+
|
|
1580
|
+
# Check skip_if_exists for each source individually
|
|
1581
|
+
if self._should_skip(doc_hash, skip_if_exists):
|
|
1582
|
+
log_debug(f"Skipping already indexed: {source_url}")
|
|
1583
|
+
continue
|
|
1584
|
+
|
|
1585
|
+
doc_id = generate_id(doc_hash)
|
|
1586
|
+
self._prepare_documents_for_insert(source_docs, doc_id, calculate_sizes=True)
|
|
1587
|
+
|
|
1588
|
+
# Insert with per-document hash
|
|
1589
|
+
if self.vector_db.upsert_available() and upsert:
|
|
1590
|
+
try:
|
|
1591
|
+
await self.vector_db.async_upsert(doc_hash, source_docs, content.metadata)
|
|
1592
|
+
except Exception as e:
|
|
1593
|
+
log_error(f"Error upserting document from {source_url}: {e}")
|
|
1594
|
+
continue
|
|
1595
|
+
else:
|
|
1596
|
+
try:
|
|
1597
|
+
await self.vector_db.async_insert(doc_hash, documents=source_docs, filters=content.metadata)
|
|
1598
|
+
except Exception as e:
|
|
1599
|
+
log_error(f"Error inserting document from {source_url}: {e}")
|
|
1600
|
+
continue
|
|
1601
|
+
|
|
1602
|
+
content.status = ContentStatus.COMPLETED
|
|
1603
|
+
await self._aupdate_content(content)
|
|
1604
|
+
return
|
|
1605
|
+
|
|
1606
|
+
# 9. Single source - use existing logic with original content hash
|
|
1545
1607
|
if not content.id:
|
|
1546
1608
|
content.id = generate_id(content.content_hash or "")
|
|
1547
1609
|
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
@@ -1648,7 +1710,48 @@ class Knowledge:
|
|
|
1648
1710
|
if reader:
|
|
1649
1711
|
read_documents = self._chunk_documents_sync(reader, read_documents)
|
|
1650
1712
|
|
|
1651
|
-
# 7.
|
|
1713
|
+
# 7. Group documents by source URL for multi-page readers (like WebsiteReader)
|
|
1714
|
+
docs_by_source: Dict[str, List[Document]] = {}
|
|
1715
|
+
for doc in read_documents:
|
|
1716
|
+
source_url = doc.meta_data.get("url", content.url) if doc.meta_data else content.url
|
|
1717
|
+
source_url = source_url or "unknown"
|
|
1718
|
+
if source_url not in docs_by_source:
|
|
1719
|
+
docs_by_source[source_url] = []
|
|
1720
|
+
docs_by_source[source_url].append(doc)
|
|
1721
|
+
|
|
1722
|
+
# 8. Process each source separately if multiple sources exist
|
|
1723
|
+
if len(docs_by_source) > 1:
|
|
1724
|
+
for source_url, source_docs in docs_by_source.items():
|
|
1725
|
+
# Compute per-document hash based on actual source URL
|
|
1726
|
+
doc_hash = self._build_document_content_hash(source_docs[0], content)
|
|
1727
|
+
|
|
1728
|
+
# Check skip_if_exists for each source individually
|
|
1729
|
+
if self._should_skip(doc_hash, skip_if_exists):
|
|
1730
|
+
log_debug(f"Skipping already indexed: {source_url}")
|
|
1731
|
+
continue
|
|
1732
|
+
|
|
1733
|
+
doc_id = generate_id(doc_hash)
|
|
1734
|
+
self._prepare_documents_for_insert(source_docs, doc_id, calculate_sizes=True)
|
|
1735
|
+
|
|
1736
|
+
# Insert with per-document hash
|
|
1737
|
+
if self.vector_db.upsert_available() and upsert:
|
|
1738
|
+
try:
|
|
1739
|
+
self.vector_db.upsert(doc_hash, source_docs, content.metadata)
|
|
1740
|
+
except Exception as e:
|
|
1741
|
+
log_error(f"Error upserting document from {source_url}: {e}")
|
|
1742
|
+
continue
|
|
1743
|
+
else:
|
|
1744
|
+
try:
|
|
1745
|
+
self.vector_db.insert(doc_hash, documents=source_docs, filters=content.metadata)
|
|
1746
|
+
except Exception as e:
|
|
1747
|
+
log_error(f"Error inserting document from {source_url}: {e}")
|
|
1748
|
+
continue
|
|
1749
|
+
|
|
1750
|
+
content.status = ContentStatus.COMPLETED
|
|
1751
|
+
self._update_content(content)
|
|
1752
|
+
return
|
|
1753
|
+
|
|
1754
|
+
# 9. Single source - use existing logic with original content hash
|
|
1652
1755
|
if not content.id:
|
|
1653
1756
|
content.id = generate_id(content.content_hash or "")
|
|
1654
1757
|
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
@@ -1900,11 +2003,11 @@ class Knowledge:
|
|
|
1900
2003
|
if self._should_skip(content.content_hash, skip_if_exists):
|
|
1901
2004
|
content.status = ContentStatus.COMPLETED
|
|
1902
2005
|
await self._aupdate_content(content)
|
|
1903
|
-
|
|
2006
|
+
continue # Skip to next topic, don't exit loop
|
|
1904
2007
|
|
|
1905
2008
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1906
2009
|
await self._aprocess_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
|
|
1907
|
-
|
|
2010
|
+
continue # Skip to next topic, don't exit loop
|
|
1908
2011
|
|
|
1909
2012
|
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1910
2013
|
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
@@ -1961,11 +2064,11 @@ class Knowledge:
|
|
|
1961
2064
|
if self._should_skip(content.content_hash, skip_if_exists):
|
|
1962
2065
|
content.status = ContentStatus.COMPLETED
|
|
1963
2066
|
self._update_content(content)
|
|
1964
|
-
|
|
2067
|
+
continue # Skip to next topic, don't exit loop
|
|
1965
2068
|
|
|
1966
2069
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1967
2070
|
self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
|
|
1968
|
-
|
|
2071
|
+
continue # Skip to next topic, don't exit loop
|
|
1969
2072
|
|
|
1970
2073
|
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1971
2074
|
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
@@ -3896,6 +3999,42 @@ class Knowledge:
|
|
|
3896
3999
|
hash_input = ":".join(hash_parts)
|
|
3897
4000
|
return hashlib.sha256(hash_input.encode()).hexdigest()
|
|
3898
4001
|
|
|
4002
|
+
def _build_document_content_hash(self, document: Document, content: Content) -> str:
|
|
4003
|
+
"""
|
|
4004
|
+
Build content hash for a specific document.
|
|
4005
|
+
|
|
4006
|
+
Used for multi-page readers (like WebsiteReader) where each crawled page
|
|
4007
|
+
should have its own unique content hash based on its actual URL.
|
|
4008
|
+
|
|
4009
|
+
Args:
|
|
4010
|
+
document: The document to build the hash for
|
|
4011
|
+
content: The original content object (for fallback name/description)
|
|
4012
|
+
|
|
4013
|
+
Returns:
|
|
4014
|
+
A unique hash string for this specific document
|
|
4015
|
+
"""
|
|
4016
|
+
hash_parts = []
|
|
4017
|
+
|
|
4018
|
+
if content.name:
|
|
4019
|
+
hash_parts.append(content.name)
|
|
4020
|
+
if content.description:
|
|
4021
|
+
hash_parts.append(content.description)
|
|
4022
|
+
|
|
4023
|
+
# Use document's own URL if available (set by WebsiteReader)
|
|
4024
|
+
doc_url = document.meta_data.get("url") if document.meta_data else None
|
|
4025
|
+
if doc_url:
|
|
4026
|
+
hash_parts.append(str(doc_url))
|
|
4027
|
+
elif content.url:
|
|
4028
|
+
hash_parts.append(content.url)
|
|
4029
|
+
elif content.path:
|
|
4030
|
+
hash_parts.append(str(content.path))
|
|
4031
|
+
else:
|
|
4032
|
+
# Fallback: use content hash for uniqueness
|
|
4033
|
+
hash_parts.append(hashlib.sha256(document.content.encode()).hexdigest()[:16])
|
|
4034
|
+
|
|
4035
|
+
hash_input = ":".join(hash_parts)
|
|
4036
|
+
return hashlib.sha256(hash_input.encode()).hexdigest()
|
|
4037
|
+
|
|
3899
4038
|
def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
|
|
3900
4039
|
"""
|
|
3901
4040
|
Safely ensure a field is a string, handling various edge cases.
|
|
@@ -4625,7 +4764,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
|
|
|
4625
4764
|
retrieval_timer = Timer()
|
|
4626
4765
|
retrieval_timer.start()
|
|
4627
4766
|
|
|
4628
|
-
|
|
4767
|
+
try:
|
|
4768
|
+
docs = self.search(query=query, filters=knowledge_filters)
|
|
4769
|
+
except Exception as e:
|
|
4770
|
+
retrieval_timer.stop()
|
|
4771
|
+
log_warning(f"Knowledge search failed: {e}")
|
|
4772
|
+
return f"Error searching knowledge base: {type(e).__name__}"
|
|
4629
4773
|
|
|
4630
4774
|
if run_response is not None and docs:
|
|
4631
4775
|
references = MessageReferences(
|
|
@@ -4657,7 +4801,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
|
|
|
4657
4801
|
retrieval_timer = Timer()
|
|
4658
4802
|
retrieval_timer.start()
|
|
4659
4803
|
|
|
4660
|
-
|
|
4804
|
+
try:
|
|
4805
|
+
docs = await self.asearch(query=query, filters=knowledge_filters)
|
|
4806
|
+
except Exception as e:
|
|
4807
|
+
retrieval_timer.stop()
|
|
4808
|
+
log_warning(f"Knowledge search failed: {e}")
|
|
4809
|
+
return f"Error searching knowledge base: {type(e).__name__}"
|
|
4661
4810
|
|
|
4662
4811
|
if run_response is not None and docs:
|
|
4663
4812
|
references = MessageReferences(
|
|
@@ -4735,7 +4884,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
|
|
|
4735
4884
|
retrieval_timer = Timer()
|
|
4736
4885
|
retrieval_timer.start()
|
|
4737
4886
|
|
|
4738
|
-
|
|
4887
|
+
try:
|
|
4888
|
+
docs = self.search(query=query, filters=search_filters)
|
|
4889
|
+
except Exception as e:
|
|
4890
|
+
retrieval_timer.stop()
|
|
4891
|
+
log_warning(f"Knowledge search failed: {e}")
|
|
4892
|
+
return f"Error searching knowledge base: {type(e).__name__}"
|
|
4739
4893
|
|
|
4740
4894
|
if run_response is not None and docs:
|
|
4741
4895
|
references = MessageReferences(
|
|
@@ -4789,7 +4943,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
|
|
|
4789
4943
|
retrieval_timer = Timer()
|
|
4790
4944
|
retrieval_timer.start()
|
|
4791
4945
|
|
|
4792
|
-
|
|
4946
|
+
try:
|
|
4947
|
+
docs = await self.asearch(query=query, filters=search_filters)
|
|
4948
|
+
except Exception as e:
|
|
4949
|
+
retrieval_timer.stop()
|
|
4950
|
+
log_warning(f"Knowledge search failed: {e}")
|
|
4951
|
+
return f"Error searching knowledge base: {type(e).__name__}"
|
|
4793
4952
|
|
|
4794
4953
|
if run_response is not None and docs:
|
|
4795
4954
|
references = MessageReferences(
|
|
@@ -110,7 +110,7 @@ class TextReader(Reader):
|
|
|
110
110
|
chunked_documents = self.chunk_document(document)
|
|
111
111
|
|
|
112
112
|
if not chunked_documents:
|
|
113
|
-
return [
|
|
113
|
+
return []
|
|
114
114
|
|
|
115
115
|
tasks = [process_chunk(chunk_doc) for chunk_doc in chunked_documents]
|
|
116
116
|
return await asyncio.gather(*tasks)
|