agno 1.7.9__py3-none-any.whl → 1.7.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +1 -1
- agno/app/fastapi/app.py +3 -1
- agno/app/fastapi/async_router.py +1 -1
- agno/app/playground/app.py +1 -0
- agno/document/chunking/semantic.py +1 -3
- agno/document/reader/markdown_reader.py +2 -7
- agno/document/reader/pdf_reader.py +69 -13
- agno/document/reader/text_reader.py +2 -2
- agno/knowledge/agent.py +70 -75
- agno/knowledge/markdown.py +15 -2
- agno/knowledge/pdf.py +32 -8
- agno/knowledge/pdf_url.py +13 -5
- agno/knowledge/website.py +4 -1
- agno/media.py +2 -0
- agno/models/aws/bedrock.py +51 -21
- agno/models/dashscope/__init__.py +5 -0
- agno/models/dashscope/dashscope.py +81 -0
- agno/models/openai/chat.py +3 -0
- agno/models/openai/responses.py +53 -7
- agno/models/qwen/__init__.py +5 -0
- agno/run/response.py +4 -0
- agno/run/team.py +4 -0
- agno/storage/in_memory.py +234 -0
- agno/team/team.py +25 -9
- agno/tools/brandfetch.py +210 -0
- agno/tools/github.py +46 -18
- agno/tools/trafilatura.py +372 -0
- agno/vectordb/clickhouse/clickhousedb.py +1 -1
- agno/vectordb/milvus/milvus.py +89 -1
- agno/vectordb/weaviate/weaviate.py +84 -18
- agno/workflow/workflow.py +3 -0
- {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/METADATA +5 -1
- {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/RECORD +37 -31
- {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/WHEEL +0 -0
- {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/entry_points.txt +0 -0
- {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/licenses/LICENSE +0 -0
- {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from typing import Any, Callable, Dict, List, Optional, Set
|
|
3
|
+
|
|
4
|
+
from agno.tools import Toolkit
|
|
5
|
+
from agno.utils.log import log_debug, logger
|
|
6
|
+
|
|
7
|
+
try:
|
|
8
|
+
from trafilatura import (
|
|
9
|
+
extract,
|
|
10
|
+
extract_metadata,
|
|
11
|
+
fetch_url,
|
|
12
|
+
html2txt,
|
|
13
|
+
)
|
|
14
|
+
from trafilatura.meta import reset_caches
|
|
15
|
+
|
|
16
|
+
# Import spider functionality
|
|
17
|
+
try:
|
|
18
|
+
from trafilatura.spider import focused_crawler
|
|
19
|
+
|
|
20
|
+
SPIDER_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
SPIDER_AVAILABLE = False
|
|
23
|
+
logger.warning("Trafilatura spider module not available. Web crawling functionality will be disabled.")
|
|
24
|
+
|
|
25
|
+
except ImportError:
|
|
26
|
+
raise ImportError("`trafilatura` not installed. Please install using `pip install trafilatura`")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class TrafilaturaTools(Toolkit):
|
|
30
|
+
"""
|
|
31
|
+
TrafilaturaTools is a toolkit for web scraping and text extraction.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
output_format (str): Default output format for extractions. Options: 'txt', 'json', 'xml', 'markdown', 'csv', 'html', 'xmltei'.
|
|
35
|
+
include_comments (bool): Whether to extract comments along with main text by default.
|
|
36
|
+
include_tables (bool): Whether to include table content by default.
|
|
37
|
+
include_images (bool): Whether to include image information by default (experimental).
|
|
38
|
+
include_formatting (bool): Whether to preserve formatting by default.
|
|
39
|
+
include_links (bool): Whether to preserve links by default (experimental).
|
|
40
|
+
with_metadata (bool): Whether to include metadata in extractions by default.
|
|
41
|
+
favor_precision (bool): Whether to prefer precision over recall by default.
|
|
42
|
+
favor_recall (bool): Whether to prefer recall over precision by default.
|
|
43
|
+
target_language (Optional[str]): Default target language filter (ISO 639-1 format).
|
|
44
|
+
deduplicate (bool): Whether to remove duplicate segments by default.
|
|
45
|
+
max_tree_size (Optional[int]): Maximum tree size for processing.
|
|
46
|
+
max_crawl_urls (int): Maximum number of URLs to crawl per website.
|
|
47
|
+
max_known_urls (int): Maximum number of known URLs during crawling.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
output_format: str = "txt",
|
|
53
|
+
include_comments: bool = True,
|
|
54
|
+
include_tables: bool = True,
|
|
55
|
+
include_images: bool = False,
|
|
56
|
+
include_formatting: bool = False,
|
|
57
|
+
include_links: bool = False,
|
|
58
|
+
with_metadata: bool = False,
|
|
59
|
+
favor_precision: bool = False,
|
|
60
|
+
favor_recall: bool = False,
|
|
61
|
+
target_language: Optional[str] = None,
|
|
62
|
+
deduplicate: bool = False,
|
|
63
|
+
max_tree_size: Optional[int] = None,
|
|
64
|
+
max_crawl_urls: int = 10,
|
|
65
|
+
max_known_urls: int = 100000,
|
|
66
|
+
**kwargs,
|
|
67
|
+
):
|
|
68
|
+
self.output_format = output_format
|
|
69
|
+
self.include_comments = include_comments
|
|
70
|
+
self.include_tables = include_tables
|
|
71
|
+
self.include_images = include_images
|
|
72
|
+
self.include_formatting = include_formatting
|
|
73
|
+
self.include_links = include_links
|
|
74
|
+
self.with_metadata = with_metadata
|
|
75
|
+
self.favor_precision = favor_precision
|
|
76
|
+
self.favor_recall = favor_recall
|
|
77
|
+
self.target_language = target_language
|
|
78
|
+
self.deduplicate = deduplicate
|
|
79
|
+
self.max_tree_size = max_tree_size
|
|
80
|
+
self.max_crawl_urls = max_crawl_urls
|
|
81
|
+
self.max_known_urls = max_known_urls
|
|
82
|
+
|
|
83
|
+
tools: List[Callable] = [self.extract_text, self.extract_metadata_only, self.html_to_text, self.extract_batch]
|
|
84
|
+
|
|
85
|
+
if not SPIDER_AVAILABLE:
|
|
86
|
+
logger.warning("Web crawling requested but spider module not available. Skipping crawler tool.")
|
|
87
|
+
else:
|
|
88
|
+
tools.append(self.crawl_website)
|
|
89
|
+
|
|
90
|
+
super().__init__(name="trafilatura_tools", tools=tools, **kwargs)
|
|
91
|
+
|
|
92
|
+
def _get_extraction_params(
|
|
93
|
+
self,
|
|
94
|
+
output_format: Optional[str] = None,
|
|
95
|
+
include_comments: Optional[bool] = None,
|
|
96
|
+
include_tables: Optional[bool] = None,
|
|
97
|
+
include_images: Optional[bool] = None,
|
|
98
|
+
include_formatting: Optional[bool] = None,
|
|
99
|
+
include_links: Optional[bool] = None,
|
|
100
|
+
with_metadata: Optional[bool] = None,
|
|
101
|
+
favor_precision: Optional[bool] = None,
|
|
102
|
+
favor_recall: Optional[bool] = None,
|
|
103
|
+
target_language: Optional[str] = None,
|
|
104
|
+
deduplicate: Optional[bool] = None,
|
|
105
|
+
max_tree_size: Optional[int] = None,
|
|
106
|
+
url_blacklist: Optional[Set[str]] = None,
|
|
107
|
+
author_blacklist: Optional[Set[str]] = None,
|
|
108
|
+
) -> Dict[str, Any]:
|
|
109
|
+
"""Helper method to build extraction parameters with fallbacks to instance defaults."""
|
|
110
|
+
return {
|
|
111
|
+
"output_format": output_format if output_format is not None else self.output_format,
|
|
112
|
+
"include_comments": include_comments if include_comments is not None else self.include_comments,
|
|
113
|
+
"include_tables": include_tables if include_tables is not None else self.include_tables,
|
|
114
|
+
"include_images": include_images if include_images is not None else self.include_images,
|
|
115
|
+
"include_formatting": include_formatting if include_formatting is not None else self.include_formatting,
|
|
116
|
+
"include_links": include_links if include_links is not None else self.include_links,
|
|
117
|
+
"with_metadata": with_metadata if with_metadata is not None else self.with_metadata,
|
|
118
|
+
"favor_precision": favor_precision if favor_precision is not None else self.favor_precision,
|
|
119
|
+
"favor_recall": favor_recall if favor_recall is not None else self.favor_recall,
|
|
120
|
+
"target_language": target_language if target_language is not None else self.target_language,
|
|
121
|
+
"deduplicate": deduplicate if deduplicate is not None else self.deduplicate,
|
|
122
|
+
"max_tree_size": max_tree_size if max_tree_size is not None else self.max_tree_size,
|
|
123
|
+
"url_blacklist": url_blacklist,
|
|
124
|
+
"author_blacklist": author_blacklist,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
def extract_text(
|
|
128
|
+
self,
|
|
129
|
+
url: str,
|
|
130
|
+
output_format: Optional[str] = None,
|
|
131
|
+
) -> str:
|
|
132
|
+
"""
|
|
133
|
+
Extract main text content from a web page URL using Trafilatura.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
url (str): The URL to extract content from.
|
|
137
|
+
output_format (Optional[str]): Output format. Options: 'txt', 'json', 'xml', 'markdown', 'csv', 'html', 'xmltei'.
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
str: Extracted content in the specified format, or error message if extraction fails.
|
|
141
|
+
"""
|
|
142
|
+
try:
|
|
143
|
+
log_debug(f"Extracting text from URL: {url}")
|
|
144
|
+
|
|
145
|
+
# Fetch the webpage content
|
|
146
|
+
html_content = fetch_url(url)
|
|
147
|
+
if not html_content:
|
|
148
|
+
return f"Error: Could not fetch content from URL: {url}"
|
|
149
|
+
|
|
150
|
+
# Get extraction parameters
|
|
151
|
+
params = self._get_extraction_params(output_format=output_format)
|
|
152
|
+
|
|
153
|
+
result = extract(html_content, url=url, **params)
|
|
154
|
+
|
|
155
|
+
if result is None:
|
|
156
|
+
return f"Error: Could not extract readable content from URL: {url}"
|
|
157
|
+
|
|
158
|
+
# Reset caches
|
|
159
|
+
reset_caches()
|
|
160
|
+
|
|
161
|
+
return result
|
|
162
|
+
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.warning(f"Error extracting text from {url}: {e}")
|
|
165
|
+
return f"Error extracting text from {url}: {e}"
|
|
166
|
+
|
|
167
|
+
def extract_metadata_only(
|
|
168
|
+
self,
|
|
169
|
+
url: str,
|
|
170
|
+
as_json: bool = True,
|
|
171
|
+
) -> str:
|
|
172
|
+
"""
|
|
173
|
+
Extract only metadata from a web page URL.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
url (str): The URL to extract metadata from.
|
|
177
|
+
as_json (bool): Whether to return metadata as JSON string.
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
str: Extracted metadata as JSON string or formatted text.
|
|
181
|
+
"""
|
|
182
|
+
try:
|
|
183
|
+
log_debug(f"Extracting metadata from URL: {url}")
|
|
184
|
+
|
|
185
|
+
# Fetch the webpage content
|
|
186
|
+
html_content = fetch_url(url)
|
|
187
|
+
if not html_content:
|
|
188
|
+
return f"Error: Could not fetch content from URL: {url}"
|
|
189
|
+
|
|
190
|
+
# Extract metadata
|
|
191
|
+
metadata_doc = extract_metadata(
|
|
192
|
+
html_content,
|
|
193
|
+
default_url=url,
|
|
194
|
+
extensive=True, # default
|
|
195
|
+
author_blacklist=None,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
if metadata_doc is None:
|
|
199
|
+
return f"Error: Could not extract metadata from URL: {url}"
|
|
200
|
+
|
|
201
|
+
metadata_dict = metadata_doc.as_dict()
|
|
202
|
+
|
|
203
|
+
# Reset caches
|
|
204
|
+
reset_caches()
|
|
205
|
+
|
|
206
|
+
if as_json:
|
|
207
|
+
return json.dumps(metadata_dict, indent=2, default=str)
|
|
208
|
+
else:
|
|
209
|
+
return "\n".join(f"{key}: {value}" for key, value in metadata_dict.items())
|
|
210
|
+
|
|
211
|
+
except Exception as e:
|
|
212
|
+
logger.warning(f"Error extracting metadata from {url}: {e}")
|
|
213
|
+
return f"Error extracting metadata from {url}: {e}"
|
|
214
|
+
|
|
215
|
+
def crawl_website(
|
|
216
|
+
self,
|
|
217
|
+
homepage_url: str,
|
|
218
|
+
extract_content: bool = False,
|
|
219
|
+
) -> str:
|
|
220
|
+
"""
|
|
221
|
+
Crawl a website and optionally extract content from discovered pages.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
homepage_url (str): The starting URL (preferably homepage) to crawl from.
|
|
225
|
+
extract_content (bool): Whether to extract content from discovered URLs.
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
str: JSON containing crawl results and optionally extracted content.
|
|
229
|
+
"""
|
|
230
|
+
if not SPIDER_AVAILABLE:
|
|
231
|
+
return "Error: Web crawling functionality not available. Trafilatura spider module could not be imported."
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
log_debug(f"Starting website crawl from: {homepage_url}")
|
|
235
|
+
|
|
236
|
+
# Use instance configuration
|
|
237
|
+
max_seen = self.max_crawl_urls
|
|
238
|
+
max_known = self.max_known_urls
|
|
239
|
+
lang = self.target_language
|
|
240
|
+
|
|
241
|
+
# Perform focused crawling
|
|
242
|
+
to_visit, known_links = focused_crawler(
|
|
243
|
+
homepage=homepage_url,
|
|
244
|
+
max_seen_urls=max_seen,
|
|
245
|
+
max_known_urls=max_known,
|
|
246
|
+
lang=lang,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
crawl_results = {
|
|
250
|
+
"homepage": homepage_url,
|
|
251
|
+
"to_visit": list(to_visit) if to_visit else [],
|
|
252
|
+
"known_links": list(known_links) if known_links else [],
|
|
253
|
+
"stats": {
|
|
254
|
+
"urls_to_visit": len(to_visit) if to_visit else 0,
|
|
255
|
+
"known_links_count": len(known_links) if known_links else 0,
|
|
256
|
+
},
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
# Optionally extract content from discovered URLs
|
|
260
|
+
if extract_content and known_links:
|
|
261
|
+
log_debug("Extracting content from discovered URLs")
|
|
262
|
+
extracted_content = {}
|
|
263
|
+
|
|
264
|
+
# Limit extraction to avoid overwhelming responses
|
|
265
|
+
urls_to_extract = list(known_links)[: min(10, len(known_links))]
|
|
266
|
+
|
|
267
|
+
for url in urls_to_extract:
|
|
268
|
+
try:
|
|
269
|
+
params = self._get_extraction_params()
|
|
270
|
+
|
|
271
|
+
html_content = fetch_url(url)
|
|
272
|
+
if html_content:
|
|
273
|
+
content = extract(html_content, url=url, **params)
|
|
274
|
+
if content:
|
|
275
|
+
extracted_content[url] = content
|
|
276
|
+
except Exception as e:
|
|
277
|
+
extracted_content[url] = f"Error extracting content: {e}"
|
|
278
|
+
|
|
279
|
+
crawl_results["extracted_content"] = extracted_content
|
|
280
|
+
|
|
281
|
+
# Reset caches
|
|
282
|
+
reset_caches()
|
|
283
|
+
|
|
284
|
+
return json.dumps(crawl_results, indent=2, default=str)
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
logger.warning(f"Error crawling website {homepage_url}: {e}")
|
|
288
|
+
return f"Error crawling website {homepage_url}: {e}"
|
|
289
|
+
|
|
290
|
+
def html_to_text(
|
|
291
|
+
self,
|
|
292
|
+
html_content: str,
|
|
293
|
+
clean: bool = True,
|
|
294
|
+
) -> str:
|
|
295
|
+
"""
|
|
296
|
+
Convert HTML content to plain text using Trafilatura's html2txt function.
|
|
297
|
+
|
|
298
|
+
Args:
|
|
299
|
+
html_content (str): The HTML content to convert.
|
|
300
|
+
clean (bool): Whether to remove potentially undesirable elements.
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
str: Plain text extracted from HTML.
|
|
304
|
+
"""
|
|
305
|
+
try:
|
|
306
|
+
log_debug("Converting HTML to text")
|
|
307
|
+
|
|
308
|
+
result = html2txt(html_content, clean=clean)
|
|
309
|
+
|
|
310
|
+
# Reset caches
|
|
311
|
+
reset_caches()
|
|
312
|
+
|
|
313
|
+
return result if result else "Error: Could not extract text from HTML content"
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
logger.warning(f"Error converting HTML to text: {e}")
|
|
317
|
+
return f"Error converting HTML to text: {e}"
|
|
318
|
+
|
|
319
|
+
def extract_batch(
|
|
320
|
+
self,
|
|
321
|
+
urls: List[str],
|
|
322
|
+
) -> str:
|
|
323
|
+
"""
|
|
324
|
+
Extract content from multiple URLs in batch.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
urls (List[str]): List of URLs to extract content from.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
str: JSON containing batch extraction results.
|
|
331
|
+
"""
|
|
332
|
+
try:
|
|
333
|
+
log_debug(f"Starting batch extraction for {len(urls)} URLs")
|
|
334
|
+
|
|
335
|
+
results = {}
|
|
336
|
+
failed_urls = []
|
|
337
|
+
|
|
338
|
+
for url in urls:
|
|
339
|
+
try:
|
|
340
|
+
params = self._get_extraction_params()
|
|
341
|
+
|
|
342
|
+
html_content = fetch_url(url)
|
|
343
|
+
if html_content:
|
|
344
|
+
content = extract(html_content, url=url, **params)
|
|
345
|
+
if content:
|
|
346
|
+
results[url] = content
|
|
347
|
+
else:
|
|
348
|
+
failed_urls.append(url)
|
|
349
|
+
else:
|
|
350
|
+
failed_urls.append(url)
|
|
351
|
+
|
|
352
|
+
except Exception as e:
|
|
353
|
+
failed_urls.append(url)
|
|
354
|
+
results[url] = f"Error: {e}"
|
|
355
|
+
|
|
356
|
+
# Reset caches after batch processing
|
|
357
|
+
reset_caches()
|
|
358
|
+
|
|
359
|
+
batch_results = {
|
|
360
|
+
"successful_extractions": len(results)
|
|
361
|
+
- len([k for k, v in results.items() if str(v).startswith("Error:")]),
|
|
362
|
+
"failed_extractions": len(failed_urls),
|
|
363
|
+
"total_urls": len(urls),
|
|
364
|
+
"results": results,
|
|
365
|
+
"failed_urls": failed_urls,
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return json.dumps(batch_results, indent=2, default=str)
|
|
369
|
+
|
|
370
|
+
except Exception as e:
|
|
371
|
+
logger.warning(f"Error in batch extraction: {e}")
|
|
372
|
+
return f"Error in batch extraction: {e}"
|
|
@@ -140,7 +140,7 @@ class Clickhouse(VectorDb):
|
|
|
140
140
|
|
|
141
141
|
if isinstance(self.index, HNSW):
|
|
142
142
|
index = (
|
|
143
|
-
f"INDEX embedding_index embedding TYPE vector_similarity('hnsw', 'L2Distance', {self.index.quantization}, "
|
|
143
|
+
f"INDEX embedding_index embedding TYPE vector_similarity('hnsw', 'L2Distance', {self.embedder.dimensions}, {self.index.quantization}, "
|
|
144
144
|
f"{self.index.hnsw_max_connections_per_layer}, {self.index.hnsw_candidate_list_size_for_construction})"
|
|
145
145
|
)
|
|
146
146
|
self.client.command("SET allow_experimental_vector_similarity_index = 1")
|
agno/vectordb/milvus/milvus.py
CHANGED
|
@@ -568,7 +568,7 @@ class Milvus(VectorDb):
|
|
|
568
568
|
self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None
|
|
569
569
|
) -> List[Document]:
|
|
570
570
|
if self.search_type == SearchType.hybrid:
|
|
571
|
-
return self.
|
|
571
|
+
return await self.async_hybrid_search(query, limit, filters)
|
|
572
572
|
|
|
573
573
|
query_embedding = self.embedder.get_embedding(query)
|
|
574
574
|
if query_embedding is None:
|
|
@@ -691,6 +691,94 @@ class Milvus(VectorDb):
|
|
|
691
691
|
logger.error(f"Error during hybrid search: {e}")
|
|
692
692
|
return []
|
|
693
693
|
|
|
694
|
+
async def async_hybrid_search(
|
|
695
|
+
self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None
|
|
696
|
+
) -> List[Document]:
|
|
697
|
+
"""
|
|
698
|
+
Perform an asynchronous hybrid search combining dense and sparse vector similarity.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
query (str): Query string to search for
|
|
702
|
+
limit (int): Maximum number of results to return
|
|
703
|
+
filters (Optional[Dict[str, Any]]): Filters to apply to the search
|
|
704
|
+
|
|
705
|
+
Returns:
|
|
706
|
+
List[Document]: List of matching documents
|
|
707
|
+
"""
|
|
708
|
+
from pymilvus import AnnSearchRequest, RRFRanker
|
|
709
|
+
|
|
710
|
+
# Get query embeddings
|
|
711
|
+
dense_vector = self.embedder.get_embedding(query)
|
|
712
|
+
sparse_vector = self._get_sparse_vector(query)
|
|
713
|
+
|
|
714
|
+
if dense_vector is None:
|
|
715
|
+
logger.error(f"Error getting dense embedding for Query: {query}")
|
|
716
|
+
return []
|
|
717
|
+
|
|
718
|
+
try:
|
|
719
|
+
# Refer to docs for details- https://milvus.io/docs/multi-vector-search.md
|
|
720
|
+
|
|
721
|
+
# Create search request for dense vectors
|
|
722
|
+
dense_search_param = {
|
|
723
|
+
"data": [dense_vector],
|
|
724
|
+
"anns_field": "dense_vector",
|
|
725
|
+
"param": {"metric_type": self._get_metric_type(), "params": {"nprobe": 10}},
|
|
726
|
+
"limit": limit
|
|
727
|
+
* 2, # Fetch more candidates for better reranking quality - each vector search returns 2x results which are then merged and reranked
|
|
728
|
+
}
|
|
729
|
+
|
|
730
|
+
# Create search request for sparse vectors
|
|
731
|
+
sparse_search_param = {
|
|
732
|
+
"data": [sparse_vector],
|
|
733
|
+
"anns_field": "sparse_vector",
|
|
734
|
+
"param": {"metric_type": "IP", "params": {"drop_ratio_build": 0.2}},
|
|
735
|
+
"limit": limit * 2, # Match dense search limit to ensure balanced candidate pool for reranking
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
# Create search requests
|
|
739
|
+
dense_request = AnnSearchRequest(**dense_search_param)
|
|
740
|
+
sparse_request = AnnSearchRequest(**sparse_search_param)
|
|
741
|
+
reqs = [dense_request, sparse_request]
|
|
742
|
+
|
|
743
|
+
# Use RRFRanker for balanced importance between vectors
|
|
744
|
+
ranker = RRFRanker(60) # Default k=60
|
|
745
|
+
|
|
746
|
+
log_info("Performing async hybrid search")
|
|
747
|
+
results = await self.async_client.hybrid_search(
|
|
748
|
+
collection_name=self.collection, reqs=reqs, ranker=ranker, limit=limit, output_fields=["*"]
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
# Build search results
|
|
752
|
+
search_results: List[Document] = []
|
|
753
|
+
for hits in results:
|
|
754
|
+
for hit in hits:
|
|
755
|
+
entity = hit.get("entity", {})
|
|
756
|
+
meta_data = json.loads(entity.get("meta_data", "{}")) if entity.get("meta_data") else {}
|
|
757
|
+
usage = json.loads(entity.get("usage", "{}")) if entity.get("usage") else None
|
|
758
|
+
|
|
759
|
+
search_results.append(
|
|
760
|
+
Document(
|
|
761
|
+
id=hit.get("id"),
|
|
762
|
+
name=entity.get("name", None),
|
|
763
|
+
meta_data=meta_data, # Now a dictionary
|
|
764
|
+
content=entity.get("content", ""),
|
|
765
|
+
embedder=self.embedder,
|
|
766
|
+
embedding=entity.get("dense_vector", None),
|
|
767
|
+
usage=usage, # Now a dictionary or None
|
|
768
|
+
)
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# Apply additional reranking if custom reranker is provided
|
|
772
|
+
if self.reranker and search_results:
|
|
773
|
+
search_results = self.reranker.rerank(query=query, documents=search_results)
|
|
774
|
+
|
|
775
|
+
log_info(f"Found {len(search_results)} documents")
|
|
776
|
+
return search_results
|
|
777
|
+
|
|
778
|
+
except Exception as e:
|
|
779
|
+
logger.error(f"Error during async hybrid search: {e}")
|
|
780
|
+
return []
|
|
781
|
+
|
|
694
782
|
def drop(self) -> None:
|
|
695
783
|
if self.exists():
|
|
696
784
|
log_debug(f"Deleting collection: {self.collection}")
|
|
@@ -2,7 +2,7 @@ import json
|
|
|
2
2
|
import uuid
|
|
3
3
|
from hashlib import md5
|
|
4
4
|
from os import getenv
|
|
5
|
-
from typing import Any, Dict, List, Optional
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
from warnings import filterwarnings
|
|
@@ -73,6 +73,13 @@ class Weaviate(VectorDb):
|
|
|
73
73
|
self.reranker: Optional[Reranker] = reranker
|
|
74
74
|
self.hybrid_search_alpha = hybrid_search_alpha
|
|
75
75
|
|
|
76
|
+
@staticmethod
|
|
77
|
+
def _get_doc_uuid(document: Document) -> Tuple[uuid.UUID, str]:
|
|
78
|
+
cleaned_content = document.content.replace("\x00", "\ufffd")
|
|
79
|
+
content_hash = md5(cleaned_content.encode()).hexdigest()
|
|
80
|
+
doc_uuid = uuid.UUID(hex=content_hash[:32])
|
|
81
|
+
return doc_uuid, cleaned_content
|
|
82
|
+
|
|
76
83
|
def get_client(self) -> weaviate.WeaviateClient:
|
|
77
84
|
"""Initialize and return a Weaviate client instance.
|
|
78
85
|
|
|
@@ -118,7 +125,7 @@ class Weaviate(VectorDb):
|
|
|
118
125
|
await self.async_client.connect() # type: ignore
|
|
119
126
|
|
|
120
127
|
if not await self.async_client.is_ready(): # type: ignore
|
|
121
|
-
raise
|
|
128
|
+
raise ConnectionError("Weaviate async client is not ready")
|
|
122
129
|
|
|
123
130
|
return self.async_client # type: ignore
|
|
124
131
|
|
|
@@ -155,6 +162,54 @@ class Weaviate(VectorDb):
|
|
|
155
162
|
finally:
|
|
156
163
|
await client.close()
|
|
157
164
|
|
|
165
|
+
def doc_content_changed(self, document: Document, check_existing: Optional[bool] = True) -> Optional[bool]:
|
|
166
|
+
"""
|
|
167
|
+
Check if the content of the document has changed by comparing its UUID.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
document (Document): Document to check
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
bool: True if the document content has changed, False otherwise. None on wrong input.
|
|
174
|
+
check_existing (bool): If True, check if the document exists before checking if the content changed.
|
|
175
|
+
"""
|
|
176
|
+
if not document or not document.content:
|
|
177
|
+
logger.warning("Invalid document: Missing content.")
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
if check_existing and document.name and not self.name_exists(document.name):
|
|
181
|
+
logger.warning(f"A document by this name does not exist: {document.name}")
|
|
182
|
+
return None
|
|
183
|
+
|
|
184
|
+
doc_uuid, _ = self._get_doc_uuid(document)
|
|
185
|
+
|
|
186
|
+
collection = self.get_client().collections.get(self.collection)
|
|
187
|
+
existing_doc = collection.query.fetch_object_by_id(doc_uuid)
|
|
188
|
+
|
|
189
|
+
if not existing_doc:
|
|
190
|
+
return True
|
|
191
|
+
else:
|
|
192
|
+
return False
|
|
193
|
+
|
|
194
|
+
def doc_delete(self, name: str) -> None:
|
|
195
|
+
"""
|
|
196
|
+
Delete all documents from Weaviate with a specific 'name' property.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
name (str): Document name to delete.
|
|
200
|
+
"""
|
|
201
|
+
collection = self.get_client().collections.get(self.collection)
|
|
202
|
+
filter_expr = Filter.by_property("name").equal(name)
|
|
203
|
+
|
|
204
|
+
result = collection.data.delete_many(where=filter_expr)
|
|
205
|
+
|
|
206
|
+
log_debug(f"Deleted document by name: '{name}' - {result.successful} documents deleted.")
|
|
207
|
+
if result.failed > 0:
|
|
208
|
+
logger.warning(
|
|
209
|
+
f"Failed to delete (some chunks of) document with name: '{name}' - "
|
|
210
|
+
f"Failed {result.failed} out of {result.matches} times. {result.successful} successful deletions."
|
|
211
|
+
)
|
|
212
|
+
|
|
158
213
|
def doc_exists(self, document: Document) -> bool:
|
|
159
214
|
"""
|
|
160
215
|
Validate if the document exists using consistent UUID generation.
|
|
@@ -169,9 +224,7 @@ class Weaviate(VectorDb):
|
|
|
169
224
|
logger.warning("Invalid document: Missing content.")
|
|
170
225
|
return False # Early exit for invalid input
|
|
171
226
|
|
|
172
|
-
|
|
173
|
-
content_hash = md5(cleaned_content.encode()).hexdigest()
|
|
174
|
-
doc_uuid = uuid.UUID(hex=content_hash[:32])
|
|
227
|
+
doc_uuid, _ = self._get_doc_uuid(document)
|
|
175
228
|
|
|
176
229
|
collection = self.get_client().collections.get(self.collection)
|
|
177
230
|
return collection.data.exists(doc_uuid)
|
|
@@ -190,9 +243,7 @@ class Weaviate(VectorDb):
|
|
|
190
243
|
logger.warning("Invalid document: Missing content.")
|
|
191
244
|
return False # Early exit for invalid input
|
|
192
245
|
|
|
193
|
-
|
|
194
|
-
content_hash = md5(cleaned_content.encode()).hexdigest()
|
|
195
|
-
doc_uuid = uuid.UUID(hex=content_hash[:32])
|
|
246
|
+
doc_uuid, _ = self._get_doc_uuid(document)
|
|
196
247
|
|
|
197
248
|
client = await self.get_async_client()
|
|
198
249
|
try:
|
|
@@ -256,9 +307,7 @@ class Weaviate(VectorDb):
|
|
|
256
307
|
logger.error(f"Document embedding is None: {document.name}")
|
|
257
308
|
continue
|
|
258
309
|
|
|
259
|
-
cleaned_content =
|
|
260
|
-
content_hash = md5(cleaned_content.encode()).hexdigest()
|
|
261
|
-
doc_uuid = uuid.UUID(hex=content_hash[:32])
|
|
310
|
+
doc_uuid, cleaned_content = self._get_doc_uuid(document)
|
|
262
311
|
|
|
263
312
|
# Merge filters with metadata
|
|
264
313
|
meta_data = document.meta_data or {}
|
|
@@ -305,9 +354,7 @@ class Weaviate(VectorDb):
|
|
|
305
354
|
continue
|
|
306
355
|
|
|
307
356
|
# Clean content and generate UUID
|
|
308
|
-
cleaned_content =
|
|
309
|
-
content_hash = md5(cleaned_content.encode()).hexdigest()
|
|
310
|
-
doc_uuid = uuid.UUID(hex=content_hash[:32])
|
|
357
|
+
doc_uuid, cleaned_content = self._get_doc_uuid(document)
|
|
311
358
|
|
|
312
359
|
# Serialize meta_data to JSON string
|
|
313
360
|
meta_data_str = json.dumps(document.meta_data) if document.meta_data else None
|
|
@@ -338,7 +385,28 @@ class Weaviate(VectorDb):
|
|
|
338
385
|
filters (Optional[Dict[str, Any]]): Filters to apply while upserting
|
|
339
386
|
"""
|
|
340
387
|
log_debug(f"Upserting {len(documents)} documents into Weaviate.")
|
|
341
|
-
|
|
388
|
+
|
|
389
|
+
_docs_to_insert = []
|
|
390
|
+
for document in documents:
|
|
391
|
+
assert document.name is not None, "Document name must be set for upsert operation."
|
|
392
|
+
|
|
393
|
+
if self.name_exists(document.name):
|
|
394
|
+
if self.doc_content_changed(document, check_existing=False):
|
|
395
|
+
log_debug(
|
|
396
|
+
f"Document already exists, but content changed. Document will be deleted and added again: {document.name}"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
is_first_or_only_chunk = ("chunk" in document.meta_data and document.meta_data["chunk"] == 1) or (
|
|
400
|
+
"chunk" not in document.meta_data
|
|
401
|
+
)
|
|
402
|
+
if is_first_or_only_chunk:
|
|
403
|
+
self.doc_delete(document.name)
|
|
404
|
+
_docs_to_insert.append(document)
|
|
405
|
+
else:
|
|
406
|
+
log_debug(f"Document skipped, content is unchanged: {document.name}")
|
|
407
|
+
else:
|
|
408
|
+
_docs_to_insert.append(document)
|
|
409
|
+
self.insert(_docs_to_insert)
|
|
342
410
|
|
|
343
411
|
async def async_upsert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
|
|
344
412
|
"""
|
|
@@ -365,9 +433,7 @@ class Weaviate(VectorDb):
|
|
|
365
433
|
logger.error(f"Document embedding is None: {document.name}")
|
|
366
434
|
continue
|
|
367
435
|
|
|
368
|
-
cleaned_content =
|
|
369
|
-
content_hash = md5(cleaned_content.encode()).hexdigest()
|
|
370
|
-
doc_uuid = uuid.UUID(hex=content_hash[:32])
|
|
436
|
+
doc_uuid, cleaned_content = self._get_doc_uuid(document)
|
|
371
437
|
|
|
372
438
|
# Serialize meta_data to JSON string
|
|
373
439
|
meta_data_str = json.dumps(document.meta_data) if document.meta_data else None
|
agno/workflow/workflow.py
CHANGED
|
@@ -369,6 +369,9 @@ class Workflow:
|
|
|
369
369
|
if self.storage is not None:
|
|
370
370
|
self.storage.mode = "workflow"
|
|
371
371
|
|
|
372
|
+
def initialize_workflow(self):
|
|
373
|
+
self.set_storage_mode()
|
|
374
|
+
|
|
372
375
|
def set_workflow_id(self) -> str:
|
|
373
376
|
if self.workflow_id is None:
|
|
374
377
|
self.workflow_id = str(uuid4())
|