agno 1.7.10__py3-none-any.whl → 1.7.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,372 @@
1
+ import json
2
+ from typing import Any, Callable, Dict, List, Optional, Set
3
+
4
+ from agno.tools import Toolkit
5
+ from agno.utils.log import log_debug, logger
6
+
7
+ try:
8
+ from trafilatura import (
9
+ extract,
10
+ extract_metadata,
11
+ fetch_url,
12
+ html2txt,
13
+ )
14
+ from trafilatura.meta import reset_caches
15
+
16
+ # Import spider functionality
17
+ try:
18
+ from trafilatura.spider import focused_crawler
19
+
20
+ SPIDER_AVAILABLE = True
21
+ except ImportError:
22
+ SPIDER_AVAILABLE = False
23
+ logger.warning("Trafilatura spider module not available. Web crawling functionality will be disabled.")
24
+
25
+ except ImportError:
26
+ raise ImportError("`trafilatura` not installed. Please install using `pip install trafilatura`")
27
+
28
+
29
+ class TrafilaturaTools(Toolkit):
30
+ """
31
+ TrafilaturaTools is a toolkit for web scraping and text extraction.
32
+
33
+ Args:
34
+ output_format (str): Default output format for extractions. Options: 'txt', 'json', 'xml', 'markdown', 'csv', 'html', 'xmltei'.
35
+ include_comments (bool): Whether to extract comments along with main text by default.
36
+ include_tables (bool): Whether to include table content by default.
37
+ include_images (bool): Whether to include image information by default (experimental).
38
+ include_formatting (bool): Whether to preserve formatting by default.
39
+ include_links (bool): Whether to preserve links by default (experimental).
40
+ with_metadata (bool): Whether to include metadata in extractions by default.
41
+ favor_precision (bool): Whether to prefer precision over recall by default.
42
+ favor_recall (bool): Whether to prefer recall over precision by default.
43
+ target_language (Optional[str]): Default target language filter (ISO 639-1 format).
44
+ deduplicate (bool): Whether to remove duplicate segments by default.
45
+ max_tree_size (Optional[int]): Maximum tree size for processing.
46
+ max_crawl_urls (int): Maximum number of URLs to crawl per website.
47
+ max_known_urls (int): Maximum number of known URLs during crawling.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ output_format: str = "txt",
53
+ include_comments: bool = True,
54
+ include_tables: bool = True,
55
+ include_images: bool = False,
56
+ include_formatting: bool = False,
57
+ include_links: bool = False,
58
+ with_metadata: bool = False,
59
+ favor_precision: bool = False,
60
+ favor_recall: bool = False,
61
+ target_language: Optional[str] = None,
62
+ deduplicate: bool = False,
63
+ max_tree_size: Optional[int] = None,
64
+ max_crawl_urls: int = 10,
65
+ max_known_urls: int = 100000,
66
+ **kwargs,
67
+ ):
68
+ self.output_format = output_format
69
+ self.include_comments = include_comments
70
+ self.include_tables = include_tables
71
+ self.include_images = include_images
72
+ self.include_formatting = include_formatting
73
+ self.include_links = include_links
74
+ self.with_metadata = with_metadata
75
+ self.favor_precision = favor_precision
76
+ self.favor_recall = favor_recall
77
+ self.target_language = target_language
78
+ self.deduplicate = deduplicate
79
+ self.max_tree_size = max_tree_size
80
+ self.max_crawl_urls = max_crawl_urls
81
+ self.max_known_urls = max_known_urls
82
+
83
+ tools: List[Callable] = [self.extract_text, self.extract_metadata_only, self.html_to_text, self.extract_batch]
84
+
85
+ if not SPIDER_AVAILABLE:
86
+ logger.warning("Web crawling requested but spider module not available. Skipping crawler tool.")
87
+ else:
88
+ tools.append(self.crawl_website)
89
+
90
+ super().__init__(name="trafilatura_tools", tools=tools, **kwargs)
91
+
92
+ def _get_extraction_params(
93
+ self,
94
+ output_format: Optional[str] = None,
95
+ include_comments: Optional[bool] = None,
96
+ include_tables: Optional[bool] = None,
97
+ include_images: Optional[bool] = None,
98
+ include_formatting: Optional[bool] = None,
99
+ include_links: Optional[bool] = None,
100
+ with_metadata: Optional[bool] = None,
101
+ favor_precision: Optional[bool] = None,
102
+ favor_recall: Optional[bool] = None,
103
+ target_language: Optional[str] = None,
104
+ deduplicate: Optional[bool] = None,
105
+ max_tree_size: Optional[int] = None,
106
+ url_blacklist: Optional[Set[str]] = None,
107
+ author_blacklist: Optional[Set[str]] = None,
108
+ ) -> Dict[str, Any]:
109
+ """Helper method to build extraction parameters with fallbacks to instance defaults."""
110
+ return {
111
+ "output_format": output_format if output_format is not None else self.output_format,
112
+ "include_comments": include_comments if include_comments is not None else self.include_comments,
113
+ "include_tables": include_tables if include_tables is not None else self.include_tables,
114
+ "include_images": include_images if include_images is not None else self.include_images,
115
+ "include_formatting": include_formatting if include_formatting is not None else self.include_formatting,
116
+ "include_links": include_links if include_links is not None else self.include_links,
117
+ "with_metadata": with_metadata if with_metadata is not None else self.with_metadata,
118
+ "favor_precision": favor_precision if favor_precision is not None else self.favor_precision,
119
+ "favor_recall": favor_recall if favor_recall is not None else self.favor_recall,
120
+ "target_language": target_language if target_language is not None else self.target_language,
121
+ "deduplicate": deduplicate if deduplicate is not None else self.deduplicate,
122
+ "max_tree_size": max_tree_size if max_tree_size is not None else self.max_tree_size,
123
+ "url_blacklist": url_blacklist,
124
+ "author_blacklist": author_blacklist,
125
+ }
126
+
127
+ def extract_text(
128
+ self,
129
+ url: str,
130
+ output_format: Optional[str] = None,
131
+ ) -> str:
132
+ """
133
+ Extract main text content from a web page URL using Trafilatura.
134
+
135
+ Args:
136
+ url (str): The URL to extract content from.
137
+ output_format (Optional[str]): Output format. Options: 'txt', 'json', 'xml', 'markdown', 'csv', 'html', 'xmltei'.
138
+
139
+ Returns:
140
+ str: Extracted content in the specified format, or error message if extraction fails.
141
+ """
142
+ try:
143
+ log_debug(f"Extracting text from URL: {url}")
144
+
145
+ # Fetch the webpage content
146
+ html_content = fetch_url(url)
147
+ if not html_content:
148
+ return f"Error: Could not fetch content from URL: {url}"
149
+
150
+ # Get extraction parameters
151
+ params = self._get_extraction_params(output_format=output_format)
152
+
153
+ result = extract(html_content, url=url, **params)
154
+
155
+ if result is None:
156
+ return f"Error: Could not extract readable content from URL: {url}"
157
+
158
+ # Reset caches
159
+ reset_caches()
160
+
161
+ return result
162
+
163
+ except Exception as e:
164
+ logger.warning(f"Error extracting text from {url}: {e}")
165
+ return f"Error extracting text from {url}: {e}"
166
+
167
+ def extract_metadata_only(
168
+ self,
169
+ url: str,
170
+ as_json: bool = True,
171
+ ) -> str:
172
+ """
173
+ Extract only metadata from a web page URL.
174
+
175
+ Args:
176
+ url (str): The URL to extract metadata from.
177
+ as_json (bool): Whether to return metadata as JSON string.
178
+
179
+ Returns:
180
+ str: Extracted metadata as JSON string or formatted text.
181
+ """
182
+ try:
183
+ log_debug(f"Extracting metadata from URL: {url}")
184
+
185
+ # Fetch the webpage content
186
+ html_content = fetch_url(url)
187
+ if not html_content:
188
+ return f"Error: Could not fetch content from URL: {url}"
189
+
190
+ # Extract metadata
191
+ metadata_doc = extract_metadata(
192
+ html_content,
193
+ default_url=url,
194
+ extensive=True, # default
195
+ author_blacklist=None,
196
+ )
197
+
198
+ if metadata_doc is None:
199
+ return f"Error: Could not extract metadata from URL: {url}"
200
+
201
+ metadata_dict = metadata_doc.as_dict()
202
+
203
+ # Reset caches
204
+ reset_caches()
205
+
206
+ if as_json:
207
+ return json.dumps(metadata_dict, indent=2, default=str)
208
+ else:
209
+ return "\n".join(f"{key}: {value}" for key, value in metadata_dict.items())
210
+
211
+ except Exception as e:
212
+ logger.warning(f"Error extracting metadata from {url}: {e}")
213
+ return f"Error extracting metadata from {url}: {e}"
214
+
215
+ def crawl_website(
216
+ self,
217
+ homepage_url: str,
218
+ extract_content: bool = False,
219
+ ) -> str:
220
+ """
221
+ Crawl a website and optionally extract content from discovered pages.
222
+
223
+ Args:
224
+ homepage_url (str): The starting URL (preferably homepage) to crawl from.
225
+ extract_content (bool): Whether to extract content from discovered URLs.
226
+
227
+ Returns:
228
+ str: JSON containing crawl results and optionally extracted content.
229
+ """
230
+ if not SPIDER_AVAILABLE:
231
+ return "Error: Web crawling functionality not available. Trafilatura spider module could not be imported."
232
+
233
+ try:
234
+ log_debug(f"Starting website crawl from: {homepage_url}")
235
+
236
+ # Use instance configuration
237
+ max_seen = self.max_crawl_urls
238
+ max_known = self.max_known_urls
239
+ lang = self.target_language
240
+
241
+ # Perform focused crawling
242
+ to_visit, known_links = focused_crawler(
243
+ homepage=homepage_url,
244
+ max_seen_urls=max_seen,
245
+ max_known_urls=max_known,
246
+ lang=lang,
247
+ )
248
+
249
+ crawl_results = {
250
+ "homepage": homepage_url,
251
+ "to_visit": list(to_visit) if to_visit else [],
252
+ "known_links": list(known_links) if known_links else [],
253
+ "stats": {
254
+ "urls_to_visit": len(to_visit) if to_visit else 0,
255
+ "known_links_count": len(known_links) if known_links else 0,
256
+ },
257
+ }
258
+
259
+ # Optionally extract content from discovered URLs
260
+ if extract_content and known_links:
261
+ log_debug("Extracting content from discovered URLs")
262
+ extracted_content = {}
263
+
264
+ # Limit extraction to avoid overwhelming responses
265
+ urls_to_extract = list(known_links)[: min(10, len(known_links))]
266
+
267
+ for url in urls_to_extract:
268
+ try:
269
+ params = self._get_extraction_params()
270
+
271
+ html_content = fetch_url(url)
272
+ if html_content:
273
+ content = extract(html_content, url=url, **params)
274
+ if content:
275
+ extracted_content[url] = content
276
+ except Exception as e:
277
+ extracted_content[url] = f"Error extracting content: {e}"
278
+
279
+ crawl_results["extracted_content"] = extracted_content
280
+
281
+ # Reset caches
282
+ reset_caches()
283
+
284
+ return json.dumps(crawl_results, indent=2, default=str)
285
+
286
+ except Exception as e:
287
+ logger.warning(f"Error crawling website {homepage_url}: {e}")
288
+ return f"Error crawling website {homepage_url}: {e}"
289
+
290
+ def html_to_text(
291
+ self,
292
+ html_content: str,
293
+ clean: bool = True,
294
+ ) -> str:
295
+ """
296
+ Convert HTML content to plain text using Trafilatura's html2txt function.
297
+
298
+ Args:
299
+ html_content (str): The HTML content to convert.
300
+ clean (bool): Whether to remove potentially undesirable elements.
301
+
302
+ Returns:
303
+ str: Plain text extracted from HTML.
304
+ """
305
+ try:
306
+ log_debug("Converting HTML to text")
307
+
308
+ result = html2txt(html_content, clean=clean)
309
+
310
+ # Reset caches
311
+ reset_caches()
312
+
313
+ return result if result else "Error: Could not extract text from HTML content"
314
+
315
+ except Exception as e:
316
+ logger.warning(f"Error converting HTML to text: {e}")
317
+ return f"Error converting HTML to text: {e}"
318
+
319
+ def extract_batch(
320
+ self,
321
+ urls: List[str],
322
+ ) -> str:
323
+ """
324
+ Extract content from multiple URLs in batch.
325
+
326
+ Args:
327
+ urls (List[str]): List of URLs to extract content from.
328
+
329
+ Returns:
330
+ str: JSON containing batch extraction results.
331
+ """
332
+ try:
333
+ log_debug(f"Starting batch extraction for {len(urls)} URLs")
334
+
335
+ results = {}
336
+ failed_urls = []
337
+
338
+ for url in urls:
339
+ try:
340
+ params = self._get_extraction_params()
341
+
342
+ html_content = fetch_url(url)
343
+ if html_content:
344
+ content = extract(html_content, url=url, **params)
345
+ if content:
346
+ results[url] = content
347
+ else:
348
+ failed_urls.append(url)
349
+ else:
350
+ failed_urls.append(url)
351
+
352
+ except Exception as e:
353
+ failed_urls.append(url)
354
+ results[url] = f"Error: {e}"
355
+
356
+ # Reset caches after batch processing
357
+ reset_caches()
358
+
359
+ batch_results = {
360
+ "successful_extractions": len(results)
361
+ - len([k for k, v in results.items() if str(v).startswith("Error:")]),
362
+ "failed_extractions": len(failed_urls),
363
+ "total_urls": len(urls),
364
+ "results": results,
365
+ "failed_urls": failed_urls,
366
+ }
367
+
368
+ return json.dumps(batch_results, indent=2, default=str)
369
+
370
+ except Exception as e:
371
+ logger.warning(f"Error in batch extraction: {e}")
372
+ return f"Error in batch extraction: {e}"
@@ -140,7 +140,7 @@ class Clickhouse(VectorDb):
140
140
 
141
141
  if isinstance(self.index, HNSW):
142
142
  index = (
143
- f"INDEX embedding_index embedding TYPE vector_similarity('hnsw', 'L2Distance', {self.index.quantization}, "
143
+ f"INDEX embedding_index embedding TYPE vector_similarity('hnsw', 'L2Distance', {self.embedder.dimensions}, {self.index.quantization}, "
144
144
  f"{self.index.hnsw_max_connections_per_layer}, {self.index.hnsw_candidate_list_size_for_construction})"
145
145
  )
146
146
  self.client.command("SET allow_experimental_vector_similarity_index = 1")
@@ -568,7 +568,7 @@ class Milvus(VectorDb):
568
568
  self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None
569
569
  ) -> List[Document]:
570
570
  if self.search_type == SearchType.hybrid:
571
- return self.hybrid_search(query, limit, filters)
571
+ return await self.async_hybrid_search(query, limit, filters)
572
572
 
573
573
  query_embedding = self.embedder.get_embedding(query)
574
574
  if query_embedding is None:
@@ -691,6 +691,94 @@ class Milvus(VectorDb):
691
691
  logger.error(f"Error during hybrid search: {e}")
692
692
  return []
693
693
 
694
+ async def async_hybrid_search(
695
+ self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None
696
+ ) -> List[Document]:
697
+ """
698
+ Perform an asynchronous hybrid search combining dense and sparse vector similarity.
699
+
700
+ Args:
701
+ query (str): Query string to search for
702
+ limit (int): Maximum number of results to return
703
+ filters (Optional[Dict[str, Any]]): Filters to apply to the search
704
+
705
+ Returns:
706
+ List[Document]: List of matching documents
707
+ """
708
+ from pymilvus import AnnSearchRequest, RRFRanker
709
+
710
+ # Get query embeddings
711
+ dense_vector = self.embedder.get_embedding(query)
712
+ sparse_vector = self._get_sparse_vector(query)
713
+
714
+ if dense_vector is None:
715
+ logger.error(f"Error getting dense embedding for Query: {query}")
716
+ return []
717
+
718
+ try:
719
+ # Refer to docs for details- https://milvus.io/docs/multi-vector-search.md
720
+
721
+ # Create search request for dense vectors
722
+ dense_search_param = {
723
+ "data": [dense_vector],
724
+ "anns_field": "dense_vector",
725
+ "param": {"metric_type": self._get_metric_type(), "params": {"nprobe": 10}},
726
+ "limit": limit
727
+ * 2, # Fetch more candidates for better reranking quality - each vector search returns 2x results which are then merged and reranked
728
+ }
729
+
730
+ # Create search request for sparse vectors
731
+ sparse_search_param = {
732
+ "data": [sparse_vector],
733
+ "anns_field": "sparse_vector",
734
+ "param": {"metric_type": "IP", "params": {"drop_ratio_build": 0.2}},
735
+ "limit": limit * 2, # Match dense search limit to ensure balanced candidate pool for reranking
736
+ }
737
+
738
+ # Create search requests
739
+ dense_request = AnnSearchRequest(**dense_search_param)
740
+ sparse_request = AnnSearchRequest(**sparse_search_param)
741
+ reqs = [dense_request, sparse_request]
742
+
743
+ # Use RRFRanker for balanced importance between vectors
744
+ ranker = RRFRanker(60) # Default k=60
745
+
746
+ log_info("Performing async hybrid search")
747
+ results = await self.async_client.hybrid_search(
748
+ collection_name=self.collection, reqs=reqs, ranker=ranker, limit=limit, output_fields=["*"]
749
+ )
750
+
751
+ # Build search results
752
+ search_results: List[Document] = []
753
+ for hits in results:
754
+ for hit in hits:
755
+ entity = hit.get("entity", {})
756
+ meta_data = json.loads(entity.get("meta_data", "{}")) if entity.get("meta_data") else {}
757
+ usage = json.loads(entity.get("usage", "{}")) if entity.get("usage") else None
758
+
759
+ search_results.append(
760
+ Document(
761
+ id=hit.get("id"),
762
+ name=entity.get("name", None),
763
+ meta_data=meta_data, # Now a dictionary
764
+ content=entity.get("content", ""),
765
+ embedder=self.embedder,
766
+ embedding=entity.get("dense_vector", None),
767
+ usage=usage, # Now a dictionary or None
768
+ )
769
+ )
770
+
771
+ # Apply additional reranking if custom reranker is provided
772
+ if self.reranker and search_results:
773
+ search_results = self.reranker.rerank(query=query, documents=search_results)
774
+
775
+ log_info(f"Found {len(search_results)} documents")
776
+ return search_results
777
+
778
+ except Exception as e:
779
+ logger.error(f"Error during async hybrid search: {e}")
780
+ return []
781
+
694
782
  def drop(self) -> None:
695
783
  if self.exists():
696
784
  log_debug(f"Deleting collection: {self.collection}")
agno/workflow/workflow.py CHANGED
@@ -369,6 +369,9 @@ class Workflow:
369
369
  if self.storage is not None:
370
370
  self.storage.mode = "workflow"
371
371
 
372
+ def initialize_workflow(self):
373
+ self.set_storage_mode()
374
+
372
375
  def set_workflow_id(self) -> str:
373
376
  if self.workflow_id is None:
374
377
  self.workflow_id = str(uuid4())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: agno
3
- Version: 1.7.10
3
+ Version: 1.7.11
4
4
  Summary: Agno: a lightweight library for building Multi-Agent Systems
5
5
  Author-email: Ashpreet Bedi <ashpreet@agno.com>
6
6
  License: Copyright (c) Agno, Inc.
@@ -549,6 +549,8 @@ Provides-Extra: daytona
549
549
  Requires-Dist: daytona; extra == "daytona"
550
550
  Provides-Extra: oxylabs
551
551
  Requires-Dist: oxylabs; extra == "oxylabs"
552
+ Provides-Extra: trafilatura
553
+ Requires-Dist: trafilatura; extra == "trafilatura"
552
554
  Provides-Extra: sql
553
555
  Requires-Dist: sqlalchemy; extra == "sql"
554
556
  Provides-Extra: postgres
@@ -664,6 +666,7 @@ Requires-Dist: agno[zep]; extra == "tools"
664
666
  Requires-Dist: agno[mem0]; extra == "tools"
665
667
  Requires-Dist: agno[google_bigquery]; extra == "tools"
666
668
  Requires-Dist: agno[psycopg]; extra == "tools"
669
+ Requires-Dist: agno[trafilatura]; extra == "tools"
667
670
  Provides-Extra: storage
668
671
  Requires-Dist: agno[sql]; extra == "storage"
669
672
  Requires-Dist: agno[postgres]; extra == "storage"