agno 1.7.9__py3-none-any.whl → 1.7.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. agno/agent/agent.py +1 -1
  2. agno/app/fastapi/app.py +3 -1
  3. agno/app/fastapi/async_router.py +1 -1
  4. agno/app/playground/app.py +1 -0
  5. agno/document/chunking/semantic.py +1 -3
  6. agno/document/reader/markdown_reader.py +2 -7
  7. agno/document/reader/pdf_reader.py +69 -13
  8. agno/document/reader/text_reader.py +2 -2
  9. agno/knowledge/agent.py +70 -75
  10. agno/knowledge/markdown.py +15 -2
  11. agno/knowledge/pdf.py +32 -8
  12. agno/knowledge/pdf_url.py +13 -5
  13. agno/knowledge/website.py +4 -1
  14. agno/media.py +2 -0
  15. agno/models/aws/bedrock.py +51 -21
  16. agno/models/dashscope/__init__.py +5 -0
  17. agno/models/dashscope/dashscope.py +81 -0
  18. agno/models/openai/chat.py +3 -0
  19. agno/models/openai/responses.py +53 -7
  20. agno/models/qwen/__init__.py +5 -0
  21. agno/run/response.py +4 -0
  22. agno/run/team.py +4 -0
  23. agno/storage/in_memory.py +234 -0
  24. agno/team/team.py +25 -9
  25. agno/tools/brandfetch.py +210 -0
  26. agno/tools/github.py +46 -18
  27. agno/tools/trafilatura.py +372 -0
  28. agno/vectordb/clickhouse/clickhousedb.py +1 -1
  29. agno/vectordb/milvus/milvus.py +89 -1
  30. agno/vectordb/weaviate/weaviate.py +84 -18
  31. agno/workflow/workflow.py +3 -0
  32. {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/METADATA +5 -1
  33. {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/RECORD +37 -31
  34. {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/WHEEL +0 -0
  35. {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/entry_points.txt +0 -0
  36. {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/licenses/LICENSE +0 -0
  37. {agno-1.7.9.dist-info → agno-1.7.11.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,372 @@
1
+ import json
2
+ from typing import Any, Callable, Dict, List, Optional, Set
3
+
4
+ from agno.tools import Toolkit
5
+ from agno.utils.log import log_debug, logger
6
+
7
+ try:
8
+ from trafilatura import (
9
+ extract,
10
+ extract_metadata,
11
+ fetch_url,
12
+ html2txt,
13
+ )
14
+ from trafilatura.meta import reset_caches
15
+
16
+ # Import spider functionality
17
+ try:
18
+ from trafilatura.spider import focused_crawler
19
+
20
+ SPIDER_AVAILABLE = True
21
+ except ImportError:
22
+ SPIDER_AVAILABLE = False
23
+ logger.warning("Trafilatura spider module not available. Web crawling functionality will be disabled.")
24
+
25
+ except ImportError:
26
+ raise ImportError("`trafilatura` not installed. Please install using `pip install trafilatura`")
27
+
28
+
29
+ class TrafilaturaTools(Toolkit):
30
+ """
31
+ TrafilaturaTools is a toolkit for web scraping and text extraction.
32
+
33
+ Args:
34
+ output_format (str): Default output format for extractions. Options: 'txt', 'json', 'xml', 'markdown', 'csv', 'html', 'xmltei'.
35
+ include_comments (bool): Whether to extract comments along with main text by default.
36
+ include_tables (bool): Whether to include table content by default.
37
+ include_images (bool): Whether to include image information by default (experimental).
38
+ include_formatting (bool): Whether to preserve formatting by default.
39
+ include_links (bool): Whether to preserve links by default (experimental).
40
+ with_metadata (bool): Whether to include metadata in extractions by default.
41
+ favor_precision (bool): Whether to prefer precision over recall by default.
42
+ favor_recall (bool): Whether to prefer recall over precision by default.
43
+ target_language (Optional[str]): Default target language filter (ISO 639-1 format).
44
+ deduplicate (bool): Whether to remove duplicate segments by default.
45
+ max_tree_size (Optional[int]): Maximum tree size for processing.
46
+ max_crawl_urls (int): Maximum number of URLs to crawl per website.
47
+ max_known_urls (int): Maximum number of known URLs during crawling.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ output_format: str = "txt",
53
+ include_comments: bool = True,
54
+ include_tables: bool = True,
55
+ include_images: bool = False,
56
+ include_formatting: bool = False,
57
+ include_links: bool = False,
58
+ with_metadata: bool = False,
59
+ favor_precision: bool = False,
60
+ favor_recall: bool = False,
61
+ target_language: Optional[str] = None,
62
+ deduplicate: bool = False,
63
+ max_tree_size: Optional[int] = None,
64
+ max_crawl_urls: int = 10,
65
+ max_known_urls: int = 100000,
66
+ **kwargs,
67
+ ):
68
+ self.output_format = output_format
69
+ self.include_comments = include_comments
70
+ self.include_tables = include_tables
71
+ self.include_images = include_images
72
+ self.include_formatting = include_formatting
73
+ self.include_links = include_links
74
+ self.with_metadata = with_metadata
75
+ self.favor_precision = favor_precision
76
+ self.favor_recall = favor_recall
77
+ self.target_language = target_language
78
+ self.deduplicate = deduplicate
79
+ self.max_tree_size = max_tree_size
80
+ self.max_crawl_urls = max_crawl_urls
81
+ self.max_known_urls = max_known_urls
82
+
83
+ tools: List[Callable] = [self.extract_text, self.extract_metadata_only, self.html_to_text, self.extract_batch]
84
+
85
+ if not SPIDER_AVAILABLE:
86
+ logger.warning("Web crawling requested but spider module not available. Skipping crawler tool.")
87
+ else:
88
+ tools.append(self.crawl_website)
89
+
90
+ super().__init__(name="trafilatura_tools", tools=tools, **kwargs)
91
+
92
+ def _get_extraction_params(
93
+ self,
94
+ output_format: Optional[str] = None,
95
+ include_comments: Optional[bool] = None,
96
+ include_tables: Optional[bool] = None,
97
+ include_images: Optional[bool] = None,
98
+ include_formatting: Optional[bool] = None,
99
+ include_links: Optional[bool] = None,
100
+ with_metadata: Optional[bool] = None,
101
+ favor_precision: Optional[bool] = None,
102
+ favor_recall: Optional[bool] = None,
103
+ target_language: Optional[str] = None,
104
+ deduplicate: Optional[bool] = None,
105
+ max_tree_size: Optional[int] = None,
106
+ url_blacklist: Optional[Set[str]] = None,
107
+ author_blacklist: Optional[Set[str]] = None,
108
+ ) -> Dict[str, Any]:
109
+ """Helper method to build extraction parameters with fallbacks to instance defaults."""
110
+ return {
111
+ "output_format": output_format if output_format is not None else self.output_format,
112
+ "include_comments": include_comments if include_comments is not None else self.include_comments,
113
+ "include_tables": include_tables if include_tables is not None else self.include_tables,
114
+ "include_images": include_images if include_images is not None else self.include_images,
115
+ "include_formatting": include_formatting if include_formatting is not None else self.include_formatting,
116
+ "include_links": include_links if include_links is not None else self.include_links,
117
+ "with_metadata": with_metadata if with_metadata is not None else self.with_metadata,
118
+ "favor_precision": favor_precision if favor_precision is not None else self.favor_precision,
119
+ "favor_recall": favor_recall if favor_recall is not None else self.favor_recall,
120
+ "target_language": target_language if target_language is not None else self.target_language,
121
+ "deduplicate": deduplicate if deduplicate is not None else self.deduplicate,
122
+ "max_tree_size": max_tree_size if max_tree_size is not None else self.max_tree_size,
123
+ "url_blacklist": url_blacklist,
124
+ "author_blacklist": author_blacklist,
125
+ }
126
+
127
+ def extract_text(
128
+ self,
129
+ url: str,
130
+ output_format: Optional[str] = None,
131
+ ) -> str:
132
+ """
133
+ Extract main text content from a web page URL using Trafilatura.
134
+
135
+ Args:
136
+ url (str): The URL to extract content from.
137
+ output_format (Optional[str]): Output format. Options: 'txt', 'json', 'xml', 'markdown', 'csv', 'html', 'xmltei'.
138
+
139
+ Returns:
140
+ str: Extracted content in the specified format, or error message if extraction fails.
141
+ """
142
+ try:
143
+ log_debug(f"Extracting text from URL: {url}")
144
+
145
+ # Fetch the webpage content
146
+ html_content = fetch_url(url)
147
+ if not html_content:
148
+ return f"Error: Could not fetch content from URL: {url}"
149
+
150
+ # Get extraction parameters
151
+ params = self._get_extraction_params(output_format=output_format)
152
+
153
+ result = extract(html_content, url=url, **params)
154
+
155
+ if result is None:
156
+ return f"Error: Could not extract readable content from URL: {url}"
157
+
158
+ # Reset caches
159
+ reset_caches()
160
+
161
+ return result
162
+
163
+ except Exception as e:
164
+ logger.warning(f"Error extracting text from {url}: {e}")
165
+ return f"Error extracting text from {url}: {e}"
166
+
167
+ def extract_metadata_only(
168
+ self,
169
+ url: str,
170
+ as_json: bool = True,
171
+ ) -> str:
172
+ """
173
+ Extract only metadata from a web page URL.
174
+
175
+ Args:
176
+ url (str): The URL to extract metadata from.
177
+ as_json (bool): Whether to return metadata as JSON string.
178
+
179
+ Returns:
180
+ str: Extracted metadata as JSON string or formatted text.
181
+ """
182
+ try:
183
+ log_debug(f"Extracting metadata from URL: {url}")
184
+
185
+ # Fetch the webpage content
186
+ html_content = fetch_url(url)
187
+ if not html_content:
188
+ return f"Error: Could not fetch content from URL: {url}"
189
+
190
+ # Extract metadata
191
+ metadata_doc = extract_metadata(
192
+ html_content,
193
+ default_url=url,
194
+ extensive=True, # default
195
+ author_blacklist=None,
196
+ )
197
+
198
+ if metadata_doc is None:
199
+ return f"Error: Could not extract metadata from URL: {url}"
200
+
201
+ metadata_dict = metadata_doc.as_dict()
202
+
203
+ # Reset caches
204
+ reset_caches()
205
+
206
+ if as_json:
207
+ return json.dumps(metadata_dict, indent=2, default=str)
208
+ else:
209
+ return "\n".join(f"{key}: {value}" for key, value in metadata_dict.items())
210
+
211
+ except Exception as e:
212
+ logger.warning(f"Error extracting metadata from {url}: {e}")
213
+ return f"Error extracting metadata from {url}: {e}"
214
+
215
+ def crawl_website(
216
+ self,
217
+ homepage_url: str,
218
+ extract_content: bool = False,
219
+ ) -> str:
220
+ """
221
+ Crawl a website and optionally extract content from discovered pages.
222
+
223
+ Args:
224
+ homepage_url (str): The starting URL (preferably homepage) to crawl from.
225
+ extract_content (bool): Whether to extract content from discovered URLs.
226
+
227
+ Returns:
228
+ str: JSON containing crawl results and optionally extracted content.
229
+ """
230
+ if not SPIDER_AVAILABLE:
231
+ return "Error: Web crawling functionality not available. Trafilatura spider module could not be imported."
232
+
233
+ try:
234
+ log_debug(f"Starting website crawl from: {homepage_url}")
235
+
236
+ # Use instance configuration
237
+ max_seen = self.max_crawl_urls
238
+ max_known = self.max_known_urls
239
+ lang = self.target_language
240
+
241
+ # Perform focused crawling
242
+ to_visit, known_links = focused_crawler(
243
+ homepage=homepage_url,
244
+ max_seen_urls=max_seen,
245
+ max_known_urls=max_known,
246
+ lang=lang,
247
+ )
248
+
249
+ crawl_results = {
250
+ "homepage": homepage_url,
251
+ "to_visit": list(to_visit) if to_visit else [],
252
+ "known_links": list(known_links) if known_links else [],
253
+ "stats": {
254
+ "urls_to_visit": len(to_visit) if to_visit else 0,
255
+ "known_links_count": len(known_links) if known_links else 0,
256
+ },
257
+ }
258
+
259
+ # Optionally extract content from discovered URLs
260
+ if extract_content and known_links:
261
+ log_debug("Extracting content from discovered URLs")
262
+ extracted_content = {}
263
+
264
+ # Limit extraction to avoid overwhelming responses
265
+ urls_to_extract = list(known_links)[: min(10, len(known_links))]
266
+
267
+ for url in urls_to_extract:
268
+ try:
269
+ params = self._get_extraction_params()
270
+
271
+ html_content = fetch_url(url)
272
+ if html_content:
273
+ content = extract(html_content, url=url, **params)
274
+ if content:
275
+ extracted_content[url] = content
276
+ except Exception as e:
277
+ extracted_content[url] = f"Error extracting content: {e}"
278
+
279
+ crawl_results["extracted_content"] = extracted_content
280
+
281
+ # Reset caches
282
+ reset_caches()
283
+
284
+ return json.dumps(crawl_results, indent=2, default=str)
285
+
286
+ except Exception as e:
287
+ logger.warning(f"Error crawling website {homepage_url}: {e}")
288
+ return f"Error crawling website {homepage_url}: {e}"
289
+
290
+ def html_to_text(
291
+ self,
292
+ html_content: str,
293
+ clean: bool = True,
294
+ ) -> str:
295
+ """
296
+ Convert HTML content to plain text using Trafilatura's html2txt function.
297
+
298
+ Args:
299
+ html_content (str): The HTML content to convert.
300
+ clean (bool): Whether to remove potentially undesirable elements.
301
+
302
+ Returns:
303
+ str: Plain text extracted from HTML.
304
+ """
305
+ try:
306
+ log_debug("Converting HTML to text")
307
+
308
+ result = html2txt(html_content, clean=clean)
309
+
310
+ # Reset caches
311
+ reset_caches()
312
+
313
+ return result if result else "Error: Could not extract text from HTML content"
314
+
315
+ except Exception as e:
316
+ logger.warning(f"Error converting HTML to text: {e}")
317
+ return f"Error converting HTML to text: {e}"
318
+
319
+ def extract_batch(
320
+ self,
321
+ urls: List[str],
322
+ ) -> str:
323
+ """
324
+ Extract content from multiple URLs in batch.
325
+
326
+ Args:
327
+ urls (List[str]): List of URLs to extract content from.
328
+
329
+ Returns:
330
+ str: JSON containing batch extraction results.
331
+ """
332
+ try:
333
+ log_debug(f"Starting batch extraction for {len(urls)} URLs")
334
+
335
+ results = {}
336
+ failed_urls = []
337
+
338
+ for url in urls:
339
+ try:
340
+ params = self._get_extraction_params()
341
+
342
+ html_content = fetch_url(url)
343
+ if html_content:
344
+ content = extract(html_content, url=url, **params)
345
+ if content:
346
+ results[url] = content
347
+ else:
348
+ failed_urls.append(url)
349
+ else:
350
+ failed_urls.append(url)
351
+
352
+ except Exception as e:
353
+ failed_urls.append(url)
354
+ results[url] = f"Error: {e}"
355
+
356
+ # Reset caches after batch processing
357
+ reset_caches()
358
+
359
+ batch_results = {
360
+ "successful_extractions": len(results)
361
+ - len([k for k, v in results.items() if str(v).startswith("Error:")]),
362
+ "failed_extractions": len(failed_urls),
363
+ "total_urls": len(urls),
364
+ "results": results,
365
+ "failed_urls": failed_urls,
366
+ }
367
+
368
+ return json.dumps(batch_results, indent=2, default=str)
369
+
370
+ except Exception as e:
371
+ logger.warning(f"Error in batch extraction: {e}")
372
+ return f"Error in batch extraction: {e}"
@@ -140,7 +140,7 @@ class Clickhouse(VectorDb):
140
140
 
141
141
  if isinstance(self.index, HNSW):
142
142
  index = (
143
- f"INDEX embedding_index embedding TYPE vector_similarity('hnsw', 'L2Distance', {self.index.quantization}, "
143
+ f"INDEX embedding_index embedding TYPE vector_similarity('hnsw', 'L2Distance', {self.embedder.dimensions}, {self.index.quantization}, "
144
144
  f"{self.index.hnsw_max_connections_per_layer}, {self.index.hnsw_candidate_list_size_for_construction})"
145
145
  )
146
146
  self.client.command("SET allow_experimental_vector_similarity_index = 1")
@@ -568,7 +568,7 @@ class Milvus(VectorDb):
568
568
  self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None
569
569
  ) -> List[Document]:
570
570
  if self.search_type == SearchType.hybrid:
571
- return self.hybrid_search(query, limit, filters)
571
+ return await self.async_hybrid_search(query, limit, filters)
572
572
 
573
573
  query_embedding = self.embedder.get_embedding(query)
574
574
  if query_embedding is None:
@@ -691,6 +691,94 @@ class Milvus(VectorDb):
691
691
  logger.error(f"Error during hybrid search: {e}")
692
692
  return []
693
693
 
694
+ async def async_hybrid_search(
695
+ self, query: str, limit: int = 5, filters: Optional[Dict[str, Any]] = None
696
+ ) -> List[Document]:
697
+ """
698
+ Perform an asynchronous hybrid search combining dense and sparse vector similarity.
699
+
700
+ Args:
701
+ query (str): Query string to search for
702
+ limit (int): Maximum number of results to return
703
+ filters (Optional[Dict[str, Any]]): Filters to apply to the search
704
+
705
+ Returns:
706
+ List[Document]: List of matching documents
707
+ """
708
+ from pymilvus import AnnSearchRequest, RRFRanker
709
+
710
+ # Get query embeddings
711
+ dense_vector = self.embedder.get_embedding(query)
712
+ sparse_vector = self._get_sparse_vector(query)
713
+
714
+ if dense_vector is None:
715
+ logger.error(f"Error getting dense embedding for Query: {query}")
716
+ return []
717
+
718
+ try:
719
+ # Refer to docs for details- https://milvus.io/docs/multi-vector-search.md
720
+
721
+ # Create search request for dense vectors
722
+ dense_search_param = {
723
+ "data": [dense_vector],
724
+ "anns_field": "dense_vector",
725
+ "param": {"metric_type": self._get_metric_type(), "params": {"nprobe": 10}},
726
+ "limit": limit
727
+ * 2, # Fetch more candidates for better reranking quality - each vector search returns 2x results which are then merged and reranked
728
+ }
729
+
730
+ # Create search request for sparse vectors
731
+ sparse_search_param = {
732
+ "data": [sparse_vector],
733
+ "anns_field": "sparse_vector",
734
+ "param": {"metric_type": "IP", "params": {"drop_ratio_build": 0.2}},
735
+ "limit": limit * 2, # Match dense search limit to ensure balanced candidate pool for reranking
736
+ }
737
+
738
+ # Create search requests
739
+ dense_request = AnnSearchRequest(**dense_search_param)
740
+ sparse_request = AnnSearchRequest(**sparse_search_param)
741
+ reqs = [dense_request, sparse_request]
742
+
743
+ # Use RRFRanker for balanced importance between vectors
744
+ ranker = RRFRanker(60) # Default k=60
745
+
746
+ log_info("Performing async hybrid search")
747
+ results = await self.async_client.hybrid_search(
748
+ collection_name=self.collection, reqs=reqs, ranker=ranker, limit=limit, output_fields=["*"]
749
+ )
750
+
751
+ # Build search results
752
+ search_results: List[Document] = []
753
+ for hits in results:
754
+ for hit in hits:
755
+ entity = hit.get("entity", {})
756
+ meta_data = json.loads(entity.get("meta_data", "{}")) if entity.get("meta_data") else {}
757
+ usage = json.loads(entity.get("usage", "{}")) if entity.get("usage") else None
758
+
759
+ search_results.append(
760
+ Document(
761
+ id=hit.get("id"),
762
+ name=entity.get("name", None),
763
+ meta_data=meta_data, # Now a dictionary
764
+ content=entity.get("content", ""),
765
+ embedder=self.embedder,
766
+ embedding=entity.get("dense_vector", None),
767
+ usage=usage, # Now a dictionary or None
768
+ )
769
+ )
770
+
771
+ # Apply additional reranking if custom reranker is provided
772
+ if self.reranker and search_results:
773
+ search_results = self.reranker.rerank(query=query, documents=search_results)
774
+
775
+ log_info(f"Found {len(search_results)} documents")
776
+ return search_results
777
+
778
+ except Exception as e:
779
+ logger.error(f"Error during async hybrid search: {e}")
780
+ return []
781
+
694
782
  def drop(self) -> None:
695
783
  if self.exists():
696
784
  log_debug(f"Deleting collection: {self.collection}")
@@ -2,7 +2,7 @@ import json
2
2
  import uuid
3
3
  from hashlib import md5
4
4
  from os import getenv
5
- from typing import Any, Dict, List, Optional
5
+ from typing import Any, Dict, List, Optional, Tuple
6
6
 
7
7
  try:
8
8
  from warnings import filterwarnings
@@ -73,6 +73,13 @@ class Weaviate(VectorDb):
73
73
  self.reranker: Optional[Reranker] = reranker
74
74
  self.hybrid_search_alpha = hybrid_search_alpha
75
75
 
76
+ @staticmethod
77
+ def _get_doc_uuid(document: Document) -> Tuple[uuid.UUID, str]:
78
+ cleaned_content = document.content.replace("\x00", "\ufffd")
79
+ content_hash = md5(cleaned_content.encode()).hexdigest()
80
+ doc_uuid = uuid.UUID(hex=content_hash[:32])
81
+ return doc_uuid, cleaned_content
82
+
76
83
  def get_client(self) -> weaviate.WeaviateClient:
77
84
  """Initialize and return a Weaviate client instance.
78
85
 
@@ -118,7 +125,7 @@ class Weaviate(VectorDb):
118
125
  await self.async_client.connect() # type: ignore
119
126
 
120
127
  if not await self.async_client.is_ready(): # type: ignore
121
- raise Exception("Weaviate async client is not ready")
128
+ raise ConnectionError("Weaviate async client is not ready")
122
129
 
123
130
  return self.async_client # type: ignore
124
131
 
@@ -155,6 +162,54 @@ class Weaviate(VectorDb):
155
162
  finally:
156
163
  await client.close()
157
164
 
165
+ def doc_content_changed(self, document: Document, check_existing: Optional[bool] = True) -> Optional[bool]:
166
+ """
167
+ Check if the content of the document has changed by comparing its UUID.
168
+
169
+ Args:
170
+ document (Document): Document to check
171
+
172
+ Returns:
173
+ bool: True if the document content has changed, False otherwise. None on wrong input.
174
+ check_existing (bool): If True, check if the document exists before checking if the content changed.
175
+ """
176
+ if not document or not document.content:
177
+ logger.warning("Invalid document: Missing content.")
178
+ return None
179
+
180
+ if check_existing and document.name and not self.name_exists(document.name):
181
+ logger.warning(f"A document by this name does not exist: {document.name}")
182
+ return None
183
+
184
+ doc_uuid, _ = self._get_doc_uuid(document)
185
+
186
+ collection = self.get_client().collections.get(self.collection)
187
+ existing_doc = collection.query.fetch_object_by_id(doc_uuid)
188
+
189
+ if not existing_doc:
190
+ return True
191
+ else:
192
+ return False
193
+
194
+ def doc_delete(self, name: str) -> None:
195
+ """
196
+ Delete all documents from Weaviate with a specific 'name' property.
197
+
198
+ Args:
199
+ name (str): Document name to delete.
200
+ """
201
+ collection = self.get_client().collections.get(self.collection)
202
+ filter_expr = Filter.by_property("name").equal(name)
203
+
204
+ result = collection.data.delete_many(where=filter_expr)
205
+
206
+ log_debug(f"Deleted document by name: '{name}' - {result.successful} documents deleted.")
207
+ if result.failed > 0:
208
+ logger.warning(
209
+ f"Failed to delete (some chunks of) document with name: '{name}' - "
210
+ f"Failed {result.failed} out of {result.matches} times. {result.successful} successful deletions."
211
+ )
212
+
158
213
  def doc_exists(self, document: Document) -> bool:
159
214
  """
160
215
  Validate if the document exists using consistent UUID generation.
@@ -169,9 +224,7 @@ class Weaviate(VectorDb):
169
224
  logger.warning("Invalid document: Missing content.")
170
225
  return False # Early exit for invalid input
171
226
 
172
- cleaned_content = document.content.replace("\x00", "\ufffd")
173
- content_hash = md5(cleaned_content.encode()).hexdigest()
174
- doc_uuid = uuid.UUID(hex=content_hash[:32])
227
+ doc_uuid, _ = self._get_doc_uuid(document)
175
228
 
176
229
  collection = self.get_client().collections.get(self.collection)
177
230
  return collection.data.exists(doc_uuid)
@@ -190,9 +243,7 @@ class Weaviate(VectorDb):
190
243
  logger.warning("Invalid document: Missing content.")
191
244
  return False # Early exit for invalid input
192
245
 
193
- cleaned_content = document.content.replace("\x00", "\ufffd")
194
- content_hash = md5(cleaned_content.encode()).hexdigest()
195
- doc_uuid = uuid.UUID(hex=content_hash[:32])
246
+ doc_uuid, _ = self._get_doc_uuid(document)
196
247
 
197
248
  client = await self.get_async_client()
198
249
  try:
@@ -256,9 +307,7 @@ class Weaviate(VectorDb):
256
307
  logger.error(f"Document embedding is None: {document.name}")
257
308
  continue
258
309
 
259
- cleaned_content = document.content.replace("\x00", "\ufffd")
260
- content_hash = md5(cleaned_content.encode()).hexdigest()
261
- doc_uuid = uuid.UUID(hex=content_hash[:32])
310
+ doc_uuid, cleaned_content = self._get_doc_uuid(document)
262
311
 
263
312
  # Merge filters with metadata
264
313
  meta_data = document.meta_data or {}
@@ -305,9 +354,7 @@ class Weaviate(VectorDb):
305
354
  continue
306
355
 
307
356
  # Clean content and generate UUID
308
- cleaned_content = document.content.replace("\x00", "\ufffd")
309
- content_hash = md5(cleaned_content.encode()).hexdigest()
310
- doc_uuid = uuid.UUID(hex=content_hash[:32])
357
+ doc_uuid, cleaned_content = self._get_doc_uuid(document)
311
358
 
312
359
  # Serialize meta_data to JSON string
313
360
  meta_data_str = json.dumps(document.meta_data) if document.meta_data else None
@@ -338,7 +385,28 @@ class Weaviate(VectorDb):
338
385
  filters (Optional[Dict[str, Any]]): Filters to apply while upserting
339
386
  """
340
387
  log_debug(f"Upserting {len(documents)} documents into Weaviate.")
341
- self.insert(documents)
388
+
389
+ _docs_to_insert = []
390
+ for document in documents:
391
+ assert document.name is not None, "Document name must be set for upsert operation."
392
+
393
+ if self.name_exists(document.name):
394
+ if self.doc_content_changed(document, check_existing=False):
395
+ log_debug(
396
+ f"Document already exists, but content changed. Document will be deleted and added again: {document.name}"
397
+ )
398
+
399
+ is_first_or_only_chunk = ("chunk" in document.meta_data and document.meta_data["chunk"] == 1) or (
400
+ "chunk" not in document.meta_data
401
+ )
402
+ if is_first_or_only_chunk:
403
+ self.doc_delete(document.name)
404
+ _docs_to_insert.append(document)
405
+ else:
406
+ log_debug(f"Document skipped, content is unchanged: {document.name}")
407
+ else:
408
+ _docs_to_insert.append(document)
409
+ self.insert(_docs_to_insert)
342
410
 
343
411
  async def async_upsert(self, documents: List[Document], filters: Optional[Dict[str, Any]] = None) -> None:
344
412
  """
@@ -365,9 +433,7 @@ class Weaviate(VectorDb):
365
433
  logger.error(f"Document embedding is None: {document.name}")
366
434
  continue
367
435
 
368
- cleaned_content = document.content.replace("\x00", "\ufffd")
369
- content_hash = md5(cleaned_content.encode()).hexdigest()
370
- doc_uuid = uuid.UUID(hex=content_hash[:32])
436
+ doc_uuid, cleaned_content = self._get_doc_uuid(document)
371
437
 
372
438
  # Serialize meta_data to JSON string
373
439
  meta_data_str = json.dumps(document.meta_data) if document.meta_data else None
agno/workflow/workflow.py CHANGED
@@ -369,6 +369,9 @@ class Workflow:
369
369
  if self.storage is not None:
370
370
  self.storage.mode = "workflow"
371
371
 
372
+ def initialize_workflow(self):
373
+ self.set_storage_mode()
374
+
372
375
  def set_workflow_id(self) -> str:
373
376
  if self.workflow_id is None:
374
377
  self.workflow_id = str(uuid4())