agno 2.4.4__py3-none-any.whl → 2.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -823,7 +823,13 @@ class Knowledge:
823
823
  log_warning(f"Invalid filter key: {key} - not present in knowledge base")
824
824
 
825
825
  elif isinstance(filters, List):
826
- # Validate that list contains FilterExpr instances
826
+ # Validate list filters against known metadata keys
827
+ if valid_metadata_filters is None or not valid_metadata_filters:
828
+ # Can't validate keys without metadata - return original list
829
+ log_warning("No valid metadata filters tracked yet. Cannot validate list filter keys.")
830
+ return filters, []
831
+
832
+ valid_list_filters: List[FilterExpr] = []
827
833
  for i, filter_item in enumerate(filters):
828
834
  if not isinstance(filter_item, FilterExpr):
829
835
  log_warning(
@@ -832,9 +838,23 @@ class Knowledge:
832
838
  f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
833
839
  f"AND(...), OR(...), NOT(...) from agno.filters"
834
840
  )
835
- # Filter expressions are already validated, return empty dict/list
836
- # The actual filtering happens in the vector_db layer
837
- return filters, []
841
+ continue
842
+
843
+ # Check if filter has a key attribute and validate it
844
+ if hasattr(filter_item, "key"):
845
+ key = filter_item.key
846
+ base_key = key.split(".")[-1] if "." in key else key
847
+ if base_key in valid_metadata_filters or key in valid_metadata_filters:
848
+ valid_list_filters.append(filter_item)
849
+ else:
850
+ invalid_keys.append(key)
851
+ log_warning(f"Invalid filter key: {key} - not present in knowledge base")
852
+ else:
853
+ # Complex filters (AND, OR, NOT) - keep them as-is
854
+ # They contain nested filters that will be validated by the vector DB
855
+ valid_list_filters.append(filter_item)
856
+
857
+ return valid_list_filters, invalid_keys
838
858
 
839
859
  return valid_filters, invalid_keys
840
860
 
@@ -1541,7 +1561,49 @@ class Knowledge:
1541
1561
  # 6. Chunk documents if needed
1542
1562
  if reader and not reader.chunk:
1543
1563
  read_documents = await reader.chunk_documents_async(read_documents)
1544
- # 7. Prepare and insert the content in the vector database
1564
+
1565
+ # 7. Group documents by source URL for multi-page readers (like WebsiteReader)
1566
+ docs_by_source: Dict[str, List[Document]] = {}
1567
+ for doc in read_documents:
1568
+ source_url = doc.meta_data.get("url", content.url) if doc.meta_data else content.url
1569
+ source_url = source_url or "unknown"
1570
+ if source_url not in docs_by_source:
1571
+ docs_by_source[source_url] = []
1572
+ docs_by_source[source_url].append(doc)
1573
+
1574
+ # 8. Process each source separately if multiple sources exist
1575
+ if len(docs_by_source) > 1:
1576
+ for source_url, source_docs in docs_by_source.items():
1577
+ # Compute per-document hash based on actual source URL
1578
+ doc_hash = self._build_document_content_hash(source_docs[0], content)
1579
+
1580
+ # Check skip_if_exists for each source individually
1581
+ if self._should_skip(doc_hash, skip_if_exists):
1582
+ log_debug(f"Skipping already indexed: {source_url}")
1583
+ continue
1584
+
1585
+ doc_id = generate_id(doc_hash)
1586
+ self._prepare_documents_for_insert(source_docs, doc_id, calculate_sizes=True)
1587
+
1588
+ # Insert with per-document hash
1589
+ if self.vector_db.upsert_available() and upsert:
1590
+ try:
1591
+ await self.vector_db.async_upsert(doc_hash, source_docs, content.metadata)
1592
+ except Exception as e:
1593
+ log_error(f"Error upserting document from {source_url}: {e}")
1594
+ continue
1595
+ else:
1596
+ try:
1597
+ await self.vector_db.async_insert(doc_hash, documents=source_docs, filters=content.metadata)
1598
+ except Exception as e:
1599
+ log_error(f"Error inserting document from {source_url}: {e}")
1600
+ continue
1601
+
1602
+ content.status = ContentStatus.COMPLETED
1603
+ await self._aupdate_content(content)
1604
+ return
1605
+
1606
+ # 9. Single source - use existing logic with original content hash
1545
1607
  if not content.id:
1546
1608
  content.id = generate_id(content.content_hash or "")
1547
1609
  self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
@@ -1648,7 +1710,48 @@ class Knowledge:
1648
1710
  if reader:
1649
1711
  read_documents = self._chunk_documents_sync(reader, read_documents)
1650
1712
 
1651
- # 7. Prepare and insert the content in the vector database
1713
+ # 7. Group documents by source URL for multi-page readers (like WebsiteReader)
1714
+ docs_by_source: Dict[str, List[Document]] = {}
1715
+ for doc in read_documents:
1716
+ source_url = doc.meta_data.get("url", content.url) if doc.meta_data else content.url
1717
+ source_url = source_url or "unknown"
1718
+ if source_url not in docs_by_source:
1719
+ docs_by_source[source_url] = []
1720
+ docs_by_source[source_url].append(doc)
1721
+
1722
+ # 8. Process each source separately if multiple sources exist
1723
+ if len(docs_by_source) > 1:
1724
+ for source_url, source_docs in docs_by_source.items():
1725
+ # Compute per-document hash based on actual source URL
1726
+ doc_hash = self._build_document_content_hash(source_docs[0], content)
1727
+
1728
+ # Check skip_if_exists for each source individually
1729
+ if self._should_skip(doc_hash, skip_if_exists):
1730
+ log_debug(f"Skipping already indexed: {source_url}")
1731
+ continue
1732
+
1733
+ doc_id = generate_id(doc_hash)
1734
+ self._prepare_documents_for_insert(source_docs, doc_id, calculate_sizes=True)
1735
+
1736
+ # Insert with per-document hash
1737
+ if self.vector_db.upsert_available() and upsert:
1738
+ try:
1739
+ self.vector_db.upsert(doc_hash, source_docs, content.metadata)
1740
+ except Exception as e:
1741
+ log_error(f"Error upserting document from {source_url}: {e}")
1742
+ continue
1743
+ else:
1744
+ try:
1745
+ self.vector_db.insert(doc_hash, documents=source_docs, filters=content.metadata)
1746
+ except Exception as e:
1747
+ log_error(f"Error inserting document from {source_url}: {e}")
1748
+ continue
1749
+
1750
+ content.status = ContentStatus.COMPLETED
1751
+ self._update_content(content)
1752
+ return
1753
+
1754
+ # 9. Single source - use existing logic with original content hash
1652
1755
  if not content.id:
1653
1756
  content.id = generate_id(content.content_hash or "")
1654
1757
  self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
@@ -1900,11 +2003,11 @@ class Knowledge:
1900
2003
  if self._should_skip(content.content_hash, skip_if_exists):
1901
2004
  content.status = ContentStatus.COMPLETED
1902
2005
  await self._aupdate_content(content)
1903
- return
2006
+ continue # Skip to next topic, don't exit loop
1904
2007
 
1905
2008
  if self.vector_db.__class__.__name__ == "LightRag":
1906
2009
  await self._aprocess_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
1907
- return
2010
+ continue # Skip to next topic, don't exit loop
1908
2011
 
1909
2012
  if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1910
2013
  log_info(f"Content {content.content_hash} already exists, skipping")
@@ -1961,11 +2064,11 @@ class Knowledge:
1961
2064
  if self._should_skip(content.content_hash, skip_if_exists):
1962
2065
  content.status = ContentStatus.COMPLETED
1963
2066
  self._update_content(content)
1964
- return
2067
+ continue # Skip to next topic, don't exit loop
1965
2068
 
1966
2069
  if self.vector_db.__class__.__name__ == "LightRag":
1967
2070
  self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
1968
- return
2071
+ continue # Skip to next topic, don't exit loop
1969
2072
 
1970
2073
  if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1971
2074
  log_info(f"Content {content.content_hash} already exists, skipping")
@@ -3896,6 +3999,42 @@ class Knowledge:
3896
3999
  hash_input = ":".join(hash_parts)
3897
4000
  return hashlib.sha256(hash_input.encode()).hexdigest()
3898
4001
 
4002
+ def _build_document_content_hash(self, document: Document, content: Content) -> str:
4003
+ """
4004
+ Build content hash for a specific document.
4005
+
4006
+ Used for multi-page readers (like WebsiteReader) where each crawled page
4007
+ should have its own unique content hash based on its actual URL.
4008
+
4009
+ Args:
4010
+ document: The document to build the hash for
4011
+ content: The original content object (for fallback name/description)
4012
+
4013
+ Returns:
4014
+ A unique hash string for this specific document
4015
+ """
4016
+ hash_parts = []
4017
+
4018
+ if content.name:
4019
+ hash_parts.append(content.name)
4020
+ if content.description:
4021
+ hash_parts.append(content.description)
4022
+
4023
+ # Use document's own URL if available (set by WebsiteReader)
4024
+ doc_url = document.meta_data.get("url") if document.meta_data else None
4025
+ if doc_url:
4026
+ hash_parts.append(str(doc_url))
4027
+ elif content.url:
4028
+ hash_parts.append(content.url)
4029
+ elif content.path:
4030
+ hash_parts.append(str(content.path))
4031
+ else:
4032
+ # Fallback: use content hash for uniqueness
4033
+ hash_parts.append(hashlib.sha256(document.content.encode()).hexdigest()[:16])
4034
+
4035
+ hash_input = ":".join(hash_parts)
4036
+ return hashlib.sha256(hash_input.encode()).hexdigest()
4037
+
3899
4038
  def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
3900
4039
  """
3901
4040
  Safely ensure a field is a string, handling various edge cases.
@@ -4451,12 +4590,10 @@ class Knowledge:
4451
4590
  # ========================================================================
4452
4591
 
4453
4592
  # Shared context strings
4454
- _KNOWLEDGE_BASE_SEARCH_INSTRUCTION = (
4455
- "You have access to a knowledge base.\n"
4456
- "IMPORTANT: For any user question that could be answered from the knowledge base, you MUST call the "
4457
- "search_knowledge_base tool before responding.\n"
4458
- "If the user question is ambiguous (e.g., 'the candidate') do NOT ask clarifying questions first—search the "
4459
- "knowledge base to identify the relevant documents.\n"
4593
+ _SEARCH_KNOWLEDGE_INSTRUCTIONS = (
4594
+ "You have a knowledge base you can search using the search_knowledge_base tool. "
4595
+ "Search before answering questions—don't assume you know the answer. "
4596
+ "For ambiguous questions, search first rather than asking for clarification."
4460
4597
  )
4461
4598
 
4462
4599
  _AGENTIC_FILTER_INSTRUCTION_TEMPLATE = """
@@ -4499,7 +4636,7 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4499
4636
  Returns:
4500
4637
  Context string to add to system prompt.
4501
4638
  """
4502
- context_parts: List[str] = [self._KNOWLEDGE_BASE_SEARCH_INSTRUCTION]
4639
+ context_parts: List[str] = [self._SEARCH_KNOWLEDGE_INSTRUCTIONS]
4503
4640
 
4504
4641
  # Add filter instructions if agentic filters are enabled
4505
4642
  if enable_agentic_filters:
@@ -4507,7 +4644,7 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4507
4644
  if valid_filters:
4508
4645
  context_parts.append(self._get_agentic_filter_instructions(valid_filters))
4509
4646
 
4510
- return "\n".join(context_parts)
4647
+ return "<knowledge_base>\n" + "\n".join(context_parts) + "\n</knowledge_base>"
4511
4648
 
4512
4649
  async def abuild_context(
4513
4650
  self,
@@ -4526,7 +4663,7 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4526
4663
  Returns:
4527
4664
  Context string to add to system prompt.
4528
4665
  """
4529
- context_parts: List[str] = [self._KNOWLEDGE_BASE_SEARCH_INSTRUCTION]
4666
+ context_parts: List[str] = [self._SEARCH_KNOWLEDGE_INSTRUCTIONS]
4530
4667
 
4531
4668
  # Add filter instructions if agentic filters are enabled
4532
4669
  if enable_agentic_filters:
@@ -4534,7 +4671,7 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4534
4671
  if valid_filters:
4535
4672
  context_parts.append(self._get_agentic_filter_instructions(valid_filters))
4536
4673
 
4537
- return "\n".join(context_parts)
4674
+ return "<knowledge_base>\n" + "\n".join(context_parts) + "\n</knowledge_base>"
4538
4675
 
4539
4676
  def get_tools(
4540
4677
  self,
@@ -4627,7 +4764,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4627
4764
  retrieval_timer = Timer()
4628
4765
  retrieval_timer.start()
4629
4766
 
4630
- docs = self.search(query=query, filters=knowledge_filters)
4767
+ try:
4768
+ docs = self.search(query=query, filters=knowledge_filters)
4769
+ except Exception as e:
4770
+ retrieval_timer.stop()
4771
+ log_warning(f"Knowledge search failed: {e}")
4772
+ return f"Error searching knowledge base: {type(e).__name__}"
4631
4773
 
4632
4774
  if run_response is not None and docs:
4633
4775
  references = MessageReferences(
@@ -4659,7 +4801,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4659
4801
  retrieval_timer = Timer()
4660
4802
  retrieval_timer.start()
4661
4803
 
4662
- docs = await self.asearch(query=query, filters=knowledge_filters)
4804
+ try:
4805
+ docs = await self.asearch(query=query, filters=knowledge_filters)
4806
+ except Exception as e:
4807
+ retrieval_timer.stop()
4808
+ log_warning(f"Knowledge search failed: {e}")
4809
+ return f"Error searching knowledge base: {type(e).__name__}"
4663
4810
 
4664
4811
  if run_response is not None and docs:
4665
4812
  references = MessageReferences(
@@ -4737,7 +4884,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4737
4884
  retrieval_timer = Timer()
4738
4885
  retrieval_timer.start()
4739
4886
 
4740
- docs = self.search(query=query, filters=search_filters)
4887
+ try:
4888
+ docs = self.search(query=query, filters=search_filters)
4889
+ except Exception as e:
4890
+ retrieval_timer.stop()
4891
+ log_warning(f"Knowledge search failed: {e}")
4892
+ return f"Error searching knowledge base: {type(e).__name__}"
4741
4893
 
4742
4894
  if run_response is not None and docs:
4743
4895
  references = MessageReferences(
@@ -4791,7 +4943,12 @@ Make sure to pass the filters as [Dict[str: Any]] to the tool. FOLLOW THIS STRUC
4791
4943
  retrieval_timer = Timer()
4792
4944
  retrieval_timer.start()
4793
4945
 
4794
- docs = await self.asearch(query=query, filters=search_filters)
4946
+ try:
4947
+ docs = await self.asearch(query=query, filters=search_filters)
4948
+ except Exception as e:
4949
+ retrieval_timer.stop()
4950
+ log_warning(f"Knowledge search failed: {e}")
4951
+ return f"Error searching knowledge base: {type(e).__name__}"
4795
4952
 
4796
4953
  if run_response is not None and docs:
4797
4954
  references = MessageReferences(
@@ -110,7 +110,7 @@ class TextReader(Reader):
110
110
  chunked_documents = self.chunk_document(document)
111
111
 
112
112
  if not chunked_documents:
113
- return [document]
113
+ return []
114
114
 
115
115
  tasks = [process_chunk(chunk_doc) for chunk_doc in chunked_documents]
116
116
  return await asyncio.gather(*tasks)