agno 2.0.10__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. agno/agent/agent.py +608 -175
  2. agno/db/in_memory/in_memory_db.py +42 -29
  3. agno/db/postgres/postgres.py +6 -4
  4. agno/exceptions.py +62 -1
  5. agno/guardrails/__init__.py +6 -0
  6. agno/guardrails/base.py +19 -0
  7. agno/guardrails/openai.py +144 -0
  8. agno/guardrails/pii.py +94 -0
  9. agno/guardrails/prompt_injection.py +51 -0
  10. agno/knowledge/embedder/aws_bedrock.py +9 -4
  11. agno/knowledge/embedder/azure_openai.py +54 -0
  12. agno/knowledge/embedder/base.py +2 -0
  13. agno/knowledge/embedder/cohere.py +184 -5
  14. agno/knowledge/embedder/google.py +79 -1
  15. agno/knowledge/embedder/huggingface.py +9 -4
  16. agno/knowledge/embedder/jina.py +63 -0
  17. agno/knowledge/embedder/mistral.py +78 -11
  18. agno/knowledge/embedder/ollama.py +5 -0
  19. agno/knowledge/embedder/openai.py +18 -54
  20. agno/knowledge/embedder/voyageai.py +69 -16
  21. agno/knowledge/knowledge.py +5 -4
  22. agno/knowledge/reader/pdf_reader.py +4 -3
  23. agno/knowledge/reader/website_reader.py +3 -2
  24. agno/models/base.py +125 -32
  25. agno/models/cerebras/cerebras.py +1 -0
  26. agno/models/cerebras/cerebras_openai.py +1 -0
  27. agno/models/dashscope/dashscope.py +1 -0
  28. agno/models/google/gemini.py +27 -5
  29. agno/models/litellm/chat.py +17 -0
  30. agno/models/openai/chat.py +13 -4
  31. agno/models/perplexity/perplexity.py +2 -3
  32. agno/models/requesty/__init__.py +5 -0
  33. agno/models/requesty/requesty.py +49 -0
  34. agno/models/vllm/vllm.py +1 -0
  35. agno/models/xai/xai.py +1 -0
  36. agno/os/app.py +167 -148
  37. agno/os/interfaces/whatsapp/router.py +2 -0
  38. agno/os/mcp.py +1 -1
  39. agno/os/middleware/__init__.py +7 -0
  40. agno/os/middleware/jwt.py +233 -0
  41. agno/os/router.py +181 -45
  42. agno/os/routers/home.py +2 -2
  43. agno/os/routers/memory/memory.py +23 -1
  44. agno/os/routers/memory/schemas.py +1 -1
  45. agno/os/routers/session/session.py +20 -3
  46. agno/os/utils.py +172 -8
  47. agno/run/agent.py +120 -77
  48. agno/run/team.py +115 -72
  49. agno/run/workflow.py +5 -15
  50. agno/session/summary.py +9 -10
  51. agno/session/team.py +2 -1
  52. agno/team/team.py +720 -168
  53. agno/tools/firecrawl.py +4 -4
  54. agno/tools/function.py +42 -2
  55. agno/tools/knowledge.py +3 -3
  56. agno/tools/searxng.py +2 -2
  57. agno/tools/serper.py +2 -2
  58. agno/tools/spider.py +2 -2
  59. agno/tools/workflow.py +4 -5
  60. agno/utils/events.py +66 -1
  61. agno/utils/hooks.py +57 -0
  62. agno/utils/media.py +11 -9
  63. agno/utils/print_response/agent.py +43 -5
  64. agno/utils/print_response/team.py +48 -12
  65. agno/vectordb/cassandra/cassandra.py +44 -4
  66. agno/vectordb/chroma/chromadb.py +79 -8
  67. agno/vectordb/clickhouse/clickhousedb.py +43 -6
  68. agno/vectordb/couchbase/couchbase.py +76 -5
  69. agno/vectordb/lancedb/lance_db.py +38 -3
  70. agno/vectordb/llamaindex/__init__.py +3 -0
  71. agno/vectordb/milvus/milvus.py +76 -4
  72. agno/vectordb/mongodb/mongodb.py +76 -4
  73. agno/vectordb/pgvector/pgvector.py +50 -6
  74. agno/vectordb/pineconedb/pineconedb.py +39 -2
  75. agno/vectordb/qdrant/qdrant.py +76 -26
  76. agno/vectordb/singlestore/singlestore.py +77 -4
  77. agno/vectordb/upstashdb/upstashdb.py +42 -2
  78. agno/vectordb/weaviate/weaviate.py +39 -3
  79. agno/workflow/types.py +1 -0
  80. agno/workflow/workflow.py +58 -2
  81. {agno-2.0.10.dist-info → agno-2.1.0.dist-info}/METADATA +4 -3
  82. {agno-2.0.10.dist-info → agno-2.1.0.dist-info}/RECORD +85 -75
  83. {agno-2.0.10.dist-info → agno-2.1.0.dist-info}/WHEEL +0 -0
  84. {agno-2.0.10.dist-info → agno-2.1.0.dist-info}/licenses/LICENSE +0 -0
  85. {agno-2.0.10.dist-info → agno-2.1.0.dist-info}/top_level.txt +0 -0
@@ -3,13 +3,13 @@ from os import getenv
3
3
  from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from agno.knowledge.embedder.base import Embedder
6
- from agno.utils.log import logger
6
+ from agno.utils.log import log_error, log_info, log_warning
7
7
 
8
8
  try:
9
9
  from mistralai import Mistral # type: ignore
10
10
  from mistralai.models.embeddingresponse import EmbeddingResponse # type: ignore
11
11
  except ImportError:
12
- logger.error("`mistralai` not installed")
12
+ log_error("`mistralai` not installed")
13
13
  raise
14
14
 
15
15
 
@@ -50,7 +50,7 @@ class MistralEmbedder(Embedder):
50
50
 
51
51
  def _response(self, text: str) -> EmbeddingResponse:
52
52
  _request_params: Dict[str, Any] = {
53
- "inputs": text,
53
+ "inputs": [text], # Mistral API expects a list
54
54
  "model": self.id,
55
55
  }
56
56
  if self.request_params:
@@ -67,7 +67,7 @@ class MistralEmbedder(Embedder):
67
67
  return response.data[0].embedding
68
68
  return []
69
69
  except Exception as e:
70
- logger.warning(f"Error getting embedding: {e}")
70
+ log_warning(f"Error getting embedding: {e}")
71
71
  return []
72
72
 
73
73
  def get_embedding_and_usage(self, text: str) -> Tuple[List[float], Dict[str, Any]]:
@@ -79,7 +79,7 @@ class MistralEmbedder(Embedder):
79
79
  usage: Dict[str, Any] = response.usage.model_dump() if response.usage else {}
80
80
  return embedding, usage
81
81
  except Exception as e:
82
- logger.warning(f"Error getting embedding and usage: {e}")
82
+ log_warning(f"Error getting embedding and usage: {e}")
83
83
  return [], {}
84
84
 
85
85
  async def async_get_embedding(self, text: str) -> List[float]:
@@ -88,7 +88,7 @@ class MistralEmbedder(Embedder):
88
88
  # Check if the client has an async version of embeddings.create
89
89
  if hasattr(self.client.embeddings, "create_async"):
90
90
  response: EmbeddingResponse = await self.client.embeddings.create_async(
91
- inputs=text, model=self.id, **self.request_params if self.request_params else {}
91
+ inputs=[text], model=self.id, **self.request_params if self.request_params else {}
92
92
  )
93
93
  else:
94
94
  # Fallback to running sync method in thread executor
@@ -98,7 +98,7 @@ class MistralEmbedder(Embedder):
98
98
  response: EmbeddingResponse = await loop.run_in_executor( # type: ignore
99
99
  None,
100
100
  lambda: self.client.embeddings.create(
101
- inputs=text, model=self.id, **self.request_params if self.request_params else {}
101
+ inputs=[text], model=self.id, **self.request_params if self.request_params else {}
102
102
  ),
103
103
  )
104
104
 
@@ -106,7 +106,7 @@ class MistralEmbedder(Embedder):
106
106
  return response.data[0].embedding
107
107
  return []
108
108
  except Exception as e:
109
- logger.warning(f"Error getting embedding: {e}")
109
+ log_warning(f"Error getting embedding: {e}")
110
110
  return []
111
111
 
112
112
  async def async_get_embedding_and_usage(self, text: str) -> Tuple[List[float], Dict[str, Any]]:
@@ -115,7 +115,7 @@ class MistralEmbedder(Embedder):
115
115
  # Check if the client has an async version of embeddings.create
116
116
  if hasattr(self.client.embeddings, "create_async"):
117
117
  response: EmbeddingResponse = await self.client.embeddings.create_async(
118
- inputs=text, model=self.id, **self.request_params if self.request_params else {}
118
+ inputs=[text], model=self.id, **self.request_params if self.request_params else {}
119
119
  )
120
120
  else:
121
121
  # Fallback to running sync method in thread executor
@@ -125,7 +125,7 @@ class MistralEmbedder(Embedder):
125
125
  response: EmbeddingResponse = await loop.run_in_executor( # type: ignore
126
126
  None,
127
127
  lambda: self.client.embeddings.create(
128
- inputs=text, model=self.id, **self.request_params if self.request_params else {}
128
+ inputs=[text], model=self.id, **self.request_params if self.request_params else {}
129
129
  ),
130
130
  )
131
131
 
@@ -135,5 +135,72 @@ class MistralEmbedder(Embedder):
135
135
  usage: Dict[str, Any] = response.usage.model_dump() if response.usage else {}
136
136
  return embedding, usage
137
137
  except Exception as e:
138
- logger.warning(f"Error getting embedding and usage: {e}")
138
+ log_warning(f"Error getting embedding and usage: {e}")
139
139
  return [], {}
140
+
141
+ async def async_get_embeddings_batch_and_usage(
142
+ self, texts: List[str]
143
+ ) -> Tuple[List[List[float]], List[Optional[Dict[str, Any]]]]:
144
+ """
145
+ Get embeddings and usage for multiple texts in batches.
146
+
147
+ Args:
148
+ texts: List of text strings to embed
149
+
150
+ Returns:
151
+ Tuple of (List of embedding vectors, List of usage dictionaries)
152
+ """
153
+ all_embeddings = []
154
+ all_usage = []
155
+ log_info(f"Getting embeddings and usage for {len(texts)} texts in batches of {self.batch_size}")
156
+
157
+ for i in range(0, len(texts), self.batch_size):
158
+ batch_texts = texts[i : i + self.batch_size]
159
+
160
+ _request_params: Dict[str, Any] = {
161
+ "inputs": batch_texts, # Mistral API expects a list for batch processing
162
+ "model": self.id,
163
+ }
164
+ if self.request_params:
165
+ _request_params.update(self.request_params)
166
+
167
+ try:
168
+ # Check if the client has an async version of embeddings.create
169
+ if hasattr(self.client.embeddings, "create_async"):
170
+ response: EmbeddingResponse = await self.client.embeddings.create_async(**_request_params)
171
+ else:
172
+ # Fallback to running sync method in thread executor
173
+ import asyncio
174
+
175
+ loop = asyncio.get_running_loop()
176
+ response: EmbeddingResponse = await loop.run_in_executor( # type: ignore
177
+ None, lambda: self.client.embeddings.create(**_request_params)
178
+ )
179
+
180
+ # Extract embeddings from batch response
181
+ if response.data:
182
+ batch_embeddings = [data.embedding for data in response.data if data.embedding]
183
+ all_embeddings.extend(batch_embeddings)
184
+ else:
185
+ # If no embeddings, add empty lists for each text in batch
186
+ all_embeddings.extend([[] for _ in batch_texts])
187
+
188
+ # Extract usage information
189
+ usage_dict = response.usage.model_dump() if response.usage else None
190
+ # Add same usage info for each embedding in the batch
191
+ all_usage.extend([usage_dict] * len(batch_texts))
192
+
193
+ except Exception as e:
194
+ log_warning(f"Error in async batch embedding: {e}")
195
+ # Fallback to individual calls for this batch
196
+ for text in batch_texts:
197
+ try:
198
+ embedding, usage = await self.async_get_embedding_and_usage(text)
199
+ all_embeddings.append(embedding)
200
+ all_usage.append(usage)
201
+ except Exception as e2:
202
+ log_warning(f"Error in individual async embedding fallback: {e2}")
203
+ all_embeddings.append([])
204
+ all_usage.append(None)
205
+
206
+ return all_embeddings, all_usage
@@ -45,6 +45,11 @@ class OllamaEmbedder(Embedder):
45
45
  ollama_client: Optional[OllamaClient] = None
46
46
  async_client: Optional[AsyncOllamaClient] = None
47
47
 
48
+ def __post_init__(self):
49
+ if self.enable_batch:
50
+ logger.warning("OllamaEmbedder does not support batch embeddings, setting enable_batch to False")
51
+ self.enable_batch = False
52
+
48
53
  @property
49
54
  def client(self) -> OllamaClient:
50
55
  if self.ollama_client:
@@ -140,66 +140,24 @@ class OpenAIEmbedder(Embedder):
140
140
  logger.warning(e)
141
141
  return [], None
142
142
 
143
- def get_embeddings_batch(self, texts: List[str], batch_size: int = 100) -> List[List[float]]:
143
+ async def async_get_embeddings_batch_and_usage(
144
+ self, texts: List[str]
145
+ ) -> Tuple[List[List[float]], List[Optional[Dict]]]:
144
146
  """
145
- Get embeddings for multiple texts in batches.
147
+ Get embeddings and usage for multiple texts in batches (async version).
146
148
 
147
149
  Args:
148
150
  texts: List of text strings to embed
149
- batch_size: Number of texts to process in each API call (max ~2048)
150
151
 
151
152
  Returns:
152
- List of embedding vectors
153
+ Tuple of (List of embedding vectors, List of usage dictionaries)
153
154
  """
154
155
  all_embeddings = []
156
+ all_usage = []
157
+ logger.info(f"Getting embeddings and usage for {len(texts)} texts in batches of {self.batch_size} (async)")
155
158
 
156
- for i in range(0, len(texts), batch_size):
157
- batch_texts = texts[i : i + batch_size]
158
-
159
- req: Dict[str, Any] = {
160
- "input": batch_texts,
161
- "model": self.id,
162
- "encoding_format": self.encoding_format,
163
- }
164
- if self.user is not None:
165
- req["user"] = self.user
166
- if self.id.startswith("text-embedding-3"):
167
- req["dimensions"] = self.dimensions
168
- if self.request_params:
169
- req.update(self.request_params)
170
-
171
- try:
172
- response: CreateEmbeddingResponse = self.client.embeddings.create(**req)
173
- batch_embeddings = [data.embedding for data in response.data]
174
- all_embeddings.extend(batch_embeddings)
175
- except Exception as e:
176
- logger.warning(f"Error in batch embedding: {e}")
177
- # Fallback to individual calls for this batch
178
- for text in batch_texts:
179
- try:
180
- embedding = self.get_embedding(text)
181
- all_embeddings.append(embedding)
182
- except Exception as e2:
183
- logger.warning(f"Error in individual embedding fallback: {e2}")
184
- all_embeddings.append([])
185
-
186
- return all_embeddings
187
-
188
- async def async_get_embeddings_batch(self, texts: List[str], batch_size: int = 100) -> List[List[float]]:
189
- """
190
- Get embeddings for multiple texts in batches (async version).
191
-
192
- Args:
193
- texts: List of text strings to embed
194
- batch_size: Number of texts to process in each API call (max ~2048)
195
-
196
- Returns:
197
- List of embedding vectors
198
- """
199
- all_embeddings = []
200
-
201
- for i in range(0, len(texts), batch_size):
202
- batch_texts = texts[i : i + batch_size]
159
+ for i in range(0, len(texts), self.batch_size):
160
+ batch_texts = texts[i : i + self.batch_size]
203
161
 
204
162
  req: Dict[str, Any] = {
205
163
  "input": batch_texts,
@@ -217,15 +175,21 @@ class OpenAIEmbedder(Embedder):
217
175
  response: CreateEmbeddingResponse = await self.aclient.embeddings.create(**req)
218
176
  batch_embeddings = [data.embedding for data in response.data]
219
177
  all_embeddings.extend(batch_embeddings)
178
+
179
+ # For each embedding in the batch, add the same usage information
180
+ usage_dict = response.usage.model_dump() if response.usage else None
181
+ all_usage.extend([usage_dict] * len(batch_embeddings))
220
182
  except Exception as e:
221
183
  logger.warning(f"Error in async batch embedding: {e}")
222
- # Fallback to individual async calls for this batch
184
+ # Fallback to individual calls for this batch
223
185
  for text in batch_texts:
224
186
  try:
225
- embedding = await self.async_get_embedding(text)
187
+ embedding, usage = await self.async_get_embedding_and_usage(text)
226
188
  all_embeddings.append(embedding)
189
+ all_usage.append(usage)
227
190
  except Exception as e2:
228
191
  logger.warning(f"Error in individual async embedding fallback: {e2}")
229
192
  all_embeddings.append([])
193
+ all_usage.append(None)
230
194
 
231
- return all_embeddings
195
+ return all_embeddings, all_usage
@@ -30,12 +30,13 @@ class VoyageAIEmbedder(Embedder):
30
30
  if self.voyage_client:
31
31
  return self.voyage_client
32
32
 
33
- _client_params = {
34
- "api_key": self.api_key,
35
- "max_retries": self.max_retries,
36
- "timeout": self.timeout,
37
- }
38
- _client_params = {k: v for k, v in _client_params.items() if v is not None}
33
+ _client_params: Dict[str, Any] = {}
34
+ if self.api_key is not None:
35
+ _client_params["api_key"] = self.api_key
36
+ if self.max_retries is not None:
37
+ _client_params["max_retries"] = self.max_retries
38
+ if self.timeout is not None:
39
+ _client_params["timeout"] = self.timeout
39
40
  if self.client_params:
40
41
  _client_params.update(self.client_params)
41
42
  self.voyage_client = VoyageClient(**_client_params)
@@ -46,12 +47,13 @@ class VoyageAIEmbedder(Embedder):
46
47
  if self.async_client:
47
48
  return self.async_client
48
49
 
49
- _client_params = {
50
- "api_key": self.api_key,
51
- "max_retries": self.max_retries,
52
- "timeout": self.timeout,
53
- }
54
- _client_params = {k: v for k, v in _client_params.items() if v is not None}
50
+ _client_params: Dict[str, Any] = {}
51
+ if self.api_key is not None:
52
+ _client_params["api_key"] = self.api_key
53
+ if self.max_retries is not None:
54
+ _client_params["max_retries"] = self.max_retries
55
+ if self.timeout is not None:
56
+ _client_params["timeout"] = self.timeout
55
57
  if self.client_params:
56
58
  _client_params.update(self.client_params)
57
59
  self.async_client = AsyncVoyageClient(**_client_params)
@@ -69,7 +71,8 @@ class VoyageAIEmbedder(Embedder):
69
71
  def get_embedding(self, text: str) -> List[float]:
70
72
  response: EmbeddingsObject = self._response(text=text)
71
73
  try:
72
- return response.embeddings[0]
74
+ embedding = response.embeddings[0]
75
+ return [float(x) for x in embedding] # Ensure all values are float
73
76
  except Exception as e:
74
77
  logger.warning(e)
75
78
  return []
@@ -79,7 +82,7 @@ class VoyageAIEmbedder(Embedder):
79
82
 
80
83
  embedding = response.embeddings[0]
81
84
  usage = {"total_tokens": response.total_tokens}
82
- return embedding, usage
85
+ return [float(x) for x in embedding], usage
83
86
 
84
87
  async def _async_response(self, text: str) -> EmbeddingsObject:
85
88
  """Async version of _response using AsyncVoyageClient."""
@@ -95,7 +98,8 @@ class VoyageAIEmbedder(Embedder):
95
98
  """Async version of get_embedding."""
96
99
  try:
97
100
  response: EmbeddingsObject = await self._async_response(text=text)
98
- return response.embeddings[0]
101
+ embedding = response.embeddings[0]
102
+ return [float(x) for x in embedding] # Ensure all values are float
99
103
  except Exception as e:
100
104
  logger.warning(f"Error getting embedding: {e}")
101
105
  return []
@@ -106,7 +110,56 @@ class VoyageAIEmbedder(Embedder):
106
110
  response: EmbeddingsObject = await self._async_response(text=text)
107
111
  embedding = response.embeddings[0]
108
112
  usage = {"total_tokens": response.total_tokens}
109
- return embedding, usage
113
+ return [float(x) for x in embedding], usage
110
114
  except Exception as e:
111
115
  logger.warning(f"Error getting embedding and usage: {e}")
112
116
  return [], None
117
+
118
+ async def async_get_embeddings_batch_and_usage(
119
+ self, texts: List[str]
120
+ ) -> Tuple[List[List[float]], List[Optional[Dict]]]:
121
+ """
122
+ Get embeddings and usage for multiple texts in batches.
123
+
124
+ Args:
125
+ texts: List of text strings to embed
126
+
127
+ Returns:
128
+ Tuple of (List of embedding vectors, List of usage dictionaries)
129
+ """
130
+ all_embeddings: List[List[float]] = []
131
+ all_usage: List[Optional[Dict]] = []
132
+ logger.info(f"Getting embeddings and usage for {len(texts)} texts in batches of {self.batch_size}")
133
+
134
+ for i in range(0, len(texts), self.batch_size):
135
+ batch_texts = texts[i : i + self.batch_size]
136
+
137
+ req: Dict[str, Any] = {
138
+ "texts": batch_texts,
139
+ "model": self.id,
140
+ }
141
+ if self.request_params:
142
+ req.update(self.request_params)
143
+
144
+ try:
145
+ response: EmbeddingsObject = await self.aclient.embed(**req)
146
+ batch_embeddings = [[float(x) for x in emb] for emb in response.embeddings]
147
+ all_embeddings.extend(batch_embeddings)
148
+
149
+ # For each embedding in the batch, add the same usage information
150
+ usage_dict = {"total_tokens": response.total_tokens}
151
+ all_usage.extend([usage_dict] * len(batch_embeddings))
152
+ except Exception as e:
153
+ logger.warning(f"Error in async batch embedding: {e}")
154
+ # Fallback to individual calls for this batch
155
+ for text in batch_texts:
156
+ try:
157
+ embedding, usage = await self.async_get_embedding_and_usage(text)
158
+ all_embeddings.append(embedding)
159
+ all_usage.append(usage)
160
+ except Exception as e2:
161
+ logger.warning(f"Error in individual async embedding fallback: {e2}")
162
+ all_embeddings.append([])
163
+ all_usage.append(None)
164
+
165
+ return all_embeddings, all_usage
@@ -89,7 +89,7 @@ class Knowledge:
89
89
  url=argument.get("url"),
90
90
  metadata=argument.get("metadata"),
91
91
  topics=argument.get("topics"),
92
- text_contents=argument.get("text_contents"),
92
+ text_content=argument.get("text_content"),
93
93
  reader=argument.get("reader"),
94
94
  include=argument.get("include"),
95
95
  exclude=argument.get("exclude"),
@@ -251,7 +251,9 @@ class Knowledge:
251
251
  ) -> None:
252
252
  # Validation: At least one of the parameters must be provided
253
253
  if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
254
- log_info("At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided.")
254
+ log_warning(
255
+ "At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
256
+ )
255
257
  return
256
258
 
257
259
  if not skip_if_exists:
@@ -534,7 +536,6 @@ class Knowledge:
534
536
  reader = content.reader
535
537
  name = content.name if content.name else content.url
536
538
  # Else select based on file extension
537
-
538
539
  if reader is None:
539
540
  if file_extension == ".csv":
540
541
  name = basename(parsed_url.path) or "data.csv"
@@ -570,6 +571,7 @@ class Knowledge:
570
571
  read_documents = reader.read(bytes_content, name=name)
571
572
  else:
572
573
  read_documents = reader.read(content.url, name=name)
574
+
573
575
  except Exception as e:
574
576
  log_error(f"Error reading URL: {content.url} - {str(e)}")
575
577
  content.status = ContentStatus.FAILED
@@ -580,7 +582,6 @@ class Knowledge:
580
582
  # 6. Chunk documents if needed
581
583
  if reader and not reader.chunk:
582
584
  read_documents = await reader.chunk_documents_async(read_documents)
583
-
584
585
  # 7. Prepare and insert the content in the vector database
585
586
  file_size = 0
586
587
  if read_documents:
@@ -117,6 +117,10 @@ def _clean_page_numbers(
117
117
  page_numbers = [find_page_number(content) for content in page_content_list]
118
118
  if all(x is None or x > 5 for x in page_numbers):
119
119
  # This approach won't work reliably for higher page numbers.
120
+ page_content_list = [
121
+ f"\n{page_content_list[i]}\n{extra_content[i]}" if extra_content else page_content_list[i]
122
+ for i in range(len(page_content_list))
123
+ ]
120
124
  return page_content_list, None
121
125
 
122
126
  # Possible range shifts to detect page numbering
@@ -261,7 +265,6 @@ class BasePDFReader(Reader):
261
265
 
262
266
  if self.chunk:
263
267
  return self._build_chunked_documents(documents)
264
-
265
268
  return documents
266
269
 
267
270
  def _pdf_reader_to_documents(
@@ -339,8 +342,6 @@ class PDFReader(BasePDFReader):
339
342
  except Exception:
340
343
  doc_name = "pdf"
341
344
 
342
- log_info(f"Reading: {doc_name}")
343
-
344
345
  try:
345
346
  DocumentReader(pdf)
346
347
  except PdfStreamError as e:
@@ -112,7 +112,8 @@ class WebsiteReader(Reader):
112
112
  if tag.name in ["article", "main", "section"]:
113
113
  return True
114
114
 
115
- classes = tag.get("class", [])
115
+ classes_attr = tag.get("class")
116
+ classes: List[str] = classes_attr if isinstance(classes_attr, list) else []
116
117
  content_classes = ["content", "main-content", "post-content", "entry-content", "article-body"]
117
118
  if any(cls in content_classes for cls in classes):
118
119
  return True
@@ -126,7 +127,7 @@ class WebsiteReader(Reader):
126
127
 
127
128
  # Try to find main content element
128
129
  element = soup.find(match)
129
- if element:
130
+ if element and hasattr(element, "find_all"):
130
131
  # Remove common unwanted elements from the found content
131
132
  for unwanted in element.find_all(["script", "style", "nav", "header", "footer"]):
132
133
  unwanted.decompose()