agno 2.4.7__py3-none-any.whl → 2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +5 -1
- agno/db/base.py +2 -0
- agno/db/postgres/postgres.py +5 -5
- agno/db/sqlite/sqlite.py +4 -4
- agno/knowledge/knowledge.py +83 -1853
- agno/knowledge/loaders/__init__.py +29 -0
- agno/knowledge/loaders/azure_blob.py +423 -0
- agno/knowledge/loaders/base.py +187 -0
- agno/knowledge/loaders/gcs.py +267 -0
- agno/knowledge/loaders/github.py +415 -0
- agno/knowledge/loaders/s3.py +281 -0
- agno/knowledge/loaders/sharepoint.py +439 -0
- agno/knowledge/reader/website_reader.py +2 -2
- agno/knowledge/remote_knowledge.py +151 -0
- agno/learn/stores/session_context.py +10 -2
- agno/models/azure/openai_chat.py +6 -11
- agno/models/neosantara/__init__.py +5 -0
- agno/models/neosantara/neosantara.py +42 -0
- agno/models/utils.py +5 -0
- agno/os/app.py +4 -1
- agno/os/interfaces/agui/router.py +1 -1
- agno/os/routers/components/components.py +2 -0
- agno/os/routers/knowledge/knowledge.py +0 -1
- agno/os/routers/registry/registry.py +340 -192
- agno/os/routers/workflows/router.py +7 -1
- agno/os/schema.py +104 -0
- agno/registry/registry.py +4 -0
- agno/session/workflow.py +1 -1
- agno/skills/utils.py +100 -2
- agno/team/team.py +6 -3
- agno/vectordb/lancedb/lance_db.py +22 -7
- agno/workflow/__init__.py +4 -0
- agno/workflow/cel.py +299 -0
- agno/workflow/condition.py +145 -2
- agno/workflow/loop.py +177 -46
- agno/workflow/parallel.py +75 -4
- agno/workflow/router.py +260 -44
- agno/workflow/step.py +14 -7
- agno/workflow/steps.py +43 -0
- agno/workflow/workflow.py +104 -46
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/METADATA +24 -36
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/RECORD +45 -34
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/WHEEL +0 -0
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Remote content loaders for Knowledge.
|
|
2
|
+
|
|
3
|
+
This module provides loaders for various cloud storage providers:
|
|
4
|
+
- S3Loader: AWS S3
|
|
5
|
+
- GCSLoader: Google Cloud Storage
|
|
6
|
+
- SharePointLoader: Microsoft SharePoint
|
|
7
|
+
- GitHubLoader: GitHub repositories
|
|
8
|
+
- AzureBlobLoader: Azure Blob Storage
|
|
9
|
+
|
|
10
|
+
All loaders inherit from BaseLoader which provides common utilities for
|
|
11
|
+
computing content names, creating content entries, and merging metadata.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from agno.knowledge.loaders.azure_blob import AzureBlobLoader
|
|
15
|
+
from agno.knowledge.loaders.base import BaseLoader, FileToProcess
|
|
16
|
+
from agno.knowledge.loaders.gcs import GCSLoader
|
|
17
|
+
from agno.knowledge.loaders.github import GitHubLoader
|
|
18
|
+
from agno.knowledge.loaders.s3 import S3Loader
|
|
19
|
+
from agno.knowledge.loaders.sharepoint import SharePointLoader
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"BaseLoader",
|
|
23
|
+
"FileToProcess",
|
|
24
|
+
"S3Loader",
|
|
25
|
+
"GCSLoader",
|
|
26
|
+
"SharePointLoader",
|
|
27
|
+
"GitHubLoader",
|
|
28
|
+
"AzureBlobLoader",
|
|
29
|
+
]
|
|
@@ -0,0 +1,423 @@
|
|
|
1
|
+
"""Azure Blob Storage content loader for Knowledge.
|
|
2
|
+
|
|
3
|
+
Provides methods for loading content from Azure Blob Storage.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# mypy: disable-error-code="attr-defined"
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any, Dict, List, Optional, cast
|
|
10
|
+
|
|
11
|
+
from agno.knowledge.content import Content, ContentStatus
|
|
12
|
+
from agno.knowledge.loaders.base import BaseLoader
|
|
13
|
+
from agno.knowledge.reader import Reader
|
|
14
|
+
from agno.knowledge.remote_content.config import AzureBlobConfig, RemoteContentConfig
|
|
15
|
+
from agno.knowledge.remote_content.remote_content import AzureBlobContent
|
|
16
|
+
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
17
|
+
from agno.utils.string import generate_id
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AzureBlobLoader(BaseLoader):
|
|
21
|
+
"""Loader for Azure Blob Storage content."""
|
|
22
|
+
|
|
23
|
+
# ==========================================
|
|
24
|
+
# AZURE BLOB HELPERS (shared between sync/async)
|
|
25
|
+
# ==========================================
|
|
26
|
+
|
|
27
|
+
def _validate_azure_config(
|
|
28
|
+
self,
|
|
29
|
+
content: Content,
|
|
30
|
+
config: Optional[RemoteContentConfig],
|
|
31
|
+
) -> Optional[AzureBlobConfig]:
|
|
32
|
+
"""Validate and extract Azure Blob config.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
AzureBlobConfig if valid, None otherwise
|
|
36
|
+
"""
|
|
37
|
+
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
38
|
+
azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
|
|
39
|
+
|
|
40
|
+
if azure_config is None:
|
|
41
|
+
log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
return azure_config
|
|
45
|
+
|
|
46
|
+
def _get_azure_blob_client(self, azure_config: AzureBlobConfig):
|
|
47
|
+
"""Get a sync Azure Blob Service Client using client credentials flow.
|
|
48
|
+
|
|
49
|
+
Requires the `azure-identity` and `azure-storage-blob` packages.
|
|
50
|
+
"""
|
|
51
|
+
try:
|
|
52
|
+
from azure.identity import ClientSecretCredential # type: ignore
|
|
53
|
+
from azure.storage.blob import BlobServiceClient # type: ignore
|
|
54
|
+
except ImportError:
|
|
55
|
+
raise ImportError(
|
|
56
|
+
"The `azure-identity` and `azure-storage-blob` packages are not installed. "
|
|
57
|
+
"Please install them via `pip install azure-identity azure-storage-blob`."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
credential = ClientSecretCredential(
|
|
61
|
+
tenant_id=azure_config.tenant_id,
|
|
62
|
+
client_id=azure_config.client_id,
|
|
63
|
+
client_secret=azure_config.client_secret,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
blob_service = BlobServiceClient(
|
|
67
|
+
account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
|
|
68
|
+
credential=credential,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return blob_service
|
|
72
|
+
|
|
73
|
+
def _get_azure_blob_client_async(self, azure_config: AzureBlobConfig):
|
|
74
|
+
"""Get an async Azure Blob Service Client using client credentials flow.
|
|
75
|
+
|
|
76
|
+
Requires the `azure-identity` and `azure-storage-blob` packages.
|
|
77
|
+
Uses the async versions from azure.storage.blob.aio and azure.identity.aio.
|
|
78
|
+
"""
|
|
79
|
+
try:
|
|
80
|
+
from azure.identity.aio import ClientSecretCredential # type: ignore
|
|
81
|
+
from azure.storage.blob.aio import BlobServiceClient # type: ignore
|
|
82
|
+
except ImportError:
|
|
83
|
+
raise ImportError(
|
|
84
|
+
"The `azure-identity` and `azure-storage-blob` packages are not installed. "
|
|
85
|
+
"Please install them via `pip install azure-identity azure-storage-blob`."
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
credential = ClientSecretCredential(
|
|
89
|
+
tenant_id=azure_config.tenant_id,
|
|
90
|
+
client_id=azure_config.client_id,
|
|
91
|
+
client_secret=azure_config.client_secret,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
blob_service = BlobServiceClient(
|
|
95
|
+
account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
|
|
96
|
+
credential=credential,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return blob_service
|
|
100
|
+
|
|
101
|
+
def _build_azure_metadata(
|
|
102
|
+
self,
|
|
103
|
+
azure_config: AzureBlobConfig,
|
|
104
|
+
blob_name: str,
|
|
105
|
+
file_name: str,
|
|
106
|
+
) -> Dict[str, str]:
|
|
107
|
+
"""Build Azure Blob-specific metadata dictionary."""
|
|
108
|
+
return {
|
|
109
|
+
"source_type": "azure_blob",
|
|
110
|
+
"source_config_id": azure_config.id,
|
|
111
|
+
"source_config_name": azure_config.name,
|
|
112
|
+
"azure_storage_account": azure_config.storage_account,
|
|
113
|
+
"azure_container": azure_config.container,
|
|
114
|
+
"azure_blob_name": blob_name,
|
|
115
|
+
"azure_filename": file_name,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
def _build_azure_virtual_path(
|
|
119
|
+
self,
|
|
120
|
+
storage_account: str,
|
|
121
|
+
container: str,
|
|
122
|
+
blob_name: str,
|
|
123
|
+
) -> str:
|
|
124
|
+
"""Build virtual path for Azure Blob content."""
|
|
125
|
+
return f"azure://{storage_account}/{container}/{blob_name}"
|
|
126
|
+
|
|
127
|
+
def _get_azure_root_path(self, remote_content: AzureBlobContent) -> str:
|
|
128
|
+
"""Get the root path for computing relative paths."""
|
|
129
|
+
return remote_content.prefix or ""
|
|
130
|
+
|
|
131
|
+
# ==========================================
|
|
132
|
+
# AZURE BLOB LOADERS
|
|
133
|
+
# ==========================================
|
|
134
|
+
|
|
135
|
+
async def _aload_from_azure_blob(
|
|
136
|
+
self,
|
|
137
|
+
content: Content,
|
|
138
|
+
upsert: bool,
|
|
139
|
+
skip_if_exists: bool,
|
|
140
|
+
config: Optional[RemoteContentConfig] = None,
|
|
141
|
+
):
|
|
142
|
+
"""Load content from Azure Blob Storage (async).
|
|
143
|
+
|
|
144
|
+
Requires the AzureBlobConfig to contain tenant_id, client_id, client_secret,
|
|
145
|
+
storage_account, and container.
|
|
146
|
+
|
|
147
|
+
Uses the async Azure SDK to avoid blocking the event loop.
|
|
148
|
+
"""
|
|
149
|
+
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
150
|
+
azure_config = self._validate_azure_config(content, config)
|
|
151
|
+
if azure_config is None:
|
|
152
|
+
return
|
|
153
|
+
|
|
154
|
+
# Get async blob service client
|
|
155
|
+
try:
|
|
156
|
+
blob_service = self._get_azure_blob_client_async(azure_config)
|
|
157
|
+
except ImportError as e:
|
|
158
|
+
log_error(str(e))
|
|
159
|
+
return
|
|
160
|
+
except Exception as e:
|
|
161
|
+
log_error(f"Error creating Azure Blob client: {e}")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
# Use async context manager for proper resource cleanup
|
|
165
|
+
async with blob_service:
|
|
166
|
+
container_client = blob_service.get_container_client(azure_config.container)
|
|
167
|
+
|
|
168
|
+
# Helper to list blobs with a given prefix (async)
|
|
169
|
+
async def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
|
|
170
|
+
"""List all blobs under a given prefix (folder)."""
|
|
171
|
+
results: List[Dict[str, Any]] = []
|
|
172
|
+
normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
|
|
173
|
+
async for blob in container_client.list_blobs(name_starts_with=normalized_prefix):
|
|
174
|
+
if not blob.name.endswith("/"):
|
|
175
|
+
results.append(
|
|
176
|
+
{
|
|
177
|
+
"name": blob.name,
|
|
178
|
+
"size": blob.size,
|
|
179
|
+
"content_type": blob.content_settings.content_type if blob.content_settings else None,
|
|
180
|
+
}
|
|
181
|
+
)
|
|
182
|
+
return results
|
|
183
|
+
|
|
184
|
+
# Identify blobs to process
|
|
185
|
+
blobs_to_process: List[Dict[str, Any]] = []
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
if remote_content.blob_name:
|
|
189
|
+
blob_client = container_client.get_blob_client(remote_content.blob_name)
|
|
190
|
+
try:
|
|
191
|
+
props = await blob_client.get_blob_properties()
|
|
192
|
+
blobs_to_process.append(
|
|
193
|
+
{
|
|
194
|
+
"name": remote_content.blob_name,
|
|
195
|
+
"size": props.size,
|
|
196
|
+
"content_type": props.content_settings.content_type if props.content_settings else None,
|
|
197
|
+
}
|
|
198
|
+
)
|
|
199
|
+
except Exception:
|
|
200
|
+
log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
|
|
201
|
+
blobs_to_process = await list_blobs_with_prefix(remote_content.blob_name)
|
|
202
|
+
if not blobs_to_process:
|
|
203
|
+
log_error(
|
|
204
|
+
f"No blob or folder found at path: {remote_content.blob_name}. "
|
|
205
|
+
"If this is a folder, ensure files exist inside it."
|
|
206
|
+
)
|
|
207
|
+
return
|
|
208
|
+
elif remote_content.prefix:
|
|
209
|
+
blobs_to_process = await list_blobs_with_prefix(remote_content.prefix)
|
|
210
|
+
except Exception as e:
|
|
211
|
+
log_error(f"Error listing Azure blobs: {e}")
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
if not blobs_to_process:
|
|
215
|
+
log_warning(f"No blobs found in Azure container: {azure_config.container}")
|
|
216
|
+
return
|
|
217
|
+
|
|
218
|
+
log_info(f"Processing {len(blobs_to_process)} file(s) from Azure Blob Storage")
|
|
219
|
+
is_folder_upload = len(blobs_to_process) > 1
|
|
220
|
+
root_path = self._get_azure_root_path(remote_content)
|
|
221
|
+
|
|
222
|
+
for blob_info in blobs_to_process:
|
|
223
|
+
blob_name = blob_info["name"]
|
|
224
|
+
file_name = blob_name.split("/")[-1]
|
|
225
|
+
|
|
226
|
+
# Build metadata and virtual path using helpers
|
|
227
|
+
virtual_path = self._build_azure_virtual_path(
|
|
228
|
+
azure_config.storage_account, azure_config.container, blob_name
|
|
229
|
+
)
|
|
230
|
+
azure_metadata = self._build_azure_metadata(azure_config, blob_name, file_name)
|
|
231
|
+
merged_metadata = self._merge_metadata(azure_metadata, content.metadata)
|
|
232
|
+
|
|
233
|
+
# Compute content name using base helper
|
|
234
|
+
content_name = self._compute_content_name(
|
|
235
|
+
blob_name, file_name, content.name, root_path, is_folder_upload
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
# Create content entry using base helper
|
|
239
|
+
content_entry = self._create_content_entry(
|
|
240
|
+
content, content_name, virtual_path, merged_metadata, "azure_blob", is_folder_upload
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
await self._ainsert_contents_db(content_entry)
|
|
244
|
+
|
|
245
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
246
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
247
|
+
await self._aupdate_content(content_entry)
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
# Download blob (async)
|
|
251
|
+
try:
|
|
252
|
+
blob_client = container_client.get_blob_client(blob_name)
|
|
253
|
+
download_stream = await blob_client.download_blob()
|
|
254
|
+
blob_data = await download_stream.readall()
|
|
255
|
+
file_content = BytesIO(blob_data)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
log_error(f"Error downloading Azure blob {blob_name}: {e}")
|
|
258
|
+
content_entry.status = ContentStatus.FAILED
|
|
259
|
+
content_entry.status_message = str(e)
|
|
260
|
+
await self._aupdate_content(content_entry)
|
|
261
|
+
continue
|
|
262
|
+
|
|
263
|
+
# Select reader and read content
|
|
264
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
265
|
+
if reader is None:
|
|
266
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
267
|
+
content_entry.status = ContentStatus.FAILED
|
|
268
|
+
content_entry.status_message = "No suitable reader found"
|
|
269
|
+
await self._aupdate_content(content_entry)
|
|
270
|
+
continue
|
|
271
|
+
|
|
272
|
+
reader = cast(Reader, reader)
|
|
273
|
+
read_documents = await reader.async_read(file_content, name=file_name)
|
|
274
|
+
|
|
275
|
+
# Prepare and insert into vector database
|
|
276
|
+
if not content_entry.id:
|
|
277
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
278
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
279
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
280
|
+
|
|
281
|
+
def _load_from_azure_blob(
|
|
282
|
+
self,
|
|
283
|
+
content: Content,
|
|
284
|
+
upsert: bool,
|
|
285
|
+
skip_if_exists: bool,
|
|
286
|
+
config: Optional[RemoteContentConfig] = None,
|
|
287
|
+
):
|
|
288
|
+
"""Load content from Azure Blob Storage (sync).
|
|
289
|
+
|
|
290
|
+
Requires the AzureBlobConfig to contain tenant_id, client_id, client_secret,
|
|
291
|
+
storage_account, and container.
|
|
292
|
+
"""
|
|
293
|
+
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
294
|
+
azure_config = self._validate_azure_config(content, config)
|
|
295
|
+
if azure_config is None:
|
|
296
|
+
return
|
|
297
|
+
|
|
298
|
+
# Get blob service client
|
|
299
|
+
try:
|
|
300
|
+
blob_service = self._get_azure_blob_client(azure_config)
|
|
301
|
+
except ImportError as e:
|
|
302
|
+
log_error(str(e))
|
|
303
|
+
return
|
|
304
|
+
except Exception as e:
|
|
305
|
+
log_error(f"Error creating Azure Blob client: {e}")
|
|
306
|
+
return
|
|
307
|
+
|
|
308
|
+
# Use context manager for proper resource cleanup
|
|
309
|
+
with blob_service:
|
|
310
|
+
container_client = blob_service.get_container_client(azure_config.container)
|
|
311
|
+
|
|
312
|
+
# Helper to list blobs with a given prefix
|
|
313
|
+
def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
|
|
314
|
+
"""List all blobs under a given prefix (folder)."""
|
|
315
|
+
results: List[Dict[str, Any]] = []
|
|
316
|
+
normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
|
|
317
|
+
blobs = container_client.list_blobs(name_starts_with=normalized_prefix)
|
|
318
|
+
for blob in blobs:
|
|
319
|
+
if not blob.name.endswith("/"):
|
|
320
|
+
results.append(
|
|
321
|
+
{
|
|
322
|
+
"name": blob.name,
|
|
323
|
+
"size": blob.size,
|
|
324
|
+
"content_type": blob.content_settings.content_type if blob.content_settings else None,
|
|
325
|
+
}
|
|
326
|
+
)
|
|
327
|
+
return results
|
|
328
|
+
|
|
329
|
+
# Identify blobs to process
|
|
330
|
+
blobs_to_process: List[Dict[str, Any]] = []
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
if remote_content.blob_name:
|
|
334
|
+
blob_client = container_client.get_blob_client(remote_content.blob_name)
|
|
335
|
+
try:
|
|
336
|
+
props = blob_client.get_blob_properties()
|
|
337
|
+
blobs_to_process.append(
|
|
338
|
+
{
|
|
339
|
+
"name": remote_content.blob_name,
|
|
340
|
+
"size": props.size,
|
|
341
|
+
"content_type": props.content_settings.content_type if props.content_settings else None,
|
|
342
|
+
}
|
|
343
|
+
)
|
|
344
|
+
except Exception:
|
|
345
|
+
log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
|
|
346
|
+
blobs_to_process = list_blobs_with_prefix(remote_content.blob_name)
|
|
347
|
+
if not blobs_to_process:
|
|
348
|
+
log_error(
|
|
349
|
+
f"No blob or folder found at path: {remote_content.blob_name}. "
|
|
350
|
+
"If this is a folder, ensure files exist inside it."
|
|
351
|
+
)
|
|
352
|
+
return
|
|
353
|
+
elif remote_content.prefix:
|
|
354
|
+
blobs_to_process = list_blobs_with_prefix(remote_content.prefix)
|
|
355
|
+
except Exception as e:
|
|
356
|
+
log_error(f"Error listing Azure blobs: {e}")
|
|
357
|
+
return
|
|
358
|
+
|
|
359
|
+
if not blobs_to_process:
|
|
360
|
+
log_warning(f"No blobs found in Azure container: {azure_config.container}")
|
|
361
|
+
return
|
|
362
|
+
|
|
363
|
+
log_info(f"Processing {len(blobs_to_process)} file(s) from Azure Blob Storage")
|
|
364
|
+
is_folder_upload = len(blobs_to_process) > 1
|
|
365
|
+
root_path = self._get_azure_root_path(remote_content)
|
|
366
|
+
|
|
367
|
+
for blob_info in blobs_to_process:
|
|
368
|
+
blob_name = blob_info["name"]
|
|
369
|
+
file_name = blob_name.split("/")[-1]
|
|
370
|
+
|
|
371
|
+
# Build metadata and virtual path using helpers
|
|
372
|
+
virtual_path = self._build_azure_virtual_path(
|
|
373
|
+
azure_config.storage_account, azure_config.container, blob_name
|
|
374
|
+
)
|
|
375
|
+
azure_metadata = self._build_azure_metadata(azure_config, blob_name, file_name)
|
|
376
|
+
merged_metadata = self._merge_metadata(azure_metadata, content.metadata)
|
|
377
|
+
|
|
378
|
+
# Compute content name using base helper
|
|
379
|
+
content_name = self._compute_content_name(
|
|
380
|
+
blob_name, file_name, content.name, root_path, is_folder_upload
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# Create content entry using base helper
|
|
384
|
+
content_entry = self._create_content_entry(
|
|
385
|
+
content, content_name, virtual_path, merged_metadata, "azure_blob", is_folder_upload
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
self._insert_contents_db(content_entry)
|
|
389
|
+
|
|
390
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
391
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
392
|
+
self._update_content(content_entry)
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
# Download blob
|
|
396
|
+
try:
|
|
397
|
+
blob_client = container_client.get_blob_client(blob_name)
|
|
398
|
+
download_stream = blob_client.download_blob()
|
|
399
|
+
file_content = BytesIO(download_stream.readall())
|
|
400
|
+
except Exception as e:
|
|
401
|
+
log_error(f"Error downloading Azure blob {blob_name}: {e}")
|
|
402
|
+
content_entry.status = ContentStatus.FAILED
|
|
403
|
+
content_entry.status_message = str(e)
|
|
404
|
+
self._update_content(content_entry)
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
# Select reader and read content
|
|
408
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
409
|
+
if reader is None:
|
|
410
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
411
|
+
content_entry.status = ContentStatus.FAILED
|
|
412
|
+
content_entry.status_message = "No suitable reader found"
|
|
413
|
+
self._update_content(content_entry)
|
|
414
|
+
continue
|
|
415
|
+
|
|
416
|
+
reader = cast(Reader, reader)
|
|
417
|
+
read_documents = reader.read(file_content, name=file_name)
|
|
418
|
+
|
|
419
|
+
# Prepare and insert into vector database
|
|
420
|
+
if not content_entry.id:
|
|
421
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
422
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
423
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""Base loader class with shared utilities for all content loaders.
|
|
2
|
+
|
|
3
|
+
Provides common helpers for:
|
|
4
|
+
- Computing content names for files
|
|
5
|
+
- Creating Content entries
|
|
6
|
+
- Building metadata dictionaries
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from agno.knowledge.content import Content, ContentStatus
|
|
13
|
+
from agno.utils.string import generate_id
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class FileToProcess:
|
|
18
|
+
"""Represents a file identified for processing."""
|
|
19
|
+
|
|
20
|
+
path: str
|
|
21
|
+
name: str
|
|
22
|
+
size: Optional[int] = None
|
|
23
|
+
content_type: Optional[str] = None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BaseLoader:
|
|
27
|
+
"""Base class with shared loader utilities.
|
|
28
|
+
|
|
29
|
+
This class provides common methods used by all content loaders to reduce
|
|
30
|
+
code duplication between sync and async implementations.
|
|
31
|
+
|
|
32
|
+
Methods that call self._build_content_hash() assume they are mixed into
|
|
33
|
+
a class that provides this method (e.g., Knowledge via RemoteKnowledge).
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def _compute_content_name(
|
|
37
|
+
self,
|
|
38
|
+
file_path: str,
|
|
39
|
+
file_name: str,
|
|
40
|
+
base_name: Optional[str],
|
|
41
|
+
root_path: str,
|
|
42
|
+
is_folder_upload: bool,
|
|
43
|
+
) -> str:
|
|
44
|
+
"""Compute the content name for a file.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
file_path: Full path to the file
|
|
48
|
+
file_name: Name of the file
|
|
49
|
+
base_name: User-provided base name for the content
|
|
50
|
+
root_path: Root path of the upload (for computing relative paths)
|
|
51
|
+
is_folder_upload: Whether this is part of a folder upload
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
The computed content name
|
|
55
|
+
"""
|
|
56
|
+
if is_folder_upload:
|
|
57
|
+
relative_path = file_path
|
|
58
|
+
if root_path and file_path.startswith(root_path + "/"):
|
|
59
|
+
relative_path = file_path[len(root_path) + 1 :]
|
|
60
|
+
return f"{base_name}/{relative_path}" if base_name else file_path
|
|
61
|
+
return base_name or file_name
|
|
62
|
+
|
|
63
|
+
def _create_content_entry_for_folder(
|
|
64
|
+
self,
|
|
65
|
+
content: Content,
|
|
66
|
+
content_name: str,
|
|
67
|
+
virtual_path: str,
|
|
68
|
+
metadata: Dict[str, Any],
|
|
69
|
+
file_type: str,
|
|
70
|
+
) -> Content:
|
|
71
|
+
"""Create a new Content entry for a file in a folder upload.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
content: Original content object (used for description)
|
|
75
|
+
content_name: Name for the new content entry
|
|
76
|
+
virtual_path: Virtual path for hashing
|
|
77
|
+
metadata: Metadata dictionary
|
|
78
|
+
file_type: Type of file (e.g., 'github', 'azure_blob')
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
New Content entry with hash and ID set
|
|
82
|
+
"""
|
|
83
|
+
entry = Content(
|
|
84
|
+
name=content_name,
|
|
85
|
+
description=content.description,
|
|
86
|
+
path=virtual_path,
|
|
87
|
+
status=ContentStatus.PROCESSING,
|
|
88
|
+
metadata=metadata,
|
|
89
|
+
file_type=file_type,
|
|
90
|
+
)
|
|
91
|
+
entry.content_hash = self._build_content_hash(entry) # type: ignore[attr-defined]
|
|
92
|
+
entry.id = generate_id(entry.content_hash)
|
|
93
|
+
return entry
|
|
94
|
+
|
|
95
|
+
def _update_content_entry_for_single_file(
|
|
96
|
+
self,
|
|
97
|
+
content: Content,
|
|
98
|
+
virtual_path: str,
|
|
99
|
+
metadata: Dict[str, Any],
|
|
100
|
+
file_type: str,
|
|
101
|
+
) -> Content:
|
|
102
|
+
"""Update an existing Content entry for a single file upload.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
content: Original content object to update
|
|
106
|
+
virtual_path: Virtual path for hashing
|
|
107
|
+
metadata: Metadata dictionary
|
|
108
|
+
file_type: Type of file (e.g., 'github', 'azure_blob')
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Updated Content entry with hash and ID set if not already present
|
|
112
|
+
"""
|
|
113
|
+
content.path = virtual_path
|
|
114
|
+
content.status = ContentStatus.PROCESSING
|
|
115
|
+
content.metadata = metadata
|
|
116
|
+
content.file_type = file_type
|
|
117
|
+
if not content.content_hash:
|
|
118
|
+
content.content_hash = self._build_content_hash(content) # type: ignore[attr-defined]
|
|
119
|
+
if not content.id:
|
|
120
|
+
content.id = generate_id(content.content_hash)
|
|
121
|
+
return content
|
|
122
|
+
|
|
123
|
+
def _create_content_entry(
|
|
124
|
+
self,
|
|
125
|
+
content: Content,
|
|
126
|
+
content_name: str,
|
|
127
|
+
virtual_path: str,
|
|
128
|
+
metadata: Dict[str, Any],
|
|
129
|
+
file_type: str,
|
|
130
|
+
is_folder_upload: bool,
|
|
131
|
+
) -> Content:
|
|
132
|
+
"""Create or update a Content entry for a file.
|
|
133
|
+
|
|
134
|
+
For folder uploads, creates a new Content entry.
|
|
135
|
+
For single file uploads, updates the original Content object.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
content: Original content object
|
|
139
|
+
content_name: Name for the content entry
|
|
140
|
+
virtual_path: Virtual path for hashing
|
|
141
|
+
metadata: Metadata dictionary
|
|
142
|
+
file_type: Type of file (e.g., 'github', 'azure_blob')
|
|
143
|
+
is_folder_upload: Whether this is part of a folder upload
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Content entry with hash and ID set
|
|
147
|
+
"""
|
|
148
|
+
if is_folder_upload:
|
|
149
|
+
return self._create_content_entry_for_folder(content, content_name, virtual_path, metadata, file_type)
|
|
150
|
+
return self._update_content_entry_for_single_file(content, virtual_path, metadata, file_type)
|
|
151
|
+
|
|
152
|
+
def _merge_metadata(
|
|
153
|
+
self,
|
|
154
|
+
provider_metadata: Dict[str, str],
|
|
155
|
+
user_metadata: Optional[Dict[str, Any]],
|
|
156
|
+
) -> Dict[str, Any]:
|
|
157
|
+
"""Merge provider metadata with user-provided metadata.
|
|
158
|
+
|
|
159
|
+
User metadata takes precedence over provider metadata.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
provider_metadata: Metadata from the provider (e.g., GitHub, Azure)
|
|
163
|
+
user_metadata: User-provided metadata
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Merged metadata dictionary
|
|
167
|
+
"""
|
|
168
|
+
return {**provider_metadata, **(user_metadata or {})}
|
|
169
|
+
|
|
170
|
+
def _files_to_dict_list(self, files: List[FileToProcess]) -> List[Dict[str, Any]]:
|
|
171
|
+
"""Convert FileToProcess objects to dict list for compatibility.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
files: List of FileToProcess objects
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
List of dictionaries with file info
|
|
178
|
+
"""
|
|
179
|
+
return [
|
|
180
|
+
{
|
|
181
|
+
"path": f.path,
|
|
182
|
+
"name": f.name,
|
|
183
|
+
"size": f.size,
|
|
184
|
+
"content_type": f.content_type,
|
|
185
|
+
}
|
|
186
|
+
for f in files
|
|
187
|
+
]
|