agno 2.4.1__py3-none-any.whl → 2.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/db/firestore/firestore.py +58 -65
- agno/db/mysql/async_mysql.py +47 -55
- agno/db/postgres/async_postgres.py +52 -61
- agno/db/sqlite/async_sqlite.py +52 -61
- agno/knowledge/knowledge.py +441 -4
- agno/knowledge/remote_content/__init__.py +4 -0
- agno/knowledge/remote_content/config.py +65 -3
- agno/knowledge/remote_content/remote_content.py +32 -1
- agno/models/ollama/__init__.py +2 -0
- agno/models/ollama/responses.py +100 -0
- agno/models/openai/__init__.py +2 -0
- agno/models/openai/open_responses.py +46 -0
- agno/models/openrouter/__init__.py +2 -0
- agno/models/openrouter/responses.py +146 -0
- agno/os/routers/knowledge/schemas.py +1 -1
- agno/vectordb/lightrag/lightrag.py +7 -6
- agno/vectordb/milvus/milvus.py +79 -48
- {agno-2.4.1.dist-info → agno-2.4.2.dist-info}/METADATA +1 -1
- {agno-2.4.1.dist-info → agno-2.4.2.dist-info}/RECORD +22 -19
- {agno-2.4.1.dist-info → agno-2.4.2.dist-info}/WHEEL +0 -0
- {agno-2.4.1.dist-info → agno-2.4.2.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.1.dist-info → agno-2.4.2.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -19,6 +19,7 @@ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
|
19
19
|
from agno.knowledge.document import Document
|
|
20
20
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
21
21
|
from agno.knowledge.remote_content.config import (
|
|
22
|
+
AzureBlobConfig,
|
|
22
23
|
GcsConfig,
|
|
23
24
|
GitHubConfig,
|
|
24
25
|
RemoteContentConfig,
|
|
@@ -26,6 +27,7 @@ from agno.knowledge.remote_content.config import (
|
|
|
26
27
|
SharePointConfig,
|
|
27
28
|
)
|
|
28
29
|
from agno.knowledge.remote_content.remote_content import (
|
|
30
|
+
AzureBlobContent,
|
|
29
31
|
GCSContent,
|
|
30
32
|
GitHubContent,
|
|
31
33
|
RemoteContent,
|
|
@@ -1964,6 +1966,9 @@ class Knowledge:
|
|
|
1964
1966
|
elif isinstance(remote_content, GitHubContent):
|
|
1965
1967
|
await self._aload_from_github(content, upsert, skip_if_exists, config)
|
|
1966
1968
|
|
|
1969
|
+
elif isinstance(remote_content, AzureBlobContent):
|
|
1970
|
+
await self._aload_from_azure_blob(content, upsert, skip_if_exists, config)
|
|
1971
|
+
|
|
1967
1972
|
else:
|
|
1968
1973
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
1969
1974
|
|
|
@@ -1976,6 +1981,8 @@ class Knowledge:
|
|
|
1976
1981
|
):
|
|
1977
1982
|
"""Load the contextual S3 content.
|
|
1978
1983
|
|
|
1984
|
+
Note: Uses sync boto3 calls as boto3 doesn't have an async API.
|
|
1985
|
+
|
|
1979
1986
|
1. Identify objects to read
|
|
1980
1987
|
2. Setup Content object
|
|
1981
1988
|
3. Hash content and add it to the contents database
|
|
@@ -2042,7 +2049,7 @@ class Knowledge:
|
|
|
2042
2049
|
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2043
2050
|
content_entry.status = ContentStatus.COMPLETED
|
|
2044
2051
|
await self._aupdate_content(content_entry)
|
|
2045
|
-
|
|
2052
|
+
continue
|
|
2046
2053
|
|
|
2047
2054
|
# 4. Select reader
|
|
2048
2055
|
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
@@ -2079,6 +2086,8 @@ class Knowledge:
|
|
|
2079
2086
|
):
|
|
2080
2087
|
"""Load the contextual GCS content.
|
|
2081
2088
|
|
|
2089
|
+
Note: Uses sync google-cloud-storage calls as it doesn't have an async API.
|
|
2090
|
+
|
|
2082
2091
|
1. Identify objects to read
|
|
2083
2092
|
2. Setup Content object
|
|
2084
2093
|
3. Hash content and add it to the contents database
|
|
@@ -2142,7 +2151,7 @@ class Knowledge:
|
|
|
2142
2151
|
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2143
2152
|
content_entry.status = ContentStatus.COMPLETED
|
|
2144
2153
|
await self._aupdate_content(content_entry)
|
|
2145
|
-
|
|
2154
|
+
continue
|
|
2146
2155
|
|
|
2147
2156
|
# 4. Select reader
|
|
2148
2157
|
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
@@ -2190,6 +2199,9 @@ class Knowledge:
|
|
|
2190
2199
|
elif isinstance(remote_content, GitHubContent):
|
|
2191
2200
|
self._load_from_github(content, upsert, skip_if_exists, config)
|
|
2192
2201
|
|
|
2202
|
+
elif isinstance(remote_content, AzureBlobContent):
|
|
2203
|
+
self._load_from_azure_blob(content, upsert, skip_if_exists, config)
|
|
2204
|
+
|
|
2193
2205
|
else:
|
|
2194
2206
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
2195
2207
|
|
|
@@ -2266,7 +2278,7 @@ class Knowledge:
|
|
|
2266
2278
|
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2267
2279
|
content_entry.status = ContentStatus.COMPLETED
|
|
2268
2280
|
self._update_content(content_entry)
|
|
2269
|
-
|
|
2281
|
+
continue
|
|
2270
2282
|
|
|
2271
2283
|
# 4. Select reader
|
|
2272
2284
|
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
@@ -2367,7 +2379,7 @@ class Knowledge:
|
|
|
2367
2379
|
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2368
2380
|
content_entry.status = ContentStatus.COMPLETED
|
|
2369
2381
|
self._update_content(content_entry)
|
|
2370
|
-
|
|
2382
|
+
continue
|
|
2371
2383
|
|
|
2372
2384
|
# 4. Select reader
|
|
2373
2385
|
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
@@ -3250,6 +3262,431 @@ class Knowledge:
|
|
|
3250
3262
|
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3251
3263
|
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3252
3264
|
|
|
3265
|
+
# --- Azure Blob Storage loaders ---
|
|
3266
|
+
|
|
3267
|
+
def _get_azure_blob_client(self, azure_config: AzureBlobConfig):
|
|
3268
|
+
"""Get a sync Azure Blob Service Client using client credentials flow.
|
|
3269
|
+
|
|
3270
|
+
Requires the `azure-identity` and `azure-storage-blob` packages.
|
|
3271
|
+
"""
|
|
3272
|
+
try:
|
|
3273
|
+
from azure.identity import ClientSecretCredential # type: ignore
|
|
3274
|
+
from azure.storage.blob import BlobServiceClient # type: ignore
|
|
3275
|
+
except ImportError:
|
|
3276
|
+
raise ImportError(
|
|
3277
|
+
"The `azure-identity` and `azure-storage-blob` packages are not installed. "
|
|
3278
|
+
"Please install them via `pip install azure-identity azure-storage-blob`."
|
|
3279
|
+
)
|
|
3280
|
+
|
|
3281
|
+
credential = ClientSecretCredential(
|
|
3282
|
+
tenant_id=azure_config.tenant_id,
|
|
3283
|
+
client_id=azure_config.client_id,
|
|
3284
|
+
client_secret=azure_config.client_secret,
|
|
3285
|
+
)
|
|
3286
|
+
|
|
3287
|
+
blob_service = BlobServiceClient(
|
|
3288
|
+
account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
|
|
3289
|
+
credential=credential,
|
|
3290
|
+
)
|
|
3291
|
+
|
|
3292
|
+
return blob_service
|
|
3293
|
+
|
|
3294
|
+
def _get_azure_blob_client_async(self, azure_config: AzureBlobConfig):
|
|
3295
|
+
"""Get an async Azure Blob Service Client using client credentials flow.
|
|
3296
|
+
|
|
3297
|
+
Requires the `azure-identity` and `azure-storage-blob` packages.
|
|
3298
|
+
Uses the async versions from azure.storage.blob.aio and azure.identity.aio.
|
|
3299
|
+
"""
|
|
3300
|
+
try:
|
|
3301
|
+
from azure.identity.aio import ClientSecretCredential # type: ignore
|
|
3302
|
+
from azure.storage.blob.aio import BlobServiceClient # type: ignore
|
|
3303
|
+
except ImportError:
|
|
3304
|
+
raise ImportError(
|
|
3305
|
+
"The `azure-identity` and `azure-storage-blob` packages are not installed. "
|
|
3306
|
+
"Please install them via `pip install azure-identity azure-storage-blob`."
|
|
3307
|
+
)
|
|
3308
|
+
|
|
3309
|
+
credential = ClientSecretCredential(
|
|
3310
|
+
tenant_id=azure_config.tenant_id,
|
|
3311
|
+
client_id=azure_config.client_id,
|
|
3312
|
+
client_secret=azure_config.client_secret,
|
|
3313
|
+
)
|
|
3314
|
+
|
|
3315
|
+
blob_service = BlobServiceClient(
|
|
3316
|
+
account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
|
|
3317
|
+
credential=credential,
|
|
3318
|
+
)
|
|
3319
|
+
|
|
3320
|
+
return blob_service
|
|
3321
|
+
|
|
3322
|
+
async def _aload_from_azure_blob(
|
|
3323
|
+
self,
|
|
3324
|
+
content: Content,
|
|
3325
|
+
upsert: bool,
|
|
3326
|
+
skip_if_exists: bool,
|
|
3327
|
+
config: Optional[RemoteContentConfig] = None,
|
|
3328
|
+
):
|
|
3329
|
+
"""Load content from Azure Blob Storage (async version).
|
|
3330
|
+
|
|
3331
|
+
Requires the AzureBlobConfig to contain tenant_id, client_id, client_secret,
|
|
3332
|
+
storage_account, and container.
|
|
3333
|
+
|
|
3334
|
+
Uses the async Azure SDK to avoid blocking the event loop.
|
|
3335
|
+
|
|
3336
|
+
1. Authenticate with Azure AD using client credentials
|
|
3337
|
+
2. List blobs in container (by prefix or single blob)
|
|
3338
|
+
3. Download and process each blob
|
|
3339
|
+
4. Insert to vector database
|
|
3340
|
+
"""
|
|
3341
|
+
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
3342
|
+
azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
|
|
3343
|
+
|
|
3344
|
+
if azure_config is None:
|
|
3345
|
+
log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
|
|
3346
|
+
return
|
|
3347
|
+
|
|
3348
|
+
# Get async blob service client
|
|
3349
|
+
try:
|
|
3350
|
+
blob_service = self._get_azure_blob_client_async(azure_config)
|
|
3351
|
+
except ImportError as e:
|
|
3352
|
+
log_error(str(e))
|
|
3353
|
+
return
|
|
3354
|
+
except Exception as e:
|
|
3355
|
+
log_error(f"Error creating Azure Blob client: {e}")
|
|
3356
|
+
return
|
|
3357
|
+
|
|
3358
|
+
# Use async context manager for proper resource cleanup
|
|
3359
|
+
async with blob_service:
|
|
3360
|
+
container_client = blob_service.get_container_client(azure_config.container)
|
|
3361
|
+
|
|
3362
|
+
# Helper to list blobs with a given prefix (async)
|
|
3363
|
+
async def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
|
|
3364
|
+
"""List all blobs under a given prefix (folder)."""
|
|
3365
|
+
results: List[Dict[str, Any]] = []
|
|
3366
|
+
normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
|
|
3367
|
+
async for blob in container_client.list_blobs(name_starts_with=normalized_prefix):
|
|
3368
|
+
# Skip "directory" markers (blobs ending with /)
|
|
3369
|
+
if not blob.name.endswith("/"):
|
|
3370
|
+
results.append(
|
|
3371
|
+
{
|
|
3372
|
+
"name": blob.name,
|
|
3373
|
+
"size": blob.size,
|
|
3374
|
+
"content_type": blob.content_settings.content_type if blob.content_settings else None,
|
|
3375
|
+
}
|
|
3376
|
+
)
|
|
3377
|
+
return results
|
|
3378
|
+
|
|
3379
|
+
# Identify blobs to process
|
|
3380
|
+
blobs_to_process: List[Dict[str, Any]] = []
|
|
3381
|
+
|
|
3382
|
+
try:
|
|
3383
|
+
if remote_content.blob_name:
|
|
3384
|
+
# Try to get as a single blob first
|
|
3385
|
+
blob_client = container_client.get_blob_client(remote_content.blob_name)
|
|
3386
|
+
try:
|
|
3387
|
+
props = await blob_client.get_blob_properties()
|
|
3388
|
+
blobs_to_process.append(
|
|
3389
|
+
{
|
|
3390
|
+
"name": remote_content.blob_name,
|
|
3391
|
+
"size": props.size,
|
|
3392
|
+
"content_type": props.content_settings.content_type if props.content_settings else None,
|
|
3393
|
+
}
|
|
3394
|
+
)
|
|
3395
|
+
except Exception:
|
|
3396
|
+
# Blob doesn't exist - check if it's actually a folder (prefix)
|
|
3397
|
+
log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
|
|
3398
|
+
blobs_to_process = await list_blobs_with_prefix(remote_content.blob_name)
|
|
3399
|
+
if not blobs_to_process:
|
|
3400
|
+
log_error(
|
|
3401
|
+
f"No blob or folder found at path: {remote_content.blob_name}. "
|
|
3402
|
+
"If this is a folder, ensure files exist inside it."
|
|
3403
|
+
)
|
|
3404
|
+
return
|
|
3405
|
+
elif remote_content.prefix:
|
|
3406
|
+
# List blobs with prefix
|
|
3407
|
+
blobs_to_process = await list_blobs_with_prefix(remote_content.prefix)
|
|
3408
|
+
except Exception as e:
|
|
3409
|
+
log_error(f"Error listing Azure blobs: {e}")
|
|
3410
|
+
return
|
|
3411
|
+
|
|
3412
|
+
if not blobs_to_process:
|
|
3413
|
+
log_warning(f"No blobs found in Azure container: {azure_config.container}")
|
|
3414
|
+
return
|
|
3415
|
+
|
|
3416
|
+
# For single file uploads, use the original content object to preserve the ID
|
|
3417
|
+
# returned by the API. For folder uploads, create new content entries for each file.
|
|
3418
|
+
is_folder_upload = len(blobs_to_process) > 1
|
|
3419
|
+
|
|
3420
|
+
# Process each blob
|
|
3421
|
+
for blob_info in blobs_to_process:
|
|
3422
|
+
blob_name = blob_info["name"]
|
|
3423
|
+
file_name = blob_name.split("/")[-1]
|
|
3424
|
+
|
|
3425
|
+
# Build a unique virtual path for hashing
|
|
3426
|
+
virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
|
|
3427
|
+
|
|
3428
|
+
# Build metadata
|
|
3429
|
+
azure_metadata = {
|
|
3430
|
+
"source_type": "azure_blob",
|
|
3431
|
+
"source_config_id": azure_config.id,
|
|
3432
|
+
"source_config_name": azure_config.name,
|
|
3433
|
+
"azure_storage_account": azure_config.storage_account,
|
|
3434
|
+
"azure_container": azure_config.container,
|
|
3435
|
+
"azure_blob_name": blob_name,
|
|
3436
|
+
"azure_filename": file_name,
|
|
3437
|
+
}
|
|
3438
|
+
merged_metadata = {**azure_metadata, **(content.metadata or {})}
|
|
3439
|
+
|
|
3440
|
+
# Setup Content object
|
|
3441
|
+
if is_folder_upload:
|
|
3442
|
+
# For folder uploads, create new content entries for each file
|
|
3443
|
+
relative_path = blob_name
|
|
3444
|
+
if remote_content.prefix and blob_name.startswith(remote_content.prefix):
|
|
3445
|
+
relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
|
|
3446
|
+
content_name = f"{content.name}/{relative_path}" if content.name else blob_name
|
|
3447
|
+
|
|
3448
|
+
content_entry = Content(
|
|
3449
|
+
name=content_name,
|
|
3450
|
+
description=content.description,
|
|
3451
|
+
path=virtual_path,
|
|
3452
|
+
status=ContentStatus.PROCESSING,
|
|
3453
|
+
metadata=merged_metadata,
|
|
3454
|
+
file_type="azure_blob",
|
|
3455
|
+
)
|
|
3456
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3457
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
3458
|
+
else:
|
|
3459
|
+
# For single file uploads, use the original content object to preserve ID
|
|
3460
|
+
content_entry = content
|
|
3461
|
+
content_entry.path = virtual_path
|
|
3462
|
+
content_entry.status = ContentStatus.PROCESSING
|
|
3463
|
+
content_entry.metadata = merged_metadata
|
|
3464
|
+
content_entry.file_type = "azure_blob"
|
|
3465
|
+
# Use existing id and content_hash from the original content if available
|
|
3466
|
+
if not content_entry.content_hash:
|
|
3467
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3468
|
+
if not content_entry.id:
|
|
3469
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
3470
|
+
|
|
3471
|
+
await self._ainsert_contents_db(content_entry)
|
|
3472
|
+
|
|
3473
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3474
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
3475
|
+
await self._aupdate_content(content_entry)
|
|
3476
|
+
continue
|
|
3477
|
+
|
|
3478
|
+
# Download blob (async)
|
|
3479
|
+
try:
|
|
3480
|
+
blob_client = container_client.get_blob_client(blob_name)
|
|
3481
|
+
download_stream = await blob_client.download_blob()
|
|
3482
|
+
blob_data = await download_stream.readall()
|
|
3483
|
+
file_content = BytesIO(blob_data)
|
|
3484
|
+
except Exception as e:
|
|
3485
|
+
log_error(f"Error downloading Azure blob {blob_name}: {e}")
|
|
3486
|
+
content_entry.status = ContentStatus.FAILED
|
|
3487
|
+
content_entry.status_message = str(e)
|
|
3488
|
+
await self._aupdate_content(content_entry)
|
|
3489
|
+
continue
|
|
3490
|
+
|
|
3491
|
+
# Select reader and read content
|
|
3492
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3493
|
+
if reader is None:
|
|
3494
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
3495
|
+
content_entry.status = ContentStatus.FAILED
|
|
3496
|
+
content_entry.status_message = "No suitable reader found"
|
|
3497
|
+
await self._aupdate_content(content_entry)
|
|
3498
|
+
continue
|
|
3499
|
+
|
|
3500
|
+
reader = cast(Reader, reader)
|
|
3501
|
+
read_documents = await reader.async_read(file_content, name=file_name)
|
|
3502
|
+
|
|
3503
|
+
# Prepare and insert into vector database
|
|
3504
|
+
if not content_entry.id:
|
|
3505
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3506
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3507
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3508
|
+
|
|
3509
|
+
def _load_from_azure_blob(
|
|
3510
|
+
self,
|
|
3511
|
+
content: Content,
|
|
3512
|
+
upsert: bool,
|
|
3513
|
+
skip_if_exists: bool,
|
|
3514
|
+
config: Optional[RemoteContentConfig] = None,
|
|
3515
|
+
):
|
|
3516
|
+
"""Synchronous version of _load_from_azure_blob.
|
|
3517
|
+
|
|
3518
|
+
Load content from Azure Blob Storage:
|
|
3519
|
+
1. Authenticate with Azure AD using client credentials
|
|
3520
|
+
2. List blobs in container (by prefix or single blob)
|
|
3521
|
+
3. Download and process each blob
|
|
3522
|
+
4. Insert to vector database
|
|
3523
|
+
"""
|
|
3524
|
+
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
3525
|
+
azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
|
|
3526
|
+
|
|
3527
|
+
if azure_config is None:
|
|
3528
|
+
log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
|
|
3529
|
+
return
|
|
3530
|
+
|
|
3531
|
+
# Get blob service client
|
|
3532
|
+
try:
|
|
3533
|
+
blob_service = self._get_azure_blob_client(azure_config)
|
|
3534
|
+
except ImportError as e:
|
|
3535
|
+
log_error(str(e))
|
|
3536
|
+
return
|
|
3537
|
+
except Exception as e:
|
|
3538
|
+
log_error(f"Error creating Azure Blob client: {e}")
|
|
3539
|
+
return
|
|
3540
|
+
|
|
3541
|
+
container_client = blob_service.get_container_client(azure_config.container)
|
|
3542
|
+
|
|
3543
|
+
# Helper to list blobs with a given prefix
|
|
3544
|
+
def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
|
|
3545
|
+
"""List all blobs under a given prefix (folder)."""
|
|
3546
|
+
results: List[Dict[str, Any]] = []
|
|
3547
|
+
normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
|
|
3548
|
+
blobs = container_client.list_blobs(name_starts_with=normalized_prefix)
|
|
3549
|
+
for blob in blobs:
|
|
3550
|
+
# Skip "directory" markers (blobs ending with /)
|
|
3551
|
+
if not blob.name.endswith("/"):
|
|
3552
|
+
results.append(
|
|
3553
|
+
{
|
|
3554
|
+
"name": blob.name,
|
|
3555
|
+
"size": blob.size,
|
|
3556
|
+
"content_type": blob.content_settings.content_type if blob.content_settings else None,
|
|
3557
|
+
}
|
|
3558
|
+
)
|
|
3559
|
+
return results
|
|
3560
|
+
|
|
3561
|
+
# Identify blobs to process
|
|
3562
|
+
blobs_to_process: List[Dict[str, Any]] = []
|
|
3563
|
+
|
|
3564
|
+
try:
|
|
3565
|
+
if remote_content.blob_name:
|
|
3566
|
+
# Try to get as a single blob first
|
|
3567
|
+
blob_client = container_client.get_blob_client(remote_content.blob_name)
|
|
3568
|
+
try:
|
|
3569
|
+
props = blob_client.get_blob_properties()
|
|
3570
|
+
blobs_to_process.append(
|
|
3571
|
+
{
|
|
3572
|
+
"name": remote_content.blob_name,
|
|
3573
|
+
"size": props.size,
|
|
3574
|
+
"content_type": props.content_settings.content_type if props.content_settings else None,
|
|
3575
|
+
}
|
|
3576
|
+
)
|
|
3577
|
+
except Exception:
|
|
3578
|
+
# Blob doesn't exist - check if it's actually a folder (prefix)
|
|
3579
|
+
log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
|
|
3580
|
+
blobs_to_process = list_blobs_with_prefix(remote_content.blob_name)
|
|
3581
|
+
if not blobs_to_process:
|
|
3582
|
+
log_error(
|
|
3583
|
+
f"No blob or folder found at path: {remote_content.blob_name}. "
|
|
3584
|
+
"If this is a folder, ensure files exist inside it."
|
|
3585
|
+
)
|
|
3586
|
+
return
|
|
3587
|
+
elif remote_content.prefix:
|
|
3588
|
+
# List blobs with prefix
|
|
3589
|
+
blobs_to_process = list_blobs_with_prefix(remote_content.prefix)
|
|
3590
|
+
except Exception as e:
|
|
3591
|
+
log_error(f"Error listing Azure blobs: {e}")
|
|
3592
|
+
return
|
|
3593
|
+
|
|
3594
|
+
if not blobs_to_process:
|
|
3595
|
+
log_warning(f"No blobs found in Azure container: {azure_config.container}")
|
|
3596
|
+
return
|
|
3597
|
+
|
|
3598
|
+
# For single file uploads, use the original content object to preserve the ID
|
|
3599
|
+
# returned by the API. For folder uploads, create new content entries for each file.
|
|
3600
|
+
is_folder_upload = len(blobs_to_process) > 1
|
|
3601
|
+
|
|
3602
|
+
# Process each blob
|
|
3603
|
+
for blob_info in blobs_to_process:
|
|
3604
|
+
blob_name = blob_info["name"]
|
|
3605
|
+
file_name = blob_name.split("/")[-1]
|
|
3606
|
+
|
|
3607
|
+
# Build a unique virtual path for hashing
|
|
3608
|
+
virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
|
|
3609
|
+
|
|
3610
|
+
# Build metadata
|
|
3611
|
+
azure_metadata = {
|
|
3612
|
+
"source_type": "azure_blob",
|
|
3613
|
+
"source_config_id": azure_config.id,
|
|
3614
|
+
"source_config_name": azure_config.name,
|
|
3615
|
+
"azure_storage_account": azure_config.storage_account,
|
|
3616
|
+
"azure_container": azure_config.container,
|
|
3617
|
+
"azure_blob_name": blob_name,
|
|
3618
|
+
"azure_filename": file_name,
|
|
3619
|
+
}
|
|
3620
|
+
merged_metadata = {**azure_metadata, **(content.metadata or {})}
|
|
3621
|
+
|
|
3622
|
+
# Setup Content object
|
|
3623
|
+
if is_folder_upload:
|
|
3624
|
+
# For folder uploads, create new content entries for each file
|
|
3625
|
+
relative_path = blob_name
|
|
3626
|
+
if remote_content.prefix and blob_name.startswith(remote_content.prefix):
|
|
3627
|
+
relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
|
|
3628
|
+
content_name = f"{content.name}/{relative_path}" if content.name else blob_name
|
|
3629
|
+
|
|
3630
|
+
content_entry = Content(
|
|
3631
|
+
name=content_name,
|
|
3632
|
+
description=content.description,
|
|
3633
|
+
path=virtual_path,
|
|
3634
|
+
status=ContentStatus.PROCESSING,
|
|
3635
|
+
metadata=merged_metadata,
|
|
3636
|
+
file_type="azure_blob",
|
|
3637
|
+
)
|
|
3638
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3639
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
3640
|
+
else:
|
|
3641
|
+
# For single file uploads, use the original content object to preserve ID
|
|
3642
|
+
content_entry = content
|
|
3643
|
+
content_entry.path = virtual_path
|
|
3644
|
+
content_entry.status = ContentStatus.PROCESSING
|
|
3645
|
+
content_entry.metadata = merged_metadata
|
|
3646
|
+
content_entry.file_type = "azure_blob"
|
|
3647
|
+
# Use existing id and content_hash from the original content if available
|
|
3648
|
+
if not content_entry.content_hash:
|
|
3649
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3650
|
+
if not content_entry.id:
|
|
3651
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
3652
|
+
|
|
3653
|
+
self._insert_contents_db(content_entry)
|
|
3654
|
+
|
|
3655
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3656
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
3657
|
+
self._update_content(content_entry)
|
|
3658
|
+
continue
|
|
3659
|
+
|
|
3660
|
+
# Download blob
|
|
3661
|
+
try:
|
|
3662
|
+
blob_client = container_client.get_blob_client(blob_name)
|
|
3663
|
+
download_stream = blob_client.download_blob()
|
|
3664
|
+
file_content = BytesIO(download_stream.readall())
|
|
3665
|
+
except Exception as e:
|
|
3666
|
+
log_error(f"Error downloading Azure blob {blob_name}: {e}")
|
|
3667
|
+
content_entry.status = ContentStatus.FAILED
|
|
3668
|
+
content_entry.status_message = str(e)
|
|
3669
|
+
self._update_content(content_entry)
|
|
3670
|
+
continue
|
|
3671
|
+
|
|
3672
|
+
# Select reader and read content
|
|
3673
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3674
|
+
if reader is None:
|
|
3675
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
3676
|
+
content_entry.status = ContentStatus.FAILED
|
|
3677
|
+
content_entry.status_message = "No suitable reader found"
|
|
3678
|
+
self._update_content(content_entry)
|
|
3679
|
+
continue
|
|
3680
|
+
|
|
3681
|
+
reader = cast(Reader, reader)
|
|
3682
|
+
read_documents = reader.read(file_content, name=file_name)
|
|
3683
|
+
|
|
3684
|
+
# Prepare and insert into vector database
|
|
3685
|
+
if not content_entry.id:
|
|
3686
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3687
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3688
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3689
|
+
|
|
3253
3690
|
async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
3254
3691
|
from agno.vectordb import VectorDb
|
|
3255
3692
|
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from agno.knowledge.remote_content.config import (
|
|
2
|
+
AzureBlobConfig,
|
|
2
3
|
GcsConfig,
|
|
3
4
|
GitHubConfig,
|
|
4
5
|
RemoteContentConfig,
|
|
@@ -6,6 +7,7 @@ from agno.knowledge.remote_content.config import (
|
|
|
6
7
|
SharePointConfig,
|
|
7
8
|
)
|
|
8
9
|
from agno.knowledge.remote_content.remote_content import (
|
|
10
|
+
AzureBlobContent,
|
|
9
11
|
GCSContent,
|
|
10
12
|
GitHubContent,
|
|
11
13
|
RemoteContent,
|
|
@@ -20,10 +22,12 @@ __all__ = [
|
|
|
20
22
|
"GcsConfig",
|
|
21
23
|
"SharePointConfig",
|
|
22
24
|
"GitHubConfig",
|
|
25
|
+
"AzureBlobConfig",
|
|
23
26
|
# Content classes
|
|
24
27
|
"RemoteContent",
|
|
25
28
|
"S3Content",
|
|
26
29
|
"GCSContent",
|
|
27
30
|
"SharePointContent",
|
|
28
31
|
"GitHubContent",
|
|
32
|
+
"AzureBlobContent",
|
|
29
33
|
]
|
|
@@ -2,10 +2,11 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import TYPE_CHECKING, Optional
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel
|
|
5
|
+
from pydantic import BaseModel, ConfigDict
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from agno.knowledge.remote_content.remote_content import (
|
|
9
|
+
AzureBlobContent,
|
|
9
10
|
GCSContent,
|
|
10
11
|
GitHubContent,
|
|
11
12
|
S3Content,
|
|
@@ -20,8 +21,7 @@ class RemoteContentConfig(BaseModel):
|
|
|
20
21
|
name: str
|
|
21
22
|
metadata: Optional[dict] = None
|
|
22
23
|
|
|
23
|
-
|
|
24
|
-
extra = "allow"
|
|
24
|
+
model_config = ConfigDict(extra="allow")
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class S3Config(RemoteContentConfig):
|
|
@@ -202,3 +202,65 @@ class GitHubConfig(RemoteContentConfig):
|
|
|
202
202
|
folder_path=folder_path,
|
|
203
203
|
branch=branch or self.branch,
|
|
204
204
|
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
class AzureBlobConfig(RemoteContentConfig):
|
|
208
|
+
"""Configuration for Azure Blob Storage content source.
|
|
209
|
+
|
|
210
|
+
Uses Azure AD client credentials flow for authentication.
|
|
211
|
+
|
|
212
|
+
Required Azure AD App Registration permissions:
|
|
213
|
+
- Storage Blob Data Reader (or Contributor) role on the storage account
|
|
214
|
+
|
|
215
|
+
Example:
|
|
216
|
+
```python
|
|
217
|
+
config = AzureBlobConfig(
|
|
218
|
+
id="company-docs",
|
|
219
|
+
name="Company Documents",
|
|
220
|
+
tenant_id=os.getenv("AZURE_TENANT_ID"),
|
|
221
|
+
client_id=os.getenv("AZURE_CLIENT_ID"),
|
|
222
|
+
client_secret=os.getenv("AZURE_CLIENT_SECRET"),
|
|
223
|
+
storage_account=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"),
|
|
224
|
+
container=os.getenv("AZURE_CONTAINER_NAME"),
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
tenant_id: str
|
|
230
|
+
client_id: str
|
|
231
|
+
client_secret: str
|
|
232
|
+
storage_account: str
|
|
233
|
+
container: str
|
|
234
|
+
prefix: Optional[str] = None
|
|
235
|
+
|
|
236
|
+
def file(self, blob_name: str) -> "AzureBlobContent":
|
|
237
|
+
"""Create a content reference for a specific blob (file).
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
blob_name: The blob name (path to file in container).
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
AzureBlobContent configured with this source's credentials.
|
|
244
|
+
"""
|
|
245
|
+
from agno.knowledge.remote_content.remote_content import AzureBlobContent
|
|
246
|
+
|
|
247
|
+
return AzureBlobContent(
|
|
248
|
+
config_id=self.id,
|
|
249
|
+
blob_name=blob_name,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
def folder(self, prefix: str) -> "AzureBlobContent":
|
|
253
|
+
"""Create a content reference for a folder (prefix).
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
prefix: The blob prefix (folder path).
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
AzureBlobContent configured with this source's credentials.
|
|
260
|
+
"""
|
|
261
|
+
from agno.knowledge.remote_content.remote_content import AzureBlobContent
|
|
262
|
+
|
|
263
|
+
return AzureBlobContent(
|
|
264
|
+
config_id=self.id,
|
|
265
|
+
prefix=prefix,
|
|
266
|
+
)
|
|
@@ -142,4 +142,35 @@ class GitHubContent:
|
|
|
142
142
|
}
|
|
143
143
|
|
|
144
144
|
|
|
145
|
-
|
|
145
|
+
@dataclass
|
|
146
|
+
class AzureBlobContent:
|
|
147
|
+
"""Content reference for Azure Blob Storage files.
|
|
148
|
+
|
|
149
|
+
Used with AzureBlobConfig to load files from Azure Blob Storage containers.
|
|
150
|
+
Supports loading single blobs or entire prefixes (folders).
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
def __init__(
|
|
154
|
+
self,
|
|
155
|
+
config_id: str,
|
|
156
|
+
blob_name: Optional[str] = None,
|
|
157
|
+
prefix: Optional[str] = None,
|
|
158
|
+
):
|
|
159
|
+
self.config_id = config_id
|
|
160
|
+
self.blob_name = blob_name
|
|
161
|
+
self.prefix = prefix
|
|
162
|
+
|
|
163
|
+
if self.blob_name is None and self.prefix is None:
|
|
164
|
+
raise ValueError("Either blob_name or prefix must be provided")
|
|
165
|
+
if self.blob_name is not None and self.prefix is not None:
|
|
166
|
+
raise ValueError("Provide either blob_name or prefix, not both")
|
|
167
|
+
|
|
168
|
+
def get_config(self):
|
|
169
|
+
return {
|
|
170
|
+
"config_id": self.config_id,
|
|
171
|
+
"blob_name": self.blob_name,
|
|
172
|
+
"prefix": self.prefix,
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
RemoteContent = Union[S3Content, GCSContent, SharePointContent, GitHubContent, AzureBlobContent]
|