agno 2.4.6__py3-none-any.whl → 2.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. agno/agent/agent.py +5 -1
  2. agno/db/base.py +2 -0
  3. agno/db/postgres/postgres.py +5 -5
  4. agno/db/singlestore/singlestore.py +4 -5
  5. agno/db/sqlite/sqlite.py +4 -4
  6. agno/knowledge/embedder/aws_bedrock.py +325 -106
  7. agno/knowledge/knowledge.py +83 -1853
  8. agno/knowledge/loaders/__init__.py +29 -0
  9. agno/knowledge/loaders/azure_blob.py +423 -0
  10. agno/knowledge/loaders/base.py +187 -0
  11. agno/knowledge/loaders/gcs.py +267 -0
  12. agno/knowledge/loaders/github.py +415 -0
  13. agno/knowledge/loaders/s3.py +281 -0
  14. agno/knowledge/loaders/sharepoint.py +439 -0
  15. agno/knowledge/reader/website_reader.py +2 -2
  16. agno/knowledge/remote_knowledge.py +151 -0
  17. agno/knowledge/reranker/aws_bedrock.py +299 -0
  18. agno/learn/machine.py +5 -6
  19. agno/learn/stores/session_context.py +10 -2
  20. agno/models/azure/openai_chat.py +6 -11
  21. agno/models/neosantara/__init__.py +5 -0
  22. agno/models/neosantara/neosantara.py +42 -0
  23. agno/models/utils.py +5 -0
  24. agno/os/app.py +4 -1
  25. agno/os/interfaces/agui/router.py +1 -1
  26. agno/os/routers/components/components.py +2 -0
  27. agno/os/routers/knowledge/knowledge.py +0 -1
  28. agno/os/routers/registry/registry.py +340 -192
  29. agno/os/routers/workflows/router.py +7 -1
  30. agno/os/schema.py +104 -0
  31. agno/registry/registry.py +4 -0
  32. agno/run/workflow.py +3 -0
  33. agno/session/workflow.py +1 -1
  34. agno/skills/utils.py +100 -2
  35. agno/team/team.py +6 -3
  36. agno/tools/mcp/mcp.py +26 -1
  37. agno/vectordb/lancedb/lance_db.py +22 -7
  38. agno/workflow/__init__.py +4 -0
  39. agno/workflow/cel.py +299 -0
  40. agno/workflow/condition.py +280 -58
  41. agno/workflow/loop.py +177 -46
  42. agno/workflow/parallel.py +75 -4
  43. agno/workflow/router.py +260 -44
  44. agno/workflow/step.py +14 -7
  45. agno/workflow/steps.py +43 -0
  46. agno/workflow/workflow.py +104 -46
  47. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/METADATA +25 -37
  48. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/RECORD +51 -39
  49. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/WHEEL +0 -0
  50. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/licenses/LICENSE +0 -0
  51. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/top_level.txt +0 -0
@@ -9,7 +9,6 @@ from os.path import basename
9
9
  from pathlib import Path
10
10
  from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
11
11
 
12
- import httpx
13
12
  from httpx import AsyncClient
14
13
 
15
14
  from agno.db.base import AsyncBaseDb, BaseDb
@@ -19,21 +18,12 @@ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
19
18
  from agno.knowledge.document import Document
20
19
  from agno.knowledge.reader import Reader, ReaderFactory
21
20
  from agno.knowledge.remote_content.config import (
22
- AzureBlobConfig,
23
- GcsConfig,
24
- GitHubConfig,
25
21
  RemoteContentConfig,
26
- S3Config,
27
- SharePointConfig,
28
22
  )
29
23
  from agno.knowledge.remote_content.remote_content import (
30
- AzureBlobContent,
31
- GCSContent,
32
- GitHubContent,
33
24
  RemoteContent,
34
- S3Content,
35
- SharePointContent,
36
25
  )
26
+ from agno.knowledge.remote_knowledge import RemoteKnowledge
37
27
  from agno.utils.http import async_fetch_with_retry
38
28
  from agno.utils.log import log_debug, log_error, log_info, log_warning
39
29
  from agno.utils.string import generate_id
@@ -49,7 +39,7 @@ class KnowledgeContentOrigin(Enum):
49
39
 
50
40
 
51
41
  @dataclass
52
- class Knowledge:
42
+ class Knowledge(RemoteKnowledge):
53
43
  """Knowledge class"""
54
44
 
55
45
  name: Optional[str] = None
@@ -2091,1847 +2081,6 @@ class Knowledge:
2091
2081
 
2092
2082
  self._handle_vector_db_insert(content, read_documents, upsert)
2093
2083
 
2094
- async def _aload_from_remote_content(
2095
- self,
2096
- content: Content,
2097
- upsert: bool,
2098
- skip_if_exists: bool,
2099
- ):
2100
- if content.remote_content is None:
2101
- log_warning("No remote content provided for content")
2102
- return
2103
-
2104
- remote_content = content.remote_content
2105
-
2106
- # Look up config if config_id is provided
2107
- config = None
2108
- if hasattr(remote_content, "config_id") and remote_content.config_id:
2109
- config = self._get_remote_config_by_id(remote_content.config_id)
2110
- if config is None:
2111
- log_warning(f"No config found for config_id: {remote_content.config_id}")
2112
-
2113
- if isinstance(remote_content, S3Content):
2114
- await self._aload_from_s3(content, upsert, skip_if_exists, config)
2115
-
2116
- elif isinstance(remote_content, GCSContent):
2117
- await self._aload_from_gcs(content, upsert, skip_if_exists, config)
2118
-
2119
- elif isinstance(remote_content, SharePointContent):
2120
- await self._aload_from_sharepoint(content, upsert, skip_if_exists, config)
2121
-
2122
- elif isinstance(remote_content, GitHubContent):
2123
- await self._aload_from_github(content, upsert, skip_if_exists, config)
2124
-
2125
- elif isinstance(remote_content, AzureBlobContent):
2126
- await self._aload_from_azure_blob(content, upsert, skip_if_exists, config)
2127
-
2128
- else:
2129
- log_warning(f"Unsupported remote content type: {type(remote_content)}")
2130
-
2131
- async def _aload_from_s3(
2132
- self,
2133
- content: Content,
2134
- upsert: bool,
2135
- skip_if_exists: bool,
2136
- config: Optional[RemoteContentConfig] = None,
2137
- ):
2138
- """Load the contextual S3 content.
2139
-
2140
- Note: Uses sync boto3 calls as boto3 doesn't have an async API.
2141
-
2142
- 1. Identify objects to read
2143
- 2. Setup Content object
2144
- 3. Hash content and add it to the contents database
2145
- 4. Select reader
2146
- 5. Fetch and load the content
2147
- 6. Read the content
2148
- 7. Prepare and insert the content in the vector database
2149
- 8. Remove temporary file if needed
2150
- """
2151
- from agno.cloud.aws.s3.bucket import S3Bucket
2152
- from agno.cloud.aws.s3.object import S3Object
2153
-
2154
- # Note: S3 support has limited features compared to GitHub/SharePoint
2155
- log_warning(
2156
- "S3 content loading has limited features. "
2157
- "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2158
- )
2159
-
2160
- remote_content: S3Content = cast(S3Content, content.remote_content)
2161
-
2162
- # Get or create bucket with credentials from config
2163
- bucket = remote_content.bucket
2164
- try:
2165
- if bucket is None and remote_content.bucket_name:
2166
- s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
2167
- bucket = S3Bucket(
2168
- name=remote_content.bucket_name,
2169
- region=s3_config.region if s3_config else None,
2170
- aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
2171
- aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
2172
- )
2173
- except Exception as e:
2174
- log_error(f"Error getting bucket: {e}")
2175
-
2176
- # 1. Identify objects to read
2177
- objects_to_read: List[S3Object] = []
2178
- if bucket is not None:
2179
- if remote_content.key is not None:
2180
- _object = S3Object(bucket_name=bucket.name, name=remote_content.key)
2181
- objects_to_read.append(_object)
2182
- elif remote_content.object is not None:
2183
- objects_to_read.append(remote_content.object)
2184
- elif remote_content.prefix is not None:
2185
- objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
2186
- else:
2187
- objects_to_read.extend(bucket.get_objects())
2188
-
2189
- for s3_object in objects_to_read:
2190
- # 2. Setup Content object
2191
- content_name = content.name or ""
2192
- content_name += "_" + (s3_object.name or "")
2193
- content_entry = Content(
2194
- name=content_name,
2195
- description=content.description,
2196
- status=ContentStatus.PROCESSING,
2197
- metadata=content.metadata,
2198
- file_type="s3",
2199
- )
2200
-
2201
- # 3. Hash content and add it to the contents database
2202
- content_entry.content_hash = self._build_content_hash(content_entry)
2203
- content_entry.id = generate_id(content_entry.content_hash)
2204
- await self._ainsert_contents_db(content_entry)
2205
- if self._should_skip(content_entry.content_hash, skip_if_exists):
2206
- content_entry.status = ContentStatus.COMPLETED
2207
- await self._aupdate_content(content_entry)
2208
- continue
2209
-
2210
- # 4. Select reader
2211
- reader = self._select_reader_by_uri(s3_object.uri, content.reader)
2212
- reader = cast(Reader, reader)
2213
-
2214
- # 5. Fetch and load the content
2215
- temporary_file = None
2216
- obj_name = content_name or s3_object.name.split("/")[-1]
2217
- readable_content: Optional[Union[BytesIO, Path]] = None
2218
- if s3_object.uri.endswith(".pdf"):
2219
- readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
2220
- else:
2221
- temporary_file = Path("storage").joinpath(obj_name)
2222
- readable_content = temporary_file
2223
- s3_object.download(readable_content) # type: ignore
2224
-
2225
- # 6. Read the content
2226
- read_documents = await reader.async_read(readable_content, name=obj_name)
2227
-
2228
- # 7. Prepare and insert the content in the vector database
2229
- self._prepare_documents_for_insert(read_documents, content_entry.id)
2230
- await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2231
-
2232
- # 8. Remove temporary file if needed
2233
- if temporary_file:
2234
- temporary_file.unlink()
2235
-
2236
- async def _aload_from_gcs(
2237
- self,
2238
- content: Content,
2239
- upsert: bool,
2240
- skip_if_exists: bool,
2241
- config: Optional[RemoteContentConfig] = None,
2242
- ):
2243
- """Load the contextual GCS content.
2244
-
2245
- Note: Uses sync google-cloud-storage calls as it doesn't have an async API.
2246
-
2247
- 1. Identify objects to read
2248
- 2. Setup Content object
2249
- 3. Hash content and add it to the contents database
2250
- 4. Select reader
2251
- 5. Fetch and load the content
2252
- 6. Read the content
2253
- 7. Prepare and insert the content in the vector database
2254
- """
2255
- try:
2256
- from google.cloud import storage # type: ignore
2257
- except ImportError:
2258
- raise ImportError(
2259
- "The `google-cloud-storage` package is not installed. "
2260
- "Please install it via `pip install google-cloud-storage`."
2261
- )
2262
-
2263
- # Note: GCS support has limited features compared to GitHub/SharePoint
2264
- log_warning(
2265
- "GCS content loading has limited features. "
2266
- "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2267
- )
2268
-
2269
- remote_content: GCSContent = cast(GCSContent, content.remote_content)
2270
-
2271
- # Get or create bucket with credentials from config
2272
- bucket = remote_content.bucket
2273
- if bucket is None and remote_content.bucket_name:
2274
- gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
2275
- if gcs_config and gcs_config.credentials_path:
2276
- client = storage.Client.from_service_account_json(gcs_config.credentials_path)
2277
- elif gcs_config and gcs_config.project:
2278
- client = storage.Client(project=gcs_config.project)
2279
- else:
2280
- client = storage.Client()
2281
- bucket = client.bucket(remote_content.bucket_name)
2282
-
2283
- # 1. Identify objects to read
2284
- objects_to_read = []
2285
- if remote_content.blob_name is not None:
2286
- objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
2287
- elif remote_content.prefix is not None:
2288
- objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2289
- else:
2290
- objects_to_read.extend(bucket.list_blobs()) # type: ignore
2291
-
2292
- for gcs_object in objects_to_read:
2293
- # 2. Setup Content object
2294
- name = (content.name or "content") + "_" + gcs_object.name
2295
- content_entry = Content(
2296
- name=name,
2297
- description=content.description,
2298
- status=ContentStatus.PROCESSING,
2299
- metadata=content.metadata,
2300
- file_type="gcs",
2301
- )
2302
-
2303
- # 3. Hash content and add it to the contents database
2304
- content_entry.content_hash = self._build_content_hash(content_entry)
2305
- content_entry.id = generate_id(content_entry.content_hash)
2306
- await self._ainsert_contents_db(content_entry)
2307
- if self._should_skip(content_entry.content_hash, skip_if_exists):
2308
- content_entry.status = ContentStatus.COMPLETED
2309
- await self._aupdate_content(content_entry)
2310
- continue
2311
-
2312
- # 4. Select reader
2313
- reader = self._select_reader_by_uri(gcs_object.name, content.reader)
2314
- reader = cast(Reader, reader)
2315
-
2316
- # 5. Fetch and load the content
2317
- readable_content = BytesIO(gcs_object.download_as_bytes())
2318
-
2319
- # 6. Read the content
2320
- read_documents = await reader.async_read(readable_content, name=name)
2321
-
2322
- # 7. Prepare and insert the content in the vector database
2323
- self._prepare_documents_for_insert(read_documents, content_entry.id)
2324
- await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2325
-
2326
- def _load_from_remote_content(
2327
- self,
2328
- content: Content,
2329
- upsert: bool,
2330
- skip_if_exists: bool,
2331
- ):
2332
- """Synchronous version of _load_from_remote_content."""
2333
- if content.remote_content is None:
2334
- log_warning("No remote content provided for content")
2335
- return
2336
-
2337
- remote_content = content.remote_content
2338
-
2339
- # Look up config if config_id is provided
2340
- config = None
2341
- if hasattr(remote_content, "config_id") and remote_content.config_id:
2342
- config = self._get_remote_config_by_id(remote_content.config_id)
2343
- if config is None:
2344
- log_warning(f"No config found for config_id: {remote_content.config_id}")
2345
-
2346
- if isinstance(remote_content, S3Content):
2347
- self._load_from_s3(content, upsert, skip_if_exists, config)
2348
-
2349
- elif isinstance(remote_content, GCSContent):
2350
- self._load_from_gcs(content, upsert, skip_if_exists, config)
2351
-
2352
- elif isinstance(remote_content, SharePointContent):
2353
- self._load_from_sharepoint(content, upsert, skip_if_exists, config)
2354
-
2355
- elif isinstance(remote_content, GitHubContent):
2356
- self._load_from_github(content, upsert, skip_if_exists, config)
2357
-
2358
- elif isinstance(remote_content, AzureBlobContent):
2359
- self._load_from_azure_blob(content, upsert, skip_if_exists, config)
2360
-
2361
- else:
2362
- log_warning(f"Unsupported remote content type: {type(remote_content)}")
2363
-
2364
- def _load_from_s3(
2365
- self,
2366
- content: Content,
2367
- upsert: bool,
2368
- skip_if_exists: bool,
2369
- config: Optional[RemoteContentConfig] = None,
2370
- ):
2371
- """Synchronous version of _load_from_s3.
2372
-
2373
- Load the contextual S3 content:
2374
- 1. Identify objects to read
2375
- 2. Setup Content object
2376
- 3. Hash content and add it to the contents database
2377
- 4. Select reader
2378
- 5. Fetch and load the content
2379
- 6. Read the content
2380
- 7. Prepare and insert the content in the vector database
2381
- 8. Remove temporary file if needed
2382
- """
2383
- from agno.cloud.aws.s3.bucket import S3Bucket
2384
- from agno.cloud.aws.s3.object import S3Object
2385
-
2386
- # Note: S3 support has limited features compared to GitHub/SharePoint
2387
- log_warning(
2388
- "S3 content loading has limited features. "
2389
- "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2390
- )
2391
-
2392
- remote_content: S3Content = cast(S3Content, content.remote_content)
2393
-
2394
- # Get or create bucket with credentials from config
2395
- bucket = remote_content.bucket
2396
- if bucket is None and remote_content.bucket_name:
2397
- s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
2398
- bucket = S3Bucket(
2399
- name=remote_content.bucket_name,
2400
- region=s3_config.region if s3_config else None,
2401
- aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
2402
- aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
2403
- )
2404
-
2405
- # 1. Identify objects to read
2406
- objects_to_read: List[S3Object] = []
2407
- if bucket is not None:
2408
- if remote_content.key is not None:
2409
- _object = S3Object(bucket_name=bucket.name, name=remote_content.key)
2410
- objects_to_read.append(_object)
2411
- elif remote_content.object is not None:
2412
- objects_to_read.append(remote_content.object)
2413
- elif remote_content.prefix is not None:
2414
- objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
2415
- else:
2416
- objects_to_read.extend(bucket.get_objects())
2417
-
2418
- for s3_object in objects_to_read:
2419
- # 2. Setup Content object
2420
- content_name = content.name or ""
2421
- content_name += "_" + (s3_object.name or "")
2422
- content_entry = Content(
2423
- name=content_name,
2424
- description=content.description,
2425
- status=ContentStatus.PROCESSING,
2426
- metadata=content.metadata,
2427
- file_type="s3",
2428
- )
2429
-
2430
- # 3. Hash content and add it to the contents database
2431
- content_entry.content_hash = self._build_content_hash(content_entry)
2432
- content_entry.id = generate_id(content_entry.content_hash)
2433
- self._insert_contents_db(content_entry)
2434
- if self._should_skip(content_entry.content_hash, skip_if_exists):
2435
- content_entry.status = ContentStatus.COMPLETED
2436
- self._update_content(content_entry)
2437
- continue
2438
-
2439
- # 4. Select reader
2440
- reader = self._select_reader_by_uri(s3_object.uri, content.reader)
2441
- reader = cast(Reader, reader)
2442
-
2443
- # 5. Fetch and load the content
2444
- temporary_file = None
2445
- obj_name = content_name or s3_object.name.split("/")[-1]
2446
- readable_content: Optional[Union[BytesIO, Path]] = None
2447
- if s3_object.uri.endswith(".pdf"):
2448
- readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
2449
- else:
2450
- temporary_file = Path("storage").joinpath(obj_name)
2451
- readable_content = temporary_file
2452
- s3_object.download(readable_content) # type: ignore
2453
-
2454
- # 6. Read the content
2455
- read_documents = reader.read(readable_content, name=obj_name)
2456
-
2457
- # 7. Prepare and insert the content in the vector database
2458
- self._prepare_documents_for_insert(read_documents, content_entry.id)
2459
- self._handle_vector_db_insert(content_entry, read_documents, upsert)
2460
-
2461
- # 8. Remove temporary file if needed
2462
- if temporary_file:
2463
- temporary_file.unlink()
2464
-
2465
- def _load_from_gcs(
2466
- self,
2467
- content: Content,
2468
- upsert: bool,
2469
- skip_if_exists: bool,
2470
- config: Optional[RemoteContentConfig] = None,
2471
- ):
2472
- """Synchronous version of _load_from_gcs.
2473
-
2474
- Load the contextual GCS content:
2475
- 1. Identify objects to read
2476
- 2. Setup Content object
2477
- 3. Hash content and add it to the contents database
2478
- 4. Select reader
2479
- 5. Fetch and load the content
2480
- 6. Read the content
2481
- 7. Prepare and insert the content in the vector database
2482
- """
2483
- try:
2484
- from google.cloud import storage # type: ignore
2485
- except ImportError:
2486
- raise ImportError(
2487
- "The `google-cloud-storage` package is not installed. "
2488
- "Please install it via `pip install google-cloud-storage`."
2489
- )
2490
-
2491
- # Note: GCS support has limited features compared to GitHub/SharePoint
2492
- log_warning(
2493
- "GCS content loading has limited features. "
2494
- "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2495
- )
2496
-
2497
- remote_content: GCSContent = cast(GCSContent, content.remote_content)
2498
-
2499
- # Get or create bucket with credentials from config
2500
- bucket = remote_content.bucket
2501
- if bucket is None and remote_content.bucket_name:
2502
- gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
2503
- if gcs_config and gcs_config.credentials_path:
2504
- client = storage.Client.from_service_account_json(gcs_config.credentials_path)
2505
- elif gcs_config and gcs_config.project:
2506
- client = storage.Client(project=gcs_config.project)
2507
- else:
2508
- client = storage.Client()
2509
- bucket = client.bucket(remote_content.bucket_name)
2510
-
2511
- # 1. Identify objects to read
2512
- objects_to_read = []
2513
- if remote_content.blob_name is not None:
2514
- objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
2515
- elif remote_content.prefix is not None:
2516
- objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2517
- else:
2518
- objects_to_read.extend(bucket.list_blobs()) # type: ignore
2519
-
2520
- for gcs_object in objects_to_read:
2521
- # 2. Setup Content object
2522
- name = (content.name or "content") + "_" + gcs_object.name
2523
- content_entry = Content(
2524
- name=name,
2525
- description=content.description,
2526
- status=ContentStatus.PROCESSING,
2527
- metadata=content.metadata,
2528
- file_type="gcs",
2529
- )
2530
-
2531
- # 3. Hash content and add it to the contents database
2532
- content_entry.content_hash = self._build_content_hash(content_entry)
2533
- content_entry.id = generate_id(content_entry.content_hash)
2534
- self._insert_contents_db(content_entry)
2535
- if self._should_skip(content_entry.content_hash, skip_if_exists):
2536
- content_entry.status = ContentStatus.COMPLETED
2537
- self._update_content(content_entry)
2538
- continue
2539
-
2540
- # 4. Select reader
2541
- reader = self._select_reader_by_uri(gcs_object.name, content.reader)
2542
- reader = cast(Reader, reader)
2543
-
2544
- # 5. Fetch and load the content
2545
- readable_content = BytesIO(gcs_object.download_as_bytes())
2546
-
2547
- # 6. Read the content
2548
- read_documents = reader.read(readable_content, name=name)
2549
-
2550
- # 7. Prepare and insert the content in the vector database
2551
- self._prepare_documents_for_insert(read_documents, content_entry.id)
2552
- self._handle_vector_db_insert(content_entry, read_documents, upsert)
2553
-
2554
- # --- SharePoint loaders ---
2555
-
2556
- def _get_sharepoint_access_token(self, sp_config: SharePointConfig) -> Optional[str]:
2557
- """Get an access token for Microsoft Graph API using client credentials flow.
2558
-
2559
- Requires the `msal` package: pip install msal
2560
- """
2561
- try:
2562
- from msal import ConfidentialClientApplication # type: ignore
2563
- except ImportError:
2564
- raise ImportError("The `msal` package is not installed. Please install it via `pip install msal`.")
2565
-
2566
- authority = f"https://login.microsoftonline.com/{sp_config.tenant_id}"
2567
- app = ConfidentialClientApplication(
2568
- sp_config.client_id,
2569
- authority=authority,
2570
- client_credential=sp_config.client_secret,
2571
- )
2572
-
2573
- # Acquire token for Microsoft Graph
2574
- scopes = ["https://graph.microsoft.com/.default"]
2575
- result = app.acquire_token_for_client(scopes=scopes)
2576
-
2577
- if "access_token" in result:
2578
- return result["access_token"]
2579
- else:
2580
- log_error(f"Failed to acquire SharePoint token: {result.get('error_description', result.get('error'))}")
2581
- return None
2582
-
2583
- def _get_sharepoint_site_id(self, hostname: str, site_path: Optional[str], access_token: str) -> Optional[str]:
2584
- """Get the SharePoint site ID using Microsoft Graph API."""
2585
- import httpx
2586
-
2587
- if site_path:
2588
- url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
2589
- else:
2590
- url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
2591
-
2592
- headers = {"Authorization": f"Bearer {access_token}"}
2593
-
2594
- try:
2595
- response = httpx.get(url, headers=headers)
2596
- response.raise_for_status()
2597
- return response.json().get("id")
2598
- except httpx.HTTPStatusError as e:
2599
- log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
2600
- return None
2601
-
2602
- def _list_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
2603
- """List all items in a SharePoint folder."""
2604
- import httpx
2605
-
2606
- # Strip leading slashes to avoid double-slash in URL
2607
- folder_path = folder_path.lstrip("/")
2608
- url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
2609
- headers = {"Authorization": f"Bearer {access_token}"}
2610
- items: List[dict] = []
2611
-
2612
- try:
2613
- while url:
2614
- response = httpx.get(url, headers=headers)
2615
- response.raise_for_status()
2616
- data = response.json()
2617
- items.extend(data.get("value", []))
2618
- url = data.get("@odata.nextLink")
2619
- except httpx.HTTPStatusError as e:
2620
- log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
2621
-
2622
- return items
2623
-
2624
- def _download_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
2625
- """Download a file from SharePoint."""
2626
- import httpx
2627
-
2628
- # Strip leading slashes to avoid double-slash in URL
2629
- file_path = file_path.lstrip("/")
2630
- url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
2631
- headers = {"Authorization": f"Bearer {access_token}"}
2632
-
2633
- try:
2634
- response = httpx.get(url, headers=headers, follow_redirects=True)
2635
- response.raise_for_status()
2636
- return BytesIO(response.content)
2637
- except httpx.HTTPStatusError as e:
2638
- log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
2639
- return None
2640
-
2641
- async def _aget_sharepoint_site_id(
2642
- self, hostname: str, site_path: Optional[str], access_token: str
2643
- ) -> Optional[str]:
2644
- """Get the SharePoint site ID using Microsoft Graph API (async)."""
2645
- import httpx
2646
-
2647
- if site_path:
2648
- url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
2649
- else:
2650
- url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
2651
-
2652
- headers = {"Authorization": f"Bearer {access_token}"}
2653
-
2654
- try:
2655
- async with httpx.AsyncClient() as client:
2656
- response = await client.get(url, headers=headers)
2657
- response.raise_for_status()
2658
- return response.json().get("id")
2659
- except httpx.HTTPStatusError as e:
2660
- log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
2661
- return None
2662
-
2663
- async def _alist_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
2664
- """List all items in a SharePoint folder (async)."""
2665
- import httpx
2666
-
2667
- # Strip leading slashes to avoid double-slash in URL
2668
- folder_path = folder_path.lstrip("/")
2669
- url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
2670
- headers = {"Authorization": f"Bearer {access_token}"}
2671
- items: List[dict] = []
2672
-
2673
- try:
2674
- async with httpx.AsyncClient() as client:
2675
- while url:
2676
- response = await client.get(url, headers=headers)
2677
- response.raise_for_status()
2678
- data = response.json()
2679
- items.extend(data.get("value", []))
2680
- url = data.get("@odata.nextLink")
2681
- except httpx.HTTPStatusError as e:
2682
- log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
2683
-
2684
- return items
2685
-
2686
- async def _adownload_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
2687
- """Download a file from SharePoint (async)."""
2688
- import httpx
2689
-
2690
- # Strip leading slashes to avoid double-slash in URL
2691
- file_path = file_path.lstrip("/")
2692
- url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
2693
- headers = {"Authorization": f"Bearer {access_token}"}
2694
-
2695
- try:
2696
- async with httpx.AsyncClient() as client:
2697
- response = await client.get(url, headers=headers, follow_redirects=True)
2698
- response.raise_for_status()
2699
- return BytesIO(response.content)
2700
- except httpx.HTTPStatusError as e:
2701
- log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
2702
- return None
2703
-
2704
- async def _aload_from_sharepoint(
2705
- self,
2706
- content: Content,
2707
- upsert: bool,
2708
- skip_if_exists: bool,
2709
- config: Optional[RemoteContentConfig] = None,
2710
- ):
2711
- """Load content from SharePoint.
2712
-
2713
- Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
2714
-
2715
- 1. Authenticate with Microsoft Graph using client credentials
2716
- 2. Get site ID from hostname/site_path
2717
- 3. Download file(s) from file_path or folder_path
2718
- 4. Process through reader and insert to vector db
2719
- """
2720
- remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
2721
- sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
2722
-
2723
- if sp_config is None:
2724
- log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
2725
- return
2726
-
2727
- # 1. Get access token
2728
- access_token = self._get_sharepoint_access_token(sp_config)
2729
- if not access_token:
2730
- return
2731
-
2732
- # 2. Get site ID - use config value if provided, otherwise fetch via API
2733
- site_id: Optional[str] = sp_config.site_id
2734
- if not site_id:
2735
- site_path = remote_content.site_path or sp_config.site_path
2736
- site_id = await self._aget_sharepoint_site_id(sp_config.hostname, site_path, access_token)
2737
- if not site_id:
2738
- log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
2739
- return
2740
-
2741
- # 3. Identify files to download
2742
- files_to_process: List[tuple] = [] # List of (file_path, file_name)
2743
-
2744
- # Helper function to recursively list all files in a folder
2745
- async def list_files_recursive(folder: str) -> List[tuple]:
2746
- """Recursively list all files in a SharePoint folder."""
2747
- files: List[tuple] = []
2748
- items = await self._alist_sharepoint_folder_items(site_id, folder, access_token)
2749
- for item in items:
2750
- if "file" in item: # It's a file
2751
- item_path = f"{folder}/{item['name']}"
2752
- files.append((item_path, item["name"]))
2753
- elif "folder" in item: # It's a folder - recurse
2754
- subdir_path = f"{folder}/{item['name']}"
2755
- subdir_files = await list_files_recursive(subdir_path)
2756
- files.extend(subdir_files)
2757
- return files
2758
-
2759
- # Get the path to process (file_path or folder_path)
2760
- path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
2761
-
2762
- if path_to_process:
2763
- # Check if path is a file or folder by getting item metadata
2764
- try:
2765
- async with AsyncClient() as client:
2766
- url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
2767
- headers = {"Authorization": f"Bearer {access_token}"}
2768
- response = await client.get(url, headers=headers, timeout=30.0)
2769
- response.raise_for_status()
2770
- item_data = response.json()
2771
-
2772
- if "folder" in item_data:
2773
- # It's a folder - recursively list all files
2774
- files_to_process = await list_files_recursive(path_to_process)
2775
- elif "file" in item_data:
2776
- # It's a single file
2777
- files_to_process.append((path_to_process, item_data["name"]))
2778
- else:
2779
- log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
2780
- return
2781
- except Exception as e:
2782
- log_error(f"Error checking SharePoint path {path_to_process}: {e}")
2783
- return
2784
-
2785
- if not files_to_process:
2786
- log_warning(f"No files found at SharePoint path: {path_to_process}")
2787
- return
2788
-
2789
- # 4. Process each file
2790
- for file_path, file_name in files_to_process:
2791
- # Build a unique virtual path for hashing (ensures different files don't collide)
2792
- virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
2793
-
2794
- # Build metadata with all info needed to re-fetch the file
2795
- sharepoint_metadata = {
2796
- "source_type": "sharepoint",
2797
- "source_config_id": sp_config.id,
2798
- "source_config_name": sp_config.name,
2799
- "sharepoint_hostname": sp_config.hostname,
2800
- "sharepoint_site_id": site_id,
2801
- "sharepoint_path": file_path,
2802
- "sharepoint_filename": file_name,
2803
- }
2804
- # Merge with user-provided metadata (user metadata takes precedence)
2805
- merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
2806
-
2807
- # Setup Content object
2808
- # Naming: for folders, use relative path; for single files, use user name or filename
2809
- is_folder_upload = len(files_to_process) > 1
2810
- if is_folder_upload:
2811
- # Compute relative path from the upload root
2812
- relative_path = file_path
2813
- if path_to_process and file_path.startswith(path_to_process + "/"):
2814
- relative_path = file_path[len(path_to_process) + 1 :]
2815
- # If user provided a name, prefix it; otherwise use full file path
2816
- content_name = f"{content.name}/{relative_path}" if content.name else file_path
2817
- else:
2818
- # Single file: use user's name or the filename
2819
- content_name = content.name or file_name
2820
- content_entry = Content(
2821
- name=content_name,
2822
- description=content.description,
2823
- path=virtual_path, # Include path for unique hashing
2824
- status=ContentStatus.PROCESSING,
2825
- metadata=merged_metadata,
2826
- file_type="sharepoint",
2827
- )
2828
-
2829
- # Hash content and add to contents database
2830
- content_entry.content_hash = self._build_content_hash(content_entry)
2831
- content_entry.id = generate_id(content_entry.content_hash)
2832
- await self._ainsert_contents_db(content_entry)
2833
- if self._should_skip(content_entry.content_hash, skip_if_exists):
2834
- content_entry.status = ContentStatus.COMPLETED
2835
- await self._aupdate_content(content_entry)
2836
- continue
2837
-
2838
- # Select reader based on file extension
2839
- reader = self._select_reader_by_uri(file_name, content.reader)
2840
- reader = cast(Reader, reader)
2841
-
2842
- # Download file
2843
- file_content = await self._adownload_sharepoint_file(site_id, file_path, access_token)
2844
- if not file_content:
2845
- content_entry.status = ContentStatus.FAILED
2846
- await self._aupdate_content(content_entry)
2847
- continue
2848
-
2849
- # Read the content
2850
- read_documents = await reader.async_read(file_content, name=file_name)
2851
-
2852
- # Prepare and insert to vector database
2853
- self._prepare_documents_for_insert(read_documents, content_entry.id)
2854
- await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2855
-
2856
- def _load_from_sharepoint(
2857
- self,
2858
- content: Content,
2859
- upsert: bool,
2860
- skip_if_exists: bool,
2861
- config: Optional[RemoteContentConfig] = None,
2862
- ):
2863
- """Synchronous version of _load_from_sharepoint.
2864
-
2865
- Load content from SharePoint:
2866
- 1. Authenticate with Microsoft Graph using client credentials
2867
- 2. Get site ID from hostname/site_path
2868
- 3. Download file(s) from file_path or folder_path
2869
- 4. Process through reader and insert to vector db
2870
- """
2871
- remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
2872
- sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
2873
-
2874
- if sp_config is None:
2875
- log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
2876
- return
2877
-
2878
- # 1. Get access token
2879
- access_token = self._get_sharepoint_access_token(sp_config)
2880
- if not access_token:
2881
- return
2882
-
2883
- # 2. Get site ID - use config value if provided, otherwise fetch via API
2884
- site_id: Optional[str] = sp_config.site_id
2885
- if not site_id:
2886
- site_path = remote_content.site_path or sp_config.site_path
2887
- site_id = self._get_sharepoint_site_id(sp_config.hostname, site_path, access_token)
2888
- if not site_id:
2889
- log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
2890
- return
2891
-
2892
- # 3. Identify files to download
2893
- files_to_process: List[tuple] = [] # List of (file_path, file_name)
2894
-
2895
- # Helper function to recursively list all files in a folder
2896
- def list_files_recursive(folder: str) -> List[tuple]:
2897
- """Recursively list all files in a SharePoint folder."""
2898
- files: List[tuple] = []
2899
- items = self._list_sharepoint_folder_items(site_id, folder, access_token)
2900
- for item in items:
2901
- if "file" in item: # It's a file
2902
- item_path = f"{folder}/{item['name']}"
2903
- files.append((item_path, item["name"]))
2904
- elif "folder" in item: # It's a folder - recurse
2905
- subdir_path = f"{folder}/{item['name']}"
2906
- subdir_files = list_files_recursive(subdir_path)
2907
- files.extend(subdir_files)
2908
- return files
2909
-
2910
- # Get the path to process (file_path or folder_path)
2911
- path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
2912
-
2913
- if path_to_process:
2914
- # Check if path is a file or folder by getting item metadata
2915
- try:
2916
- with httpx.Client() as client:
2917
- url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
2918
- headers = {"Authorization": f"Bearer {access_token}"}
2919
- response = client.get(url, headers=headers, timeout=30.0)
2920
- response.raise_for_status()
2921
- item_data = response.json()
2922
-
2923
- if "folder" in item_data:
2924
- # It's a folder - recursively list all files
2925
- files_to_process = list_files_recursive(path_to_process)
2926
- elif "file" in item_data:
2927
- # It's a single file
2928
- files_to_process.append((path_to_process, item_data["name"]))
2929
- else:
2930
- log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
2931
- return
2932
- except Exception as e:
2933
- log_error(f"Error checking SharePoint path {path_to_process}: {e}")
2934
- return
2935
-
2936
- if not files_to_process:
2937
- log_warning(f"No files found at SharePoint path: {path_to_process}")
2938
- return
2939
-
2940
- # 4. Process each file
2941
- for file_path, file_name in files_to_process:
2942
- # Build a unique virtual path for hashing (ensures different files don't collide)
2943
- virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
2944
-
2945
- # Build metadata with all info needed to re-fetch the file
2946
- sharepoint_metadata = {
2947
- "source_type": "sharepoint",
2948
- "source_config_id": sp_config.id,
2949
- "source_config_name": sp_config.name,
2950
- "sharepoint_hostname": sp_config.hostname,
2951
- "sharepoint_site_id": site_id,
2952
- "sharepoint_path": file_path,
2953
- "sharepoint_filename": file_name,
2954
- }
2955
- # Merge with user-provided metadata (user metadata takes precedence)
2956
- merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
2957
-
2958
- # Setup Content object
2959
- # Naming: for folders, use relative path; for single files, use user name or filename
2960
- is_folder_upload = len(files_to_process) > 1
2961
- if is_folder_upload:
2962
- # Compute relative path from the upload root
2963
- relative_path = file_path
2964
- if path_to_process and file_path.startswith(path_to_process + "/"):
2965
- relative_path = file_path[len(path_to_process) + 1 :]
2966
- # If user provided a name, prefix it; otherwise use full file path
2967
- content_name = f"{content.name}/{relative_path}" if content.name else file_path
2968
- else:
2969
- # Single file: use user's name or the filename
2970
- content_name = content.name or file_name
2971
- content_entry = Content(
2972
- name=content_name,
2973
- description=content.description,
2974
- path=virtual_path, # Include path for unique hashing
2975
- status=ContentStatus.PROCESSING,
2976
- metadata=merged_metadata,
2977
- file_type="sharepoint",
2978
- )
2979
-
2980
- # Hash content and add to contents database
2981
- content_entry.content_hash = self._build_content_hash(content_entry)
2982
- content_entry.id = generate_id(content_entry.content_hash)
2983
- self._insert_contents_db(content_entry)
2984
- if self._should_skip(content_entry.content_hash, skip_if_exists):
2985
- content_entry.status = ContentStatus.COMPLETED
2986
- self._update_content(content_entry)
2987
- continue
2988
-
2989
- # Select reader based on file extension
2990
- reader = self._select_reader_by_uri(file_name, content.reader)
2991
- reader = cast(Reader, reader)
2992
-
2993
- # Download file
2994
- file_content = self._download_sharepoint_file(site_id, file_path, access_token)
2995
- if not file_content:
2996
- content_entry.status = ContentStatus.FAILED
2997
- self._update_content(content_entry)
2998
- continue
2999
-
3000
- # Read the content
3001
- read_documents = reader.read(file_content, name=file_name)
3002
-
3003
- # Prepare and insert to vector database
3004
- self._prepare_documents_for_insert(read_documents, content_entry.id)
3005
- self._handle_vector_db_insert(content_entry, read_documents, upsert)
3006
-
3007
- # --- GitHub loaders ---
3008
-
3009
- async def _aload_from_github(
3010
- self,
3011
- content: Content,
3012
- upsert: bool,
3013
- skip_if_exists: bool,
3014
- config: Optional[RemoteContentConfig] = None,
3015
- ):
3016
- """Load content from GitHub.
3017
-
3018
- Requires the GitHub config to contain repo and optionally token for private repos.
3019
- Uses the GitHub API to fetch file contents.
3020
- """
3021
- remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
3022
- gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
3023
-
3024
- if gh_config is None:
3025
- log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
3026
- return
3027
-
3028
- # Build headers for GitHub API
3029
- headers = {
3030
- "Accept": "application/vnd.github.v3+json",
3031
- "User-Agent": "Agno-Knowledge",
3032
- }
3033
- if gh_config.token:
3034
- headers["Authorization"] = f"Bearer {gh_config.token}"
3035
-
3036
- branch = remote_content.branch or gh_config.branch or "main"
3037
-
3038
- # Get list of files to process
3039
- files_to_process: List[Dict[str, str]] = []
3040
-
3041
- async with AsyncClient() as client:
3042
- # Helper function to recursively list all files in a folder
3043
- async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
3044
- """Recursively list all files in a GitHub folder."""
3045
- files: List[Dict[str, str]] = []
3046
- api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
3047
- if branch:
3048
- api_url += f"?ref={branch}"
3049
-
3050
- try:
3051
- response = await client.get(api_url, headers=headers, timeout=30.0)
3052
- response.raise_for_status()
3053
- items = response.json()
3054
-
3055
- # If items is not a list, it's a single file response
3056
- if not isinstance(items, list):
3057
- items = [items]
3058
-
3059
- for item in items:
3060
- if item.get("type") == "file":
3061
- files.append(
3062
- {
3063
- "path": item["path"],
3064
- "name": item["name"],
3065
- }
3066
- )
3067
- elif item.get("type") == "dir":
3068
- # Recursively get files from subdirectory
3069
- subdir_files = await list_files_recursive(item["path"])
3070
- files.extend(subdir_files)
3071
- except Exception as e:
3072
- log_error(f"Error listing GitHub folder {folder}: {e}")
3073
-
3074
- return files
3075
-
3076
- # Get the path to process (file_path or folder_path)
3077
- path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
3078
-
3079
- if path_to_process:
3080
- # Fetch the path to determine if it's a file or directory
3081
- api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
3082
- if branch:
3083
- api_url += f"?ref={branch}"
3084
-
3085
- try:
3086
- response = await client.get(api_url, headers=headers, timeout=30.0)
3087
- response.raise_for_status()
3088
- path_data = response.json()
3089
-
3090
- if isinstance(path_data, list):
3091
- # It's a directory - recursively list all files
3092
- for item in path_data:
3093
- if item.get("type") == "file":
3094
- files_to_process.append({"path": item["path"], "name": item["name"]})
3095
- elif item.get("type") == "dir":
3096
- subdir_files = await list_files_recursive(item["path"])
3097
- files_to_process.extend(subdir_files)
3098
- else:
3099
- # It's a single file
3100
- files_to_process.append(
3101
- {
3102
- "path": path_data["path"],
3103
- "name": path_data["name"],
3104
- }
3105
- )
3106
- except Exception as e:
3107
- log_error(f"Error fetching GitHub path {path_to_process}: {e}")
3108
- return
3109
-
3110
- if not files_to_process:
3111
- log_warning(f"No files found at GitHub path: {path_to_process}")
3112
- return
3113
-
3114
- # Process each file
3115
- for file_info in files_to_process:
3116
- file_path = file_info["path"]
3117
- file_name = file_info["name"]
3118
-
3119
- # Build a unique virtual path for hashing (ensures different files don't collide)
3120
- virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
3121
-
3122
- # Build metadata with all info needed to re-fetch the file
3123
- github_metadata = {
3124
- "source_type": "github",
3125
- "source_config_id": gh_config.id,
3126
- "source_config_name": gh_config.name,
3127
- "github_repo": gh_config.repo,
3128
- "github_branch": branch,
3129
- "github_path": file_path,
3130
- "github_filename": file_name,
3131
- }
3132
- # Merge with user-provided metadata (user metadata takes precedence)
3133
- merged_metadata = {**github_metadata, **(content.metadata or {})}
3134
-
3135
- # Setup Content object
3136
- # Naming: for folders, use relative path; for single files, use user name or filename
3137
- is_folder_upload = len(files_to_process) > 1
3138
- if is_folder_upload:
3139
- # Compute relative path from the upload root
3140
- relative_path = file_path
3141
- if path_to_process and file_path.startswith(path_to_process + "/"):
3142
- relative_path = file_path[len(path_to_process) + 1 :]
3143
- # If user provided a name, prefix it; otherwise use full file path
3144
- content_name = f"{content.name}/{relative_path}" if content.name else file_path
3145
- else:
3146
- # Single file: use user's name or the filename
3147
- content_name = content.name or file_name
3148
- content_entry = Content(
3149
- name=content_name,
3150
- description=content.description,
3151
- path=virtual_path, # Include path for unique hashing
3152
- status=ContentStatus.PROCESSING,
3153
- metadata=merged_metadata,
3154
- file_type="github",
3155
- )
3156
-
3157
- # Hash content and add to contents database
3158
- content_entry.content_hash = self._build_content_hash(content_entry)
3159
- content_entry.id = generate_id(content_entry.content_hash)
3160
- await self._ainsert_contents_db(content_entry)
3161
-
3162
- if self._should_skip(content_entry.content_hash, skip_if_exists):
3163
- content_entry.status = ContentStatus.COMPLETED
3164
- await self._aupdate_content(content_entry)
3165
- continue
3166
-
3167
- # Fetch file content using GitHub API (works for private repos)
3168
- api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
3169
- if branch:
3170
- api_url += f"?ref={branch}"
3171
- try:
3172
- response = await client.get(api_url, headers=headers, timeout=30.0)
3173
- response.raise_for_status()
3174
- file_data = response.json()
3175
-
3176
- # GitHub API returns content as base64
3177
- if file_data.get("encoding") == "base64":
3178
- import base64
3179
-
3180
- file_content = base64.b64decode(file_data["content"])
3181
- else:
3182
- # For large files, GitHub returns a download_url
3183
- download_url = file_data.get("download_url")
3184
- if download_url:
3185
- dl_response = await client.get(download_url, headers=headers, timeout=30.0)
3186
- dl_response.raise_for_status()
3187
- file_content = dl_response.content
3188
- else:
3189
- raise ValueError("No content or download_url in response")
3190
- except Exception as e:
3191
- log_error(f"Error fetching GitHub file {file_path}: {e}")
3192
- content_entry.status = ContentStatus.FAILED
3193
- content_entry.status_message = str(e)
3194
- await self._aupdate_content(content_entry)
3195
- continue
3196
-
3197
- # Select reader and read content
3198
- reader = self._select_reader_by_uri(file_name, content.reader)
3199
- if reader is None:
3200
- log_warning(f"No reader found for file: {file_name}")
3201
- content_entry.status = ContentStatus.FAILED
3202
- content_entry.status_message = "No suitable reader found"
3203
- await self._aupdate_content(content_entry)
3204
- continue
3205
-
3206
- reader = cast(Reader, reader)
3207
- readable_content = BytesIO(file_content)
3208
- read_documents = await reader.async_read(readable_content, name=file_name)
3209
-
3210
- # Prepare and insert into vector database
3211
- if not content_entry.id:
3212
- content_entry.id = generate_id(content_entry.content_hash or "")
3213
- self._prepare_documents_for_insert(read_documents, content_entry.id)
3214
- await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
3215
-
3216
- def _load_from_github(
3217
- self,
3218
- content: Content,
3219
- upsert: bool,
3220
- skip_if_exists: bool,
3221
- config: Optional[RemoteContentConfig] = None,
3222
- ):
3223
- """Synchronous version of _load_from_github."""
3224
- import httpx
3225
-
3226
- remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
3227
- gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
3228
-
3229
- if gh_config is None:
3230
- log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
3231
- return
3232
-
3233
- # Build headers for GitHub API
3234
- headers = {
3235
- "Accept": "application/vnd.github.v3+json",
3236
- "User-Agent": "Agno-Knowledge",
3237
- }
3238
- if gh_config.token:
3239
- headers["Authorization"] = f"Bearer {gh_config.token}"
3240
-
3241
- branch = remote_content.branch or gh_config.branch or "main"
3242
-
3243
- # Get list of files to process
3244
- files_to_process: List[Dict[str, str]] = []
3245
-
3246
- with httpx.Client() as client:
3247
- # Helper function to recursively list all files in a folder
3248
- def list_files_recursive(folder: str) -> List[Dict[str, str]]:
3249
- """Recursively list all files in a GitHub folder."""
3250
- files: List[Dict[str, str]] = []
3251
- api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
3252
- if branch:
3253
- api_url += f"?ref={branch}"
3254
-
3255
- try:
3256
- response = client.get(api_url, headers=headers, timeout=30.0)
3257
- response.raise_for_status()
3258
- items = response.json()
3259
-
3260
- # If items is not a list, it's a single file response
3261
- if not isinstance(items, list):
3262
- items = [items]
3263
-
3264
- for item in items:
3265
- if item.get("type") == "file":
3266
- files.append(
3267
- {
3268
- "path": item["path"],
3269
- "name": item["name"],
3270
- }
3271
- )
3272
- elif item.get("type") == "dir":
3273
- # Recursively get files from subdirectory
3274
- subdir_files = list_files_recursive(item["path"])
3275
- files.extend(subdir_files)
3276
- except Exception as e:
3277
- log_error(f"Error listing GitHub folder {folder}: {e}")
3278
-
3279
- return files
3280
-
3281
- # Get the path to process (file_path or folder_path)
3282
- path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
3283
-
3284
- if path_to_process:
3285
- # Fetch the path to determine if it's a file or directory
3286
- api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
3287
- if branch:
3288
- api_url += f"?ref={branch}"
3289
-
3290
- try:
3291
- response = client.get(api_url, headers=headers, timeout=30.0)
3292
- response.raise_for_status()
3293
- path_data = response.json()
3294
-
3295
- if isinstance(path_data, list):
3296
- # It's a directory - recursively list all files
3297
- for item in path_data:
3298
- if item.get("type") == "file":
3299
- files_to_process.append({"path": item["path"], "name": item["name"]})
3300
- elif item.get("type") == "dir":
3301
- subdir_files = list_files_recursive(item["path"])
3302
- files_to_process.extend(subdir_files)
3303
- else:
3304
- # It's a single file
3305
- files_to_process.append(
3306
- {
3307
- "path": path_data["path"],
3308
- "name": path_data["name"],
3309
- }
3310
- )
3311
- except Exception as e:
3312
- log_error(f"Error fetching GitHub path {path_to_process}: {e}")
3313
- return
3314
-
3315
- if not files_to_process:
3316
- log_warning(f"No files found at GitHub path: {path_to_process}")
3317
- return
3318
-
3319
- # Process each file
3320
- for file_info in files_to_process:
3321
- file_path = file_info["path"]
3322
- file_name = file_info["name"]
3323
-
3324
- # Build a unique virtual path for hashing (ensures different files don't collide)
3325
- virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
3326
-
3327
- # Build metadata with all info needed to re-fetch the file
3328
- github_metadata = {
3329
- "source_type": "github",
3330
- "source_config_id": gh_config.id,
3331
- "source_config_name": gh_config.name,
3332
- "github_repo": gh_config.repo,
3333
- "github_branch": branch,
3334
- "github_path": file_path,
3335
- "github_filename": file_name,
3336
- }
3337
- # Merge with user-provided metadata (user metadata takes precedence)
3338
- merged_metadata = {**github_metadata, **(content.metadata or {})}
3339
-
3340
- # Setup Content object
3341
- # Naming: for folders, use relative path; for single files, use user name or filename
3342
- is_folder_upload = len(files_to_process) > 1
3343
- if is_folder_upload:
3344
- # Compute relative path from the upload root
3345
- relative_path = file_path
3346
- if path_to_process and file_path.startswith(path_to_process + "/"):
3347
- relative_path = file_path[len(path_to_process) + 1 :]
3348
- # If user provided a name, prefix it; otherwise use full file path
3349
- content_name = f"{content.name}/{relative_path}" if content.name else file_path
3350
- else:
3351
- # Single file: use user's name or the filename
3352
- content_name = content.name or file_name
3353
- content_entry = Content(
3354
- name=content_name,
3355
- description=content.description,
3356
- path=virtual_path, # Include path for unique hashing
3357
- status=ContentStatus.PROCESSING,
3358
- metadata=merged_metadata,
3359
- file_type="github",
3360
- )
3361
-
3362
- # Hash content and add to contents database
3363
- content_entry.content_hash = self._build_content_hash(content_entry)
3364
- content_entry.id = generate_id(content_entry.content_hash)
3365
- self._insert_contents_db(content_entry)
3366
-
3367
- if self._should_skip(content_entry.content_hash, skip_if_exists):
3368
- content_entry.status = ContentStatus.COMPLETED
3369
- self._update_content(content_entry)
3370
- continue
3371
-
3372
- # Fetch file content using GitHub API (works for private repos)
3373
- api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
3374
- if branch:
3375
- api_url += f"?ref={branch}"
3376
- try:
3377
- response = client.get(api_url, headers=headers, timeout=30.0)
3378
- response.raise_for_status()
3379
- file_data = response.json()
3380
-
3381
- # GitHub API returns content as base64
3382
- if file_data.get("encoding") == "base64":
3383
- import base64
3384
-
3385
- file_content = base64.b64decode(file_data["content"])
3386
- else:
3387
- # For large files, GitHub returns a download_url
3388
- download_url = file_data.get("download_url")
3389
- if download_url:
3390
- dl_response = client.get(download_url, headers=headers, timeout=30.0)
3391
- dl_response.raise_for_status()
3392
- file_content = dl_response.content
3393
- else:
3394
- raise ValueError("No content or download_url in response")
3395
- except Exception as e:
3396
- log_error(f"Error fetching GitHub file {file_path}: {e}")
3397
- content_entry.status = ContentStatus.FAILED
3398
- content_entry.status_message = str(e)
3399
- self._update_content(content_entry)
3400
- continue
3401
-
3402
- # Select reader and read content
3403
- reader = self._select_reader_by_uri(file_name, content.reader)
3404
- if reader is None:
3405
- log_warning(f"No reader found for file: {file_name}")
3406
- content_entry.status = ContentStatus.FAILED
3407
- content_entry.status_message = "No suitable reader found"
3408
- self._update_content(content_entry)
3409
- continue
3410
-
3411
- reader = cast(Reader, reader)
3412
- readable_content = BytesIO(file_content)
3413
- read_documents = reader.read(readable_content, name=file_name)
3414
-
3415
- # Prepare and insert into vector database
3416
- if not content_entry.id:
3417
- content_entry.id = generate_id(content_entry.content_hash or "")
3418
- self._prepare_documents_for_insert(read_documents, content_entry.id)
3419
- self._handle_vector_db_insert(content_entry, read_documents, upsert)
3420
-
3421
- # --- Azure Blob Storage loaders ---
3422
-
3423
- def _get_azure_blob_client(self, azure_config: AzureBlobConfig):
3424
- """Get a sync Azure Blob Service Client using client credentials flow.
3425
-
3426
- Requires the `azure-identity` and `azure-storage-blob` packages.
3427
- """
3428
- try:
3429
- from azure.identity import ClientSecretCredential # type: ignore
3430
- from azure.storage.blob import BlobServiceClient # type: ignore
3431
- except ImportError:
3432
- raise ImportError(
3433
- "The `azure-identity` and `azure-storage-blob` packages are not installed. "
3434
- "Please install them via `pip install azure-identity azure-storage-blob`."
3435
- )
3436
-
3437
- credential = ClientSecretCredential(
3438
- tenant_id=azure_config.tenant_id,
3439
- client_id=azure_config.client_id,
3440
- client_secret=azure_config.client_secret,
3441
- )
3442
-
3443
- blob_service = BlobServiceClient(
3444
- account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
3445
- credential=credential,
3446
- )
3447
-
3448
- return blob_service
3449
-
3450
- def _get_azure_blob_client_async(self, azure_config: AzureBlobConfig):
3451
- """Get an async Azure Blob Service Client using client credentials flow.
3452
-
3453
- Requires the `azure-identity` and `azure-storage-blob` packages.
3454
- Uses the async versions from azure.storage.blob.aio and azure.identity.aio.
3455
- """
3456
- try:
3457
- from azure.identity.aio import ClientSecretCredential # type: ignore
3458
- from azure.storage.blob.aio import BlobServiceClient # type: ignore
3459
- except ImportError:
3460
- raise ImportError(
3461
- "The `azure-identity` and `azure-storage-blob` packages are not installed. "
3462
- "Please install them via `pip install azure-identity azure-storage-blob`."
3463
- )
3464
-
3465
- credential = ClientSecretCredential(
3466
- tenant_id=azure_config.tenant_id,
3467
- client_id=azure_config.client_id,
3468
- client_secret=azure_config.client_secret,
3469
- )
3470
-
3471
- blob_service = BlobServiceClient(
3472
- account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
3473
- credential=credential,
3474
- )
3475
-
3476
- return blob_service
3477
-
3478
- async def _aload_from_azure_blob(
3479
- self,
3480
- content: Content,
3481
- upsert: bool,
3482
- skip_if_exists: bool,
3483
- config: Optional[RemoteContentConfig] = None,
3484
- ):
3485
- """Load content from Azure Blob Storage (async version).
3486
-
3487
- Requires the AzureBlobConfig to contain tenant_id, client_id, client_secret,
3488
- storage_account, and container.
3489
-
3490
- Uses the async Azure SDK to avoid blocking the event loop.
3491
-
3492
- 1. Authenticate with Azure AD using client credentials
3493
- 2. List blobs in container (by prefix or single blob)
3494
- 3. Download and process each blob
3495
- 4. Insert to vector database
3496
- """
3497
- remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
3498
- azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
3499
-
3500
- if azure_config is None:
3501
- log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
3502
- return
3503
-
3504
- # Get async blob service client
3505
- try:
3506
- blob_service = self._get_azure_blob_client_async(azure_config)
3507
- except ImportError as e:
3508
- log_error(str(e))
3509
- return
3510
- except Exception as e:
3511
- log_error(f"Error creating Azure Blob client: {e}")
3512
- return
3513
-
3514
- # Use async context manager for proper resource cleanup
3515
- async with blob_service:
3516
- container_client = blob_service.get_container_client(azure_config.container)
3517
-
3518
- # Helper to list blobs with a given prefix (async)
3519
- async def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
3520
- """List all blobs under a given prefix (folder)."""
3521
- results: List[Dict[str, Any]] = []
3522
- normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
3523
- async for blob in container_client.list_blobs(name_starts_with=normalized_prefix):
3524
- # Skip "directory" markers (blobs ending with /)
3525
- if not blob.name.endswith("/"):
3526
- results.append(
3527
- {
3528
- "name": blob.name,
3529
- "size": blob.size,
3530
- "content_type": blob.content_settings.content_type if blob.content_settings else None,
3531
- }
3532
- )
3533
- return results
3534
-
3535
- # Identify blobs to process
3536
- blobs_to_process: List[Dict[str, Any]] = []
3537
-
3538
- try:
3539
- if remote_content.blob_name:
3540
- # Try to get as a single blob first
3541
- blob_client = container_client.get_blob_client(remote_content.blob_name)
3542
- try:
3543
- props = await blob_client.get_blob_properties()
3544
- blobs_to_process.append(
3545
- {
3546
- "name": remote_content.blob_name,
3547
- "size": props.size,
3548
- "content_type": props.content_settings.content_type if props.content_settings else None,
3549
- }
3550
- )
3551
- except Exception:
3552
- # Blob doesn't exist - check if it's actually a folder (prefix)
3553
- log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
3554
- blobs_to_process = await list_blobs_with_prefix(remote_content.blob_name)
3555
- if not blobs_to_process:
3556
- log_error(
3557
- f"No blob or folder found at path: {remote_content.blob_name}. "
3558
- "If this is a folder, ensure files exist inside it."
3559
- )
3560
- return
3561
- elif remote_content.prefix:
3562
- # List blobs with prefix
3563
- blobs_to_process = await list_blobs_with_prefix(remote_content.prefix)
3564
- except Exception as e:
3565
- log_error(f"Error listing Azure blobs: {e}")
3566
- return
3567
-
3568
- if not blobs_to_process:
3569
- log_warning(f"No blobs found in Azure container: {azure_config.container}")
3570
- return
3571
-
3572
- # For single file uploads, use the original content object to preserve the ID
3573
- # returned by the API. For folder uploads, create new content entries for each file.
3574
- is_folder_upload = len(blobs_to_process) > 1
3575
-
3576
- # Process each blob
3577
- for blob_info in blobs_to_process:
3578
- blob_name = blob_info["name"]
3579
- file_name = blob_name.split("/")[-1]
3580
-
3581
- # Build a unique virtual path for hashing
3582
- virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
3583
-
3584
- # Build metadata
3585
- azure_metadata = {
3586
- "source_type": "azure_blob",
3587
- "source_config_id": azure_config.id,
3588
- "source_config_name": azure_config.name,
3589
- "azure_storage_account": azure_config.storage_account,
3590
- "azure_container": azure_config.container,
3591
- "azure_blob_name": blob_name,
3592
- "azure_filename": file_name,
3593
- }
3594
- merged_metadata = {**azure_metadata, **(content.metadata or {})}
3595
-
3596
- # Setup Content object
3597
- if is_folder_upload:
3598
- # For folder uploads, create new content entries for each file
3599
- relative_path = blob_name
3600
- if remote_content.prefix and blob_name.startswith(remote_content.prefix):
3601
- relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
3602
- content_name = f"{content.name}/{relative_path}" if content.name else blob_name
3603
-
3604
- content_entry = Content(
3605
- name=content_name,
3606
- description=content.description,
3607
- path=virtual_path,
3608
- status=ContentStatus.PROCESSING,
3609
- metadata=merged_metadata,
3610
- file_type="azure_blob",
3611
- )
3612
- content_entry.content_hash = self._build_content_hash(content_entry)
3613
- content_entry.id = generate_id(content_entry.content_hash)
3614
- else:
3615
- # For single file uploads, use the original content object to preserve ID
3616
- content_entry = content
3617
- content_entry.path = virtual_path
3618
- content_entry.status = ContentStatus.PROCESSING
3619
- content_entry.metadata = merged_metadata
3620
- content_entry.file_type = "azure_blob"
3621
- # Use existing id and content_hash from the original content if available
3622
- if not content_entry.content_hash:
3623
- content_entry.content_hash = self._build_content_hash(content_entry)
3624
- if not content_entry.id:
3625
- content_entry.id = generate_id(content_entry.content_hash)
3626
-
3627
- await self._ainsert_contents_db(content_entry)
3628
-
3629
- if self._should_skip(content_entry.content_hash, skip_if_exists):
3630
- content_entry.status = ContentStatus.COMPLETED
3631
- await self._aupdate_content(content_entry)
3632
- continue
3633
-
3634
- # Download blob (async)
3635
- try:
3636
- blob_client = container_client.get_blob_client(blob_name)
3637
- download_stream = await blob_client.download_blob()
3638
- blob_data = await download_stream.readall()
3639
- file_content = BytesIO(blob_data)
3640
- except Exception as e:
3641
- log_error(f"Error downloading Azure blob {blob_name}: {e}")
3642
- content_entry.status = ContentStatus.FAILED
3643
- content_entry.status_message = str(e)
3644
- await self._aupdate_content(content_entry)
3645
- continue
3646
-
3647
- # Select reader and read content
3648
- reader = self._select_reader_by_uri(file_name, content.reader)
3649
- if reader is None:
3650
- log_warning(f"No reader found for file: {file_name}")
3651
- content_entry.status = ContentStatus.FAILED
3652
- content_entry.status_message = "No suitable reader found"
3653
- await self._aupdate_content(content_entry)
3654
- continue
3655
-
3656
- reader = cast(Reader, reader)
3657
- read_documents = await reader.async_read(file_content, name=file_name)
3658
-
3659
- # Prepare and insert into vector database
3660
- if not content_entry.id:
3661
- content_entry.id = generate_id(content_entry.content_hash or "")
3662
- self._prepare_documents_for_insert(read_documents, content_entry.id)
3663
- await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
3664
-
3665
- def _load_from_azure_blob(
3666
- self,
3667
- content: Content,
3668
- upsert: bool,
3669
- skip_if_exists: bool,
3670
- config: Optional[RemoteContentConfig] = None,
3671
- ):
3672
- """Synchronous version of _load_from_azure_blob.
3673
-
3674
- Load content from Azure Blob Storage:
3675
- 1. Authenticate with Azure AD using client credentials
3676
- 2. List blobs in container (by prefix or single blob)
3677
- 3. Download and process each blob
3678
- 4. Insert to vector database
3679
- """
3680
- remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
3681
- azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
3682
-
3683
- if azure_config is None:
3684
- log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
3685
- return
3686
-
3687
- # Get blob service client
3688
- try:
3689
- blob_service = self._get_azure_blob_client(azure_config)
3690
- except ImportError as e:
3691
- log_error(str(e))
3692
- return
3693
- except Exception as e:
3694
- log_error(f"Error creating Azure Blob client: {e}")
3695
- return
3696
-
3697
- container_client = blob_service.get_container_client(azure_config.container)
3698
-
3699
- # Helper to list blobs with a given prefix
3700
- def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
3701
- """List all blobs under a given prefix (folder)."""
3702
- results: List[Dict[str, Any]] = []
3703
- normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
3704
- blobs = container_client.list_blobs(name_starts_with=normalized_prefix)
3705
- for blob in blobs:
3706
- # Skip "directory" markers (blobs ending with /)
3707
- if not blob.name.endswith("/"):
3708
- results.append(
3709
- {
3710
- "name": blob.name,
3711
- "size": blob.size,
3712
- "content_type": blob.content_settings.content_type if blob.content_settings else None,
3713
- }
3714
- )
3715
- return results
3716
-
3717
- # Identify blobs to process
3718
- blobs_to_process: List[Dict[str, Any]] = []
3719
-
3720
- try:
3721
- if remote_content.blob_name:
3722
- # Try to get as a single blob first
3723
- blob_client = container_client.get_blob_client(remote_content.blob_name)
3724
- try:
3725
- props = blob_client.get_blob_properties()
3726
- blobs_to_process.append(
3727
- {
3728
- "name": remote_content.blob_name,
3729
- "size": props.size,
3730
- "content_type": props.content_settings.content_type if props.content_settings else None,
3731
- }
3732
- )
3733
- except Exception:
3734
- # Blob doesn't exist - check if it's actually a folder (prefix)
3735
- log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
3736
- blobs_to_process = list_blobs_with_prefix(remote_content.blob_name)
3737
- if not blobs_to_process:
3738
- log_error(
3739
- f"No blob or folder found at path: {remote_content.blob_name}. "
3740
- "If this is a folder, ensure files exist inside it."
3741
- )
3742
- return
3743
- elif remote_content.prefix:
3744
- # List blobs with prefix
3745
- blobs_to_process = list_blobs_with_prefix(remote_content.prefix)
3746
- except Exception as e:
3747
- log_error(f"Error listing Azure blobs: {e}")
3748
- return
3749
-
3750
- if not blobs_to_process:
3751
- log_warning(f"No blobs found in Azure container: {azure_config.container}")
3752
- return
3753
-
3754
- # For single file uploads, use the original content object to preserve the ID
3755
- # returned by the API. For folder uploads, create new content entries for each file.
3756
- is_folder_upload = len(blobs_to_process) > 1
3757
-
3758
- # Process each blob
3759
- for blob_info in blobs_to_process:
3760
- blob_name = blob_info["name"]
3761
- file_name = blob_name.split("/")[-1]
3762
-
3763
- # Build a unique virtual path for hashing
3764
- virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
3765
-
3766
- # Build metadata
3767
- azure_metadata = {
3768
- "source_type": "azure_blob",
3769
- "source_config_id": azure_config.id,
3770
- "source_config_name": azure_config.name,
3771
- "azure_storage_account": azure_config.storage_account,
3772
- "azure_container": azure_config.container,
3773
- "azure_blob_name": blob_name,
3774
- "azure_filename": file_name,
3775
- }
3776
- merged_metadata = {**azure_metadata, **(content.metadata or {})}
3777
-
3778
- # Setup Content object
3779
- if is_folder_upload:
3780
- # For folder uploads, create new content entries for each file
3781
- relative_path = blob_name
3782
- if remote_content.prefix and blob_name.startswith(remote_content.prefix):
3783
- relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
3784
- content_name = f"{content.name}/{relative_path}" if content.name else blob_name
3785
-
3786
- content_entry = Content(
3787
- name=content_name,
3788
- description=content.description,
3789
- path=virtual_path,
3790
- status=ContentStatus.PROCESSING,
3791
- metadata=merged_metadata,
3792
- file_type="azure_blob",
3793
- )
3794
- content_entry.content_hash = self._build_content_hash(content_entry)
3795
- content_entry.id = generate_id(content_entry.content_hash)
3796
- else:
3797
- # For single file uploads, use the original content object to preserve ID
3798
- content_entry = content
3799
- content_entry.path = virtual_path
3800
- content_entry.status = ContentStatus.PROCESSING
3801
- content_entry.metadata = merged_metadata
3802
- content_entry.file_type = "azure_blob"
3803
- # Use existing id and content_hash from the original content if available
3804
- if not content_entry.content_hash:
3805
- content_entry.content_hash = self._build_content_hash(content_entry)
3806
- if not content_entry.id:
3807
- content_entry.id = generate_id(content_entry.content_hash)
3808
-
3809
- self._insert_contents_db(content_entry)
3810
-
3811
- if self._should_skip(content_entry.content_hash, skip_if_exists):
3812
- content_entry.status = ContentStatus.COMPLETED
3813
- self._update_content(content_entry)
3814
- continue
3815
-
3816
- # Download blob
3817
- try:
3818
- blob_client = container_client.get_blob_client(blob_name)
3819
- download_stream = blob_client.download_blob()
3820
- file_content = BytesIO(download_stream.readall())
3821
- except Exception as e:
3822
- log_error(f"Error downloading Azure blob {blob_name}: {e}")
3823
- content_entry.status = ContentStatus.FAILED
3824
- content_entry.status_message = str(e)
3825
- self._update_content(content_entry)
3826
- continue
3827
-
3828
- # Select reader and read content
3829
- reader = self._select_reader_by_uri(file_name, content.reader)
3830
- if reader is None:
3831
- log_warning(f"No reader found for file: {file_name}")
3832
- content_entry.status = ContentStatus.FAILED
3833
- content_entry.status_message = "No suitable reader found"
3834
- self._update_content(content_entry)
3835
- continue
3836
-
3837
- reader = cast(Reader, reader)
3838
- read_documents = reader.read(file_content, name=file_name)
3839
-
3840
- # Prepare and insert into vector database
3841
- if not content_entry.id:
3842
- content_entry.id = generate_id(content_entry.content_hash or "")
3843
- self._prepare_documents_for_insert(read_documents, content_entry.id)
3844
- self._handle_vector_db_insert(content_entry, read_documents, upsert)
3845
-
3846
- async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
3847
- from agno.vectordb import VectorDb
3848
-
3849
- self.vector_db = cast(VectorDb, self.vector_db)
3850
-
3851
- if not self.vector_db:
3852
- log_error("No vector database configured")
3853
- content.status = ContentStatus.FAILED
3854
- content.status_message = "No vector database configured"
3855
- await self._aupdate_content(content)
3856
- return
3857
-
3858
- if self.vector_db.upsert_available() and upsert:
3859
- try:
3860
- await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
3861
- except Exception as e:
3862
- log_error(f"Error upserting document: {e}")
3863
- content.status = ContentStatus.FAILED
3864
- content.status_message = "Could not upsert embedding"
3865
- await self._aupdate_content(content)
3866
- return
3867
- else:
3868
- try:
3869
- await self.vector_db.async_insert(
3870
- content.content_hash, # type: ignore[arg-type]
3871
- documents=read_documents,
3872
- filters=content.metadata, # type: ignore[arg-type]
3873
- )
3874
- except Exception as e:
3875
- log_error(f"Error inserting document: {e}")
3876
- content.status = ContentStatus.FAILED
3877
- content.status_message = "Could not insert embedding"
3878
- await self._aupdate_content(content)
3879
- return
3880
-
3881
- content.status = ContentStatus.COMPLETED
3882
- await self._aupdate_content(content)
3883
-
3884
- def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
3885
- """Synchronously handle vector database insertion."""
3886
- from agno.vectordb import VectorDb
3887
-
3888
- self.vector_db = cast(VectorDb, self.vector_db)
3889
-
3890
- if not self.vector_db:
3891
- log_error("No vector database configured")
3892
- content.status = ContentStatus.FAILED
3893
- content.status_message = "No vector database configured"
3894
- self._update_content(content)
3895
- return
3896
-
3897
- if self.vector_db.upsert_available() and upsert:
3898
- try:
3899
- self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
3900
- except Exception as e:
3901
- log_error(f"Error upserting document: {e}")
3902
- content.status = ContentStatus.FAILED
3903
- content.status_message = "Could not upsert embedding"
3904
- self._update_content(content)
3905
- return
3906
- else:
3907
- try:
3908
- self.vector_db.insert(
3909
- content.content_hash, # type: ignore[arg-type]
3910
- documents=read_documents,
3911
- filters=content.metadata, # type: ignore[arg-type]
3912
- )
3913
- except Exception as e:
3914
- log_error(f"Error inserting document: {e}")
3915
- content.status = ContentStatus.FAILED
3916
- content.status_message = "Could not insert embedding"
3917
- self._update_content(content)
3918
- return
3919
-
3920
- content.status = ContentStatus.COMPLETED
3921
- self._update_content(content)
3922
-
3923
- # --- Remote Content Sources ---
3924
-
3925
- def _get_remote_configs(self) -> List[RemoteContentConfig]:
3926
- """Return configured remote content sources."""
3927
- return self.content_sources or []
3928
-
3929
- def _get_remote_config_by_id(self, config_id: str) -> Optional[RemoteContentConfig]:
3930
- """Get a remote content config by its ID."""
3931
- if not self.content_sources:
3932
- return None
3933
- return next((c for c in self.content_sources if c.id == config_id), None)
3934
-
3935
2084
  # ==========================================
3936
2085
  # PRIVATE - CONVERSION & DATA METHODS
3937
2086
  # ==========================================
@@ -4156,6 +2305,87 @@ class Knowledge:
4156
2305
  content_row = self._build_knowledge_row(content)
4157
2306
  self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
4158
2307
 
2308
+ # --- Vector DB Insert Helpers ---
2309
+
2310
+ async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
2311
+ from agno.vectordb import VectorDb
2312
+
2313
+ self.vector_db = cast(VectorDb, self.vector_db)
2314
+
2315
+ if not self.vector_db:
2316
+ log_error("No vector database configured")
2317
+ content.status = ContentStatus.FAILED
2318
+ content.status_message = "No vector database configured"
2319
+ await self._aupdate_content(content)
2320
+ return
2321
+
2322
+ if self.vector_db.upsert_available() and upsert:
2323
+ try:
2324
+ await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
2325
+ except Exception as e:
2326
+ log_error(f"Error upserting document: {e}")
2327
+ content.status = ContentStatus.FAILED
2328
+ content.status_message = "Could not upsert embedding"
2329
+ await self._aupdate_content(content)
2330
+ return
2331
+ else:
2332
+ try:
2333
+ await self.vector_db.async_insert(
2334
+ content.content_hash, # type: ignore[arg-type]
2335
+ documents=read_documents,
2336
+ filters=content.metadata, # type: ignore[arg-type]
2337
+ )
2338
+ except Exception as e:
2339
+ log_error(f"Error inserting document: {e}")
2340
+ content.status = ContentStatus.FAILED
2341
+ content.status_message = "Could not insert embedding"
2342
+ await self._aupdate_content(content)
2343
+ return
2344
+
2345
+ content.status = ContentStatus.COMPLETED
2346
+ await self._aupdate_content(content)
2347
+
2348
+ def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
2349
+ """Synchronously handle vector database insertion."""
2350
+ from agno.vectordb import VectorDb
2351
+
2352
+ self.vector_db = cast(VectorDb, self.vector_db)
2353
+
2354
+ if not self.vector_db:
2355
+ log_error("No vector database configured")
2356
+ content.status = ContentStatus.FAILED
2357
+ content.status_message = "No vector database configured"
2358
+ self._update_content(content)
2359
+ return
2360
+
2361
+ if self.vector_db.upsert_available() and upsert:
2362
+ try:
2363
+ self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
2364
+ except Exception as e:
2365
+ log_error(f"Error upserting document: {e}")
2366
+ content.status = ContentStatus.FAILED
2367
+ content.status_message = "Could not upsert embedding"
2368
+ self._update_content(content)
2369
+ return
2370
+ else:
2371
+ try:
2372
+ self.vector_db.insert(
2373
+ content.content_hash, # type: ignore[arg-type]
2374
+ documents=read_documents,
2375
+ filters=content.metadata, # type: ignore[arg-type]
2376
+ )
2377
+ except Exception as e:
2378
+ log_error(f"Error inserting document: {e}")
2379
+ content.status = ContentStatus.FAILED
2380
+ content.status_message = "Could not insert embedding"
2381
+ self._update_content(content)
2382
+ return
2383
+
2384
+ content.status = ContentStatus.COMPLETED
2385
+ self._update_content(content)
2386
+
2387
+ # --- Content Update ---
2388
+
4159
2389
  def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
4160
2390
  from agno.vectordb import VectorDb
4161
2391