agno 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ from os.path import basename
9
9
  from pathlib import Path
10
10
  from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
11
11
 
12
+ import httpx
12
13
  from httpx import AsyncClient
13
14
 
14
15
  from agno.db.base import AsyncBaseDb, BaseDb
@@ -17,7 +18,20 @@ from agno.filters import FilterExpr
17
18
  from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
18
19
  from agno.knowledge.document import Document
19
20
  from agno.knowledge.reader import Reader, ReaderFactory
20
- from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
21
+ from agno.knowledge.remote_content.config import (
22
+ GcsConfig,
23
+ GitHubConfig,
24
+ RemoteContentConfig,
25
+ S3Config,
26
+ SharePointConfig,
27
+ )
28
+ from agno.knowledge.remote_content.remote_content import (
29
+ GCSContent,
30
+ GitHubContent,
31
+ RemoteContent,
32
+ S3Content,
33
+ SharePointContent,
34
+ )
21
35
  from agno.utils.http import async_fetch_with_retry
22
36
  from agno.utils.log import log_debug, log_error, log_info, log_warning
23
37
  from agno.utils.string import generate_id
@@ -42,6 +56,7 @@ class Knowledge:
42
56
  contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
43
57
  max_results: int = 10
44
58
  readers: Optional[Dict[str, Reader]] = None
59
+ content_sources: Optional[List[RemoteContentConfig]] = None
45
60
 
46
61
  def __post_init__(self):
47
62
  from agno.vectordb import VectorDb
@@ -1161,7 +1176,7 @@ class Knowledge:
1161
1176
  import inspect
1162
1177
 
1163
1178
  read_signature = inspect.signature(reader.read)
1164
- if password and "password" in read_signature.parameters:
1179
+ if password is not None and "password" in read_signature.parameters:
1165
1180
  if isinstance(source, BytesIO):
1166
1181
  return reader.read(source, name=name, password=password)
1167
1182
  else:
@@ -1194,7 +1209,7 @@ class Knowledge:
1194
1209
  import inspect
1195
1210
 
1196
1211
  read_signature = inspect.signature(reader.async_read)
1197
- if password and "password" in read_signature.parameters:
1212
+ if password is not None and "password" in read_signature.parameters:
1198
1213
  return await reader.async_read(source, name=name, password=password)
1199
1214
  else:
1200
1215
  if isinstance(source, BytesIO):
@@ -1285,7 +1300,7 @@ class Knowledge:
1285
1300
  log_debug(f"Using Reader: {reader.__class__.__name__}")
1286
1301
 
1287
1302
  if reader:
1288
- password = content.auth.password if content.auth and content.auth.password else None
1303
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1289
1304
  read_documents = await self._aread(reader, path, name=content.name or path.name, password=password)
1290
1305
  else:
1291
1306
  read_documents = []
@@ -1304,7 +1319,7 @@ class Knowledge:
1304
1319
 
1305
1320
  if not content.id:
1306
1321
  content.id = generate_id(content.content_hash or "")
1307
- self._prepare_documents_for_insert(read_documents, content.id)
1322
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1308
1323
 
1309
1324
  await self._ahandle_vector_db_insert(content, read_documents, upsert)
1310
1325
 
@@ -1366,7 +1381,7 @@ class Knowledge:
1366
1381
  log_debug(f"Using Reader: {reader.__class__.__name__}")
1367
1382
 
1368
1383
  if reader:
1369
- password = content.auth.password if content.auth and content.auth.password else None
1384
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1370
1385
  read_documents = self._read(reader, path, name=content.name or path.name, password=password)
1371
1386
  else:
1372
1387
  read_documents = []
@@ -1385,7 +1400,7 @@ class Knowledge:
1385
1400
 
1386
1401
  if not content.id:
1387
1402
  content.id = generate_id(content.content_hash or "")
1388
- self._prepare_documents_for_insert(read_documents, content.id)
1403
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1389
1404
 
1390
1405
  self._handle_vector_db_insert(content, read_documents, upsert)
1391
1406
 
@@ -1485,7 +1500,7 @@ class Knowledge:
1485
1500
  if reader.__class__.__name__ == "YouTubeReader":
1486
1501
  read_documents = await reader.async_read(content.url, name=name)
1487
1502
  else:
1488
- password = content.auth.password if content.auth and content.auth.password else None
1503
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1489
1504
  source = bytes_content if bytes_content else content.url
1490
1505
  read_documents = await self._aread(reader, source, name=name, password=password)
1491
1506
 
@@ -1583,7 +1598,7 @@ class Knowledge:
1583
1598
  if reader.__class__.__name__ == "YouTubeReader":
1584
1599
  read_documents = reader.read(content.url, name=name)
1585
1600
  else:
1586
- password = content.auth.password if content.auth and content.auth.password else None
1601
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1587
1602
  source = bytes_content if bytes_content else content.url
1588
1603
  read_documents = self._read(reader, source, name=name, password=password)
1589
1604
 
@@ -1930,16 +1945,35 @@ class Knowledge:
1930
1945
 
1931
1946
  remote_content = content.remote_content
1932
1947
 
1948
+ # Look up config if config_id is provided
1949
+ config = None
1950
+ if hasattr(remote_content, "config_id") and remote_content.config_id:
1951
+ config = self._get_remote_config_by_id(remote_content.config_id)
1952
+ if config is None:
1953
+ log_warning(f"No config found for config_id: {remote_content.config_id}")
1954
+
1933
1955
  if isinstance(remote_content, S3Content):
1934
- await self._aload_from_s3(content, upsert, skip_if_exists)
1956
+ await self._aload_from_s3(content, upsert, skip_if_exists, config)
1935
1957
 
1936
1958
  elif isinstance(remote_content, GCSContent):
1937
- await self._aload_from_gcs(content, upsert, skip_if_exists)
1959
+ await self._aload_from_gcs(content, upsert, skip_if_exists, config)
1960
+
1961
+ elif isinstance(remote_content, SharePointContent):
1962
+ await self._aload_from_sharepoint(content, upsert, skip_if_exists, config)
1963
+
1964
+ elif isinstance(remote_content, GitHubContent):
1965
+ await self._aload_from_github(content, upsert, skip_if_exists, config)
1938
1966
 
1939
1967
  else:
1940
1968
  log_warning(f"Unsupported remote content type: {type(remote_content)}")
1941
1969
 
1942
- async def _aload_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
1970
+ async def _aload_from_s3(
1971
+ self,
1972
+ content: Content,
1973
+ upsert: bool,
1974
+ skip_if_exists: bool,
1975
+ config: Optional[RemoteContentConfig] = None,
1976
+ ):
1943
1977
  """Load the contextual S3 content.
1944
1978
 
1945
1979
  1. Identify objects to read
@@ -1951,22 +1985,43 @@ class Knowledge:
1951
1985
  7. Prepare and insert the content in the vector database
1952
1986
  8. Remove temporary file if needed
1953
1987
  """
1988
+ from agno.cloud.aws.s3.bucket import S3Bucket
1954
1989
  from agno.cloud.aws.s3.object import S3Object
1955
1990
 
1991
+ # Note: S3 support has limited features compared to GitHub/SharePoint
1992
+ log_warning(
1993
+ "S3 content loading has limited features. "
1994
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
1995
+ )
1996
+
1956
1997
  remote_content: S3Content = cast(S3Content, content.remote_content)
1957
1998
 
1999
+ # Get or create bucket with credentials from config
2000
+ bucket = remote_content.bucket
2001
+ try:
2002
+ if bucket is None and remote_content.bucket_name:
2003
+ s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
2004
+ bucket = S3Bucket(
2005
+ name=remote_content.bucket_name,
2006
+ region=s3_config.region if s3_config else None,
2007
+ aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
2008
+ aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
2009
+ )
2010
+ except Exception as e:
2011
+ log_error(f"Error getting bucket: {e}")
2012
+
1958
2013
  # 1. Identify objects to read
1959
2014
  objects_to_read: List[S3Object] = []
1960
- if remote_content.bucket is not None:
2015
+ if bucket is not None:
1961
2016
  if remote_content.key is not None:
1962
- _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
2017
+ _object = S3Object(bucket_name=bucket.name, name=remote_content.key)
1963
2018
  objects_to_read.append(_object)
1964
2019
  elif remote_content.object is not None:
1965
2020
  objects_to_read.append(remote_content.object)
1966
2021
  elif remote_content.prefix is not None:
1967
- objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
2022
+ objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
1968
2023
  else:
1969
- objects_to_read.extend(remote_content.bucket.get_objects())
2024
+ objects_to_read.extend(bucket.get_objects())
1970
2025
 
1971
2026
  for s3_object in objects_to_read:
1972
2027
  # 2. Setup Content object
@@ -2008,16 +2063,20 @@ class Knowledge:
2008
2063
  read_documents = await reader.async_read(readable_content, name=obj_name)
2009
2064
 
2010
2065
  # 7. Prepare and insert the content in the vector database
2011
- if not content.id:
2012
- content.id = generate_id(content.content_hash or "")
2013
- self._prepare_documents_for_insert(read_documents, content.id)
2066
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2014
2067
  await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2015
2068
 
2016
2069
  # 8. Remove temporary file if needed
2017
2070
  if temporary_file:
2018
2071
  temporary_file.unlink()
2019
2072
 
2020
- async def _aload_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
2073
+ async def _aload_from_gcs(
2074
+ self,
2075
+ content: Content,
2076
+ upsert: bool,
2077
+ skip_if_exists: bool,
2078
+ config: Optional[RemoteContentConfig] = None,
2079
+ ):
2021
2080
  """Load the contextual GCS content.
2022
2081
 
2023
2082
  1. Identify objects to read
@@ -2028,16 +2087,42 @@ class Knowledge:
2028
2087
  6. Read the content
2029
2088
  7. Prepare and insert the content in the vector database
2030
2089
  """
2090
+ try:
2091
+ from google.cloud import storage # type: ignore
2092
+ except ImportError:
2093
+ raise ImportError(
2094
+ "The `google-cloud-storage` package is not installed. "
2095
+ "Please install it via `pip install google-cloud-storage`."
2096
+ )
2097
+
2098
+ # Note: GCS support has limited features compared to GitHub/SharePoint
2099
+ log_warning(
2100
+ "GCS content loading has limited features. "
2101
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2102
+ )
2103
+
2031
2104
  remote_content: GCSContent = cast(GCSContent, content.remote_content)
2032
2105
 
2106
+ # Get or create bucket with credentials from config
2107
+ bucket = remote_content.bucket
2108
+ if bucket is None and remote_content.bucket_name:
2109
+ gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
2110
+ if gcs_config and gcs_config.credentials_path:
2111
+ client = storage.Client.from_service_account_json(gcs_config.credentials_path)
2112
+ elif gcs_config and gcs_config.project:
2113
+ client = storage.Client(project=gcs_config.project)
2114
+ else:
2115
+ client = storage.Client()
2116
+ bucket = client.bucket(remote_content.bucket_name)
2117
+
2033
2118
  # 1. Identify objects to read
2034
2119
  objects_to_read = []
2035
2120
  if remote_content.blob_name is not None:
2036
- objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
2121
+ objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
2037
2122
  elif remote_content.prefix is not None:
2038
- objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2123
+ objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2039
2124
  else:
2040
- objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
2125
+ objects_to_read.extend(bucket.list_blobs()) # type: ignore
2041
2126
 
2042
2127
  for gcs_object in objects_to_read:
2043
2128
  # 2. Setup Content object
@@ -2070,9 +2155,7 @@ class Knowledge:
2070
2155
  read_documents = await reader.async_read(readable_content, name=name)
2071
2156
 
2072
2157
  # 7. Prepare and insert the content in the vector database
2073
- if not content.id:
2074
- content.id = generate_id(content.content_hash or "")
2075
- self._prepare_documents_for_insert(read_documents, content.id)
2158
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2076
2159
  await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2077
2160
 
2078
2161
  def _load_from_remote_content(
@@ -2088,16 +2171,35 @@ class Knowledge:
2088
2171
 
2089
2172
  remote_content = content.remote_content
2090
2173
 
2174
+ # Look up config if config_id is provided
2175
+ config = None
2176
+ if hasattr(remote_content, "config_id") and remote_content.config_id:
2177
+ config = self._get_remote_config_by_id(remote_content.config_id)
2178
+ if config is None:
2179
+ log_warning(f"No config found for config_id: {remote_content.config_id}")
2180
+
2091
2181
  if isinstance(remote_content, S3Content):
2092
- self._load_from_s3(content, upsert, skip_if_exists)
2182
+ self._load_from_s3(content, upsert, skip_if_exists, config)
2093
2183
 
2094
2184
  elif isinstance(remote_content, GCSContent):
2095
- self._load_from_gcs(content, upsert, skip_if_exists)
2185
+ self._load_from_gcs(content, upsert, skip_if_exists, config)
2186
+
2187
+ elif isinstance(remote_content, SharePointContent):
2188
+ self._load_from_sharepoint(content, upsert, skip_if_exists, config)
2189
+
2190
+ elif isinstance(remote_content, GitHubContent):
2191
+ self._load_from_github(content, upsert, skip_if_exists, config)
2096
2192
 
2097
2193
  else:
2098
2194
  log_warning(f"Unsupported remote content type: {type(remote_content)}")
2099
2195
 
2100
- def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
2196
+ def _load_from_s3(
2197
+ self,
2198
+ content: Content,
2199
+ upsert: bool,
2200
+ skip_if_exists: bool,
2201
+ config: Optional[RemoteContentConfig] = None,
2202
+ ):
2101
2203
  """Synchronous version of _load_from_s3.
2102
2204
 
2103
2205
  Load the contextual S3 content:
@@ -2110,22 +2212,40 @@ class Knowledge:
2110
2212
  7. Prepare and insert the content in the vector database
2111
2213
  8. Remove temporary file if needed
2112
2214
  """
2215
+ from agno.cloud.aws.s3.bucket import S3Bucket
2113
2216
  from agno.cloud.aws.s3.object import S3Object
2114
2217
 
2218
+ # Note: S3 support has limited features compared to GitHub/SharePoint
2219
+ log_warning(
2220
+ "S3 content loading has limited features. "
2221
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2222
+ )
2223
+
2115
2224
  remote_content: S3Content = cast(S3Content, content.remote_content)
2116
2225
 
2226
+ # Get or create bucket with credentials from config
2227
+ bucket = remote_content.bucket
2228
+ if bucket is None and remote_content.bucket_name:
2229
+ s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
2230
+ bucket = S3Bucket(
2231
+ name=remote_content.bucket_name,
2232
+ region=s3_config.region if s3_config else None,
2233
+ aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
2234
+ aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
2235
+ )
2236
+
2117
2237
  # 1. Identify objects to read
2118
2238
  objects_to_read: List[S3Object] = []
2119
- if remote_content.bucket is not None:
2239
+ if bucket is not None:
2120
2240
  if remote_content.key is not None:
2121
- _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
2241
+ _object = S3Object(bucket_name=bucket.name, name=remote_content.key)
2122
2242
  objects_to_read.append(_object)
2123
2243
  elif remote_content.object is not None:
2124
2244
  objects_to_read.append(remote_content.object)
2125
2245
  elif remote_content.prefix is not None:
2126
- objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
2246
+ objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
2127
2247
  else:
2128
- objects_to_read.extend(remote_content.bucket.get_objects())
2248
+ objects_to_read.extend(bucket.get_objects())
2129
2249
 
2130
2250
  for s3_object in objects_to_read:
2131
2251
  # 2. Setup Content object
@@ -2167,16 +2287,20 @@ class Knowledge:
2167
2287
  read_documents = reader.read(readable_content, name=obj_name)
2168
2288
 
2169
2289
  # 7. Prepare and insert the content in the vector database
2170
- if not content.id:
2171
- content.id = generate_id(content.content_hash or "")
2172
- self._prepare_documents_for_insert(read_documents, content.id)
2290
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2173
2291
  self._handle_vector_db_insert(content_entry, read_documents, upsert)
2174
2292
 
2175
2293
  # 8. Remove temporary file if needed
2176
2294
  if temporary_file:
2177
2295
  temporary_file.unlink()
2178
2296
 
2179
- def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
2297
+ def _load_from_gcs(
2298
+ self,
2299
+ content: Content,
2300
+ upsert: bool,
2301
+ skip_if_exists: bool,
2302
+ config: Optional[RemoteContentConfig] = None,
2303
+ ):
2180
2304
  """Synchronous version of _load_from_gcs.
2181
2305
 
2182
2306
  Load the contextual GCS content:
@@ -2188,16 +2312,42 @@ class Knowledge:
2188
2312
  6. Read the content
2189
2313
  7. Prepare and insert the content in the vector database
2190
2314
  """
2315
+ try:
2316
+ from google.cloud import storage # type: ignore
2317
+ except ImportError:
2318
+ raise ImportError(
2319
+ "The `google-cloud-storage` package is not installed. "
2320
+ "Please install it via `pip install google-cloud-storage`."
2321
+ )
2322
+
2323
+ # Note: GCS support has limited features compared to GitHub/SharePoint
2324
+ log_warning(
2325
+ "GCS content loading has limited features. "
2326
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2327
+ )
2328
+
2191
2329
  remote_content: GCSContent = cast(GCSContent, content.remote_content)
2192
2330
 
2331
+ # Get or create bucket with credentials from config
2332
+ bucket = remote_content.bucket
2333
+ if bucket is None and remote_content.bucket_name:
2334
+ gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
2335
+ if gcs_config and gcs_config.credentials_path:
2336
+ client = storage.Client.from_service_account_json(gcs_config.credentials_path)
2337
+ elif gcs_config and gcs_config.project:
2338
+ client = storage.Client(project=gcs_config.project)
2339
+ else:
2340
+ client = storage.Client()
2341
+ bucket = client.bucket(remote_content.bucket_name)
2342
+
2193
2343
  # 1. Identify objects to read
2194
2344
  objects_to_read = []
2195
2345
  if remote_content.blob_name is not None:
2196
- objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
2346
+ objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
2197
2347
  elif remote_content.prefix is not None:
2198
- objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2348
+ objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2199
2349
  else:
2200
- objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
2350
+ objects_to_read.extend(bucket.list_blobs()) # type: ignore
2201
2351
 
2202
2352
  for gcs_object in objects_to_read:
2203
2353
  # 2. Setup Content object
@@ -2230,11 +2380,876 @@ class Knowledge:
2230
2380
  read_documents = reader.read(readable_content, name=name)
2231
2381
 
2232
2382
  # 7. Prepare and insert the content in the vector database
2233
- if not content.id:
2234
- content.id = generate_id(content.content_hash or "")
2235
- self._prepare_documents_for_insert(read_documents, content.id)
2383
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2236
2384
  self._handle_vector_db_insert(content_entry, read_documents, upsert)
2237
2385
 
2386
+ # --- SharePoint loaders ---
2387
+
2388
+ def _get_sharepoint_access_token(self, sp_config: SharePointConfig) -> Optional[str]:
2389
+ """Get an access token for Microsoft Graph API using client credentials flow.
2390
+
2391
+ Requires the `msal` package: pip install msal
2392
+ """
2393
+ try:
2394
+ from msal import ConfidentialClientApplication # type: ignore
2395
+ except ImportError:
2396
+ raise ImportError("The `msal` package is not installed. Please install it via `pip install msal`.")
2397
+
2398
+ authority = f"https://login.microsoftonline.com/{sp_config.tenant_id}"
2399
+ app = ConfidentialClientApplication(
2400
+ sp_config.client_id,
2401
+ authority=authority,
2402
+ client_credential=sp_config.client_secret,
2403
+ )
2404
+
2405
+ # Acquire token for Microsoft Graph
2406
+ scopes = ["https://graph.microsoft.com/.default"]
2407
+ result = app.acquire_token_for_client(scopes=scopes)
2408
+
2409
+ if "access_token" in result:
2410
+ return result["access_token"]
2411
+ else:
2412
+ log_error(f"Failed to acquire SharePoint token: {result.get('error_description', result.get('error'))}")
2413
+ return None
2414
+
2415
+ def _get_sharepoint_site_id(self, hostname: str, site_path: Optional[str], access_token: str) -> Optional[str]:
2416
+ """Get the SharePoint site ID using Microsoft Graph API."""
2417
+ import httpx
2418
+
2419
+ if site_path:
2420
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
2421
+ else:
2422
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
2423
+
2424
+ headers = {"Authorization": f"Bearer {access_token}"}
2425
+
2426
+ try:
2427
+ response = httpx.get(url, headers=headers)
2428
+ response.raise_for_status()
2429
+ return response.json().get("id")
2430
+ except httpx.HTTPStatusError as e:
2431
+ log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
2432
+ return None
2433
+
2434
+ def _list_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
2435
+ """List all items in a SharePoint folder."""
2436
+ import httpx
2437
+
2438
+ # Strip leading slashes to avoid double-slash in URL
2439
+ folder_path = folder_path.lstrip("/")
2440
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
2441
+ headers = {"Authorization": f"Bearer {access_token}"}
2442
+ items: List[dict] = []
2443
+
2444
+ try:
2445
+ while url:
2446
+ response = httpx.get(url, headers=headers)
2447
+ response.raise_for_status()
2448
+ data = response.json()
2449
+ items.extend(data.get("value", []))
2450
+ url = data.get("@odata.nextLink")
2451
+ except httpx.HTTPStatusError as e:
2452
+ log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
2453
+
2454
+ return items
2455
+
2456
+ def _download_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
2457
+ """Download a file from SharePoint."""
2458
+ import httpx
2459
+
2460
+ # Strip leading slashes to avoid double-slash in URL
2461
+ file_path = file_path.lstrip("/")
2462
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
2463
+ headers = {"Authorization": f"Bearer {access_token}"}
2464
+
2465
+ try:
2466
+ response = httpx.get(url, headers=headers, follow_redirects=True)
2467
+ response.raise_for_status()
2468
+ return BytesIO(response.content)
2469
+ except httpx.HTTPStatusError as e:
2470
+ log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
2471
+ return None
2472
+
2473
+ async def _aget_sharepoint_site_id(
2474
+ self, hostname: str, site_path: Optional[str], access_token: str
2475
+ ) -> Optional[str]:
2476
+ """Get the SharePoint site ID using Microsoft Graph API (async)."""
2477
+ import httpx
2478
+
2479
+ if site_path:
2480
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
2481
+ else:
2482
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
2483
+
2484
+ headers = {"Authorization": f"Bearer {access_token}"}
2485
+
2486
+ try:
2487
+ async with httpx.AsyncClient() as client:
2488
+ response = await client.get(url, headers=headers)
2489
+ response.raise_for_status()
2490
+ return response.json().get("id")
2491
+ except httpx.HTTPStatusError as e:
2492
+ log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
2493
+ return None
2494
+
2495
+ async def _alist_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
2496
+ """List all items in a SharePoint folder (async)."""
2497
+ import httpx
2498
+
2499
+ # Strip leading slashes to avoid double-slash in URL
2500
+ folder_path = folder_path.lstrip("/")
2501
+ url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
2502
+ headers = {"Authorization": f"Bearer {access_token}"}
2503
+ items: List[dict] = []
2504
+
2505
+ try:
2506
+ async with httpx.AsyncClient() as client:
2507
+ while url:
2508
+ response = await client.get(url, headers=headers)
2509
+ response.raise_for_status()
2510
+ data = response.json()
2511
+ items.extend(data.get("value", []))
2512
+ url = data.get("@odata.nextLink")
2513
+ except httpx.HTTPStatusError as e:
2514
+ log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
2515
+
2516
+ return items
2517
+
2518
+ async def _adownload_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
2519
+ """Download a file from SharePoint (async)."""
2520
+ import httpx
2521
+
2522
+ # Strip leading slashes to avoid double-slash in URL
2523
+ file_path = file_path.lstrip("/")
2524
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
2525
+ headers = {"Authorization": f"Bearer {access_token}"}
2526
+
2527
+ try:
2528
+ async with httpx.AsyncClient() as client:
2529
+ response = await client.get(url, headers=headers, follow_redirects=True)
2530
+ response.raise_for_status()
2531
+ return BytesIO(response.content)
2532
+ except httpx.HTTPStatusError as e:
2533
+ log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
2534
+ return None
2535
+
2536
+ async def _aload_from_sharepoint(
2537
+ self,
2538
+ content: Content,
2539
+ upsert: bool,
2540
+ skip_if_exists: bool,
2541
+ config: Optional[RemoteContentConfig] = None,
2542
+ ):
2543
+ """Load content from SharePoint.
2544
+
2545
+ Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
2546
+
2547
+ 1. Authenticate with Microsoft Graph using client credentials
2548
+ 2. Get site ID from hostname/site_path
2549
+ 3. Download file(s) from file_path or folder_path
2550
+ 4. Process through reader and insert to vector db
2551
+ """
2552
+ remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
2553
+ sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
2554
+
2555
+ if sp_config is None:
2556
+ log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
2557
+ return
2558
+
2559
+ # 1. Get access token
2560
+ access_token = self._get_sharepoint_access_token(sp_config)
2561
+ if not access_token:
2562
+ return
2563
+
2564
+ # 2. Get site ID - use config value if provided, otherwise fetch via API
2565
+ site_id: Optional[str] = sp_config.site_id
2566
+ if not site_id:
2567
+ site_path = remote_content.site_path or sp_config.site_path
2568
+ site_id = await self._aget_sharepoint_site_id(sp_config.hostname, site_path, access_token)
2569
+ if not site_id:
2570
+ log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
2571
+ return
2572
+
2573
+ # 3. Identify files to download
2574
+ files_to_process: List[tuple] = [] # List of (file_path, file_name)
2575
+
2576
+ # Helper function to recursively list all files in a folder
2577
+ async def list_files_recursive(folder: str) -> List[tuple]:
2578
+ """Recursively list all files in a SharePoint folder."""
2579
+ files: List[tuple] = []
2580
+ items = await self._alist_sharepoint_folder_items(site_id, folder, access_token)
2581
+ for item in items:
2582
+ if "file" in item: # It's a file
2583
+ item_path = f"{folder}/{item['name']}"
2584
+ files.append((item_path, item["name"]))
2585
+ elif "folder" in item: # It's a folder - recurse
2586
+ subdir_path = f"{folder}/{item['name']}"
2587
+ subdir_files = await list_files_recursive(subdir_path)
2588
+ files.extend(subdir_files)
2589
+ return files
2590
+
2591
+ # Get the path to process (file_path or folder_path)
2592
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
2593
+
2594
+ if path_to_process:
2595
+ # Check if path is a file or folder by getting item metadata
2596
+ try:
2597
+ async with AsyncClient() as client:
2598
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
2599
+ headers = {"Authorization": f"Bearer {access_token}"}
2600
+ response = await client.get(url, headers=headers, timeout=30.0)
2601
+ response.raise_for_status()
2602
+ item_data = response.json()
2603
+
2604
+ if "folder" in item_data:
2605
+ # It's a folder - recursively list all files
2606
+ files_to_process = await list_files_recursive(path_to_process)
2607
+ elif "file" in item_data:
2608
+ # It's a single file
2609
+ files_to_process.append((path_to_process, item_data["name"]))
2610
+ else:
2611
+ log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
2612
+ return
2613
+ except Exception as e:
2614
+ log_error(f"Error checking SharePoint path {path_to_process}: {e}")
2615
+ return
2616
+
2617
+ if not files_to_process:
2618
+ log_warning(f"No files found at SharePoint path: {path_to_process}")
2619
+ return
2620
+
2621
+ # 4. Process each file
2622
+ for file_path, file_name in files_to_process:
2623
+ # Build a unique virtual path for hashing (ensures different files don't collide)
2624
+ virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
2625
+
2626
+ # Build metadata with all info needed to re-fetch the file
2627
+ sharepoint_metadata = {
2628
+ "source_type": "sharepoint",
2629
+ "source_config_id": sp_config.id,
2630
+ "source_config_name": sp_config.name,
2631
+ "sharepoint_hostname": sp_config.hostname,
2632
+ "sharepoint_site_id": site_id,
2633
+ "sharepoint_path": file_path,
2634
+ "sharepoint_filename": file_name,
2635
+ }
2636
+ # Merge with user-provided metadata (user metadata takes precedence)
2637
+ merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
2638
+
2639
+ # Setup Content object
2640
+ # Naming: for folders, use relative path; for single files, use user name or filename
2641
+ is_folder_upload = len(files_to_process) > 1
2642
+ if is_folder_upload:
2643
+ # Compute relative path from the upload root
2644
+ relative_path = file_path
2645
+ if path_to_process and file_path.startswith(path_to_process + "/"):
2646
+ relative_path = file_path[len(path_to_process) + 1 :]
2647
+ # If user provided a name, prefix it; otherwise use full file path
2648
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
2649
+ else:
2650
+ # Single file: use user's name or the filename
2651
+ content_name = content.name or file_name
2652
+ content_entry = Content(
2653
+ name=content_name,
2654
+ description=content.description,
2655
+ path=virtual_path, # Include path for unique hashing
2656
+ status=ContentStatus.PROCESSING,
2657
+ metadata=merged_metadata,
2658
+ file_type="sharepoint",
2659
+ )
2660
+
2661
+ # Hash content and add to contents database
2662
+ content_entry.content_hash = self._build_content_hash(content_entry)
2663
+ content_entry.id = generate_id(content_entry.content_hash)
2664
+ await self._ainsert_contents_db(content_entry)
2665
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
2666
+ content_entry.status = ContentStatus.COMPLETED
2667
+ await self._aupdate_content(content_entry)
2668
+ continue
2669
+
2670
+ # Select reader based on file extension
2671
+ reader = self._select_reader_by_uri(file_name, content.reader)
2672
+ reader = cast(Reader, reader)
2673
+
2674
+ # Download file
2675
+ file_content = await self._adownload_sharepoint_file(site_id, file_path, access_token)
2676
+ if not file_content:
2677
+ content_entry.status = ContentStatus.FAILED
2678
+ await self._aupdate_content(content_entry)
2679
+ continue
2680
+
2681
+ # Read the content
2682
+ read_documents = await reader.async_read(file_content, name=file_name)
2683
+
2684
+ # Prepare and insert to vector database
2685
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2686
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2687
+
2688
+ def _load_from_sharepoint(
2689
+ self,
2690
+ content: Content,
2691
+ upsert: bool,
2692
+ skip_if_exists: bool,
2693
+ config: Optional[RemoteContentConfig] = None,
2694
+ ):
2695
+ """Synchronous version of _load_from_sharepoint.
2696
+
2697
+ Load content from SharePoint:
2698
+ 1. Authenticate with Microsoft Graph using client credentials
2699
+ 2. Get site ID from hostname/site_path
2700
+ 3. Download file(s) from file_path or folder_path
2701
+ 4. Process through reader and insert to vector db
2702
+ """
2703
+ remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
2704
+ sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
2705
+
2706
+ if sp_config is None:
2707
+ log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
2708
+ return
2709
+
2710
+ # 1. Get access token
2711
+ access_token = self._get_sharepoint_access_token(sp_config)
2712
+ if not access_token:
2713
+ return
2714
+
2715
+ # 2. Get site ID - use config value if provided, otherwise fetch via API
2716
+ site_id: Optional[str] = sp_config.site_id
2717
+ if not site_id:
2718
+ site_path = remote_content.site_path or sp_config.site_path
2719
+ site_id = self._get_sharepoint_site_id(sp_config.hostname, site_path, access_token)
2720
+ if not site_id:
2721
+ log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
2722
+ return
2723
+
2724
+ # 3. Identify files to download
2725
+ files_to_process: List[tuple] = [] # List of (file_path, file_name)
2726
+
2727
+ # Helper function to recursively list all files in a folder
2728
+ def list_files_recursive(folder: str) -> List[tuple]:
2729
+ """Recursively list all files in a SharePoint folder."""
2730
+ files: List[tuple] = []
2731
+ items = self._list_sharepoint_folder_items(site_id, folder, access_token)
2732
+ for item in items:
2733
+ if "file" in item: # It's a file
2734
+ item_path = f"{folder}/{item['name']}"
2735
+ files.append((item_path, item["name"]))
2736
+ elif "folder" in item: # It's a folder - recurse
2737
+ subdir_path = f"{folder}/{item['name']}"
2738
+ subdir_files = list_files_recursive(subdir_path)
2739
+ files.extend(subdir_files)
2740
+ return files
2741
+
2742
+ # Get the path to process (file_path or folder_path)
2743
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
2744
+
2745
+ if path_to_process:
2746
+ # Check if path is a file or folder by getting item metadata
2747
+ try:
2748
+ with httpx.Client() as client:
2749
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
2750
+ headers = {"Authorization": f"Bearer {access_token}"}
2751
+ response = client.get(url, headers=headers, timeout=30.0)
2752
+ response.raise_for_status()
2753
+ item_data = response.json()
2754
+
2755
+ if "folder" in item_data:
2756
+ # It's a folder - recursively list all files
2757
+ files_to_process = list_files_recursive(path_to_process)
2758
+ elif "file" in item_data:
2759
+ # It's a single file
2760
+ files_to_process.append((path_to_process, item_data["name"]))
2761
+ else:
2762
+ log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
2763
+ return
2764
+ except Exception as e:
2765
+ log_error(f"Error checking SharePoint path {path_to_process}: {e}")
2766
+ return
2767
+
2768
+ if not files_to_process:
2769
+ log_warning(f"No files found at SharePoint path: {path_to_process}")
2770
+ return
2771
+
2772
+ # 4. Process each file
2773
+ for file_path, file_name in files_to_process:
2774
+ # Build a unique virtual path for hashing (ensures different files don't collide)
2775
+ virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
2776
+
2777
+ # Build metadata with all info needed to re-fetch the file
2778
+ sharepoint_metadata = {
2779
+ "source_type": "sharepoint",
2780
+ "source_config_id": sp_config.id,
2781
+ "source_config_name": sp_config.name,
2782
+ "sharepoint_hostname": sp_config.hostname,
2783
+ "sharepoint_site_id": site_id,
2784
+ "sharepoint_path": file_path,
2785
+ "sharepoint_filename": file_name,
2786
+ }
2787
+ # Merge with user-provided metadata (user metadata takes precedence)
2788
+ merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
2789
+
2790
+ # Setup Content object
2791
+ # Naming: for folders, use relative path; for single files, use user name or filename
2792
+ is_folder_upload = len(files_to_process) > 1
2793
+ if is_folder_upload:
2794
+ # Compute relative path from the upload root
2795
+ relative_path = file_path
2796
+ if path_to_process and file_path.startswith(path_to_process + "/"):
2797
+ relative_path = file_path[len(path_to_process) + 1 :]
2798
+ # If user provided a name, prefix it; otherwise use full file path
2799
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
2800
+ else:
2801
+ # Single file: use user's name or the filename
2802
+ content_name = content.name or file_name
2803
+ content_entry = Content(
2804
+ name=content_name,
2805
+ description=content.description,
2806
+ path=virtual_path, # Include path for unique hashing
2807
+ status=ContentStatus.PROCESSING,
2808
+ metadata=merged_metadata,
2809
+ file_type="sharepoint",
2810
+ )
2811
+
2812
+ # Hash content and add to contents database
2813
+ content_entry.content_hash = self._build_content_hash(content_entry)
2814
+ content_entry.id = generate_id(content_entry.content_hash)
2815
+ self._insert_contents_db(content_entry)
2816
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
2817
+ content_entry.status = ContentStatus.COMPLETED
2818
+ self._update_content(content_entry)
2819
+ continue
2820
+
2821
+ # Select reader based on file extension
2822
+ reader = self._select_reader_by_uri(file_name, content.reader)
2823
+ reader = cast(Reader, reader)
2824
+
2825
+ # Download file
2826
+ file_content = self._download_sharepoint_file(site_id, file_path, access_token)
2827
+ if not file_content:
2828
+ content_entry.status = ContentStatus.FAILED
2829
+ self._update_content(content_entry)
2830
+ continue
2831
+
2832
+ # Read the content
2833
+ read_documents = reader.read(file_content, name=file_name)
2834
+
2835
+ # Prepare and insert to vector database
2836
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2837
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
2838
+
2839
+ # --- GitHub loaders ---
2840
+
2841
+ async def _aload_from_github(
2842
+ self,
2843
+ content: Content,
2844
+ upsert: bool,
2845
+ skip_if_exists: bool,
2846
+ config: Optional[RemoteContentConfig] = None,
2847
+ ):
2848
+ """Load content from GitHub.
2849
+
2850
+ Requires the GitHub config to contain repo and optionally token for private repos.
2851
+ Uses the GitHub API to fetch file contents.
2852
+ """
2853
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
2854
+ gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
2855
+
2856
+ if gh_config is None:
2857
+ log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
2858
+ return
2859
+
2860
+ # Build headers for GitHub API
2861
+ headers = {
2862
+ "Accept": "application/vnd.github.v3+json",
2863
+ "User-Agent": "Agno-Knowledge",
2864
+ }
2865
+ if gh_config.token:
2866
+ headers["Authorization"] = f"Bearer {gh_config.token}"
2867
+
2868
+ branch = remote_content.branch or gh_config.branch or "main"
2869
+
2870
+ # Get list of files to process
2871
+ files_to_process: List[Dict[str, str]] = []
2872
+
2873
+ async with AsyncClient() as client:
2874
+ # Helper function to recursively list all files in a folder
2875
+ async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
2876
+ """Recursively list all files in a GitHub folder."""
2877
+ files: List[Dict[str, str]] = []
2878
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
2879
+ if branch:
2880
+ api_url += f"?ref={branch}"
2881
+
2882
+ try:
2883
+ response = await client.get(api_url, headers=headers, timeout=30.0)
2884
+ response.raise_for_status()
2885
+ items = response.json()
2886
+
2887
+ # If items is not a list, it's a single file response
2888
+ if not isinstance(items, list):
2889
+ items = [items]
2890
+
2891
+ for item in items:
2892
+ if item.get("type") == "file":
2893
+ files.append(
2894
+ {
2895
+ "path": item["path"],
2896
+ "name": item["name"],
2897
+ }
2898
+ )
2899
+ elif item.get("type") == "dir":
2900
+ # Recursively get files from subdirectory
2901
+ subdir_files = await list_files_recursive(item["path"])
2902
+ files.extend(subdir_files)
2903
+ except Exception as e:
2904
+ log_error(f"Error listing GitHub folder {folder}: {e}")
2905
+
2906
+ return files
2907
+
2908
+ # Get the path to process (file_path or folder_path)
2909
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
2910
+
2911
+ if path_to_process:
2912
+ # Fetch the path to determine if it's a file or directory
2913
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
2914
+ if branch:
2915
+ api_url += f"?ref={branch}"
2916
+
2917
+ try:
2918
+ response = await client.get(api_url, headers=headers, timeout=30.0)
2919
+ response.raise_for_status()
2920
+ path_data = response.json()
2921
+
2922
+ if isinstance(path_data, list):
2923
+ # It's a directory - recursively list all files
2924
+ for item in path_data:
2925
+ if item.get("type") == "file":
2926
+ files_to_process.append({"path": item["path"], "name": item["name"]})
2927
+ elif item.get("type") == "dir":
2928
+ subdir_files = await list_files_recursive(item["path"])
2929
+ files_to_process.extend(subdir_files)
2930
+ else:
2931
+ # It's a single file
2932
+ files_to_process.append(
2933
+ {
2934
+ "path": path_data["path"],
2935
+ "name": path_data["name"],
2936
+ }
2937
+ )
2938
+ except Exception as e:
2939
+ log_error(f"Error fetching GitHub path {path_to_process}: {e}")
2940
+ return
2941
+
2942
+ if not files_to_process:
2943
+ log_warning(f"No files found at GitHub path: {path_to_process}")
2944
+ return
2945
+
2946
+ # Process each file
2947
+ for file_info in files_to_process:
2948
+ file_path = file_info["path"]
2949
+ file_name = file_info["name"]
2950
+
2951
+ # Build a unique virtual path for hashing (ensures different files don't collide)
2952
+ virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
2953
+
2954
+ # Build metadata with all info needed to re-fetch the file
2955
+ github_metadata = {
2956
+ "source_type": "github",
2957
+ "source_config_id": gh_config.id,
2958
+ "source_config_name": gh_config.name,
2959
+ "github_repo": gh_config.repo,
2960
+ "github_branch": branch,
2961
+ "github_path": file_path,
2962
+ "github_filename": file_name,
2963
+ }
2964
+ # Merge with user-provided metadata (user metadata takes precedence)
2965
+ merged_metadata = {**github_metadata, **(content.metadata or {})}
2966
+
2967
+ # Setup Content object
2968
+ # Naming: for folders, use relative path; for single files, use user name or filename
2969
+ is_folder_upload = len(files_to_process) > 1
2970
+ if is_folder_upload:
2971
+ # Compute relative path from the upload root
2972
+ relative_path = file_path
2973
+ if path_to_process and file_path.startswith(path_to_process + "/"):
2974
+ relative_path = file_path[len(path_to_process) + 1 :]
2975
+ # If user provided a name, prefix it; otherwise use full file path
2976
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
2977
+ else:
2978
+ # Single file: use user's name or the filename
2979
+ content_name = content.name or file_name
2980
+ content_entry = Content(
2981
+ name=content_name,
2982
+ description=content.description,
2983
+ path=virtual_path, # Include path for unique hashing
2984
+ status=ContentStatus.PROCESSING,
2985
+ metadata=merged_metadata,
2986
+ file_type="github",
2987
+ )
2988
+
2989
+ # Hash content and add to contents database
2990
+ content_entry.content_hash = self._build_content_hash(content_entry)
2991
+ content_entry.id = generate_id(content_entry.content_hash)
2992
+ await self._ainsert_contents_db(content_entry)
2993
+
2994
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
2995
+ content_entry.status = ContentStatus.COMPLETED
2996
+ await self._aupdate_content(content_entry)
2997
+ continue
2998
+
2999
+ # Fetch file content using GitHub API (works for private repos)
3000
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
3001
+ if branch:
3002
+ api_url += f"?ref={branch}"
3003
+ try:
3004
+ response = await client.get(api_url, headers=headers, timeout=30.0)
3005
+ response.raise_for_status()
3006
+ file_data = response.json()
3007
+
3008
+ # GitHub API returns content as base64
3009
+ if file_data.get("encoding") == "base64":
3010
+ import base64
3011
+
3012
+ file_content = base64.b64decode(file_data["content"])
3013
+ else:
3014
+ # For large files, GitHub returns a download_url
3015
+ download_url = file_data.get("download_url")
3016
+ if download_url:
3017
+ dl_response = await client.get(download_url, headers=headers, timeout=30.0)
3018
+ dl_response.raise_for_status()
3019
+ file_content = dl_response.content
3020
+ else:
3021
+ raise ValueError("No content or download_url in response")
3022
+ except Exception as e:
3023
+ log_error(f"Error fetching GitHub file {file_path}: {e}")
3024
+ content_entry.status = ContentStatus.FAILED
3025
+ content_entry.status_message = str(e)
3026
+ await self._aupdate_content(content_entry)
3027
+ continue
3028
+
3029
+ # Select reader and read content
3030
+ reader = self._select_reader_by_uri(file_name, content.reader)
3031
+ if reader is None:
3032
+ log_warning(f"No reader found for file: {file_name}")
3033
+ content_entry.status = ContentStatus.FAILED
3034
+ content_entry.status_message = "No suitable reader found"
3035
+ await self._aupdate_content(content_entry)
3036
+ continue
3037
+
3038
+ reader = cast(Reader, reader)
3039
+ readable_content = BytesIO(file_content)
3040
+ read_documents = await reader.async_read(readable_content, name=file_name)
3041
+
3042
+ # Prepare and insert into vector database
3043
+ if not content_entry.id:
3044
+ content_entry.id = generate_id(content_entry.content_hash or "")
3045
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
3046
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
3047
+
3048
+ def _load_from_github(
3049
+ self,
3050
+ content: Content,
3051
+ upsert: bool,
3052
+ skip_if_exists: bool,
3053
+ config: Optional[RemoteContentConfig] = None,
3054
+ ):
3055
+ """Synchronous version of _load_from_github."""
3056
+ import httpx
3057
+
3058
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
3059
+ gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
3060
+
3061
+ if gh_config is None:
3062
+ log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
3063
+ return
3064
+
3065
+ # Build headers for GitHub API
3066
+ headers = {
3067
+ "Accept": "application/vnd.github.v3+json",
3068
+ "User-Agent": "Agno-Knowledge",
3069
+ }
3070
+ if gh_config.token:
3071
+ headers["Authorization"] = f"Bearer {gh_config.token}"
3072
+
3073
+ branch = remote_content.branch or gh_config.branch or "main"
3074
+
3075
+ # Get list of files to process
3076
+ files_to_process: List[Dict[str, str]] = []
3077
+
3078
+ with httpx.Client() as client:
3079
+ # Helper function to recursively list all files in a folder
3080
+ def list_files_recursive(folder: str) -> List[Dict[str, str]]:
3081
+ """Recursively list all files in a GitHub folder."""
3082
+ files: List[Dict[str, str]] = []
3083
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
3084
+ if branch:
3085
+ api_url += f"?ref={branch}"
3086
+
3087
+ try:
3088
+ response = client.get(api_url, headers=headers, timeout=30.0)
3089
+ response.raise_for_status()
3090
+ items = response.json()
3091
+
3092
+ # If items is not a list, it's a single file response
3093
+ if not isinstance(items, list):
3094
+ items = [items]
3095
+
3096
+ for item in items:
3097
+ if item.get("type") == "file":
3098
+ files.append(
3099
+ {
3100
+ "path": item["path"],
3101
+ "name": item["name"],
3102
+ }
3103
+ )
3104
+ elif item.get("type") == "dir":
3105
+ # Recursively get files from subdirectory
3106
+ subdir_files = list_files_recursive(item["path"])
3107
+ files.extend(subdir_files)
3108
+ except Exception as e:
3109
+ log_error(f"Error listing GitHub folder {folder}: {e}")
3110
+
3111
+ return files
3112
+
3113
+ # Get the path to process (file_path or folder_path)
3114
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
3115
+
3116
+ if path_to_process:
3117
+ # Fetch the path to determine if it's a file or directory
3118
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
3119
+ if branch:
3120
+ api_url += f"?ref={branch}"
3121
+
3122
+ try:
3123
+ response = client.get(api_url, headers=headers, timeout=30.0)
3124
+ response.raise_for_status()
3125
+ path_data = response.json()
3126
+
3127
+ if isinstance(path_data, list):
3128
+ # It's a directory - recursively list all files
3129
+ for item in path_data:
3130
+ if item.get("type") == "file":
3131
+ files_to_process.append({"path": item["path"], "name": item["name"]})
3132
+ elif item.get("type") == "dir":
3133
+ subdir_files = list_files_recursive(item["path"])
3134
+ files_to_process.extend(subdir_files)
3135
+ else:
3136
+ # It's a single file
3137
+ files_to_process.append(
3138
+ {
3139
+ "path": path_data["path"],
3140
+ "name": path_data["name"],
3141
+ }
3142
+ )
3143
+ except Exception as e:
3144
+ log_error(f"Error fetching GitHub path {path_to_process}: {e}")
3145
+ return
3146
+
3147
+ if not files_to_process:
3148
+ log_warning(f"No files found at GitHub path: {path_to_process}")
3149
+ return
3150
+
3151
+ # Process each file
3152
+ for file_info in files_to_process:
3153
+ file_path = file_info["path"]
3154
+ file_name = file_info["name"]
3155
+
3156
+ # Build a unique virtual path for hashing (ensures different files don't collide)
3157
+ virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
3158
+
3159
+ # Build metadata with all info needed to re-fetch the file
3160
+ github_metadata = {
3161
+ "source_type": "github",
3162
+ "source_config_id": gh_config.id,
3163
+ "source_config_name": gh_config.name,
3164
+ "github_repo": gh_config.repo,
3165
+ "github_branch": branch,
3166
+ "github_path": file_path,
3167
+ "github_filename": file_name,
3168
+ }
3169
+ # Merge with user-provided metadata (user metadata takes precedence)
3170
+ merged_metadata = {**github_metadata, **(content.metadata or {})}
3171
+
3172
+ # Setup Content object
3173
+ # Naming: for folders, use relative path; for single files, use user name or filename
3174
+ is_folder_upload = len(files_to_process) > 1
3175
+ if is_folder_upload:
3176
+ # Compute relative path from the upload root
3177
+ relative_path = file_path
3178
+ if path_to_process and file_path.startswith(path_to_process + "/"):
3179
+ relative_path = file_path[len(path_to_process) + 1 :]
3180
+ # If user provided a name, prefix it; otherwise use full file path
3181
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
3182
+ else:
3183
+ # Single file: use user's name or the filename
3184
+ content_name = content.name or file_name
3185
+ content_entry = Content(
3186
+ name=content_name,
3187
+ description=content.description,
3188
+ path=virtual_path, # Include path for unique hashing
3189
+ status=ContentStatus.PROCESSING,
3190
+ metadata=merged_metadata,
3191
+ file_type="github",
3192
+ )
3193
+
3194
+ # Hash content and add to contents database
3195
+ content_entry.content_hash = self._build_content_hash(content_entry)
3196
+ content_entry.id = generate_id(content_entry.content_hash)
3197
+ self._insert_contents_db(content_entry)
3198
+
3199
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
3200
+ content_entry.status = ContentStatus.COMPLETED
3201
+ self._update_content(content_entry)
3202
+ continue
3203
+
3204
+ # Fetch file content using GitHub API (works for private repos)
3205
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
3206
+ if branch:
3207
+ api_url += f"?ref={branch}"
3208
+ try:
3209
+ response = client.get(api_url, headers=headers, timeout=30.0)
3210
+ response.raise_for_status()
3211
+ file_data = response.json()
3212
+
3213
+ # GitHub API returns content as base64
3214
+ if file_data.get("encoding") == "base64":
3215
+ import base64
3216
+
3217
+ file_content = base64.b64decode(file_data["content"])
3218
+ else:
3219
+ # For large files, GitHub returns a download_url
3220
+ download_url = file_data.get("download_url")
3221
+ if download_url:
3222
+ dl_response = client.get(download_url, headers=headers, timeout=30.0)
3223
+ dl_response.raise_for_status()
3224
+ file_content = dl_response.content
3225
+ else:
3226
+ raise ValueError("No content or download_url in response")
3227
+ except Exception as e:
3228
+ log_error(f"Error fetching GitHub file {file_path}: {e}")
3229
+ content_entry.status = ContentStatus.FAILED
3230
+ content_entry.status_message = str(e)
3231
+ self._update_content(content_entry)
3232
+ continue
3233
+
3234
+ # Select reader and read content
3235
+ reader = self._select_reader_by_uri(file_name, content.reader)
3236
+ if reader is None:
3237
+ log_warning(f"No reader found for file: {file_name}")
3238
+ content_entry.status = ContentStatus.FAILED
3239
+ content_entry.status_message = "No suitable reader found"
3240
+ self._update_content(content_entry)
3241
+ continue
3242
+
3243
+ reader = cast(Reader, reader)
3244
+ readable_content = BytesIO(file_content)
3245
+ read_documents = reader.read(readable_content, name=file_name)
3246
+
3247
+ # Prepare and insert into vector database
3248
+ if not content_entry.id:
3249
+ content_entry.id = generate_id(content_entry.content_hash or "")
3250
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
3251
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
3252
+
2238
3253
  async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
2239
3254
  from agno.vectordb import VectorDb
2240
3255
 
@@ -2312,6 +3327,18 @@ class Knowledge:
2312
3327
  content.status = ContentStatus.COMPLETED
2313
3328
  self._update_content(content)
2314
3329
 
3330
+ # --- Remote Content Sources ---
3331
+
3332
+ def _get_remote_configs(self) -> List[RemoteContentConfig]:
3333
+ """Return configured remote content sources."""
3334
+ return self.content_sources or []
3335
+
3336
+ def _get_remote_config_by_id(self, config_id: str) -> Optional[RemoteContentConfig]:
3337
+ """Get a remote content config by its ID."""
3338
+ if not self.content_sources:
3339
+ return None
3340
+ return next((c for c in self.content_sources if c.id == config_id), None)
3341
+
2315
3342
  # ==========================================
2316
3343
  # PRIVATE - CONVERSION & DATA METHODS
2317
3344
  # ==========================================