agno 2.4.0__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. agno/db/firestore/firestore.py +58 -65
  2. agno/db/mysql/async_mysql.py +47 -55
  3. agno/db/postgres/async_postgres.py +52 -61
  4. agno/db/postgres/postgres.py +25 -12
  5. agno/db/sqlite/async_sqlite.py +52 -61
  6. agno/db/sqlite/sqlite.py +24 -11
  7. agno/integrations/discord/client.py +12 -1
  8. agno/knowledge/knowledge.py +1511 -47
  9. agno/knowledge/reader/csv_reader.py +231 -8
  10. agno/knowledge/reader/field_labeled_csv_reader.py +167 -3
  11. agno/knowledge/reader/reader_factory.py +8 -1
  12. agno/knowledge/remote_content/__init__.py +33 -0
  13. agno/knowledge/remote_content/config.py +266 -0
  14. agno/knowledge/remote_content/remote_content.py +105 -17
  15. agno/models/base.py +12 -2
  16. agno/models/cerebras/cerebras.py +34 -2
  17. agno/models/n1n/__init__.py +3 -0
  18. agno/models/n1n/n1n.py +57 -0
  19. agno/models/ollama/__init__.py +2 -0
  20. agno/models/ollama/responses.py +100 -0
  21. agno/models/openai/__init__.py +2 -0
  22. agno/models/openai/chat.py +18 -1
  23. agno/models/openai/open_responses.py +46 -0
  24. agno/models/openrouter/__init__.py +2 -0
  25. agno/models/openrouter/responses.py +146 -0
  26. agno/models/perplexity/perplexity.py +2 -0
  27. agno/os/interfaces/slack/router.py +10 -1
  28. agno/os/interfaces/whatsapp/router.py +6 -0
  29. agno/os/routers/components/components.py +10 -1
  30. agno/os/routers/knowledge/knowledge.py +125 -0
  31. agno/os/routers/knowledge/schemas.py +12 -0
  32. agno/run/agent.py +2 -0
  33. agno/team/team.py +20 -4
  34. agno/vectordb/lightrag/lightrag.py +7 -6
  35. agno/vectordb/milvus/milvus.py +79 -48
  36. agno/vectordb/pgvector/pgvector.py +3 -3
  37. {agno-2.4.0.dist-info → agno-2.4.2.dist-info}/METADATA +4 -1
  38. {agno-2.4.0.dist-info → agno-2.4.2.dist-info}/RECORD +41 -35
  39. {agno-2.4.0.dist-info → agno-2.4.2.dist-info}/WHEEL +1 -1
  40. {agno-2.4.0.dist-info → agno-2.4.2.dist-info}/licenses/LICENSE +0 -0
  41. {agno-2.4.0.dist-info → agno-2.4.2.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ from os.path import basename
9
9
  from pathlib import Path
10
10
  from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
11
11
 
12
+ import httpx
12
13
  from httpx import AsyncClient
13
14
 
14
15
  from agno.db.base import AsyncBaseDb, BaseDb
@@ -17,7 +18,22 @@ from agno.filters import FilterExpr
17
18
  from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
18
19
  from agno.knowledge.document import Document
19
20
  from agno.knowledge.reader import Reader, ReaderFactory
20
- from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
21
+ from agno.knowledge.remote_content.config import (
22
+ AzureBlobConfig,
23
+ GcsConfig,
24
+ GitHubConfig,
25
+ RemoteContentConfig,
26
+ S3Config,
27
+ SharePointConfig,
28
+ )
29
+ from agno.knowledge.remote_content.remote_content import (
30
+ AzureBlobContent,
31
+ GCSContent,
32
+ GitHubContent,
33
+ RemoteContent,
34
+ S3Content,
35
+ SharePointContent,
36
+ )
21
37
  from agno.utils.http import async_fetch_with_retry
22
38
  from agno.utils.log import log_debug, log_error, log_info, log_warning
23
39
  from agno.utils.string import generate_id
@@ -42,6 +58,7 @@ class Knowledge:
42
58
  contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
43
59
  max_results: int = 10
44
60
  readers: Optional[Dict[str, Reader]] = None
61
+ content_sources: Optional[List[RemoteContentConfig]] = None
45
62
 
46
63
  def __post_init__(self):
47
64
  from agno.vectordb import VectorDb
@@ -1161,7 +1178,7 @@ class Knowledge:
1161
1178
  import inspect
1162
1179
 
1163
1180
  read_signature = inspect.signature(reader.read)
1164
- if password and "password" in read_signature.parameters:
1181
+ if password is not None and "password" in read_signature.parameters:
1165
1182
  if isinstance(source, BytesIO):
1166
1183
  return reader.read(source, name=name, password=password)
1167
1184
  else:
@@ -1194,7 +1211,7 @@ class Knowledge:
1194
1211
  import inspect
1195
1212
 
1196
1213
  read_signature = inspect.signature(reader.async_read)
1197
- if password and "password" in read_signature.parameters:
1214
+ if password is not None and "password" in read_signature.parameters:
1198
1215
  return await reader.async_read(source, name=name, password=password)
1199
1216
  else:
1200
1217
  if isinstance(source, BytesIO):
@@ -1285,7 +1302,7 @@ class Knowledge:
1285
1302
  log_debug(f"Using Reader: {reader.__class__.__name__}")
1286
1303
 
1287
1304
  if reader:
1288
- password = content.auth.password if content.auth and content.auth.password else None
1305
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1289
1306
  read_documents = await self._aread(reader, path, name=content.name or path.name, password=password)
1290
1307
  else:
1291
1308
  read_documents = []
@@ -1304,7 +1321,7 @@ class Knowledge:
1304
1321
 
1305
1322
  if not content.id:
1306
1323
  content.id = generate_id(content.content_hash or "")
1307
- self._prepare_documents_for_insert(read_documents, content.id)
1324
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1308
1325
 
1309
1326
  await self._ahandle_vector_db_insert(content, read_documents, upsert)
1310
1327
 
@@ -1366,7 +1383,7 @@ class Knowledge:
1366
1383
  log_debug(f"Using Reader: {reader.__class__.__name__}")
1367
1384
 
1368
1385
  if reader:
1369
- password = content.auth.password if content.auth and content.auth.password else None
1386
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1370
1387
  read_documents = self._read(reader, path, name=content.name or path.name, password=password)
1371
1388
  else:
1372
1389
  read_documents = []
@@ -1385,7 +1402,7 @@ class Knowledge:
1385
1402
 
1386
1403
  if not content.id:
1387
1404
  content.id = generate_id(content.content_hash or "")
1388
- self._prepare_documents_for_insert(read_documents, content.id)
1405
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1389
1406
 
1390
1407
  self._handle_vector_db_insert(content, read_documents, upsert)
1391
1408
 
@@ -1485,7 +1502,7 @@ class Knowledge:
1485
1502
  if reader.__class__.__name__ == "YouTubeReader":
1486
1503
  read_documents = await reader.async_read(content.url, name=name)
1487
1504
  else:
1488
- password = content.auth.password if content.auth and content.auth.password else None
1505
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1489
1506
  source = bytes_content if bytes_content else content.url
1490
1507
  read_documents = await self._aread(reader, source, name=name, password=password)
1491
1508
 
@@ -1583,7 +1600,7 @@ class Knowledge:
1583
1600
  if reader.__class__.__name__ == "YouTubeReader":
1584
1601
  read_documents = reader.read(content.url, name=name)
1585
1602
  else:
1586
- password = content.auth.password if content.auth and content.auth.password else None
1603
+ password = content.auth.password if content.auth and content.auth.password is not None else None
1587
1604
  source = bytes_content if bytes_content else content.url
1588
1605
  read_documents = self._read(reader, source, name=name, password=password)
1589
1606
 
@@ -1930,18 +1947,42 @@ class Knowledge:
1930
1947
 
1931
1948
  remote_content = content.remote_content
1932
1949
 
1950
+ # Look up config if config_id is provided
1951
+ config = None
1952
+ if hasattr(remote_content, "config_id") and remote_content.config_id:
1953
+ config = self._get_remote_config_by_id(remote_content.config_id)
1954
+ if config is None:
1955
+ log_warning(f"No config found for config_id: {remote_content.config_id}")
1956
+
1933
1957
  if isinstance(remote_content, S3Content):
1934
- await self._aload_from_s3(content, upsert, skip_if_exists)
1958
+ await self._aload_from_s3(content, upsert, skip_if_exists, config)
1935
1959
 
1936
1960
  elif isinstance(remote_content, GCSContent):
1937
- await self._aload_from_gcs(content, upsert, skip_if_exists)
1961
+ await self._aload_from_gcs(content, upsert, skip_if_exists, config)
1962
+
1963
+ elif isinstance(remote_content, SharePointContent):
1964
+ await self._aload_from_sharepoint(content, upsert, skip_if_exists, config)
1965
+
1966
+ elif isinstance(remote_content, GitHubContent):
1967
+ await self._aload_from_github(content, upsert, skip_if_exists, config)
1968
+
1969
+ elif isinstance(remote_content, AzureBlobContent):
1970
+ await self._aload_from_azure_blob(content, upsert, skip_if_exists, config)
1938
1971
 
1939
1972
  else:
1940
1973
  log_warning(f"Unsupported remote content type: {type(remote_content)}")
1941
1974
 
1942
- async def _aload_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
1975
+ async def _aload_from_s3(
1976
+ self,
1977
+ content: Content,
1978
+ upsert: bool,
1979
+ skip_if_exists: bool,
1980
+ config: Optional[RemoteContentConfig] = None,
1981
+ ):
1943
1982
  """Load the contextual S3 content.
1944
1983
 
1984
+ Note: Uses sync boto3 calls as boto3 doesn't have an async API.
1985
+
1945
1986
  1. Identify objects to read
1946
1987
  2. Setup Content object
1947
1988
  3. Hash content and add it to the contents database
@@ -1951,22 +1992,43 @@ class Knowledge:
1951
1992
  7. Prepare and insert the content in the vector database
1952
1993
  8. Remove temporary file if needed
1953
1994
  """
1995
+ from agno.cloud.aws.s3.bucket import S3Bucket
1954
1996
  from agno.cloud.aws.s3.object import S3Object
1955
1997
 
1998
+ # Note: S3 support has limited features compared to GitHub/SharePoint
1999
+ log_warning(
2000
+ "S3 content loading has limited features. "
2001
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2002
+ )
2003
+
1956
2004
  remote_content: S3Content = cast(S3Content, content.remote_content)
1957
2005
 
2006
+ # Get or create bucket with credentials from config
2007
+ bucket = remote_content.bucket
2008
+ try:
2009
+ if bucket is None and remote_content.bucket_name:
2010
+ s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
2011
+ bucket = S3Bucket(
2012
+ name=remote_content.bucket_name,
2013
+ region=s3_config.region if s3_config else None,
2014
+ aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
2015
+ aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
2016
+ )
2017
+ except Exception as e:
2018
+ log_error(f"Error getting bucket: {e}")
2019
+
1958
2020
  # 1. Identify objects to read
1959
2021
  objects_to_read: List[S3Object] = []
1960
- if remote_content.bucket is not None:
2022
+ if bucket is not None:
1961
2023
  if remote_content.key is not None:
1962
- _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
2024
+ _object = S3Object(bucket_name=bucket.name, name=remote_content.key)
1963
2025
  objects_to_read.append(_object)
1964
2026
  elif remote_content.object is not None:
1965
2027
  objects_to_read.append(remote_content.object)
1966
2028
  elif remote_content.prefix is not None:
1967
- objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
2029
+ objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
1968
2030
  else:
1969
- objects_to_read.extend(remote_content.bucket.get_objects())
2031
+ objects_to_read.extend(bucket.get_objects())
1970
2032
 
1971
2033
  for s3_object in objects_to_read:
1972
2034
  # 2. Setup Content object
@@ -1987,7 +2049,7 @@ class Knowledge:
1987
2049
  if self._should_skip(content_entry.content_hash, skip_if_exists):
1988
2050
  content_entry.status = ContentStatus.COMPLETED
1989
2051
  await self._aupdate_content(content_entry)
1990
- return
2052
+ continue
1991
2053
 
1992
2054
  # 4. Select reader
1993
2055
  reader = self._select_reader_by_uri(s3_object.uri, content.reader)
@@ -2008,18 +2070,24 @@ class Knowledge:
2008
2070
  read_documents = await reader.async_read(readable_content, name=obj_name)
2009
2071
 
2010
2072
  # 7. Prepare and insert the content in the vector database
2011
- if not content.id:
2012
- content.id = generate_id(content.content_hash or "")
2013
- self._prepare_documents_for_insert(read_documents, content.id)
2073
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2014
2074
  await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2015
2075
 
2016
2076
  # 8. Remove temporary file if needed
2017
2077
  if temporary_file:
2018
2078
  temporary_file.unlink()
2019
2079
 
2020
- async def _aload_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
2080
+ async def _aload_from_gcs(
2081
+ self,
2082
+ content: Content,
2083
+ upsert: bool,
2084
+ skip_if_exists: bool,
2085
+ config: Optional[RemoteContentConfig] = None,
2086
+ ):
2021
2087
  """Load the contextual GCS content.
2022
2088
 
2089
+ Note: Uses sync google-cloud-storage calls as it doesn't have an async API.
2090
+
2023
2091
  1. Identify objects to read
2024
2092
  2. Setup Content object
2025
2093
  3. Hash content and add it to the contents database
@@ -2028,16 +2096,42 @@ class Knowledge:
2028
2096
  6. Read the content
2029
2097
  7. Prepare and insert the content in the vector database
2030
2098
  """
2099
+ try:
2100
+ from google.cloud import storage # type: ignore
2101
+ except ImportError:
2102
+ raise ImportError(
2103
+ "The `google-cloud-storage` package is not installed. "
2104
+ "Please install it via `pip install google-cloud-storage`."
2105
+ )
2106
+
2107
+ # Note: GCS support has limited features compared to GitHub/SharePoint
2108
+ log_warning(
2109
+ "GCS content loading has limited features. "
2110
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2111
+ )
2112
+
2031
2113
  remote_content: GCSContent = cast(GCSContent, content.remote_content)
2032
2114
 
2115
+ # Get or create bucket with credentials from config
2116
+ bucket = remote_content.bucket
2117
+ if bucket is None and remote_content.bucket_name:
2118
+ gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
2119
+ if gcs_config and gcs_config.credentials_path:
2120
+ client = storage.Client.from_service_account_json(gcs_config.credentials_path)
2121
+ elif gcs_config and gcs_config.project:
2122
+ client = storage.Client(project=gcs_config.project)
2123
+ else:
2124
+ client = storage.Client()
2125
+ bucket = client.bucket(remote_content.bucket_name)
2126
+
2033
2127
  # 1. Identify objects to read
2034
2128
  objects_to_read = []
2035
2129
  if remote_content.blob_name is not None:
2036
- objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
2130
+ objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
2037
2131
  elif remote_content.prefix is not None:
2038
- objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2132
+ objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2039
2133
  else:
2040
- objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
2134
+ objects_to_read.extend(bucket.list_blobs()) # type: ignore
2041
2135
 
2042
2136
  for gcs_object in objects_to_read:
2043
2137
  # 2. Setup Content object
@@ -2057,7 +2151,7 @@ class Knowledge:
2057
2151
  if self._should_skip(content_entry.content_hash, skip_if_exists):
2058
2152
  content_entry.status = ContentStatus.COMPLETED
2059
2153
  await self._aupdate_content(content_entry)
2060
- return
2154
+ continue
2061
2155
 
2062
2156
  # 4. Select reader
2063
2157
  reader = self._select_reader_by_uri(gcs_object.name, content.reader)
@@ -2070,9 +2164,7 @@ class Knowledge:
2070
2164
  read_documents = await reader.async_read(readable_content, name=name)
2071
2165
 
2072
2166
  # 7. Prepare and insert the content in the vector database
2073
- if not content.id:
2074
- content.id = generate_id(content.content_hash or "")
2075
- self._prepare_documents_for_insert(read_documents, content.id)
2167
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2076
2168
  await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2077
2169
 
2078
2170
  def _load_from_remote_content(
@@ -2088,16 +2180,38 @@ class Knowledge:
2088
2180
 
2089
2181
  remote_content = content.remote_content
2090
2182
 
2183
+ # Look up config if config_id is provided
2184
+ config = None
2185
+ if hasattr(remote_content, "config_id") and remote_content.config_id:
2186
+ config = self._get_remote_config_by_id(remote_content.config_id)
2187
+ if config is None:
2188
+ log_warning(f"No config found for config_id: {remote_content.config_id}")
2189
+
2091
2190
  if isinstance(remote_content, S3Content):
2092
- self._load_from_s3(content, upsert, skip_if_exists)
2191
+ self._load_from_s3(content, upsert, skip_if_exists, config)
2093
2192
 
2094
2193
  elif isinstance(remote_content, GCSContent):
2095
- self._load_from_gcs(content, upsert, skip_if_exists)
2194
+ self._load_from_gcs(content, upsert, skip_if_exists, config)
2195
+
2196
+ elif isinstance(remote_content, SharePointContent):
2197
+ self._load_from_sharepoint(content, upsert, skip_if_exists, config)
2198
+
2199
+ elif isinstance(remote_content, GitHubContent):
2200
+ self._load_from_github(content, upsert, skip_if_exists, config)
2201
+
2202
+ elif isinstance(remote_content, AzureBlobContent):
2203
+ self._load_from_azure_blob(content, upsert, skip_if_exists, config)
2096
2204
 
2097
2205
  else:
2098
2206
  log_warning(f"Unsupported remote content type: {type(remote_content)}")
2099
2207
 
2100
- def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
2208
+ def _load_from_s3(
2209
+ self,
2210
+ content: Content,
2211
+ upsert: bool,
2212
+ skip_if_exists: bool,
2213
+ config: Optional[RemoteContentConfig] = None,
2214
+ ):
2101
2215
  """Synchronous version of _load_from_s3.
2102
2216
 
2103
2217
  Load the contextual S3 content:
@@ -2110,22 +2224,40 @@ class Knowledge:
2110
2224
  7. Prepare and insert the content in the vector database
2111
2225
  8. Remove temporary file if needed
2112
2226
  """
2227
+ from agno.cloud.aws.s3.bucket import S3Bucket
2113
2228
  from agno.cloud.aws.s3.object import S3Object
2114
2229
 
2230
+ # Note: S3 support has limited features compared to GitHub/SharePoint
2231
+ log_warning(
2232
+ "S3 content loading has limited features. "
2233
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2234
+ )
2235
+
2115
2236
  remote_content: S3Content = cast(S3Content, content.remote_content)
2116
2237
 
2238
+ # Get or create bucket with credentials from config
2239
+ bucket = remote_content.bucket
2240
+ if bucket is None and remote_content.bucket_name:
2241
+ s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
2242
+ bucket = S3Bucket(
2243
+ name=remote_content.bucket_name,
2244
+ region=s3_config.region if s3_config else None,
2245
+ aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
2246
+ aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
2247
+ )
2248
+
2117
2249
  # 1. Identify objects to read
2118
2250
  objects_to_read: List[S3Object] = []
2119
- if remote_content.bucket is not None:
2251
+ if bucket is not None:
2120
2252
  if remote_content.key is not None:
2121
- _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
2253
+ _object = S3Object(bucket_name=bucket.name, name=remote_content.key)
2122
2254
  objects_to_read.append(_object)
2123
2255
  elif remote_content.object is not None:
2124
2256
  objects_to_read.append(remote_content.object)
2125
2257
  elif remote_content.prefix is not None:
2126
- objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
2258
+ objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
2127
2259
  else:
2128
- objects_to_read.extend(remote_content.bucket.get_objects())
2260
+ objects_to_read.extend(bucket.get_objects())
2129
2261
 
2130
2262
  for s3_object in objects_to_read:
2131
2263
  # 2. Setup Content object
@@ -2146,7 +2278,7 @@ class Knowledge:
2146
2278
  if self._should_skip(content_entry.content_hash, skip_if_exists):
2147
2279
  content_entry.status = ContentStatus.COMPLETED
2148
2280
  self._update_content(content_entry)
2149
- return
2281
+ continue
2150
2282
 
2151
2283
  # 4. Select reader
2152
2284
  reader = self._select_reader_by_uri(s3_object.uri, content.reader)
@@ -2167,16 +2299,20 @@ class Knowledge:
2167
2299
  read_documents = reader.read(readable_content, name=obj_name)
2168
2300
 
2169
2301
  # 7. Prepare and insert the content in the vector database
2170
- if not content.id:
2171
- content.id = generate_id(content.content_hash or "")
2172
- self._prepare_documents_for_insert(read_documents, content.id)
2302
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2173
2303
  self._handle_vector_db_insert(content_entry, read_documents, upsert)
2174
2304
 
2175
2305
  # 8. Remove temporary file if needed
2176
2306
  if temporary_file:
2177
2307
  temporary_file.unlink()
2178
2308
 
2179
- def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
2309
+ def _load_from_gcs(
2310
+ self,
2311
+ content: Content,
2312
+ upsert: bool,
2313
+ skip_if_exists: bool,
2314
+ config: Optional[RemoteContentConfig] = None,
2315
+ ):
2180
2316
  """Synchronous version of _load_from_gcs.
2181
2317
 
2182
2318
  Load the contextual GCS content:
@@ -2188,16 +2324,42 @@ class Knowledge:
2188
2324
  6. Read the content
2189
2325
  7. Prepare and insert the content in the vector database
2190
2326
  """
2327
+ try:
2328
+ from google.cloud import storage # type: ignore
2329
+ except ImportError:
2330
+ raise ImportError(
2331
+ "The `google-cloud-storage` package is not installed. "
2332
+ "Please install it via `pip install google-cloud-storage`."
2333
+ )
2334
+
2335
+ # Note: GCS support has limited features compared to GitHub/SharePoint
2336
+ log_warning(
2337
+ "GCS content loading has limited features. "
2338
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
2339
+ )
2340
+
2191
2341
  remote_content: GCSContent = cast(GCSContent, content.remote_content)
2192
2342
 
2343
+ # Get or create bucket with credentials from config
2344
+ bucket = remote_content.bucket
2345
+ if bucket is None and remote_content.bucket_name:
2346
+ gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
2347
+ if gcs_config and gcs_config.credentials_path:
2348
+ client = storage.Client.from_service_account_json(gcs_config.credentials_path)
2349
+ elif gcs_config and gcs_config.project:
2350
+ client = storage.Client(project=gcs_config.project)
2351
+ else:
2352
+ client = storage.Client()
2353
+ bucket = client.bucket(remote_content.bucket_name)
2354
+
2193
2355
  # 1. Identify objects to read
2194
2356
  objects_to_read = []
2195
2357
  if remote_content.blob_name is not None:
2196
- objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
2358
+ objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
2197
2359
  elif remote_content.prefix is not None:
2198
- objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2360
+ objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
2199
2361
  else:
2200
- objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
2362
+ objects_to_read.extend(bucket.list_blobs()) # type: ignore
2201
2363
 
2202
2364
  for gcs_object in objects_to_read:
2203
2365
  # 2. Setup Content object
@@ -2217,7 +2379,7 @@ class Knowledge:
2217
2379
  if self._should_skip(content_entry.content_hash, skip_if_exists):
2218
2380
  content_entry.status = ContentStatus.COMPLETED
2219
2381
  self._update_content(content_entry)
2220
- return
2382
+ continue
2221
2383
 
2222
2384
  # 4. Select reader
2223
2385
  reader = self._select_reader_by_uri(gcs_object.name, content.reader)
@@ -2230,9 +2392,1299 @@ class Knowledge:
2230
2392
  read_documents = reader.read(readable_content, name=name)
2231
2393
 
2232
2394
  # 7. Prepare and insert the content in the vector database
2233
- if not content.id:
2234
- content.id = generate_id(content.content_hash or "")
2235
- self._prepare_documents_for_insert(read_documents, content.id)
2395
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2396
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
2397
+
2398
+ # --- SharePoint loaders ---
2399
+
2400
+ def _get_sharepoint_access_token(self, sp_config: SharePointConfig) -> Optional[str]:
2401
+ """Get an access token for Microsoft Graph API using client credentials flow.
2402
+
2403
+ Requires the `msal` package: pip install msal
2404
+ """
2405
+ try:
2406
+ from msal import ConfidentialClientApplication # type: ignore
2407
+ except ImportError:
2408
+ raise ImportError("The `msal` package is not installed. Please install it via `pip install msal`.")
2409
+
2410
+ authority = f"https://login.microsoftonline.com/{sp_config.tenant_id}"
2411
+ app = ConfidentialClientApplication(
2412
+ sp_config.client_id,
2413
+ authority=authority,
2414
+ client_credential=sp_config.client_secret,
2415
+ )
2416
+
2417
+ # Acquire token for Microsoft Graph
2418
+ scopes = ["https://graph.microsoft.com/.default"]
2419
+ result = app.acquire_token_for_client(scopes=scopes)
2420
+
2421
+ if "access_token" in result:
2422
+ return result["access_token"]
2423
+ else:
2424
+ log_error(f"Failed to acquire SharePoint token: {result.get('error_description', result.get('error'))}")
2425
+ return None
2426
+
2427
+ def _get_sharepoint_site_id(self, hostname: str, site_path: Optional[str], access_token: str) -> Optional[str]:
2428
+ """Get the SharePoint site ID using Microsoft Graph API."""
2429
+ import httpx
2430
+
2431
+ if site_path:
2432
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
2433
+ else:
2434
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
2435
+
2436
+ headers = {"Authorization": f"Bearer {access_token}"}
2437
+
2438
+ try:
2439
+ response = httpx.get(url, headers=headers)
2440
+ response.raise_for_status()
2441
+ return response.json().get("id")
2442
+ except httpx.HTTPStatusError as e:
2443
+ log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
2444
+ return None
2445
+
2446
+ def _list_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
2447
+ """List all items in a SharePoint folder."""
2448
+ import httpx
2449
+
2450
+ # Strip leading slashes to avoid double-slash in URL
2451
+ folder_path = folder_path.lstrip("/")
2452
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
2453
+ headers = {"Authorization": f"Bearer {access_token}"}
2454
+ items: List[dict] = []
2455
+
2456
+ try:
2457
+ while url:
2458
+ response = httpx.get(url, headers=headers)
2459
+ response.raise_for_status()
2460
+ data = response.json()
2461
+ items.extend(data.get("value", []))
2462
+ url = data.get("@odata.nextLink")
2463
+ except httpx.HTTPStatusError as e:
2464
+ log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
2465
+
2466
+ return items
2467
+
2468
+ def _download_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
2469
+ """Download a file from SharePoint."""
2470
+ import httpx
2471
+
2472
+ # Strip leading slashes to avoid double-slash in URL
2473
+ file_path = file_path.lstrip("/")
2474
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
2475
+ headers = {"Authorization": f"Bearer {access_token}"}
2476
+
2477
+ try:
2478
+ response = httpx.get(url, headers=headers, follow_redirects=True)
2479
+ response.raise_for_status()
2480
+ return BytesIO(response.content)
2481
+ except httpx.HTTPStatusError as e:
2482
+ log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
2483
+ return None
2484
+
2485
+ async def _aget_sharepoint_site_id(
2486
+ self, hostname: str, site_path: Optional[str], access_token: str
2487
+ ) -> Optional[str]:
2488
+ """Get the SharePoint site ID using Microsoft Graph API (async)."""
2489
+ import httpx
2490
+
2491
+ if site_path:
2492
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
2493
+ else:
2494
+ url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
2495
+
2496
+ headers = {"Authorization": f"Bearer {access_token}"}
2497
+
2498
+ try:
2499
+ async with httpx.AsyncClient() as client:
2500
+ response = await client.get(url, headers=headers)
2501
+ response.raise_for_status()
2502
+ return response.json().get("id")
2503
+ except httpx.HTTPStatusError as e:
2504
+ log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
2505
+ return None
2506
+
2507
+ async def _alist_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
2508
+ """List all items in a SharePoint folder (async)."""
2509
+ import httpx
2510
+
2511
+ # Strip leading slashes to avoid double-slash in URL
2512
+ folder_path = folder_path.lstrip("/")
2513
+ url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
2514
+ headers = {"Authorization": f"Bearer {access_token}"}
2515
+ items: List[dict] = []
2516
+
2517
+ try:
2518
+ async with httpx.AsyncClient() as client:
2519
+ while url:
2520
+ response = await client.get(url, headers=headers)
2521
+ response.raise_for_status()
2522
+ data = response.json()
2523
+ items.extend(data.get("value", []))
2524
+ url = data.get("@odata.nextLink")
2525
+ except httpx.HTTPStatusError as e:
2526
+ log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
2527
+
2528
+ return items
2529
+
2530
+ async def _adownload_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
2531
+ """Download a file from SharePoint (async)."""
2532
+ import httpx
2533
+
2534
+ # Strip leading slashes to avoid double-slash in URL
2535
+ file_path = file_path.lstrip("/")
2536
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
2537
+ headers = {"Authorization": f"Bearer {access_token}"}
2538
+
2539
+ try:
2540
+ async with httpx.AsyncClient() as client:
2541
+ response = await client.get(url, headers=headers, follow_redirects=True)
2542
+ response.raise_for_status()
2543
+ return BytesIO(response.content)
2544
+ except httpx.HTTPStatusError as e:
2545
+ log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
2546
+ return None
2547
+
2548
+ async def _aload_from_sharepoint(
2549
+ self,
2550
+ content: Content,
2551
+ upsert: bool,
2552
+ skip_if_exists: bool,
2553
+ config: Optional[RemoteContentConfig] = None,
2554
+ ):
2555
+ """Load content from SharePoint.
2556
+
2557
+ Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
2558
+
2559
+ 1. Authenticate with Microsoft Graph using client credentials
2560
+ 2. Get site ID from hostname/site_path
2561
+ 3. Download file(s) from file_path or folder_path
2562
+ 4. Process through reader and insert to vector db
2563
+ """
2564
+ remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
2565
+ sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
2566
+
2567
+ if sp_config is None:
2568
+ log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
2569
+ return
2570
+
2571
+ # 1. Get access token
2572
+ access_token = self._get_sharepoint_access_token(sp_config)
2573
+ if not access_token:
2574
+ return
2575
+
2576
+ # 2. Get site ID - use config value if provided, otherwise fetch via API
2577
+ site_id: Optional[str] = sp_config.site_id
2578
+ if not site_id:
2579
+ site_path = remote_content.site_path or sp_config.site_path
2580
+ site_id = await self._aget_sharepoint_site_id(sp_config.hostname, site_path, access_token)
2581
+ if not site_id:
2582
+ log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
2583
+ return
2584
+
2585
+ # 3. Identify files to download
2586
+ files_to_process: List[tuple] = [] # List of (file_path, file_name)
2587
+
2588
+ # Helper function to recursively list all files in a folder
2589
+ async def list_files_recursive(folder: str) -> List[tuple]:
2590
+ """Recursively list all files in a SharePoint folder."""
2591
+ files: List[tuple] = []
2592
+ items = await self._alist_sharepoint_folder_items(site_id, folder, access_token)
2593
+ for item in items:
2594
+ if "file" in item: # It's a file
2595
+ item_path = f"{folder}/{item['name']}"
2596
+ files.append((item_path, item["name"]))
2597
+ elif "folder" in item: # It's a folder - recurse
2598
+ subdir_path = f"{folder}/{item['name']}"
2599
+ subdir_files = await list_files_recursive(subdir_path)
2600
+ files.extend(subdir_files)
2601
+ return files
2602
+
2603
+ # Get the path to process (file_path or folder_path)
2604
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
2605
+
2606
+ if path_to_process:
2607
+ # Check if path is a file or folder by getting item metadata
2608
+ try:
2609
+ async with AsyncClient() as client:
2610
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
2611
+ headers = {"Authorization": f"Bearer {access_token}"}
2612
+ response = await client.get(url, headers=headers, timeout=30.0)
2613
+ response.raise_for_status()
2614
+ item_data = response.json()
2615
+
2616
+ if "folder" in item_data:
2617
+ # It's a folder - recursively list all files
2618
+ files_to_process = await list_files_recursive(path_to_process)
2619
+ elif "file" in item_data:
2620
+ # It's a single file
2621
+ files_to_process.append((path_to_process, item_data["name"]))
2622
+ else:
2623
+ log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
2624
+ return
2625
+ except Exception as e:
2626
+ log_error(f"Error checking SharePoint path {path_to_process}: {e}")
2627
+ return
2628
+
2629
+ if not files_to_process:
2630
+ log_warning(f"No files found at SharePoint path: {path_to_process}")
2631
+ return
2632
+
2633
+ # 4. Process each file
2634
+ for file_path, file_name in files_to_process:
2635
+ # Build a unique virtual path for hashing (ensures different files don't collide)
2636
+ virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
2637
+
2638
+ # Build metadata with all info needed to re-fetch the file
2639
+ sharepoint_metadata = {
2640
+ "source_type": "sharepoint",
2641
+ "source_config_id": sp_config.id,
2642
+ "source_config_name": sp_config.name,
2643
+ "sharepoint_hostname": sp_config.hostname,
2644
+ "sharepoint_site_id": site_id,
2645
+ "sharepoint_path": file_path,
2646
+ "sharepoint_filename": file_name,
2647
+ }
2648
+ # Merge with user-provided metadata (user metadata takes precedence)
2649
+ merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
2650
+
2651
+ # Setup Content object
2652
+ # Naming: for folders, use relative path; for single files, use user name or filename
2653
+ is_folder_upload = len(files_to_process) > 1
2654
+ if is_folder_upload:
2655
+ # Compute relative path from the upload root
2656
+ relative_path = file_path
2657
+ if path_to_process and file_path.startswith(path_to_process + "/"):
2658
+ relative_path = file_path[len(path_to_process) + 1 :]
2659
+ # If user provided a name, prefix it; otherwise use full file path
2660
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
2661
+ else:
2662
+ # Single file: use user's name or the filename
2663
+ content_name = content.name or file_name
2664
+ content_entry = Content(
2665
+ name=content_name,
2666
+ description=content.description,
2667
+ path=virtual_path, # Include path for unique hashing
2668
+ status=ContentStatus.PROCESSING,
2669
+ metadata=merged_metadata,
2670
+ file_type="sharepoint",
2671
+ )
2672
+
2673
+ # Hash content and add to contents database
2674
+ content_entry.content_hash = self._build_content_hash(content_entry)
2675
+ content_entry.id = generate_id(content_entry.content_hash)
2676
+ await self._ainsert_contents_db(content_entry)
2677
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
2678
+ content_entry.status = ContentStatus.COMPLETED
2679
+ await self._aupdate_content(content_entry)
2680
+ continue
2681
+
2682
+ # Select reader based on file extension
2683
+ reader = self._select_reader_by_uri(file_name, content.reader)
2684
+ reader = cast(Reader, reader)
2685
+
2686
+ # Download file
2687
+ file_content = await self._adownload_sharepoint_file(site_id, file_path, access_token)
2688
+ if not file_content:
2689
+ content_entry.status = ContentStatus.FAILED
2690
+ await self._aupdate_content(content_entry)
2691
+ continue
2692
+
2693
+ # Read the content
2694
+ read_documents = await reader.async_read(file_content, name=file_name)
2695
+
2696
+ # Prepare and insert to vector database
2697
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2698
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
2699
+
2700
+ def _load_from_sharepoint(
2701
+ self,
2702
+ content: Content,
2703
+ upsert: bool,
2704
+ skip_if_exists: bool,
2705
+ config: Optional[RemoteContentConfig] = None,
2706
+ ):
2707
+ """Synchronous version of _load_from_sharepoint.
2708
+
2709
+ Load content from SharePoint:
2710
+ 1. Authenticate with Microsoft Graph using client credentials
2711
+ 2. Get site ID from hostname/site_path
2712
+ 3. Download file(s) from file_path or folder_path
2713
+ 4. Process through reader and insert to vector db
2714
+ """
2715
+ remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
2716
+ sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
2717
+
2718
+ if sp_config is None:
2719
+ log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
2720
+ return
2721
+
2722
+ # 1. Get access token
2723
+ access_token = self._get_sharepoint_access_token(sp_config)
2724
+ if not access_token:
2725
+ return
2726
+
2727
+ # 2. Get site ID - use config value if provided, otherwise fetch via API
2728
+ site_id: Optional[str] = sp_config.site_id
2729
+ if not site_id:
2730
+ site_path = remote_content.site_path or sp_config.site_path
2731
+ site_id = self._get_sharepoint_site_id(sp_config.hostname, site_path, access_token)
2732
+ if not site_id:
2733
+ log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
2734
+ return
2735
+
2736
+ # 3. Identify files to download
2737
+ files_to_process: List[tuple] = [] # List of (file_path, file_name)
2738
+
2739
+ # Helper function to recursively list all files in a folder
2740
+ def list_files_recursive(folder: str) -> List[tuple]:
2741
+ """Recursively list all files in a SharePoint folder."""
2742
+ files: List[tuple] = []
2743
+ items = self._list_sharepoint_folder_items(site_id, folder, access_token)
2744
+ for item in items:
2745
+ if "file" in item: # It's a file
2746
+ item_path = f"{folder}/{item['name']}"
2747
+ files.append((item_path, item["name"]))
2748
+ elif "folder" in item: # It's a folder - recurse
2749
+ subdir_path = f"{folder}/{item['name']}"
2750
+ subdir_files = list_files_recursive(subdir_path)
2751
+ files.extend(subdir_files)
2752
+ return files
2753
+
2754
+ # Get the path to process (file_path or folder_path)
2755
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
2756
+
2757
+ if path_to_process:
2758
+ # Check if path is a file or folder by getting item metadata
2759
+ try:
2760
+ with httpx.Client() as client:
2761
+ url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
2762
+ headers = {"Authorization": f"Bearer {access_token}"}
2763
+ response = client.get(url, headers=headers, timeout=30.0)
2764
+ response.raise_for_status()
2765
+ item_data = response.json()
2766
+
2767
+ if "folder" in item_data:
2768
+ # It's a folder - recursively list all files
2769
+ files_to_process = list_files_recursive(path_to_process)
2770
+ elif "file" in item_data:
2771
+ # It's a single file
2772
+ files_to_process.append((path_to_process, item_data["name"]))
2773
+ else:
2774
+ log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
2775
+ return
2776
+ except Exception as e:
2777
+ log_error(f"Error checking SharePoint path {path_to_process}: {e}")
2778
+ return
2779
+
2780
+ if not files_to_process:
2781
+ log_warning(f"No files found at SharePoint path: {path_to_process}")
2782
+ return
2783
+
2784
+ # 4. Process each file
2785
+ for file_path, file_name in files_to_process:
2786
+ # Build a unique virtual path for hashing (ensures different files don't collide)
2787
+ virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
2788
+
2789
+ # Build metadata with all info needed to re-fetch the file
2790
+ sharepoint_metadata = {
2791
+ "source_type": "sharepoint",
2792
+ "source_config_id": sp_config.id,
2793
+ "source_config_name": sp_config.name,
2794
+ "sharepoint_hostname": sp_config.hostname,
2795
+ "sharepoint_site_id": site_id,
2796
+ "sharepoint_path": file_path,
2797
+ "sharepoint_filename": file_name,
2798
+ }
2799
+ # Merge with user-provided metadata (user metadata takes precedence)
2800
+ merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
2801
+
2802
+ # Setup Content object
2803
+ # Naming: for folders, use relative path; for single files, use user name or filename
2804
+ is_folder_upload = len(files_to_process) > 1
2805
+ if is_folder_upload:
2806
+ # Compute relative path from the upload root
2807
+ relative_path = file_path
2808
+ if path_to_process and file_path.startswith(path_to_process + "/"):
2809
+ relative_path = file_path[len(path_to_process) + 1 :]
2810
+ # If user provided a name, prefix it; otherwise use full file path
2811
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
2812
+ else:
2813
+ # Single file: use user's name or the filename
2814
+ content_name = content.name or file_name
2815
+ content_entry = Content(
2816
+ name=content_name,
2817
+ description=content.description,
2818
+ path=virtual_path, # Include path for unique hashing
2819
+ status=ContentStatus.PROCESSING,
2820
+ metadata=merged_metadata,
2821
+ file_type="sharepoint",
2822
+ )
2823
+
2824
+ # Hash content and add to contents database
2825
+ content_entry.content_hash = self._build_content_hash(content_entry)
2826
+ content_entry.id = generate_id(content_entry.content_hash)
2827
+ self._insert_contents_db(content_entry)
2828
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
2829
+ content_entry.status = ContentStatus.COMPLETED
2830
+ self._update_content(content_entry)
2831
+ continue
2832
+
2833
+ # Select reader based on file extension
2834
+ reader = self._select_reader_by_uri(file_name, content.reader)
2835
+ reader = cast(Reader, reader)
2836
+
2837
+ # Download file
2838
+ file_content = self._download_sharepoint_file(site_id, file_path, access_token)
2839
+ if not file_content:
2840
+ content_entry.status = ContentStatus.FAILED
2841
+ self._update_content(content_entry)
2842
+ continue
2843
+
2844
+ # Read the content
2845
+ read_documents = reader.read(file_content, name=file_name)
2846
+
2847
+ # Prepare and insert to vector database
2848
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2849
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
2850
+
2851
+ # --- GitHub loaders ---
2852
+
2853
+ async def _aload_from_github(
2854
+ self,
2855
+ content: Content,
2856
+ upsert: bool,
2857
+ skip_if_exists: bool,
2858
+ config: Optional[RemoteContentConfig] = None,
2859
+ ):
2860
+ """Load content from GitHub.
2861
+
2862
+ Requires the GitHub config to contain repo and optionally token for private repos.
2863
+ Uses the GitHub API to fetch file contents.
2864
+ """
2865
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
2866
+ gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
2867
+
2868
+ if gh_config is None:
2869
+ log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
2870
+ return
2871
+
2872
+ # Build headers for GitHub API
2873
+ headers = {
2874
+ "Accept": "application/vnd.github.v3+json",
2875
+ "User-Agent": "Agno-Knowledge",
2876
+ }
2877
+ if gh_config.token:
2878
+ headers["Authorization"] = f"Bearer {gh_config.token}"
2879
+
2880
+ branch = remote_content.branch or gh_config.branch or "main"
2881
+
2882
+ # Get list of files to process
2883
+ files_to_process: List[Dict[str, str]] = []
2884
+
2885
+ async with AsyncClient() as client:
2886
+ # Helper function to recursively list all files in a folder
2887
+ async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
2888
+ """Recursively list all files in a GitHub folder."""
2889
+ files: List[Dict[str, str]] = []
2890
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
2891
+ if branch:
2892
+ api_url += f"?ref={branch}"
2893
+
2894
+ try:
2895
+ response = await client.get(api_url, headers=headers, timeout=30.0)
2896
+ response.raise_for_status()
2897
+ items = response.json()
2898
+
2899
+ # If items is not a list, it's a single file response
2900
+ if not isinstance(items, list):
2901
+ items = [items]
2902
+
2903
+ for item in items:
2904
+ if item.get("type") == "file":
2905
+ files.append(
2906
+ {
2907
+ "path": item["path"],
2908
+ "name": item["name"],
2909
+ }
2910
+ )
2911
+ elif item.get("type") == "dir":
2912
+ # Recursively get files from subdirectory
2913
+ subdir_files = await list_files_recursive(item["path"])
2914
+ files.extend(subdir_files)
2915
+ except Exception as e:
2916
+ log_error(f"Error listing GitHub folder {folder}: {e}")
2917
+
2918
+ return files
2919
+
2920
+ # Get the path to process (file_path or folder_path)
2921
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
2922
+
2923
+ if path_to_process:
2924
+ # Fetch the path to determine if it's a file or directory
2925
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
2926
+ if branch:
2927
+ api_url += f"?ref={branch}"
2928
+
2929
+ try:
2930
+ response = await client.get(api_url, headers=headers, timeout=30.0)
2931
+ response.raise_for_status()
2932
+ path_data = response.json()
2933
+
2934
+ if isinstance(path_data, list):
2935
+ # It's a directory - recursively list all files
2936
+ for item in path_data:
2937
+ if item.get("type") == "file":
2938
+ files_to_process.append({"path": item["path"], "name": item["name"]})
2939
+ elif item.get("type") == "dir":
2940
+ subdir_files = await list_files_recursive(item["path"])
2941
+ files_to_process.extend(subdir_files)
2942
+ else:
2943
+ # It's a single file
2944
+ files_to_process.append(
2945
+ {
2946
+ "path": path_data["path"],
2947
+ "name": path_data["name"],
2948
+ }
2949
+ )
2950
+ except Exception as e:
2951
+ log_error(f"Error fetching GitHub path {path_to_process}: {e}")
2952
+ return
2953
+
2954
+ if not files_to_process:
2955
+ log_warning(f"No files found at GitHub path: {path_to_process}")
2956
+ return
2957
+
2958
+ # Process each file
2959
+ for file_info in files_to_process:
2960
+ file_path = file_info["path"]
2961
+ file_name = file_info["name"]
2962
+
2963
+ # Build a unique virtual path for hashing (ensures different files don't collide)
2964
+ virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
2965
+
2966
+ # Build metadata with all info needed to re-fetch the file
2967
+ github_metadata = {
2968
+ "source_type": "github",
2969
+ "source_config_id": gh_config.id,
2970
+ "source_config_name": gh_config.name,
2971
+ "github_repo": gh_config.repo,
2972
+ "github_branch": branch,
2973
+ "github_path": file_path,
2974
+ "github_filename": file_name,
2975
+ }
2976
+ # Merge with user-provided metadata (user metadata takes precedence)
2977
+ merged_metadata = {**github_metadata, **(content.metadata or {})}
2978
+
2979
+ # Setup Content object
2980
+ # Naming: for folders, use relative path; for single files, use user name or filename
2981
+ is_folder_upload = len(files_to_process) > 1
2982
+ if is_folder_upload:
2983
+ # Compute relative path from the upload root
2984
+ relative_path = file_path
2985
+ if path_to_process and file_path.startswith(path_to_process + "/"):
2986
+ relative_path = file_path[len(path_to_process) + 1 :]
2987
+ # If user provided a name, prefix it; otherwise use full file path
2988
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
2989
+ else:
2990
+ # Single file: use user's name or the filename
2991
+ content_name = content.name or file_name
2992
+ content_entry = Content(
2993
+ name=content_name,
2994
+ description=content.description,
2995
+ path=virtual_path, # Include path for unique hashing
2996
+ status=ContentStatus.PROCESSING,
2997
+ metadata=merged_metadata,
2998
+ file_type="github",
2999
+ )
3000
+
3001
+ # Hash content and add to contents database
3002
+ content_entry.content_hash = self._build_content_hash(content_entry)
3003
+ content_entry.id = generate_id(content_entry.content_hash)
3004
+ await self._ainsert_contents_db(content_entry)
3005
+
3006
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
3007
+ content_entry.status = ContentStatus.COMPLETED
3008
+ await self._aupdate_content(content_entry)
3009
+ continue
3010
+
3011
+ # Fetch file content using GitHub API (works for private repos)
3012
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
3013
+ if branch:
3014
+ api_url += f"?ref={branch}"
3015
+ try:
3016
+ response = await client.get(api_url, headers=headers, timeout=30.0)
3017
+ response.raise_for_status()
3018
+ file_data = response.json()
3019
+
3020
+ # GitHub API returns content as base64
3021
+ if file_data.get("encoding") == "base64":
3022
+ import base64
3023
+
3024
+ file_content = base64.b64decode(file_data["content"])
3025
+ else:
3026
+ # For large files, GitHub returns a download_url
3027
+ download_url = file_data.get("download_url")
3028
+ if download_url:
3029
+ dl_response = await client.get(download_url, headers=headers, timeout=30.0)
3030
+ dl_response.raise_for_status()
3031
+ file_content = dl_response.content
3032
+ else:
3033
+ raise ValueError("No content or download_url in response")
3034
+ except Exception as e:
3035
+ log_error(f"Error fetching GitHub file {file_path}: {e}")
3036
+ content_entry.status = ContentStatus.FAILED
3037
+ content_entry.status_message = str(e)
3038
+ await self._aupdate_content(content_entry)
3039
+ continue
3040
+
3041
+ # Select reader and read content
3042
+ reader = self._select_reader_by_uri(file_name, content.reader)
3043
+ if reader is None:
3044
+ log_warning(f"No reader found for file: {file_name}")
3045
+ content_entry.status = ContentStatus.FAILED
3046
+ content_entry.status_message = "No suitable reader found"
3047
+ await self._aupdate_content(content_entry)
3048
+ continue
3049
+
3050
+ reader = cast(Reader, reader)
3051
+ readable_content = BytesIO(file_content)
3052
+ read_documents = await reader.async_read(readable_content, name=file_name)
3053
+
3054
+ # Prepare and insert into vector database
3055
+ if not content_entry.id:
3056
+ content_entry.id = generate_id(content_entry.content_hash or "")
3057
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
3058
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
3059
+
3060
+ def _load_from_github(
3061
+ self,
3062
+ content: Content,
3063
+ upsert: bool,
3064
+ skip_if_exists: bool,
3065
+ config: Optional[RemoteContentConfig] = None,
3066
+ ):
3067
+ """Synchronous version of _load_from_github."""
3068
+ import httpx
3069
+
3070
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
3071
+ gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
3072
+
3073
+ if gh_config is None:
3074
+ log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
3075
+ return
3076
+
3077
+ # Build headers for GitHub API
3078
+ headers = {
3079
+ "Accept": "application/vnd.github.v3+json",
3080
+ "User-Agent": "Agno-Knowledge",
3081
+ }
3082
+ if gh_config.token:
3083
+ headers["Authorization"] = f"Bearer {gh_config.token}"
3084
+
3085
+ branch = remote_content.branch or gh_config.branch or "main"
3086
+
3087
+ # Get list of files to process
3088
+ files_to_process: List[Dict[str, str]] = []
3089
+
3090
+ with httpx.Client() as client:
3091
+ # Helper function to recursively list all files in a folder
3092
+ def list_files_recursive(folder: str) -> List[Dict[str, str]]:
3093
+ """Recursively list all files in a GitHub folder."""
3094
+ files: List[Dict[str, str]] = []
3095
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
3096
+ if branch:
3097
+ api_url += f"?ref={branch}"
3098
+
3099
+ try:
3100
+ response = client.get(api_url, headers=headers, timeout=30.0)
3101
+ response.raise_for_status()
3102
+ items = response.json()
3103
+
3104
+ # If items is not a list, it's a single file response
3105
+ if not isinstance(items, list):
3106
+ items = [items]
3107
+
3108
+ for item in items:
3109
+ if item.get("type") == "file":
3110
+ files.append(
3111
+ {
3112
+ "path": item["path"],
3113
+ "name": item["name"],
3114
+ }
3115
+ )
3116
+ elif item.get("type") == "dir":
3117
+ # Recursively get files from subdirectory
3118
+ subdir_files = list_files_recursive(item["path"])
3119
+ files.extend(subdir_files)
3120
+ except Exception as e:
3121
+ log_error(f"Error listing GitHub folder {folder}: {e}")
3122
+
3123
+ return files
3124
+
3125
+ # Get the path to process (file_path or folder_path)
3126
+ path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
3127
+
3128
+ if path_to_process:
3129
+ # Fetch the path to determine if it's a file or directory
3130
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
3131
+ if branch:
3132
+ api_url += f"?ref={branch}"
3133
+
3134
+ try:
3135
+ response = client.get(api_url, headers=headers, timeout=30.0)
3136
+ response.raise_for_status()
3137
+ path_data = response.json()
3138
+
3139
+ if isinstance(path_data, list):
3140
+ # It's a directory - recursively list all files
3141
+ for item in path_data:
3142
+ if item.get("type") == "file":
3143
+ files_to_process.append({"path": item["path"], "name": item["name"]})
3144
+ elif item.get("type") == "dir":
3145
+ subdir_files = list_files_recursive(item["path"])
3146
+ files_to_process.extend(subdir_files)
3147
+ else:
3148
+ # It's a single file
3149
+ files_to_process.append(
3150
+ {
3151
+ "path": path_data["path"],
3152
+ "name": path_data["name"],
3153
+ }
3154
+ )
3155
+ except Exception as e:
3156
+ log_error(f"Error fetching GitHub path {path_to_process}: {e}")
3157
+ return
3158
+
3159
+ if not files_to_process:
3160
+ log_warning(f"No files found at GitHub path: {path_to_process}")
3161
+ return
3162
+
3163
+ # Process each file
3164
+ for file_info in files_to_process:
3165
+ file_path = file_info["path"]
3166
+ file_name = file_info["name"]
3167
+
3168
+ # Build a unique virtual path for hashing (ensures different files don't collide)
3169
+ virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
3170
+
3171
+ # Build metadata with all info needed to re-fetch the file
3172
+ github_metadata = {
3173
+ "source_type": "github",
3174
+ "source_config_id": gh_config.id,
3175
+ "source_config_name": gh_config.name,
3176
+ "github_repo": gh_config.repo,
3177
+ "github_branch": branch,
3178
+ "github_path": file_path,
3179
+ "github_filename": file_name,
3180
+ }
3181
+ # Merge with user-provided metadata (user metadata takes precedence)
3182
+ merged_metadata = {**github_metadata, **(content.metadata or {})}
3183
+
3184
+ # Setup Content object
3185
+ # Naming: for folders, use relative path; for single files, use user name or filename
3186
+ is_folder_upload = len(files_to_process) > 1
3187
+ if is_folder_upload:
3188
+ # Compute relative path from the upload root
3189
+ relative_path = file_path
3190
+ if path_to_process and file_path.startswith(path_to_process + "/"):
3191
+ relative_path = file_path[len(path_to_process) + 1 :]
3192
+ # If user provided a name, prefix it; otherwise use full file path
3193
+ content_name = f"{content.name}/{relative_path}" if content.name else file_path
3194
+ else:
3195
+ # Single file: use user's name or the filename
3196
+ content_name = content.name or file_name
3197
+ content_entry = Content(
3198
+ name=content_name,
3199
+ description=content.description,
3200
+ path=virtual_path, # Include path for unique hashing
3201
+ status=ContentStatus.PROCESSING,
3202
+ metadata=merged_metadata,
3203
+ file_type="github",
3204
+ )
3205
+
3206
+ # Hash content and add to contents database
3207
+ content_entry.content_hash = self._build_content_hash(content_entry)
3208
+ content_entry.id = generate_id(content_entry.content_hash)
3209
+ self._insert_contents_db(content_entry)
3210
+
3211
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
3212
+ content_entry.status = ContentStatus.COMPLETED
3213
+ self._update_content(content_entry)
3214
+ continue
3215
+
3216
+ # Fetch file content using GitHub API (works for private repos)
3217
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
3218
+ if branch:
3219
+ api_url += f"?ref={branch}"
3220
+ try:
3221
+ response = client.get(api_url, headers=headers, timeout=30.0)
3222
+ response.raise_for_status()
3223
+ file_data = response.json()
3224
+
3225
+ # GitHub API returns content as base64
3226
+ if file_data.get("encoding") == "base64":
3227
+ import base64
3228
+
3229
+ file_content = base64.b64decode(file_data["content"])
3230
+ else:
3231
+ # For large files, GitHub returns a download_url
3232
+ download_url = file_data.get("download_url")
3233
+ if download_url:
3234
+ dl_response = client.get(download_url, headers=headers, timeout=30.0)
3235
+ dl_response.raise_for_status()
3236
+ file_content = dl_response.content
3237
+ else:
3238
+ raise ValueError("No content or download_url in response")
3239
+ except Exception as e:
3240
+ log_error(f"Error fetching GitHub file {file_path}: {e}")
3241
+ content_entry.status = ContentStatus.FAILED
3242
+ content_entry.status_message = str(e)
3243
+ self._update_content(content_entry)
3244
+ continue
3245
+
3246
+ # Select reader and read content
3247
+ reader = self._select_reader_by_uri(file_name, content.reader)
3248
+ if reader is None:
3249
+ log_warning(f"No reader found for file: {file_name}")
3250
+ content_entry.status = ContentStatus.FAILED
3251
+ content_entry.status_message = "No suitable reader found"
3252
+ self._update_content(content_entry)
3253
+ continue
3254
+
3255
+ reader = cast(Reader, reader)
3256
+ readable_content = BytesIO(file_content)
3257
+ read_documents = reader.read(readable_content, name=file_name)
3258
+
3259
+ # Prepare and insert into vector database
3260
+ if not content_entry.id:
3261
+ content_entry.id = generate_id(content_entry.content_hash or "")
3262
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
3263
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
3264
+
3265
+ # --- Azure Blob Storage loaders ---
3266
+
3267
+ def _get_azure_blob_client(self, azure_config: AzureBlobConfig):
3268
+ """Get a sync Azure Blob Service Client using client credentials flow.
3269
+
3270
+ Requires the `azure-identity` and `azure-storage-blob` packages.
3271
+ """
3272
+ try:
3273
+ from azure.identity import ClientSecretCredential # type: ignore
3274
+ from azure.storage.blob import BlobServiceClient # type: ignore
3275
+ except ImportError:
3276
+ raise ImportError(
3277
+ "The `azure-identity` and `azure-storage-blob` packages are not installed. "
3278
+ "Please install them via `pip install azure-identity azure-storage-blob`."
3279
+ )
3280
+
3281
+ credential = ClientSecretCredential(
3282
+ tenant_id=azure_config.tenant_id,
3283
+ client_id=azure_config.client_id,
3284
+ client_secret=azure_config.client_secret,
3285
+ )
3286
+
3287
+ blob_service = BlobServiceClient(
3288
+ account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
3289
+ credential=credential,
3290
+ )
3291
+
3292
+ return blob_service
3293
+
3294
+ def _get_azure_blob_client_async(self, azure_config: AzureBlobConfig):
3295
+ """Get an async Azure Blob Service Client using client credentials flow.
3296
+
3297
+ Requires the `azure-identity` and `azure-storage-blob` packages.
3298
+ Uses the async versions from azure.storage.blob.aio and azure.identity.aio.
3299
+ """
3300
+ try:
3301
+ from azure.identity.aio import ClientSecretCredential # type: ignore
3302
+ from azure.storage.blob.aio import BlobServiceClient # type: ignore
3303
+ except ImportError:
3304
+ raise ImportError(
3305
+ "The `azure-identity` and `azure-storage-blob` packages are not installed. "
3306
+ "Please install them via `pip install azure-identity azure-storage-blob`."
3307
+ )
3308
+
3309
+ credential = ClientSecretCredential(
3310
+ tenant_id=azure_config.tenant_id,
3311
+ client_id=azure_config.client_id,
3312
+ client_secret=azure_config.client_secret,
3313
+ )
3314
+
3315
+ blob_service = BlobServiceClient(
3316
+ account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
3317
+ credential=credential,
3318
+ )
3319
+
3320
+ return blob_service
3321
+
3322
+ async def _aload_from_azure_blob(
3323
+ self,
3324
+ content: Content,
3325
+ upsert: bool,
3326
+ skip_if_exists: bool,
3327
+ config: Optional[RemoteContentConfig] = None,
3328
+ ):
3329
+ """Load content from Azure Blob Storage (async version).
3330
+
3331
+ Requires the AzureBlobConfig to contain tenant_id, client_id, client_secret,
3332
+ storage_account, and container.
3333
+
3334
+ Uses the async Azure SDK to avoid blocking the event loop.
3335
+
3336
+ 1. Authenticate with Azure AD using client credentials
3337
+ 2. List blobs in container (by prefix or single blob)
3338
+ 3. Download and process each blob
3339
+ 4. Insert to vector database
3340
+ """
3341
+ remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
3342
+ azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
3343
+
3344
+ if azure_config is None:
3345
+ log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
3346
+ return
3347
+
3348
+ # Get async blob service client
3349
+ try:
3350
+ blob_service = self._get_azure_blob_client_async(azure_config)
3351
+ except ImportError as e:
3352
+ log_error(str(e))
3353
+ return
3354
+ except Exception as e:
3355
+ log_error(f"Error creating Azure Blob client: {e}")
3356
+ return
3357
+
3358
+ # Use async context manager for proper resource cleanup
3359
+ async with blob_service:
3360
+ container_client = blob_service.get_container_client(azure_config.container)
3361
+
3362
+ # Helper to list blobs with a given prefix (async)
3363
+ async def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
3364
+ """List all blobs under a given prefix (folder)."""
3365
+ results: List[Dict[str, Any]] = []
3366
+ normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
3367
+ async for blob in container_client.list_blobs(name_starts_with=normalized_prefix):
3368
+ # Skip "directory" markers (blobs ending with /)
3369
+ if not blob.name.endswith("/"):
3370
+ results.append(
3371
+ {
3372
+ "name": blob.name,
3373
+ "size": blob.size,
3374
+ "content_type": blob.content_settings.content_type if blob.content_settings else None,
3375
+ }
3376
+ )
3377
+ return results
3378
+
3379
+ # Identify blobs to process
3380
+ blobs_to_process: List[Dict[str, Any]] = []
3381
+
3382
+ try:
3383
+ if remote_content.blob_name:
3384
+ # Try to get as a single blob first
3385
+ blob_client = container_client.get_blob_client(remote_content.blob_name)
3386
+ try:
3387
+ props = await blob_client.get_blob_properties()
3388
+ blobs_to_process.append(
3389
+ {
3390
+ "name": remote_content.blob_name,
3391
+ "size": props.size,
3392
+ "content_type": props.content_settings.content_type if props.content_settings else None,
3393
+ }
3394
+ )
3395
+ except Exception:
3396
+ # Blob doesn't exist - check if it's actually a folder (prefix)
3397
+ log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
3398
+ blobs_to_process = await list_blobs_with_prefix(remote_content.blob_name)
3399
+ if not blobs_to_process:
3400
+ log_error(
3401
+ f"No blob or folder found at path: {remote_content.blob_name}. "
3402
+ "If this is a folder, ensure files exist inside it."
3403
+ )
3404
+ return
3405
+ elif remote_content.prefix:
3406
+ # List blobs with prefix
3407
+ blobs_to_process = await list_blobs_with_prefix(remote_content.prefix)
3408
+ except Exception as e:
3409
+ log_error(f"Error listing Azure blobs: {e}")
3410
+ return
3411
+
3412
+ if not blobs_to_process:
3413
+ log_warning(f"No blobs found in Azure container: {azure_config.container}")
3414
+ return
3415
+
3416
+ # For single file uploads, use the original content object to preserve the ID
3417
+ # returned by the API. For folder uploads, create new content entries for each file.
3418
+ is_folder_upload = len(blobs_to_process) > 1
3419
+
3420
+ # Process each blob
3421
+ for blob_info in blobs_to_process:
3422
+ blob_name = blob_info["name"]
3423
+ file_name = blob_name.split("/")[-1]
3424
+
3425
+ # Build a unique virtual path for hashing
3426
+ virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
3427
+
3428
+ # Build metadata
3429
+ azure_metadata = {
3430
+ "source_type": "azure_blob",
3431
+ "source_config_id": azure_config.id,
3432
+ "source_config_name": azure_config.name,
3433
+ "azure_storage_account": azure_config.storage_account,
3434
+ "azure_container": azure_config.container,
3435
+ "azure_blob_name": blob_name,
3436
+ "azure_filename": file_name,
3437
+ }
3438
+ merged_metadata = {**azure_metadata, **(content.metadata or {})}
3439
+
3440
+ # Setup Content object
3441
+ if is_folder_upload:
3442
+ # For folder uploads, create new content entries for each file
3443
+ relative_path = blob_name
3444
+ if remote_content.prefix and blob_name.startswith(remote_content.prefix):
3445
+ relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
3446
+ content_name = f"{content.name}/{relative_path}" if content.name else blob_name
3447
+
3448
+ content_entry = Content(
3449
+ name=content_name,
3450
+ description=content.description,
3451
+ path=virtual_path,
3452
+ status=ContentStatus.PROCESSING,
3453
+ metadata=merged_metadata,
3454
+ file_type="azure_blob",
3455
+ )
3456
+ content_entry.content_hash = self._build_content_hash(content_entry)
3457
+ content_entry.id = generate_id(content_entry.content_hash)
3458
+ else:
3459
+ # For single file uploads, use the original content object to preserve ID
3460
+ content_entry = content
3461
+ content_entry.path = virtual_path
3462
+ content_entry.status = ContentStatus.PROCESSING
3463
+ content_entry.metadata = merged_metadata
3464
+ content_entry.file_type = "azure_blob"
3465
+ # Use existing id and content_hash from the original content if available
3466
+ if not content_entry.content_hash:
3467
+ content_entry.content_hash = self._build_content_hash(content_entry)
3468
+ if not content_entry.id:
3469
+ content_entry.id = generate_id(content_entry.content_hash)
3470
+
3471
+ await self._ainsert_contents_db(content_entry)
3472
+
3473
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
3474
+ content_entry.status = ContentStatus.COMPLETED
3475
+ await self._aupdate_content(content_entry)
3476
+ continue
3477
+
3478
+ # Download blob (async)
3479
+ try:
3480
+ blob_client = container_client.get_blob_client(blob_name)
3481
+ download_stream = await blob_client.download_blob()
3482
+ blob_data = await download_stream.readall()
3483
+ file_content = BytesIO(blob_data)
3484
+ except Exception as e:
3485
+ log_error(f"Error downloading Azure blob {blob_name}: {e}")
3486
+ content_entry.status = ContentStatus.FAILED
3487
+ content_entry.status_message = str(e)
3488
+ await self._aupdate_content(content_entry)
3489
+ continue
3490
+
3491
+ # Select reader and read content
3492
+ reader = self._select_reader_by_uri(file_name, content.reader)
3493
+ if reader is None:
3494
+ log_warning(f"No reader found for file: {file_name}")
3495
+ content_entry.status = ContentStatus.FAILED
3496
+ content_entry.status_message = "No suitable reader found"
3497
+ await self._aupdate_content(content_entry)
3498
+ continue
3499
+
3500
+ reader = cast(Reader, reader)
3501
+ read_documents = await reader.async_read(file_content, name=file_name)
3502
+
3503
+ # Prepare and insert into vector database
3504
+ if not content_entry.id:
3505
+ content_entry.id = generate_id(content_entry.content_hash or "")
3506
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
3507
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
3508
+
3509
+ def _load_from_azure_blob(
3510
+ self,
3511
+ content: Content,
3512
+ upsert: bool,
3513
+ skip_if_exists: bool,
3514
+ config: Optional[RemoteContentConfig] = None,
3515
+ ):
3516
+ """Synchronous version of _load_from_azure_blob.
3517
+
3518
+ Load content from Azure Blob Storage:
3519
+ 1. Authenticate with Azure AD using client credentials
3520
+ 2. List blobs in container (by prefix or single blob)
3521
+ 3. Download and process each blob
3522
+ 4. Insert to vector database
3523
+ """
3524
+ remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
3525
+ azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
3526
+
3527
+ if azure_config is None:
3528
+ log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
3529
+ return
3530
+
3531
+ # Get blob service client
3532
+ try:
3533
+ blob_service = self._get_azure_blob_client(azure_config)
3534
+ except ImportError as e:
3535
+ log_error(str(e))
3536
+ return
3537
+ except Exception as e:
3538
+ log_error(f"Error creating Azure Blob client: {e}")
3539
+ return
3540
+
3541
+ container_client = blob_service.get_container_client(azure_config.container)
3542
+
3543
+ # Helper to list blobs with a given prefix
3544
+ def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
3545
+ """List all blobs under a given prefix (folder)."""
3546
+ results: List[Dict[str, Any]] = []
3547
+ normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
3548
+ blobs = container_client.list_blobs(name_starts_with=normalized_prefix)
3549
+ for blob in blobs:
3550
+ # Skip "directory" markers (blobs ending with /)
3551
+ if not blob.name.endswith("/"):
3552
+ results.append(
3553
+ {
3554
+ "name": blob.name,
3555
+ "size": blob.size,
3556
+ "content_type": blob.content_settings.content_type if blob.content_settings else None,
3557
+ }
3558
+ )
3559
+ return results
3560
+
3561
+ # Identify blobs to process
3562
+ blobs_to_process: List[Dict[str, Any]] = []
3563
+
3564
+ try:
3565
+ if remote_content.blob_name:
3566
+ # Try to get as a single blob first
3567
+ blob_client = container_client.get_blob_client(remote_content.blob_name)
3568
+ try:
3569
+ props = blob_client.get_blob_properties()
3570
+ blobs_to_process.append(
3571
+ {
3572
+ "name": remote_content.blob_name,
3573
+ "size": props.size,
3574
+ "content_type": props.content_settings.content_type if props.content_settings else None,
3575
+ }
3576
+ )
3577
+ except Exception:
3578
+ # Blob doesn't exist - check if it's actually a folder (prefix)
3579
+ log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
3580
+ blobs_to_process = list_blobs_with_prefix(remote_content.blob_name)
3581
+ if not blobs_to_process:
3582
+ log_error(
3583
+ f"No blob or folder found at path: {remote_content.blob_name}. "
3584
+ "If this is a folder, ensure files exist inside it."
3585
+ )
3586
+ return
3587
+ elif remote_content.prefix:
3588
+ # List blobs with prefix
3589
+ blobs_to_process = list_blobs_with_prefix(remote_content.prefix)
3590
+ except Exception as e:
3591
+ log_error(f"Error listing Azure blobs: {e}")
3592
+ return
3593
+
3594
+ if not blobs_to_process:
3595
+ log_warning(f"No blobs found in Azure container: {azure_config.container}")
3596
+ return
3597
+
3598
+ # For single file uploads, use the original content object to preserve the ID
3599
+ # returned by the API. For folder uploads, create new content entries for each file.
3600
+ is_folder_upload = len(blobs_to_process) > 1
3601
+
3602
+ # Process each blob
3603
+ for blob_info in blobs_to_process:
3604
+ blob_name = blob_info["name"]
3605
+ file_name = blob_name.split("/")[-1]
3606
+
3607
+ # Build a unique virtual path for hashing
3608
+ virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
3609
+
3610
+ # Build metadata
3611
+ azure_metadata = {
3612
+ "source_type": "azure_blob",
3613
+ "source_config_id": azure_config.id,
3614
+ "source_config_name": azure_config.name,
3615
+ "azure_storage_account": azure_config.storage_account,
3616
+ "azure_container": azure_config.container,
3617
+ "azure_blob_name": blob_name,
3618
+ "azure_filename": file_name,
3619
+ }
3620
+ merged_metadata = {**azure_metadata, **(content.metadata or {})}
3621
+
3622
+ # Setup Content object
3623
+ if is_folder_upload:
3624
+ # For folder uploads, create new content entries for each file
3625
+ relative_path = blob_name
3626
+ if remote_content.prefix and blob_name.startswith(remote_content.prefix):
3627
+ relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
3628
+ content_name = f"{content.name}/{relative_path}" if content.name else blob_name
3629
+
3630
+ content_entry = Content(
3631
+ name=content_name,
3632
+ description=content.description,
3633
+ path=virtual_path,
3634
+ status=ContentStatus.PROCESSING,
3635
+ metadata=merged_metadata,
3636
+ file_type="azure_blob",
3637
+ )
3638
+ content_entry.content_hash = self._build_content_hash(content_entry)
3639
+ content_entry.id = generate_id(content_entry.content_hash)
3640
+ else:
3641
+ # For single file uploads, use the original content object to preserve ID
3642
+ content_entry = content
3643
+ content_entry.path = virtual_path
3644
+ content_entry.status = ContentStatus.PROCESSING
3645
+ content_entry.metadata = merged_metadata
3646
+ content_entry.file_type = "azure_blob"
3647
+ # Use existing id and content_hash from the original content if available
3648
+ if not content_entry.content_hash:
3649
+ content_entry.content_hash = self._build_content_hash(content_entry)
3650
+ if not content_entry.id:
3651
+ content_entry.id = generate_id(content_entry.content_hash)
3652
+
3653
+ self._insert_contents_db(content_entry)
3654
+
3655
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
3656
+ content_entry.status = ContentStatus.COMPLETED
3657
+ self._update_content(content_entry)
3658
+ continue
3659
+
3660
+ # Download blob
3661
+ try:
3662
+ blob_client = container_client.get_blob_client(blob_name)
3663
+ download_stream = blob_client.download_blob()
3664
+ file_content = BytesIO(download_stream.readall())
3665
+ except Exception as e:
3666
+ log_error(f"Error downloading Azure blob {blob_name}: {e}")
3667
+ content_entry.status = ContentStatus.FAILED
3668
+ content_entry.status_message = str(e)
3669
+ self._update_content(content_entry)
3670
+ continue
3671
+
3672
+ # Select reader and read content
3673
+ reader = self._select_reader_by_uri(file_name, content.reader)
3674
+ if reader is None:
3675
+ log_warning(f"No reader found for file: {file_name}")
3676
+ content_entry.status = ContentStatus.FAILED
3677
+ content_entry.status_message = "No suitable reader found"
3678
+ self._update_content(content_entry)
3679
+ continue
3680
+
3681
+ reader = cast(Reader, reader)
3682
+ read_documents = reader.read(file_content, name=file_name)
3683
+
3684
+ # Prepare and insert into vector database
3685
+ if not content_entry.id:
3686
+ content_entry.id = generate_id(content_entry.content_hash or "")
3687
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
2236
3688
  self._handle_vector_db_insert(content_entry, read_documents, upsert)
2237
3689
 
2238
3690
  async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
@@ -2312,6 +3764,18 @@ class Knowledge:
2312
3764
  content.status = ContentStatus.COMPLETED
2313
3765
  self._update_content(content)
2314
3766
 
3767
+ # --- Remote Content Sources ---
3768
+
3769
+ def _get_remote_configs(self) -> List[RemoteContentConfig]:
3770
+ """Return configured remote content sources."""
3771
+ return self.content_sources or []
3772
+
3773
+ def _get_remote_config_by_id(self, config_id: str) -> Optional[RemoteContentConfig]:
3774
+ """Get a remote content config by its ID."""
3775
+ if not self.content_sources:
3776
+ return None
3777
+ return next((c for c in self.content_sources if c.id == config_id), None)
3778
+
2315
3779
  # ==========================================
2316
3780
  # PRIVATE - CONVERSION & DATA METHODS
2317
3781
  # ==========================================