agno 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/db/postgres/postgres.py +25 -12
- agno/db/sqlite/sqlite.py +24 -11
- agno/integrations/discord/client.py +12 -1
- agno/knowledge/knowledge.py +1070 -43
- agno/knowledge/reader/csv_reader.py +231 -8
- agno/knowledge/reader/field_labeled_csv_reader.py +167 -3
- agno/knowledge/reader/reader_factory.py +8 -1
- agno/knowledge/remote_content/__init__.py +29 -0
- agno/knowledge/remote_content/config.py +204 -0
- agno/knowledge/remote_content/remote_content.py +74 -17
- agno/models/base.py +12 -2
- agno/models/cerebras/cerebras.py +34 -2
- agno/models/n1n/__init__.py +3 -0
- agno/models/n1n/n1n.py +57 -0
- agno/models/openai/chat.py +18 -1
- agno/models/perplexity/perplexity.py +2 -0
- agno/os/interfaces/slack/router.py +10 -1
- agno/os/interfaces/whatsapp/router.py +6 -0
- agno/os/routers/components/components.py +10 -1
- agno/os/routers/knowledge/knowledge.py +125 -0
- agno/os/routers/knowledge/schemas.py +12 -0
- agno/run/agent.py +2 -0
- agno/team/team.py +20 -4
- agno/vectordb/pgvector/pgvector.py +3 -3
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/METADATA +4 -1
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/RECORD +29 -26
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/WHEEL +1 -1
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -9,6 +9,7 @@ from os.path import basename
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
11
11
|
|
|
12
|
+
import httpx
|
|
12
13
|
from httpx import AsyncClient
|
|
13
14
|
|
|
14
15
|
from agno.db.base import AsyncBaseDb, BaseDb
|
|
@@ -17,7 +18,20 @@ from agno.filters import FilterExpr
|
|
|
17
18
|
from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
18
19
|
from agno.knowledge.document import Document
|
|
19
20
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
20
|
-
from agno.knowledge.remote_content.
|
|
21
|
+
from agno.knowledge.remote_content.config import (
|
|
22
|
+
GcsConfig,
|
|
23
|
+
GitHubConfig,
|
|
24
|
+
RemoteContentConfig,
|
|
25
|
+
S3Config,
|
|
26
|
+
SharePointConfig,
|
|
27
|
+
)
|
|
28
|
+
from agno.knowledge.remote_content.remote_content import (
|
|
29
|
+
GCSContent,
|
|
30
|
+
GitHubContent,
|
|
31
|
+
RemoteContent,
|
|
32
|
+
S3Content,
|
|
33
|
+
SharePointContent,
|
|
34
|
+
)
|
|
21
35
|
from agno.utils.http import async_fetch_with_retry
|
|
22
36
|
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
23
37
|
from agno.utils.string import generate_id
|
|
@@ -42,6 +56,7 @@ class Knowledge:
|
|
|
42
56
|
contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
|
|
43
57
|
max_results: int = 10
|
|
44
58
|
readers: Optional[Dict[str, Reader]] = None
|
|
59
|
+
content_sources: Optional[List[RemoteContentConfig]] = None
|
|
45
60
|
|
|
46
61
|
def __post_init__(self):
|
|
47
62
|
from agno.vectordb import VectorDb
|
|
@@ -1161,7 +1176,7 @@ class Knowledge:
|
|
|
1161
1176
|
import inspect
|
|
1162
1177
|
|
|
1163
1178
|
read_signature = inspect.signature(reader.read)
|
|
1164
|
-
if password and "password" in read_signature.parameters:
|
|
1179
|
+
if password is not None and "password" in read_signature.parameters:
|
|
1165
1180
|
if isinstance(source, BytesIO):
|
|
1166
1181
|
return reader.read(source, name=name, password=password)
|
|
1167
1182
|
else:
|
|
@@ -1194,7 +1209,7 @@ class Knowledge:
|
|
|
1194
1209
|
import inspect
|
|
1195
1210
|
|
|
1196
1211
|
read_signature = inspect.signature(reader.async_read)
|
|
1197
|
-
if password and "password" in read_signature.parameters:
|
|
1212
|
+
if password is not None and "password" in read_signature.parameters:
|
|
1198
1213
|
return await reader.async_read(source, name=name, password=password)
|
|
1199
1214
|
else:
|
|
1200
1215
|
if isinstance(source, BytesIO):
|
|
@@ -1285,7 +1300,7 @@ class Knowledge:
|
|
|
1285
1300
|
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
1286
1301
|
|
|
1287
1302
|
if reader:
|
|
1288
|
-
password = content.auth.password if content.auth and content.auth.password else None
|
|
1303
|
+
password = content.auth.password if content.auth and content.auth.password is not None else None
|
|
1289
1304
|
read_documents = await self._aread(reader, path, name=content.name or path.name, password=password)
|
|
1290
1305
|
else:
|
|
1291
1306
|
read_documents = []
|
|
@@ -1304,7 +1319,7 @@ class Knowledge:
|
|
|
1304
1319
|
|
|
1305
1320
|
if not content.id:
|
|
1306
1321
|
content.id = generate_id(content.content_hash or "")
|
|
1307
|
-
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1322
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
1308
1323
|
|
|
1309
1324
|
await self._ahandle_vector_db_insert(content, read_documents, upsert)
|
|
1310
1325
|
|
|
@@ -1366,7 +1381,7 @@ class Knowledge:
|
|
|
1366
1381
|
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
1367
1382
|
|
|
1368
1383
|
if reader:
|
|
1369
|
-
password = content.auth.password if content.auth and content.auth.password else None
|
|
1384
|
+
password = content.auth.password if content.auth and content.auth.password is not None else None
|
|
1370
1385
|
read_documents = self._read(reader, path, name=content.name or path.name, password=password)
|
|
1371
1386
|
else:
|
|
1372
1387
|
read_documents = []
|
|
@@ -1385,7 +1400,7 @@ class Knowledge:
|
|
|
1385
1400
|
|
|
1386
1401
|
if not content.id:
|
|
1387
1402
|
content.id = generate_id(content.content_hash or "")
|
|
1388
|
-
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1403
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
1389
1404
|
|
|
1390
1405
|
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1391
1406
|
|
|
@@ -1485,7 +1500,7 @@ class Knowledge:
|
|
|
1485
1500
|
if reader.__class__.__name__ == "YouTubeReader":
|
|
1486
1501
|
read_documents = await reader.async_read(content.url, name=name)
|
|
1487
1502
|
else:
|
|
1488
|
-
password = content.auth.password if content.auth and content.auth.password else None
|
|
1503
|
+
password = content.auth.password if content.auth and content.auth.password is not None else None
|
|
1489
1504
|
source = bytes_content if bytes_content else content.url
|
|
1490
1505
|
read_documents = await self._aread(reader, source, name=name, password=password)
|
|
1491
1506
|
|
|
@@ -1583,7 +1598,7 @@ class Knowledge:
|
|
|
1583
1598
|
if reader.__class__.__name__ == "YouTubeReader":
|
|
1584
1599
|
read_documents = reader.read(content.url, name=name)
|
|
1585
1600
|
else:
|
|
1586
|
-
password = content.auth.password if content.auth and content.auth.password else None
|
|
1601
|
+
password = content.auth.password if content.auth and content.auth.password is not None else None
|
|
1587
1602
|
source = bytes_content if bytes_content else content.url
|
|
1588
1603
|
read_documents = self._read(reader, source, name=name, password=password)
|
|
1589
1604
|
|
|
@@ -1930,16 +1945,35 @@ class Knowledge:
|
|
|
1930
1945
|
|
|
1931
1946
|
remote_content = content.remote_content
|
|
1932
1947
|
|
|
1948
|
+
# Look up config if config_id is provided
|
|
1949
|
+
config = None
|
|
1950
|
+
if hasattr(remote_content, "config_id") and remote_content.config_id:
|
|
1951
|
+
config = self._get_remote_config_by_id(remote_content.config_id)
|
|
1952
|
+
if config is None:
|
|
1953
|
+
log_warning(f"No config found for config_id: {remote_content.config_id}")
|
|
1954
|
+
|
|
1933
1955
|
if isinstance(remote_content, S3Content):
|
|
1934
|
-
await self._aload_from_s3(content, upsert, skip_if_exists)
|
|
1956
|
+
await self._aload_from_s3(content, upsert, skip_if_exists, config)
|
|
1935
1957
|
|
|
1936
1958
|
elif isinstance(remote_content, GCSContent):
|
|
1937
|
-
await self._aload_from_gcs(content, upsert, skip_if_exists)
|
|
1959
|
+
await self._aload_from_gcs(content, upsert, skip_if_exists, config)
|
|
1960
|
+
|
|
1961
|
+
elif isinstance(remote_content, SharePointContent):
|
|
1962
|
+
await self._aload_from_sharepoint(content, upsert, skip_if_exists, config)
|
|
1963
|
+
|
|
1964
|
+
elif isinstance(remote_content, GitHubContent):
|
|
1965
|
+
await self._aload_from_github(content, upsert, skip_if_exists, config)
|
|
1938
1966
|
|
|
1939
1967
|
else:
|
|
1940
1968
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
1941
1969
|
|
|
1942
|
-
async def _aload_from_s3(
|
|
1970
|
+
async def _aload_from_s3(
|
|
1971
|
+
self,
|
|
1972
|
+
content: Content,
|
|
1973
|
+
upsert: bool,
|
|
1974
|
+
skip_if_exists: bool,
|
|
1975
|
+
config: Optional[RemoteContentConfig] = None,
|
|
1976
|
+
):
|
|
1943
1977
|
"""Load the contextual S3 content.
|
|
1944
1978
|
|
|
1945
1979
|
1. Identify objects to read
|
|
@@ -1951,22 +1985,43 @@ class Knowledge:
|
|
|
1951
1985
|
7. Prepare and insert the content in the vector database
|
|
1952
1986
|
8. Remove temporary file if needed
|
|
1953
1987
|
"""
|
|
1988
|
+
from agno.cloud.aws.s3.bucket import S3Bucket
|
|
1954
1989
|
from agno.cloud.aws.s3.object import S3Object
|
|
1955
1990
|
|
|
1991
|
+
# Note: S3 support has limited features compared to GitHub/SharePoint
|
|
1992
|
+
log_warning(
|
|
1993
|
+
"S3 content loading has limited features. "
|
|
1994
|
+
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
1995
|
+
)
|
|
1996
|
+
|
|
1956
1997
|
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
1957
1998
|
|
|
1999
|
+
# Get or create bucket with credentials from config
|
|
2000
|
+
bucket = remote_content.bucket
|
|
2001
|
+
try:
|
|
2002
|
+
if bucket is None and remote_content.bucket_name:
|
|
2003
|
+
s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
|
|
2004
|
+
bucket = S3Bucket(
|
|
2005
|
+
name=remote_content.bucket_name,
|
|
2006
|
+
region=s3_config.region if s3_config else None,
|
|
2007
|
+
aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
|
|
2008
|
+
aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
|
|
2009
|
+
)
|
|
2010
|
+
except Exception as e:
|
|
2011
|
+
log_error(f"Error getting bucket: {e}")
|
|
2012
|
+
|
|
1958
2013
|
# 1. Identify objects to read
|
|
1959
2014
|
objects_to_read: List[S3Object] = []
|
|
1960
|
-
if
|
|
2015
|
+
if bucket is not None:
|
|
1961
2016
|
if remote_content.key is not None:
|
|
1962
|
-
_object = S3Object(bucket_name=
|
|
2017
|
+
_object = S3Object(bucket_name=bucket.name, name=remote_content.key)
|
|
1963
2018
|
objects_to_read.append(_object)
|
|
1964
2019
|
elif remote_content.object is not None:
|
|
1965
2020
|
objects_to_read.append(remote_content.object)
|
|
1966
2021
|
elif remote_content.prefix is not None:
|
|
1967
|
-
objects_to_read.extend(
|
|
2022
|
+
objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
|
|
1968
2023
|
else:
|
|
1969
|
-
objects_to_read.extend(
|
|
2024
|
+
objects_to_read.extend(bucket.get_objects())
|
|
1970
2025
|
|
|
1971
2026
|
for s3_object in objects_to_read:
|
|
1972
2027
|
# 2. Setup Content object
|
|
@@ -2008,16 +2063,20 @@ class Knowledge:
|
|
|
2008
2063
|
read_documents = await reader.async_read(readable_content, name=obj_name)
|
|
2009
2064
|
|
|
2010
2065
|
# 7. Prepare and insert the content in the vector database
|
|
2011
|
-
|
|
2012
|
-
content.id = generate_id(content.content_hash or "")
|
|
2013
|
-
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2066
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2014
2067
|
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2015
2068
|
|
|
2016
2069
|
# 8. Remove temporary file if needed
|
|
2017
2070
|
if temporary_file:
|
|
2018
2071
|
temporary_file.unlink()
|
|
2019
2072
|
|
|
2020
|
-
async def _aload_from_gcs(
|
|
2073
|
+
async def _aload_from_gcs(
|
|
2074
|
+
self,
|
|
2075
|
+
content: Content,
|
|
2076
|
+
upsert: bool,
|
|
2077
|
+
skip_if_exists: bool,
|
|
2078
|
+
config: Optional[RemoteContentConfig] = None,
|
|
2079
|
+
):
|
|
2021
2080
|
"""Load the contextual GCS content.
|
|
2022
2081
|
|
|
2023
2082
|
1. Identify objects to read
|
|
@@ -2028,16 +2087,42 @@ class Knowledge:
|
|
|
2028
2087
|
6. Read the content
|
|
2029
2088
|
7. Prepare and insert the content in the vector database
|
|
2030
2089
|
"""
|
|
2090
|
+
try:
|
|
2091
|
+
from google.cloud import storage # type: ignore
|
|
2092
|
+
except ImportError:
|
|
2093
|
+
raise ImportError(
|
|
2094
|
+
"The `google-cloud-storage` package is not installed. "
|
|
2095
|
+
"Please install it via `pip install google-cloud-storage`."
|
|
2096
|
+
)
|
|
2097
|
+
|
|
2098
|
+
# Note: GCS support has limited features compared to GitHub/SharePoint
|
|
2099
|
+
log_warning(
|
|
2100
|
+
"GCS content loading has limited features. "
|
|
2101
|
+
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2102
|
+
)
|
|
2103
|
+
|
|
2031
2104
|
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
2032
2105
|
|
|
2106
|
+
# Get or create bucket with credentials from config
|
|
2107
|
+
bucket = remote_content.bucket
|
|
2108
|
+
if bucket is None and remote_content.bucket_name:
|
|
2109
|
+
gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
|
|
2110
|
+
if gcs_config and gcs_config.credentials_path:
|
|
2111
|
+
client = storage.Client.from_service_account_json(gcs_config.credentials_path)
|
|
2112
|
+
elif gcs_config and gcs_config.project:
|
|
2113
|
+
client = storage.Client(project=gcs_config.project)
|
|
2114
|
+
else:
|
|
2115
|
+
client = storage.Client()
|
|
2116
|
+
bucket = client.bucket(remote_content.bucket_name)
|
|
2117
|
+
|
|
2033
2118
|
# 1. Identify objects to read
|
|
2034
2119
|
objects_to_read = []
|
|
2035
2120
|
if remote_content.blob_name is not None:
|
|
2036
|
-
objects_to_read.append(
|
|
2121
|
+
objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
|
|
2037
2122
|
elif remote_content.prefix is not None:
|
|
2038
|
-
objects_to_read.extend(
|
|
2123
|
+
objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
2039
2124
|
else:
|
|
2040
|
-
objects_to_read.extend(
|
|
2125
|
+
objects_to_read.extend(bucket.list_blobs()) # type: ignore
|
|
2041
2126
|
|
|
2042
2127
|
for gcs_object in objects_to_read:
|
|
2043
2128
|
# 2. Setup Content object
|
|
@@ -2070,9 +2155,7 @@ class Knowledge:
|
|
|
2070
2155
|
read_documents = await reader.async_read(readable_content, name=name)
|
|
2071
2156
|
|
|
2072
2157
|
# 7. Prepare and insert the content in the vector database
|
|
2073
|
-
|
|
2074
|
-
content.id = generate_id(content.content_hash or "")
|
|
2075
|
-
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2158
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2076
2159
|
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2077
2160
|
|
|
2078
2161
|
def _load_from_remote_content(
|
|
@@ -2088,16 +2171,35 @@ class Knowledge:
|
|
|
2088
2171
|
|
|
2089
2172
|
remote_content = content.remote_content
|
|
2090
2173
|
|
|
2174
|
+
# Look up config if config_id is provided
|
|
2175
|
+
config = None
|
|
2176
|
+
if hasattr(remote_content, "config_id") and remote_content.config_id:
|
|
2177
|
+
config = self._get_remote_config_by_id(remote_content.config_id)
|
|
2178
|
+
if config is None:
|
|
2179
|
+
log_warning(f"No config found for config_id: {remote_content.config_id}")
|
|
2180
|
+
|
|
2091
2181
|
if isinstance(remote_content, S3Content):
|
|
2092
|
-
self._load_from_s3(content, upsert, skip_if_exists)
|
|
2182
|
+
self._load_from_s3(content, upsert, skip_if_exists, config)
|
|
2093
2183
|
|
|
2094
2184
|
elif isinstance(remote_content, GCSContent):
|
|
2095
|
-
self._load_from_gcs(content, upsert, skip_if_exists)
|
|
2185
|
+
self._load_from_gcs(content, upsert, skip_if_exists, config)
|
|
2186
|
+
|
|
2187
|
+
elif isinstance(remote_content, SharePointContent):
|
|
2188
|
+
self._load_from_sharepoint(content, upsert, skip_if_exists, config)
|
|
2189
|
+
|
|
2190
|
+
elif isinstance(remote_content, GitHubContent):
|
|
2191
|
+
self._load_from_github(content, upsert, skip_if_exists, config)
|
|
2096
2192
|
|
|
2097
2193
|
else:
|
|
2098
2194
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
2099
2195
|
|
|
2100
|
-
def _load_from_s3(
|
|
2196
|
+
def _load_from_s3(
|
|
2197
|
+
self,
|
|
2198
|
+
content: Content,
|
|
2199
|
+
upsert: bool,
|
|
2200
|
+
skip_if_exists: bool,
|
|
2201
|
+
config: Optional[RemoteContentConfig] = None,
|
|
2202
|
+
):
|
|
2101
2203
|
"""Synchronous version of _load_from_s3.
|
|
2102
2204
|
|
|
2103
2205
|
Load the contextual S3 content:
|
|
@@ -2110,22 +2212,40 @@ class Knowledge:
|
|
|
2110
2212
|
7. Prepare and insert the content in the vector database
|
|
2111
2213
|
8. Remove temporary file if needed
|
|
2112
2214
|
"""
|
|
2215
|
+
from agno.cloud.aws.s3.bucket import S3Bucket
|
|
2113
2216
|
from agno.cloud.aws.s3.object import S3Object
|
|
2114
2217
|
|
|
2218
|
+
# Note: S3 support has limited features compared to GitHub/SharePoint
|
|
2219
|
+
log_warning(
|
|
2220
|
+
"S3 content loading has limited features. "
|
|
2221
|
+
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2222
|
+
)
|
|
2223
|
+
|
|
2115
2224
|
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
2116
2225
|
|
|
2226
|
+
# Get or create bucket with credentials from config
|
|
2227
|
+
bucket = remote_content.bucket
|
|
2228
|
+
if bucket is None and remote_content.bucket_name:
|
|
2229
|
+
s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
|
|
2230
|
+
bucket = S3Bucket(
|
|
2231
|
+
name=remote_content.bucket_name,
|
|
2232
|
+
region=s3_config.region if s3_config else None,
|
|
2233
|
+
aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
|
|
2234
|
+
aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
|
|
2235
|
+
)
|
|
2236
|
+
|
|
2117
2237
|
# 1. Identify objects to read
|
|
2118
2238
|
objects_to_read: List[S3Object] = []
|
|
2119
|
-
if
|
|
2239
|
+
if bucket is not None:
|
|
2120
2240
|
if remote_content.key is not None:
|
|
2121
|
-
_object = S3Object(bucket_name=
|
|
2241
|
+
_object = S3Object(bucket_name=bucket.name, name=remote_content.key)
|
|
2122
2242
|
objects_to_read.append(_object)
|
|
2123
2243
|
elif remote_content.object is not None:
|
|
2124
2244
|
objects_to_read.append(remote_content.object)
|
|
2125
2245
|
elif remote_content.prefix is not None:
|
|
2126
|
-
objects_to_read.extend(
|
|
2246
|
+
objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
|
|
2127
2247
|
else:
|
|
2128
|
-
objects_to_read.extend(
|
|
2248
|
+
objects_to_read.extend(bucket.get_objects())
|
|
2129
2249
|
|
|
2130
2250
|
for s3_object in objects_to_read:
|
|
2131
2251
|
# 2. Setup Content object
|
|
@@ -2167,16 +2287,20 @@ class Knowledge:
|
|
|
2167
2287
|
read_documents = reader.read(readable_content, name=obj_name)
|
|
2168
2288
|
|
|
2169
2289
|
# 7. Prepare and insert the content in the vector database
|
|
2170
|
-
|
|
2171
|
-
content.id = generate_id(content.content_hash or "")
|
|
2172
|
-
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2290
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2173
2291
|
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2174
2292
|
|
|
2175
2293
|
# 8. Remove temporary file if needed
|
|
2176
2294
|
if temporary_file:
|
|
2177
2295
|
temporary_file.unlink()
|
|
2178
2296
|
|
|
2179
|
-
def _load_from_gcs(
|
|
2297
|
+
def _load_from_gcs(
|
|
2298
|
+
self,
|
|
2299
|
+
content: Content,
|
|
2300
|
+
upsert: bool,
|
|
2301
|
+
skip_if_exists: bool,
|
|
2302
|
+
config: Optional[RemoteContentConfig] = None,
|
|
2303
|
+
):
|
|
2180
2304
|
"""Synchronous version of _load_from_gcs.
|
|
2181
2305
|
|
|
2182
2306
|
Load the contextual GCS content:
|
|
@@ -2188,16 +2312,42 @@ class Knowledge:
|
|
|
2188
2312
|
6. Read the content
|
|
2189
2313
|
7. Prepare and insert the content in the vector database
|
|
2190
2314
|
"""
|
|
2315
|
+
try:
|
|
2316
|
+
from google.cloud import storage # type: ignore
|
|
2317
|
+
except ImportError:
|
|
2318
|
+
raise ImportError(
|
|
2319
|
+
"The `google-cloud-storage` package is not installed. "
|
|
2320
|
+
"Please install it via `pip install google-cloud-storage`."
|
|
2321
|
+
)
|
|
2322
|
+
|
|
2323
|
+
# Note: GCS support has limited features compared to GitHub/SharePoint
|
|
2324
|
+
log_warning(
|
|
2325
|
+
"GCS content loading has limited features. "
|
|
2326
|
+
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2327
|
+
)
|
|
2328
|
+
|
|
2191
2329
|
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
2192
2330
|
|
|
2331
|
+
# Get or create bucket with credentials from config
|
|
2332
|
+
bucket = remote_content.bucket
|
|
2333
|
+
if bucket is None and remote_content.bucket_name:
|
|
2334
|
+
gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
|
|
2335
|
+
if gcs_config and gcs_config.credentials_path:
|
|
2336
|
+
client = storage.Client.from_service_account_json(gcs_config.credentials_path)
|
|
2337
|
+
elif gcs_config and gcs_config.project:
|
|
2338
|
+
client = storage.Client(project=gcs_config.project)
|
|
2339
|
+
else:
|
|
2340
|
+
client = storage.Client()
|
|
2341
|
+
bucket = client.bucket(remote_content.bucket_name)
|
|
2342
|
+
|
|
2193
2343
|
# 1. Identify objects to read
|
|
2194
2344
|
objects_to_read = []
|
|
2195
2345
|
if remote_content.blob_name is not None:
|
|
2196
|
-
objects_to_read.append(
|
|
2346
|
+
objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
|
|
2197
2347
|
elif remote_content.prefix is not None:
|
|
2198
|
-
objects_to_read.extend(
|
|
2348
|
+
objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
2199
2349
|
else:
|
|
2200
|
-
objects_to_read.extend(
|
|
2350
|
+
objects_to_read.extend(bucket.list_blobs()) # type: ignore
|
|
2201
2351
|
|
|
2202
2352
|
for gcs_object in objects_to_read:
|
|
2203
2353
|
# 2. Setup Content object
|
|
@@ -2230,11 +2380,876 @@ class Knowledge:
|
|
|
2230
2380
|
read_documents = reader.read(readable_content, name=name)
|
|
2231
2381
|
|
|
2232
2382
|
# 7. Prepare and insert the content in the vector database
|
|
2233
|
-
|
|
2234
|
-
content.id = generate_id(content.content_hash or "")
|
|
2235
|
-
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2383
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2236
2384
|
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2237
2385
|
|
|
2386
|
+
# --- SharePoint loaders ---
|
|
2387
|
+
|
|
2388
|
+
def _get_sharepoint_access_token(self, sp_config: SharePointConfig) -> Optional[str]:
|
|
2389
|
+
"""Get an access token for Microsoft Graph API using client credentials flow.
|
|
2390
|
+
|
|
2391
|
+
Requires the `msal` package: pip install msal
|
|
2392
|
+
"""
|
|
2393
|
+
try:
|
|
2394
|
+
from msal import ConfidentialClientApplication # type: ignore
|
|
2395
|
+
except ImportError:
|
|
2396
|
+
raise ImportError("The `msal` package is not installed. Please install it via `pip install msal`.")
|
|
2397
|
+
|
|
2398
|
+
authority = f"https://login.microsoftonline.com/{sp_config.tenant_id}"
|
|
2399
|
+
app = ConfidentialClientApplication(
|
|
2400
|
+
sp_config.client_id,
|
|
2401
|
+
authority=authority,
|
|
2402
|
+
client_credential=sp_config.client_secret,
|
|
2403
|
+
)
|
|
2404
|
+
|
|
2405
|
+
# Acquire token for Microsoft Graph
|
|
2406
|
+
scopes = ["https://graph.microsoft.com/.default"]
|
|
2407
|
+
result = app.acquire_token_for_client(scopes=scopes)
|
|
2408
|
+
|
|
2409
|
+
if "access_token" in result:
|
|
2410
|
+
return result["access_token"]
|
|
2411
|
+
else:
|
|
2412
|
+
log_error(f"Failed to acquire SharePoint token: {result.get('error_description', result.get('error'))}")
|
|
2413
|
+
return None
|
|
2414
|
+
|
|
2415
|
+
def _get_sharepoint_site_id(self, hostname: str, site_path: Optional[str], access_token: str) -> Optional[str]:
|
|
2416
|
+
"""Get the SharePoint site ID using Microsoft Graph API."""
|
|
2417
|
+
import httpx
|
|
2418
|
+
|
|
2419
|
+
if site_path:
|
|
2420
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
|
|
2421
|
+
else:
|
|
2422
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
|
|
2423
|
+
|
|
2424
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2425
|
+
|
|
2426
|
+
try:
|
|
2427
|
+
response = httpx.get(url, headers=headers)
|
|
2428
|
+
response.raise_for_status()
|
|
2429
|
+
return response.json().get("id")
|
|
2430
|
+
except httpx.HTTPStatusError as e:
|
|
2431
|
+
log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
|
|
2432
|
+
return None
|
|
2433
|
+
|
|
2434
|
+
def _list_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
|
|
2435
|
+
"""List all items in a SharePoint folder."""
|
|
2436
|
+
import httpx
|
|
2437
|
+
|
|
2438
|
+
# Strip leading slashes to avoid double-slash in URL
|
|
2439
|
+
folder_path = folder_path.lstrip("/")
|
|
2440
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
|
|
2441
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2442
|
+
items: List[dict] = []
|
|
2443
|
+
|
|
2444
|
+
try:
|
|
2445
|
+
while url:
|
|
2446
|
+
response = httpx.get(url, headers=headers)
|
|
2447
|
+
response.raise_for_status()
|
|
2448
|
+
data = response.json()
|
|
2449
|
+
items.extend(data.get("value", []))
|
|
2450
|
+
url = data.get("@odata.nextLink")
|
|
2451
|
+
except httpx.HTTPStatusError as e:
|
|
2452
|
+
log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
|
|
2453
|
+
|
|
2454
|
+
return items
|
|
2455
|
+
|
|
2456
|
+
def _download_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
|
|
2457
|
+
"""Download a file from SharePoint."""
|
|
2458
|
+
import httpx
|
|
2459
|
+
|
|
2460
|
+
# Strip leading slashes to avoid double-slash in URL
|
|
2461
|
+
file_path = file_path.lstrip("/")
|
|
2462
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
|
|
2463
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2464
|
+
|
|
2465
|
+
try:
|
|
2466
|
+
response = httpx.get(url, headers=headers, follow_redirects=True)
|
|
2467
|
+
response.raise_for_status()
|
|
2468
|
+
return BytesIO(response.content)
|
|
2469
|
+
except httpx.HTTPStatusError as e:
|
|
2470
|
+
log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
|
|
2471
|
+
return None
|
|
2472
|
+
|
|
2473
|
+
async def _aget_sharepoint_site_id(
|
|
2474
|
+
self, hostname: str, site_path: Optional[str], access_token: str
|
|
2475
|
+
) -> Optional[str]:
|
|
2476
|
+
"""Get the SharePoint site ID using Microsoft Graph API (async)."""
|
|
2477
|
+
import httpx
|
|
2478
|
+
|
|
2479
|
+
if site_path:
|
|
2480
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
|
|
2481
|
+
else:
|
|
2482
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
|
|
2483
|
+
|
|
2484
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2485
|
+
|
|
2486
|
+
try:
|
|
2487
|
+
async with httpx.AsyncClient() as client:
|
|
2488
|
+
response = await client.get(url, headers=headers)
|
|
2489
|
+
response.raise_for_status()
|
|
2490
|
+
return response.json().get("id")
|
|
2491
|
+
except httpx.HTTPStatusError as e:
|
|
2492
|
+
log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
|
|
2493
|
+
return None
|
|
2494
|
+
|
|
2495
|
+
async def _alist_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
|
|
2496
|
+
"""List all items in a SharePoint folder (async)."""
|
|
2497
|
+
import httpx
|
|
2498
|
+
|
|
2499
|
+
# Strip leading slashes to avoid double-slash in URL
|
|
2500
|
+
folder_path = folder_path.lstrip("/")
|
|
2501
|
+
url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
|
|
2502
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2503
|
+
items: List[dict] = []
|
|
2504
|
+
|
|
2505
|
+
try:
|
|
2506
|
+
async with httpx.AsyncClient() as client:
|
|
2507
|
+
while url:
|
|
2508
|
+
response = await client.get(url, headers=headers)
|
|
2509
|
+
response.raise_for_status()
|
|
2510
|
+
data = response.json()
|
|
2511
|
+
items.extend(data.get("value", []))
|
|
2512
|
+
url = data.get("@odata.nextLink")
|
|
2513
|
+
except httpx.HTTPStatusError as e:
|
|
2514
|
+
log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
|
|
2515
|
+
|
|
2516
|
+
return items
|
|
2517
|
+
|
|
2518
|
+
async def _adownload_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
|
|
2519
|
+
"""Download a file from SharePoint (async)."""
|
|
2520
|
+
import httpx
|
|
2521
|
+
|
|
2522
|
+
# Strip leading slashes to avoid double-slash in URL
|
|
2523
|
+
file_path = file_path.lstrip("/")
|
|
2524
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
|
|
2525
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2526
|
+
|
|
2527
|
+
try:
|
|
2528
|
+
async with httpx.AsyncClient() as client:
|
|
2529
|
+
response = await client.get(url, headers=headers, follow_redirects=True)
|
|
2530
|
+
response.raise_for_status()
|
|
2531
|
+
return BytesIO(response.content)
|
|
2532
|
+
except httpx.HTTPStatusError as e:
|
|
2533
|
+
log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
|
|
2534
|
+
return None
|
|
2535
|
+
|
|
2536
|
+
async def _aload_from_sharepoint(
|
|
2537
|
+
self,
|
|
2538
|
+
content: Content,
|
|
2539
|
+
upsert: bool,
|
|
2540
|
+
skip_if_exists: bool,
|
|
2541
|
+
config: Optional[RemoteContentConfig] = None,
|
|
2542
|
+
):
|
|
2543
|
+
"""Load content from SharePoint.
|
|
2544
|
+
|
|
2545
|
+
Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
|
|
2546
|
+
|
|
2547
|
+
1. Authenticate with Microsoft Graph using client credentials
|
|
2548
|
+
2. Get site ID from hostname/site_path
|
|
2549
|
+
3. Download file(s) from file_path or folder_path
|
|
2550
|
+
4. Process through reader and insert to vector db
|
|
2551
|
+
"""
|
|
2552
|
+
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
2553
|
+
sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
|
|
2554
|
+
|
|
2555
|
+
if sp_config is None:
|
|
2556
|
+
log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
|
|
2557
|
+
return
|
|
2558
|
+
|
|
2559
|
+
# 1. Get access token
|
|
2560
|
+
access_token = self._get_sharepoint_access_token(sp_config)
|
|
2561
|
+
if not access_token:
|
|
2562
|
+
return
|
|
2563
|
+
|
|
2564
|
+
# 2. Get site ID - use config value if provided, otherwise fetch via API
|
|
2565
|
+
site_id: Optional[str] = sp_config.site_id
|
|
2566
|
+
if not site_id:
|
|
2567
|
+
site_path = remote_content.site_path or sp_config.site_path
|
|
2568
|
+
site_id = await self._aget_sharepoint_site_id(sp_config.hostname, site_path, access_token)
|
|
2569
|
+
if not site_id:
|
|
2570
|
+
log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
|
|
2571
|
+
return
|
|
2572
|
+
|
|
2573
|
+
# 3. Identify files to download
|
|
2574
|
+
files_to_process: List[tuple] = [] # List of (file_path, file_name)
|
|
2575
|
+
|
|
2576
|
+
# Helper function to recursively list all files in a folder
|
|
2577
|
+
async def list_files_recursive(folder: str) -> List[tuple]:
|
|
2578
|
+
"""Recursively list all files in a SharePoint folder."""
|
|
2579
|
+
files: List[tuple] = []
|
|
2580
|
+
items = await self._alist_sharepoint_folder_items(site_id, folder, access_token)
|
|
2581
|
+
for item in items:
|
|
2582
|
+
if "file" in item: # It's a file
|
|
2583
|
+
item_path = f"{folder}/{item['name']}"
|
|
2584
|
+
files.append((item_path, item["name"]))
|
|
2585
|
+
elif "folder" in item: # It's a folder - recurse
|
|
2586
|
+
subdir_path = f"{folder}/{item['name']}"
|
|
2587
|
+
subdir_files = await list_files_recursive(subdir_path)
|
|
2588
|
+
files.extend(subdir_files)
|
|
2589
|
+
return files
|
|
2590
|
+
|
|
2591
|
+
# Get the path to process (file_path or folder_path)
|
|
2592
|
+
path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
|
|
2593
|
+
|
|
2594
|
+
if path_to_process:
|
|
2595
|
+
# Check if path is a file or folder by getting item metadata
|
|
2596
|
+
try:
|
|
2597
|
+
async with AsyncClient() as client:
|
|
2598
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
|
|
2599
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2600
|
+
response = await client.get(url, headers=headers, timeout=30.0)
|
|
2601
|
+
response.raise_for_status()
|
|
2602
|
+
item_data = response.json()
|
|
2603
|
+
|
|
2604
|
+
if "folder" in item_data:
|
|
2605
|
+
# It's a folder - recursively list all files
|
|
2606
|
+
files_to_process = await list_files_recursive(path_to_process)
|
|
2607
|
+
elif "file" in item_data:
|
|
2608
|
+
# It's a single file
|
|
2609
|
+
files_to_process.append((path_to_process, item_data["name"]))
|
|
2610
|
+
else:
|
|
2611
|
+
log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
|
|
2612
|
+
return
|
|
2613
|
+
except Exception as e:
|
|
2614
|
+
log_error(f"Error checking SharePoint path {path_to_process}: {e}")
|
|
2615
|
+
return
|
|
2616
|
+
|
|
2617
|
+
if not files_to_process:
|
|
2618
|
+
log_warning(f"No files found at SharePoint path: {path_to_process}")
|
|
2619
|
+
return
|
|
2620
|
+
|
|
2621
|
+
# 4. Process each file
|
|
2622
|
+
for file_path, file_name in files_to_process:
|
|
2623
|
+
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
2624
|
+
virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
|
|
2625
|
+
|
|
2626
|
+
# Build metadata with all info needed to re-fetch the file
|
|
2627
|
+
sharepoint_metadata = {
|
|
2628
|
+
"source_type": "sharepoint",
|
|
2629
|
+
"source_config_id": sp_config.id,
|
|
2630
|
+
"source_config_name": sp_config.name,
|
|
2631
|
+
"sharepoint_hostname": sp_config.hostname,
|
|
2632
|
+
"sharepoint_site_id": site_id,
|
|
2633
|
+
"sharepoint_path": file_path,
|
|
2634
|
+
"sharepoint_filename": file_name,
|
|
2635
|
+
}
|
|
2636
|
+
# Merge with user-provided metadata (user metadata takes precedence)
|
|
2637
|
+
merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
|
|
2638
|
+
|
|
2639
|
+
# Setup Content object
|
|
2640
|
+
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
2641
|
+
is_folder_upload = len(files_to_process) > 1
|
|
2642
|
+
if is_folder_upload:
|
|
2643
|
+
# Compute relative path from the upload root
|
|
2644
|
+
relative_path = file_path
|
|
2645
|
+
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
2646
|
+
relative_path = file_path[len(path_to_process) + 1 :]
|
|
2647
|
+
# If user provided a name, prefix it; otherwise use full file path
|
|
2648
|
+
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
2649
|
+
else:
|
|
2650
|
+
# Single file: use user's name or the filename
|
|
2651
|
+
content_name = content.name or file_name
|
|
2652
|
+
content_entry = Content(
|
|
2653
|
+
name=content_name,
|
|
2654
|
+
description=content.description,
|
|
2655
|
+
path=virtual_path, # Include path for unique hashing
|
|
2656
|
+
status=ContentStatus.PROCESSING,
|
|
2657
|
+
metadata=merged_metadata,
|
|
2658
|
+
file_type="sharepoint",
|
|
2659
|
+
)
|
|
2660
|
+
|
|
2661
|
+
# Hash content and add to contents database
|
|
2662
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2663
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
2664
|
+
await self._ainsert_contents_db(content_entry)
|
|
2665
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2666
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
2667
|
+
await self._aupdate_content(content_entry)
|
|
2668
|
+
continue
|
|
2669
|
+
|
|
2670
|
+
# Select reader based on file extension
|
|
2671
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
2672
|
+
reader = cast(Reader, reader)
|
|
2673
|
+
|
|
2674
|
+
# Download file
|
|
2675
|
+
file_content = await self._adownload_sharepoint_file(site_id, file_path, access_token)
|
|
2676
|
+
if not file_content:
|
|
2677
|
+
content_entry.status = ContentStatus.FAILED
|
|
2678
|
+
await self._aupdate_content(content_entry)
|
|
2679
|
+
continue
|
|
2680
|
+
|
|
2681
|
+
# Read the content
|
|
2682
|
+
read_documents = await reader.async_read(file_content, name=file_name)
|
|
2683
|
+
|
|
2684
|
+
# Prepare and insert to vector database
|
|
2685
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2686
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2687
|
+
|
|
2688
|
+
def _load_from_sharepoint(
|
|
2689
|
+
self,
|
|
2690
|
+
content: Content,
|
|
2691
|
+
upsert: bool,
|
|
2692
|
+
skip_if_exists: bool,
|
|
2693
|
+
config: Optional[RemoteContentConfig] = None,
|
|
2694
|
+
):
|
|
2695
|
+
"""Synchronous version of _load_from_sharepoint.
|
|
2696
|
+
|
|
2697
|
+
Load content from SharePoint:
|
|
2698
|
+
1. Authenticate with Microsoft Graph using client credentials
|
|
2699
|
+
2. Get site ID from hostname/site_path
|
|
2700
|
+
3. Download file(s) from file_path or folder_path
|
|
2701
|
+
4. Process through reader and insert to vector db
|
|
2702
|
+
"""
|
|
2703
|
+
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
2704
|
+
sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
|
|
2705
|
+
|
|
2706
|
+
if sp_config is None:
|
|
2707
|
+
log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
|
|
2708
|
+
return
|
|
2709
|
+
|
|
2710
|
+
# 1. Get access token
|
|
2711
|
+
access_token = self._get_sharepoint_access_token(sp_config)
|
|
2712
|
+
if not access_token:
|
|
2713
|
+
return
|
|
2714
|
+
|
|
2715
|
+
# 2. Get site ID - use config value if provided, otherwise fetch via API
|
|
2716
|
+
site_id: Optional[str] = sp_config.site_id
|
|
2717
|
+
if not site_id:
|
|
2718
|
+
site_path = remote_content.site_path or sp_config.site_path
|
|
2719
|
+
site_id = self._get_sharepoint_site_id(sp_config.hostname, site_path, access_token)
|
|
2720
|
+
if not site_id:
|
|
2721
|
+
log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
|
|
2722
|
+
return
|
|
2723
|
+
|
|
2724
|
+
# 3. Identify files to download
|
|
2725
|
+
files_to_process: List[tuple] = [] # List of (file_path, file_name)
|
|
2726
|
+
|
|
2727
|
+
# Helper function to recursively list all files in a folder
|
|
2728
|
+
def list_files_recursive(folder: str) -> List[tuple]:
|
|
2729
|
+
"""Recursively list all files in a SharePoint folder."""
|
|
2730
|
+
files: List[tuple] = []
|
|
2731
|
+
items = self._list_sharepoint_folder_items(site_id, folder, access_token)
|
|
2732
|
+
for item in items:
|
|
2733
|
+
if "file" in item: # It's a file
|
|
2734
|
+
item_path = f"{folder}/{item['name']}"
|
|
2735
|
+
files.append((item_path, item["name"]))
|
|
2736
|
+
elif "folder" in item: # It's a folder - recurse
|
|
2737
|
+
subdir_path = f"{folder}/{item['name']}"
|
|
2738
|
+
subdir_files = list_files_recursive(subdir_path)
|
|
2739
|
+
files.extend(subdir_files)
|
|
2740
|
+
return files
|
|
2741
|
+
|
|
2742
|
+
# Get the path to process (file_path or folder_path)
|
|
2743
|
+
path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
|
|
2744
|
+
|
|
2745
|
+
if path_to_process:
|
|
2746
|
+
# Check if path is a file or folder by getting item metadata
|
|
2747
|
+
try:
|
|
2748
|
+
with httpx.Client() as client:
|
|
2749
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
|
|
2750
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2751
|
+
response = client.get(url, headers=headers, timeout=30.0)
|
|
2752
|
+
response.raise_for_status()
|
|
2753
|
+
item_data = response.json()
|
|
2754
|
+
|
|
2755
|
+
if "folder" in item_data:
|
|
2756
|
+
# It's a folder - recursively list all files
|
|
2757
|
+
files_to_process = list_files_recursive(path_to_process)
|
|
2758
|
+
elif "file" in item_data:
|
|
2759
|
+
# It's a single file
|
|
2760
|
+
files_to_process.append((path_to_process, item_data["name"]))
|
|
2761
|
+
else:
|
|
2762
|
+
log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
|
|
2763
|
+
return
|
|
2764
|
+
except Exception as e:
|
|
2765
|
+
log_error(f"Error checking SharePoint path {path_to_process}: {e}")
|
|
2766
|
+
return
|
|
2767
|
+
|
|
2768
|
+
if not files_to_process:
|
|
2769
|
+
log_warning(f"No files found at SharePoint path: {path_to_process}")
|
|
2770
|
+
return
|
|
2771
|
+
|
|
2772
|
+
# 4. Process each file
|
|
2773
|
+
for file_path, file_name in files_to_process:
|
|
2774
|
+
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
2775
|
+
virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
|
|
2776
|
+
|
|
2777
|
+
# Build metadata with all info needed to re-fetch the file
|
|
2778
|
+
sharepoint_metadata = {
|
|
2779
|
+
"source_type": "sharepoint",
|
|
2780
|
+
"source_config_id": sp_config.id,
|
|
2781
|
+
"source_config_name": sp_config.name,
|
|
2782
|
+
"sharepoint_hostname": sp_config.hostname,
|
|
2783
|
+
"sharepoint_site_id": site_id,
|
|
2784
|
+
"sharepoint_path": file_path,
|
|
2785
|
+
"sharepoint_filename": file_name,
|
|
2786
|
+
}
|
|
2787
|
+
# Merge with user-provided metadata (user metadata takes precedence)
|
|
2788
|
+
merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
|
|
2789
|
+
|
|
2790
|
+
# Setup Content object
|
|
2791
|
+
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
2792
|
+
is_folder_upload = len(files_to_process) > 1
|
|
2793
|
+
if is_folder_upload:
|
|
2794
|
+
# Compute relative path from the upload root
|
|
2795
|
+
relative_path = file_path
|
|
2796
|
+
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
2797
|
+
relative_path = file_path[len(path_to_process) + 1 :]
|
|
2798
|
+
# If user provided a name, prefix it; otherwise use full file path
|
|
2799
|
+
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
2800
|
+
else:
|
|
2801
|
+
# Single file: use user's name or the filename
|
|
2802
|
+
content_name = content.name or file_name
|
|
2803
|
+
content_entry = Content(
|
|
2804
|
+
name=content_name,
|
|
2805
|
+
description=content.description,
|
|
2806
|
+
path=virtual_path, # Include path for unique hashing
|
|
2807
|
+
status=ContentStatus.PROCESSING,
|
|
2808
|
+
metadata=merged_metadata,
|
|
2809
|
+
file_type="sharepoint",
|
|
2810
|
+
)
|
|
2811
|
+
|
|
2812
|
+
# Hash content and add to contents database
|
|
2813
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2814
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
2815
|
+
self._insert_contents_db(content_entry)
|
|
2816
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2817
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
2818
|
+
self._update_content(content_entry)
|
|
2819
|
+
continue
|
|
2820
|
+
|
|
2821
|
+
# Select reader based on file extension
|
|
2822
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
2823
|
+
reader = cast(Reader, reader)
|
|
2824
|
+
|
|
2825
|
+
# Download file
|
|
2826
|
+
file_content = self._download_sharepoint_file(site_id, file_path, access_token)
|
|
2827
|
+
if not file_content:
|
|
2828
|
+
content_entry.status = ContentStatus.FAILED
|
|
2829
|
+
self._update_content(content_entry)
|
|
2830
|
+
continue
|
|
2831
|
+
|
|
2832
|
+
# Read the content
|
|
2833
|
+
read_documents = reader.read(file_content, name=file_name)
|
|
2834
|
+
|
|
2835
|
+
# Prepare and insert to vector database
|
|
2836
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2837
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2838
|
+
|
|
2839
|
+
# --- GitHub loaders ---
|
|
2840
|
+
|
|
2841
|
+
async def _aload_from_github(
|
|
2842
|
+
self,
|
|
2843
|
+
content: Content,
|
|
2844
|
+
upsert: bool,
|
|
2845
|
+
skip_if_exists: bool,
|
|
2846
|
+
config: Optional[RemoteContentConfig] = None,
|
|
2847
|
+
):
|
|
2848
|
+
"""Load content from GitHub.
|
|
2849
|
+
|
|
2850
|
+
Requires the GitHub config to contain repo and optionally token for private repos.
|
|
2851
|
+
Uses the GitHub API to fetch file contents.
|
|
2852
|
+
"""
|
|
2853
|
+
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
2854
|
+
gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
|
|
2855
|
+
|
|
2856
|
+
if gh_config is None:
|
|
2857
|
+
log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
|
|
2858
|
+
return
|
|
2859
|
+
|
|
2860
|
+
# Build headers for GitHub API
|
|
2861
|
+
headers = {
|
|
2862
|
+
"Accept": "application/vnd.github.v3+json",
|
|
2863
|
+
"User-Agent": "Agno-Knowledge",
|
|
2864
|
+
}
|
|
2865
|
+
if gh_config.token:
|
|
2866
|
+
headers["Authorization"] = f"Bearer {gh_config.token}"
|
|
2867
|
+
|
|
2868
|
+
branch = remote_content.branch or gh_config.branch or "main"
|
|
2869
|
+
|
|
2870
|
+
# Get list of files to process
|
|
2871
|
+
files_to_process: List[Dict[str, str]] = []
|
|
2872
|
+
|
|
2873
|
+
async with AsyncClient() as client:
|
|
2874
|
+
# Helper function to recursively list all files in a folder
|
|
2875
|
+
async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
|
|
2876
|
+
"""Recursively list all files in a GitHub folder."""
|
|
2877
|
+
files: List[Dict[str, str]] = []
|
|
2878
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
|
|
2879
|
+
if branch:
|
|
2880
|
+
api_url += f"?ref={branch}"
|
|
2881
|
+
|
|
2882
|
+
try:
|
|
2883
|
+
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
2884
|
+
response.raise_for_status()
|
|
2885
|
+
items = response.json()
|
|
2886
|
+
|
|
2887
|
+
# If items is not a list, it's a single file response
|
|
2888
|
+
if not isinstance(items, list):
|
|
2889
|
+
items = [items]
|
|
2890
|
+
|
|
2891
|
+
for item in items:
|
|
2892
|
+
if item.get("type") == "file":
|
|
2893
|
+
files.append(
|
|
2894
|
+
{
|
|
2895
|
+
"path": item["path"],
|
|
2896
|
+
"name": item["name"],
|
|
2897
|
+
}
|
|
2898
|
+
)
|
|
2899
|
+
elif item.get("type") == "dir":
|
|
2900
|
+
# Recursively get files from subdirectory
|
|
2901
|
+
subdir_files = await list_files_recursive(item["path"])
|
|
2902
|
+
files.extend(subdir_files)
|
|
2903
|
+
except Exception as e:
|
|
2904
|
+
log_error(f"Error listing GitHub folder {folder}: {e}")
|
|
2905
|
+
|
|
2906
|
+
return files
|
|
2907
|
+
|
|
2908
|
+
# Get the path to process (file_path or folder_path)
|
|
2909
|
+
path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
|
|
2910
|
+
|
|
2911
|
+
if path_to_process:
|
|
2912
|
+
# Fetch the path to determine if it's a file or directory
|
|
2913
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
|
|
2914
|
+
if branch:
|
|
2915
|
+
api_url += f"?ref={branch}"
|
|
2916
|
+
|
|
2917
|
+
try:
|
|
2918
|
+
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
2919
|
+
response.raise_for_status()
|
|
2920
|
+
path_data = response.json()
|
|
2921
|
+
|
|
2922
|
+
if isinstance(path_data, list):
|
|
2923
|
+
# It's a directory - recursively list all files
|
|
2924
|
+
for item in path_data:
|
|
2925
|
+
if item.get("type") == "file":
|
|
2926
|
+
files_to_process.append({"path": item["path"], "name": item["name"]})
|
|
2927
|
+
elif item.get("type") == "dir":
|
|
2928
|
+
subdir_files = await list_files_recursive(item["path"])
|
|
2929
|
+
files_to_process.extend(subdir_files)
|
|
2930
|
+
else:
|
|
2931
|
+
# It's a single file
|
|
2932
|
+
files_to_process.append(
|
|
2933
|
+
{
|
|
2934
|
+
"path": path_data["path"],
|
|
2935
|
+
"name": path_data["name"],
|
|
2936
|
+
}
|
|
2937
|
+
)
|
|
2938
|
+
except Exception as e:
|
|
2939
|
+
log_error(f"Error fetching GitHub path {path_to_process}: {e}")
|
|
2940
|
+
return
|
|
2941
|
+
|
|
2942
|
+
if not files_to_process:
|
|
2943
|
+
log_warning(f"No files found at GitHub path: {path_to_process}")
|
|
2944
|
+
return
|
|
2945
|
+
|
|
2946
|
+
# Process each file
|
|
2947
|
+
for file_info in files_to_process:
|
|
2948
|
+
file_path = file_info["path"]
|
|
2949
|
+
file_name = file_info["name"]
|
|
2950
|
+
|
|
2951
|
+
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
2952
|
+
virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
|
|
2953
|
+
|
|
2954
|
+
# Build metadata with all info needed to re-fetch the file
|
|
2955
|
+
github_metadata = {
|
|
2956
|
+
"source_type": "github",
|
|
2957
|
+
"source_config_id": gh_config.id,
|
|
2958
|
+
"source_config_name": gh_config.name,
|
|
2959
|
+
"github_repo": gh_config.repo,
|
|
2960
|
+
"github_branch": branch,
|
|
2961
|
+
"github_path": file_path,
|
|
2962
|
+
"github_filename": file_name,
|
|
2963
|
+
}
|
|
2964
|
+
# Merge with user-provided metadata (user metadata takes precedence)
|
|
2965
|
+
merged_metadata = {**github_metadata, **(content.metadata or {})}
|
|
2966
|
+
|
|
2967
|
+
# Setup Content object
|
|
2968
|
+
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
2969
|
+
is_folder_upload = len(files_to_process) > 1
|
|
2970
|
+
if is_folder_upload:
|
|
2971
|
+
# Compute relative path from the upload root
|
|
2972
|
+
relative_path = file_path
|
|
2973
|
+
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
2974
|
+
relative_path = file_path[len(path_to_process) + 1 :]
|
|
2975
|
+
# If user provided a name, prefix it; otherwise use full file path
|
|
2976
|
+
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
2977
|
+
else:
|
|
2978
|
+
# Single file: use user's name or the filename
|
|
2979
|
+
content_name = content.name or file_name
|
|
2980
|
+
content_entry = Content(
|
|
2981
|
+
name=content_name,
|
|
2982
|
+
description=content.description,
|
|
2983
|
+
path=virtual_path, # Include path for unique hashing
|
|
2984
|
+
status=ContentStatus.PROCESSING,
|
|
2985
|
+
metadata=merged_metadata,
|
|
2986
|
+
file_type="github",
|
|
2987
|
+
)
|
|
2988
|
+
|
|
2989
|
+
# Hash content and add to contents database
|
|
2990
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2991
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
2992
|
+
await self._ainsert_contents_db(content_entry)
|
|
2993
|
+
|
|
2994
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2995
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
2996
|
+
await self._aupdate_content(content_entry)
|
|
2997
|
+
continue
|
|
2998
|
+
|
|
2999
|
+
# Fetch file content using GitHub API (works for private repos)
|
|
3000
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
|
|
3001
|
+
if branch:
|
|
3002
|
+
api_url += f"?ref={branch}"
|
|
3003
|
+
try:
|
|
3004
|
+
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
3005
|
+
response.raise_for_status()
|
|
3006
|
+
file_data = response.json()
|
|
3007
|
+
|
|
3008
|
+
# GitHub API returns content as base64
|
|
3009
|
+
if file_data.get("encoding") == "base64":
|
|
3010
|
+
import base64
|
|
3011
|
+
|
|
3012
|
+
file_content = base64.b64decode(file_data["content"])
|
|
3013
|
+
else:
|
|
3014
|
+
# For large files, GitHub returns a download_url
|
|
3015
|
+
download_url = file_data.get("download_url")
|
|
3016
|
+
if download_url:
|
|
3017
|
+
dl_response = await client.get(download_url, headers=headers, timeout=30.0)
|
|
3018
|
+
dl_response.raise_for_status()
|
|
3019
|
+
file_content = dl_response.content
|
|
3020
|
+
else:
|
|
3021
|
+
raise ValueError("No content or download_url in response")
|
|
3022
|
+
except Exception as e:
|
|
3023
|
+
log_error(f"Error fetching GitHub file {file_path}: {e}")
|
|
3024
|
+
content_entry.status = ContentStatus.FAILED
|
|
3025
|
+
content_entry.status_message = str(e)
|
|
3026
|
+
await self._aupdate_content(content_entry)
|
|
3027
|
+
continue
|
|
3028
|
+
|
|
3029
|
+
# Select reader and read content
|
|
3030
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3031
|
+
if reader is None:
|
|
3032
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
3033
|
+
content_entry.status = ContentStatus.FAILED
|
|
3034
|
+
content_entry.status_message = "No suitable reader found"
|
|
3035
|
+
await self._aupdate_content(content_entry)
|
|
3036
|
+
continue
|
|
3037
|
+
|
|
3038
|
+
reader = cast(Reader, reader)
|
|
3039
|
+
readable_content = BytesIO(file_content)
|
|
3040
|
+
read_documents = await reader.async_read(readable_content, name=file_name)
|
|
3041
|
+
|
|
3042
|
+
# Prepare and insert into vector database
|
|
3043
|
+
if not content_entry.id:
|
|
3044
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3045
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3046
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3047
|
+
|
|
3048
|
+
def _load_from_github(
|
|
3049
|
+
self,
|
|
3050
|
+
content: Content,
|
|
3051
|
+
upsert: bool,
|
|
3052
|
+
skip_if_exists: bool,
|
|
3053
|
+
config: Optional[RemoteContentConfig] = None,
|
|
3054
|
+
):
|
|
3055
|
+
"""Synchronous version of _load_from_github."""
|
|
3056
|
+
import httpx
|
|
3057
|
+
|
|
3058
|
+
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
3059
|
+
gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
|
|
3060
|
+
|
|
3061
|
+
if gh_config is None:
|
|
3062
|
+
log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
|
|
3063
|
+
return
|
|
3064
|
+
|
|
3065
|
+
# Build headers for GitHub API
|
|
3066
|
+
headers = {
|
|
3067
|
+
"Accept": "application/vnd.github.v3+json",
|
|
3068
|
+
"User-Agent": "Agno-Knowledge",
|
|
3069
|
+
}
|
|
3070
|
+
if gh_config.token:
|
|
3071
|
+
headers["Authorization"] = f"Bearer {gh_config.token}"
|
|
3072
|
+
|
|
3073
|
+
branch = remote_content.branch or gh_config.branch or "main"
|
|
3074
|
+
|
|
3075
|
+
# Get list of files to process
|
|
3076
|
+
files_to_process: List[Dict[str, str]] = []
|
|
3077
|
+
|
|
3078
|
+
with httpx.Client() as client:
|
|
3079
|
+
# Helper function to recursively list all files in a folder
|
|
3080
|
+
def list_files_recursive(folder: str) -> List[Dict[str, str]]:
|
|
3081
|
+
"""Recursively list all files in a GitHub folder."""
|
|
3082
|
+
files: List[Dict[str, str]] = []
|
|
3083
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
|
|
3084
|
+
if branch:
|
|
3085
|
+
api_url += f"?ref={branch}"
|
|
3086
|
+
|
|
3087
|
+
try:
|
|
3088
|
+
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
3089
|
+
response.raise_for_status()
|
|
3090
|
+
items = response.json()
|
|
3091
|
+
|
|
3092
|
+
# If items is not a list, it's a single file response
|
|
3093
|
+
if not isinstance(items, list):
|
|
3094
|
+
items = [items]
|
|
3095
|
+
|
|
3096
|
+
for item in items:
|
|
3097
|
+
if item.get("type") == "file":
|
|
3098
|
+
files.append(
|
|
3099
|
+
{
|
|
3100
|
+
"path": item["path"],
|
|
3101
|
+
"name": item["name"],
|
|
3102
|
+
}
|
|
3103
|
+
)
|
|
3104
|
+
elif item.get("type") == "dir":
|
|
3105
|
+
# Recursively get files from subdirectory
|
|
3106
|
+
subdir_files = list_files_recursive(item["path"])
|
|
3107
|
+
files.extend(subdir_files)
|
|
3108
|
+
except Exception as e:
|
|
3109
|
+
log_error(f"Error listing GitHub folder {folder}: {e}")
|
|
3110
|
+
|
|
3111
|
+
return files
|
|
3112
|
+
|
|
3113
|
+
# Get the path to process (file_path or folder_path)
|
|
3114
|
+
path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
|
|
3115
|
+
|
|
3116
|
+
if path_to_process:
|
|
3117
|
+
# Fetch the path to determine if it's a file or directory
|
|
3118
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
|
|
3119
|
+
if branch:
|
|
3120
|
+
api_url += f"?ref={branch}"
|
|
3121
|
+
|
|
3122
|
+
try:
|
|
3123
|
+
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
3124
|
+
response.raise_for_status()
|
|
3125
|
+
path_data = response.json()
|
|
3126
|
+
|
|
3127
|
+
if isinstance(path_data, list):
|
|
3128
|
+
# It's a directory - recursively list all files
|
|
3129
|
+
for item in path_data:
|
|
3130
|
+
if item.get("type") == "file":
|
|
3131
|
+
files_to_process.append({"path": item["path"], "name": item["name"]})
|
|
3132
|
+
elif item.get("type") == "dir":
|
|
3133
|
+
subdir_files = list_files_recursive(item["path"])
|
|
3134
|
+
files_to_process.extend(subdir_files)
|
|
3135
|
+
else:
|
|
3136
|
+
# It's a single file
|
|
3137
|
+
files_to_process.append(
|
|
3138
|
+
{
|
|
3139
|
+
"path": path_data["path"],
|
|
3140
|
+
"name": path_data["name"],
|
|
3141
|
+
}
|
|
3142
|
+
)
|
|
3143
|
+
except Exception as e:
|
|
3144
|
+
log_error(f"Error fetching GitHub path {path_to_process}: {e}")
|
|
3145
|
+
return
|
|
3146
|
+
|
|
3147
|
+
if not files_to_process:
|
|
3148
|
+
log_warning(f"No files found at GitHub path: {path_to_process}")
|
|
3149
|
+
return
|
|
3150
|
+
|
|
3151
|
+
# Process each file
|
|
3152
|
+
for file_info in files_to_process:
|
|
3153
|
+
file_path = file_info["path"]
|
|
3154
|
+
file_name = file_info["name"]
|
|
3155
|
+
|
|
3156
|
+
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
3157
|
+
virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
|
|
3158
|
+
|
|
3159
|
+
# Build metadata with all info needed to re-fetch the file
|
|
3160
|
+
github_metadata = {
|
|
3161
|
+
"source_type": "github",
|
|
3162
|
+
"source_config_id": gh_config.id,
|
|
3163
|
+
"source_config_name": gh_config.name,
|
|
3164
|
+
"github_repo": gh_config.repo,
|
|
3165
|
+
"github_branch": branch,
|
|
3166
|
+
"github_path": file_path,
|
|
3167
|
+
"github_filename": file_name,
|
|
3168
|
+
}
|
|
3169
|
+
# Merge with user-provided metadata (user metadata takes precedence)
|
|
3170
|
+
merged_metadata = {**github_metadata, **(content.metadata or {})}
|
|
3171
|
+
|
|
3172
|
+
# Setup Content object
|
|
3173
|
+
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
3174
|
+
is_folder_upload = len(files_to_process) > 1
|
|
3175
|
+
if is_folder_upload:
|
|
3176
|
+
# Compute relative path from the upload root
|
|
3177
|
+
relative_path = file_path
|
|
3178
|
+
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
3179
|
+
relative_path = file_path[len(path_to_process) + 1 :]
|
|
3180
|
+
# If user provided a name, prefix it; otherwise use full file path
|
|
3181
|
+
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
3182
|
+
else:
|
|
3183
|
+
# Single file: use user's name or the filename
|
|
3184
|
+
content_name = content.name or file_name
|
|
3185
|
+
content_entry = Content(
|
|
3186
|
+
name=content_name,
|
|
3187
|
+
description=content.description,
|
|
3188
|
+
path=virtual_path, # Include path for unique hashing
|
|
3189
|
+
status=ContentStatus.PROCESSING,
|
|
3190
|
+
metadata=merged_metadata,
|
|
3191
|
+
file_type="github",
|
|
3192
|
+
)
|
|
3193
|
+
|
|
3194
|
+
# Hash content and add to contents database
|
|
3195
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3196
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
3197
|
+
self._insert_contents_db(content_entry)
|
|
3198
|
+
|
|
3199
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3200
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
3201
|
+
self._update_content(content_entry)
|
|
3202
|
+
continue
|
|
3203
|
+
|
|
3204
|
+
# Fetch file content using GitHub API (works for private repos)
|
|
3205
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
|
|
3206
|
+
if branch:
|
|
3207
|
+
api_url += f"?ref={branch}"
|
|
3208
|
+
try:
|
|
3209
|
+
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
3210
|
+
response.raise_for_status()
|
|
3211
|
+
file_data = response.json()
|
|
3212
|
+
|
|
3213
|
+
# GitHub API returns content as base64
|
|
3214
|
+
if file_data.get("encoding") == "base64":
|
|
3215
|
+
import base64
|
|
3216
|
+
|
|
3217
|
+
file_content = base64.b64decode(file_data["content"])
|
|
3218
|
+
else:
|
|
3219
|
+
# For large files, GitHub returns a download_url
|
|
3220
|
+
download_url = file_data.get("download_url")
|
|
3221
|
+
if download_url:
|
|
3222
|
+
dl_response = client.get(download_url, headers=headers, timeout=30.0)
|
|
3223
|
+
dl_response.raise_for_status()
|
|
3224
|
+
file_content = dl_response.content
|
|
3225
|
+
else:
|
|
3226
|
+
raise ValueError("No content or download_url in response")
|
|
3227
|
+
except Exception as e:
|
|
3228
|
+
log_error(f"Error fetching GitHub file {file_path}: {e}")
|
|
3229
|
+
content_entry.status = ContentStatus.FAILED
|
|
3230
|
+
content_entry.status_message = str(e)
|
|
3231
|
+
self._update_content(content_entry)
|
|
3232
|
+
continue
|
|
3233
|
+
|
|
3234
|
+
# Select reader and read content
|
|
3235
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3236
|
+
if reader is None:
|
|
3237
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
3238
|
+
content_entry.status = ContentStatus.FAILED
|
|
3239
|
+
content_entry.status_message = "No suitable reader found"
|
|
3240
|
+
self._update_content(content_entry)
|
|
3241
|
+
continue
|
|
3242
|
+
|
|
3243
|
+
reader = cast(Reader, reader)
|
|
3244
|
+
readable_content = BytesIO(file_content)
|
|
3245
|
+
read_documents = reader.read(readable_content, name=file_name)
|
|
3246
|
+
|
|
3247
|
+
# Prepare and insert into vector database
|
|
3248
|
+
if not content_entry.id:
|
|
3249
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3250
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3251
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3252
|
+
|
|
2238
3253
|
async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
2239
3254
|
from agno.vectordb import VectorDb
|
|
2240
3255
|
|
|
@@ -2312,6 +3327,18 @@ class Knowledge:
|
|
|
2312
3327
|
content.status = ContentStatus.COMPLETED
|
|
2313
3328
|
self._update_content(content)
|
|
2314
3329
|
|
|
3330
|
+
# --- Remote Content Sources ---
|
|
3331
|
+
|
|
3332
|
+
def _get_remote_configs(self) -> List[RemoteContentConfig]:
|
|
3333
|
+
"""Return configured remote content sources."""
|
|
3334
|
+
return self.content_sources or []
|
|
3335
|
+
|
|
3336
|
+
def _get_remote_config_by_id(self, config_id: str) -> Optional[RemoteContentConfig]:
|
|
3337
|
+
"""Get a remote content config by its ID."""
|
|
3338
|
+
if not self.content_sources:
|
|
3339
|
+
return None
|
|
3340
|
+
return next((c for c in self.content_sources if c.id == config_id), None)
|
|
3341
|
+
|
|
2315
3342
|
# ==========================================
|
|
2316
3343
|
# PRIVATE - CONVERSION & DATA METHODS
|
|
2317
3344
|
# ==========================================
|