agno 2.4.6__py3-none-any.whl → 2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +5 -1
- agno/db/base.py +2 -0
- agno/db/postgres/postgres.py +5 -5
- agno/db/singlestore/singlestore.py +4 -5
- agno/db/sqlite/sqlite.py +4 -4
- agno/knowledge/embedder/aws_bedrock.py +325 -106
- agno/knowledge/knowledge.py +83 -1853
- agno/knowledge/loaders/__init__.py +29 -0
- agno/knowledge/loaders/azure_blob.py +423 -0
- agno/knowledge/loaders/base.py +187 -0
- agno/knowledge/loaders/gcs.py +267 -0
- agno/knowledge/loaders/github.py +415 -0
- agno/knowledge/loaders/s3.py +281 -0
- agno/knowledge/loaders/sharepoint.py +439 -0
- agno/knowledge/reader/website_reader.py +2 -2
- agno/knowledge/remote_knowledge.py +151 -0
- agno/knowledge/reranker/aws_bedrock.py +299 -0
- agno/learn/machine.py +5 -6
- agno/learn/stores/session_context.py +10 -2
- agno/models/azure/openai_chat.py +6 -11
- agno/models/neosantara/__init__.py +5 -0
- agno/models/neosantara/neosantara.py +42 -0
- agno/models/utils.py +5 -0
- agno/os/app.py +4 -1
- agno/os/interfaces/agui/router.py +1 -1
- agno/os/routers/components/components.py +2 -0
- agno/os/routers/knowledge/knowledge.py +0 -1
- agno/os/routers/registry/registry.py +340 -192
- agno/os/routers/workflows/router.py +7 -1
- agno/os/schema.py +104 -0
- agno/registry/registry.py +4 -0
- agno/run/workflow.py +3 -0
- agno/session/workflow.py +1 -1
- agno/skills/utils.py +100 -2
- agno/team/team.py +6 -3
- agno/tools/mcp/mcp.py +26 -1
- agno/vectordb/lancedb/lance_db.py +22 -7
- agno/workflow/__init__.py +4 -0
- agno/workflow/cel.py +299 -0
- agno/workflow/condition.py +280 -58
- agno/workflow/loop.py +177 -46
- agno/workflow/parallel.py +75 -4
- agno/workflow/router.py +260 -44
- agno/workflow/step.py +14 -7
- agno/workflow/steps.py +43 -0
- agno/workflow/workflow.py +104 -46
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/METADATA +25 -37
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/RECORD +51 -39
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/WHEEL +0 -0
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,439 @@
|
|
|
1
|
+
"""SharePoint content loader for Knowledge.
|
|
2
|
+
|
|
3
|
+
Provides methods for loading content from Microsoft SharePoint.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# mypy: disable-error-code="attr-defined"
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Dict, List, Optional, cast
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
from httpx import AsyncClient
|
|
13
|
+
|
|
14
|
+
from agno.knowledge.content import Content, ContentStatus
|
|
15
|
+
from agno.knowledge.loaders.base import BaseLoader
|
|
16
|
+
from agno.knowledge.reader import Reader
|
|
17
|
+
from agno.knowledge.remote_content.config import RemoteContentConfig, SharePointConfig
|
|
18
|
+
from agno.knowledge.remote_content.remote_content import SharePointContent
|
|
19
|
+
from agno.utils.log import log_error, log_info, log_warning
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class SharePointLoader(BaseLoader):
|
|
23
|
+
"""Loader for SharePoint content."""
|
|
24
|
+
|
|
25
|
+
# ==========================================
|
|
26
|
+
# SHAREPOINT HELPERS (shared between sync/async)
|
|
27
|
+
# ==========================================
|
|
28
|
+
|
|
29
|
+
def _validate_sharepoint_config(
|
|
30
|
+
self,
|
|
31
|
+
content: Content,
|
|
32
|
+
config: Optional[RemoteContentConfig],
|
|
33
|
+
) -> Optional[SharePointConfig]:
|
|
34
|
+
"""Validate and extract SharePoint config.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
SharePointConfig if valid, None otherwise
|
|
38
|
+
"""
|
|
39
|
+
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
40
|
+
sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
|
|
41
|
+
|
|
42
|
+
if sp_config is None:
|
|
43
|
+
log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
return sp_config
|
|
47
|
+
|
|
48
|
+
def _get_sharepoint_access_token(self, sp_config: SharePointConfig) -> Optional[str]:
|
|
49
|
+
"""Get an access token for Microsoft Graph API using client credentials flow.
|
|
50
|
+
|
|
51
|
+
Requires the `msal` package: pip install msal
|
|
52
|
+
"""
|
|
53
|
+
try:
|
|
54
|
+
from msal import ConfidentialClientApplication # type: ignore
|
|
55
|
+
except ImportError:
|
|
56
|
+
raise ImportError("The `msal` package is not installed. Please install it via `pip install msal`.")
|
|
57
|
+
|
|
58
|
+
authority = f"https://login.microsoftonline.com/{sp_config.tenant_id}"
|
|
59
|
+
app = ConfidentialClientApplication(
|
|
60
|
+
sp_config.client_id,
|
|
61
|
+
authority=authority,
|
|
62
|
+
client_credential=sp_config.client_secret,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
scopes = ["https://graph.microsoft.com/.default"]
|
|
66
|
+
result = app.acquire_token_for_client(scopes=scopes)
|
|
67
|
+
|
|
68
|
+
if "access_token" in result:
|
|
69
|
+
return result["access_token"]
|
|
70
|
+
else:
|
|
71
|
+
log_error(f"Failed to acquire SharePoint token: {result.get('error_description', result.get('error'))}")
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
def _get_sharepoint_site_id(self, hostname: str, site_path: Optional[str], access_token: str) -> Optional[str]:
|
|
75
|
+
"""Get the SharePoint site ID using Microsoft Graph API (sync)."""
|
|
76
|
+
if site_path:
|
|
77
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
|
|
78
|
+
else:
|
|
79
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
|
|
80
|
+
|
|
81
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
response = httpx.get(url, headers=headers)
|
|
85
|
+
response.raise_for_status()
|
|
86
|
+
return response.json().get("id")
|
|
87
|
+
except httpx.HTTPStatusError as e:
|
|
88
|
+
log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
async def _aget_sharepoint_site_id(
|
|
92
|
+
self, hostname: str, site_path: Optional[str], access_token: str
|
|
93
|
+
) -> Optional[str]:
|
|
94
|
+
"""Get the SharePoint site ID using Microsoft Graph API (async)."""
|
|
95
|
+
if site_path:
|
|
96
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
|
|
97
|
+
else:
|
|
98
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
|
|
99
|
+
|
|
100
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
async with httpx.AsyncClient() as client:
|
|
104
|
+
response = await client.get(url, headers=headers)
|
|
105
|
+
response.raise_for_status()
|
|
106
|
+
return response.json().get("id")
|
|
107
|
+
except httpx.HTTPStatusError as e:
|
|
108
|
+
log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
def _list_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
|
|
112
|
+
"""List all items in a SharePoint folder (sync)."""
|
|
113
|
+
folder_path = folder_path.lstrip("/")
|
|
114
|
+
url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
|
|
115
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
116
|
+
items: List[dict] = []
|
|
117
|
+
|
|
118
|
+
try:
|
|
119
|
+
while url:
|
|
120
|
+
response = httpx.get(url, headers=headers)
|
|
121
|
+
response.raise_for_status()
|
|
122
|
+
data = response.json()
|
|
123
|
+
items.extend(data.get("value", []))
|
|
124
|
+
url = data.get("@odata.nextLink")
|
|
125
|
+
except httpx.HTTPStatusError as e:
|
|
126
|
+
log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
|
|
127
|
+
|
|
128
|
+
return items
|
|
129
|
+
|
|
130
|
+
async def _alist_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
|
|
131
|
+
"""List all items in a SharePoint folder (async)."""
|
|
132
|
+
folder_path = folder_path.lstrip("/")
|
|
133
|
+
url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
|
|
134
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
135
|
+
items: List[dict] = []
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
async with httpx.AsyncClient() as client:
|
|
139
|
+
while url:
|
|
140
|
+
response = await client.get(url, headers=headers)
|
|
141
|
+
response.raise_for_status()
|
|
142
|
+
data = response.json()
|
|
143
|
+
items.extend(data.get("value", []))
|
|
144
|
+
url = data.get("@odata.nextLink")
|
|
145
|
+
except httpx.HTTPStatusError as e:
|
|
146
|
+
log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
|
|
147
|
+
|
|
148
|
+
return items
|
|
149
|
+
|
|
150
|
+
def _download_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
|
|
151
|
+
"""Download a file from SharePoint (sync)."""
|
|
152
|
+
file_path = file_path.lstrip("/")
|
|
153
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
|
|
154
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
response = httpx.get(url, headers=headers, follow_redirects=True)
|
|
158
|
+
response.raise_for_status()
|
|
159
|
+
return BytesIO(response.content)
|
|
160
|
+
except httpx.HTTPStatusError as e:
|
|
161
|
+
log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
async def _adownload_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
|
|
165
|
+
"""Download a file from SharePoint (async)."""
|
|
166
|
+
file_path = file_path.lstrip("/")
|
|
167
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
|
|
168
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
async with httpx.AsyncClient() as client:
|
|
172
|
+
response = await client.get(url, headers=headers, follow_redirects=True)
|
|
173
|
+
response.raise_for_status()
|
|
174
|
+
return BytesIO(response.content)
|
|
175
|
+
except httpx.HTTPStatusError as e:
|
|
176
|
+
log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
def _build_sharepoint_metadata(
|
|
180
|
+
self,
|
|
181
|
+
sp_config: SharePointConfig,
|
|
182
|
+
site_id: str,
|
|
183
|
+
file_path: str,
|
|
184
|
+
file_name: str,
|
|
185
|
+
) -> Dict[str, str]:
|
|
186
|
+
"""Build SharePoint-specific metadata dictionary."""
|
|
187
|
+
return {
|
|
188
|
+
"source_type": "sharepoint",
|
|
189
|
+
"source_config_id": sp_config.id,
|
|
190
|
+
"source_config_name": sp_config.name,
|
|
191
|
+
"sharepoint_hostname": sp_config.hostname,
|
|
192
|
+
"sharepoint_site_id": site_id,
|
|
193
|
+
"sharepoint_path": file_path,
|
|
194
|
+
"sharepoint_filename": file_name,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
def _build_sharepoint_virtual_path(self, hostname: str, site_id: str, file_path: str) -> str:
|
|
198
|
+
"""Build virtual path for SharePoint content."""
|
|
199
|
+
return f"sharepoint://{hostname}/{site_id}/{file_path}"
|
|
200
|
+
|
|
201
|
+
def _get_sharepoint_path_to_process(self, remote_content: SharePointContent) -> str:
|
|
202
|
+
"""Get the path to process from remote content."""
|
|
203
|
+
return (remote_content.file_path or remote_content.folder_path or "").strip("/")
|
|
204
|
+
|
|
205
|
+
# ==========================================
|
|
206
|
+
# SHAREPOINT LOADERS
|
|
207
|
+
# ==========================================
|
|
208
|
+
|
|
209
|
+
async def _aload_from_sharepoint(
|
|
210
|
+
self,
|
|
211
|
+
content: Content,
|
|
212
|
+
upsert: bool,
|
|
213
|
+
skip_if_exists: bool,
|
|
214
|
+
config: Optional[RemoteContentConfig] = None,
|
|
215
|
+
):
|
|
216
|
+
"""Load content from SharePoint (async).
|
|
217
|
+
|
|
218
|
+
Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
|
|
219
|
+
"""
|
|
220
|
+
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
221
|
+
sp_config = self._validate_sharepoint_config(content, config)
|
|
222
|
+
if sp_config is None:
|
|
223
|
+
return
|
|
224
|
+
|
|
225
|
+
# Get access token
|
|
226
|
+
access_token = self._get_sharepoint_access_token(sp_config)
|
|
227
|
+
if not access_token:
|
|
228
|
+
return
|
|
229
|
+
|
|
230
|
+
# Get site ID
|
|
231
|
+
site_id: Optional[str] = sp_config.site_id
|
|
232
|
+
if not site_id:
|
|
233
|
+
site_path = remote_content.site_path or sp_config.site_path
|
|
234
|
+
site_id = await self._aget_sharepoint_site_id(sp_config.hostname, site_path, access_token)
|
|
235
|
+
if not site_id:
|
|
236
|
+
log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
|
|
237
|
+
return
|
|
238
|
+
|
|
239
|
+
# Identify files to download
|
|
240
|
+
files_to_process: List[tuple] = []
|
|
241
|
+
path_to_process = self._get_sharepoint_path_to_process(remote_content)
|
|
242
|
+
|
|
243
|
+
# Helper function to recursively list all files in a folder
|
|
244
|
+
async def list_files_recursive(folder: str) -> List[tuple]:
|
|
245
|
+
"""Recursively list all files in a SharePoint folder."""
|
|
246
|
+
files: List[tuple] = []
|
|
247
|
+
items = await self._alist_sharepoint_folder_items(site_id, folder, access_token) # type: ignore
|
|
248
|
+
for item in items:
|
|
249
|
+
if "file" in item:
|
|
250
|
+
item_path = f"{folder}/{item['name']}"
|
|
251
|
+
files.append((item_path, item["name"]))
|
|
252
|
+
elif "folder" in item:
|
|
253
|
+
subdir_path = f"{folder}/{item['name']}"
|
|
254
|
+
subdir_files = await list_files_recursive(subdir_path)
|
|
255
|
+
files.extend(subdir_files)
|
|
256
|
+
return files
|
|
257
|
+
|
|
258
|
+
if path_to_process:
|
|
259
|
+
try:
|
|
260
|
+
async with AsyncClient() as client:
|
|
261
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
|
|
262
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
263
|
+
response = await client.get(url, headers=headers, timeout=30.0)
|
|
264
|
+
response.raise_for_status()
|
|
265
|
+
item_data = response.json()
|
|
266
|
+
|
|
267
|
+
if "folder" in item_data:
|
|
268
|
+
files_to_process = await list_files_recursive(path_to_process)
|
|
269
|
+
elif "file" in item_data:
|
|
270
|
+
files_to_process.append((path_to_process, item_data["name"]))
|
|
271
|
+
else:
|
|
272
|
+
log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
|
|
273
|
+
return
|
|
274
|
+
except Exception as e:
|
|
275
|
+
log_error(f"Error checking SharePoint path {path_to_process}: {e}")
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
if not files_to_process:
|
|
279
|
+
log_warning(f"No files found at SharePoint path: {path_to_process}")
|
|
280
|
+
return
|
|
281
|
+
|
|
282
|
+
log_info(f"Processing {len(files_to_process)} file(s) from SharePoint")
|
|
283
|
+
is_folder_upload = len(files_to_process) > 1
|
|
284
|
+
|
|
285
|
+
for file_path, file_name in files_to_process:
|
|
286
|
+
# Build metadata and virtual path using helpers
|
|
287
|
+
virtual_path = self._build_sharepoint_virtual_path(sp_config.hostname, site_id, file_path)
|
|
288
|
+
sharepoint_metadata = self._build_sharepoint_metadata(sp_config, site_id, file_path, file_name)
|
|
289
|
+
merged_metadata = self._merge_metadata(sharepoint_metadata, content.metadata)
|
|
290
|
+
|
|
291
|
+
# Compute content name using base helper
|
|
292
|
+
content_name = self._compute_content_name(
|
|
293
|
+
file_path, file_name, content.name, path_to_process, is_folder_upload
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Create content entry using base helper
|
|
297
|
+
content_entry = self._create_content_entry(
|
|
298
|
+
content, content_name, virtual_path, merged_metadata, "sharepoint", is_folder_upload
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
await self._ainsert_contents_db(content_entry)
|
|
302
|
+
|
|
303
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
304
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
305
|
+
await self._aupdate_content(content_entry)
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
# Select reader and download file
|
|
309
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
310
|
+
reader = cast(Reader, reader)
|
|
311
|
+
|
|
312
|
+
file_content = await self._adownload_sharepoint_file(site_id, file_path, access_token)
|
|
313
|
+
if not file_content:
|
|
314
|
+
content_entry.status = ContentStatus.FAILED
|
|
315
|
+
await self._aupdate_content(content_entry)
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
# Read the content
|
|
319
|
+
read_documents = await reader.async_read(file_content, name=file_name)
|
|
320
|
+
|
|
321
|
+
# Prepare and insert to vector database
|
|
322
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
323
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
324
|
+
|
|
325
|
+
def _load_from_sharepoint(
|
|
326
|
+
self,
|
|
327
|
+
content: Content,
|
|
328
|
+
upsert: bool,
|
|
329
|
+
skip_if_exists: bool,
|
|
330
|
+
config: Optional[RemoteContentConfig] = None,
|
|
331
|
+
):
|
|
332
|
+
"""Load content from SharePoint (sync).
|
|
333
|
+
|
|
334
|
+
Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
|
|
335
|
+
"""
|
|
336
|
+
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
337
|
+
sp_config = self._validate_sharepoint_config(content, config)
|
|
338
|
+
if sp_config is None:
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
# Get access token
|
|
342
|
+
access_token = self._get_sharepoint_access_token(sp_config)
|
|
343
|
+
if not access_token:
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
# Get site ID
|
|
347
|
+
site_id: Optional[str] = sp_config.site_id
|
|
348
|
+
if not site_id:
|
|
349
|
+
site_path = remote_content.site_path or sp_config.site_path
|
|
350
|
+
site_id = self._get_sharepoint_site_id(sp_config.hostname, site_path, access_token)
|
|
351
|
+
if not site_id:
|
|
352
|
+
log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
|
|
353
|
+
return
|
|
354
|
+
|
|
355
|
+
# Identify files to download
|
|
356
|
+
files_to_process: List[tuple] = []
|
|
357
|
+
path_to_process = self._get_sharepoint_path_to_process(remote_content)
|
|
358
|
+
|
|
359
|
+
# Helper function to recursively list all files in a folder
|
|
360
|
+
def list_files_recursive(folder: str) -> List[tuple]:
|
|
361
|
+
"""Recursively list all files in a SharePoint folder."""
|
|
362
|
+
files: List[tuple] = []
|
|
363
|
+
items = self._list_sharepoint_folder_items(site_id, folder, access_token) # type: ignore
|
|
364
|
+
for item in items:
|
|
365
|
+
if "file" in item:
|
|
366
|
+
item_path = f"{folder}/{item['name']}"
|
|
367
|
+
files.append((item_path, item["name"]))
|
|
368
|
+
elif "folder" in item:
|
|
369
|
+
subdir_path = f"{folder}/{item['name']}"
|
|
370
|
+
subdir_files = list_files_recursive(subdir_path)
|
|
371
|
+
files.extend(subdir_files)
|
|
372
|
+
return files
|
|
373
|
+
|
|
374
|
+
if path_to_process:
|
|
375
|
+
try:
|
|
376
|
+
with httpx.Client() as client:
|
|
377
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
|
|
378
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
379
|
+
response = client.get(url, headers=headers, timeout=30.0)
|
|
380
|
+
response.raise_for_status()
|
|
381
|
+
item_data = response.json()
|
|
382
|
+
|
|
383
|
+
if "folder" in item_data:
|
|
384
|
+
files_to_process = list_files_recursive(path_to_process)
|
|
385
|
+
elif "file" in item_data:
|
|
386
|
+
files_to_process.append((path_to_process, item_data["name"]))
|
|
387
|
+
else:
|
|
388
|
+
log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
|
|
389
|
+
return
|
|
390
|
+
except Exception as e:
|
|
391
|
+
log_error(f"Error checking SharePoint path {path_to_process}: {e}")
|
|
392
|
+
return
|
|
393
|
+
|
|
394
|
+
if not files_to_process:
|
|
395
|
+
log_warning(f"No files found at SharePoint path: {path_to_process}")
|
|
396
|
+
return
|
|
397
|
+
|
|
398
|
+
log_info(f"Processing {len(files_to_process)} file(s) from SharePoint")
|
|
399
|
+
is_folder_upload = len(files_to_process) > 1
|
|
400
|
+
|
|
401
|
+
for file_path, file_name in files_to_process:
|
|
402
|
+
# Build metadata and virtual path using helpers
|
|
403
|
+
virtual_path = self._build_sharepoint_virtual_path(sp_config.hostname, site_id, file_path)
|
|
404
|
+
sharepoint_metadata = self._build_sharepoint_metadata(sp_config, site_id, file_path, file_name)
|
|
405
|
+
merged_metadata = self._merge_metadata(sharepoint_metadata, content.metadata)
|
|
406
|
+
|
|
407
|
+
# Compute content name using base helper
|
|
408
|
+
content_name = self._compute_content_name(
|
|
409
|
+
file_path, file_name, content.name, path_to_process, is_folder_upload
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Create content entry using base helper
|
|
413
|
+
content_entry = self._create_content_entry(
|
|
414
|
+
content, content_name, virtual_path, merged_metadata, "sharepoint", is_folder_upload
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
self._insert_contents_db(content_entry)
|
|
418
|
+
|
|
419
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
420
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
421
|
+
self._update_content(content_entry)
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Select reader and download file
|
|
425
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
426
|
+
reader = cast(Reader, reader)
|
|
427
|
+
|
|
428
|
+
file_content = self._download_sharepoint_file(site_id, file_path, access_token)
|
|
429
|
+
if not file_content:
|
|
430
|
+
content_entry.status = ContentStatus.FAILED
|
|
431
|
+
self._update_content(content_entry)
|
|
432
|
+
continue
|
|
433
|
+
|
|
434
|
+
# Read the content
|
|
435
|
+
read_documents = reader.read(file_content, name=file_name)
|
|
436
|
+
|
|
437
|
+
# Prepare and insert to vector database
|
|
438
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
439
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
@@ -7,7 +7,7 @@ from urllib.parse import urljoin, urlparse
|
|
|
7
7
|
|
|
8
8
|
import httpx
|
|
9
9
|
|
|
10
|
-
from agno.knowledge.chunking.
|
|
10
|
+
from agno.knowledge.chunking.fixed import FixedSizeChunking
|
|
11
11
|
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
12
12
|
from agno.knowledge.document.base import Document
|
|
13
13
|
from agno.knowledge.reader.base import Reader
|
|
@@ -32,7 +32,7 @@ class WebsiteReader(Reader):
|
|
|
32
32
|
|
|
33
33
|
def __init__(
|
|
34
34
|
self,
|
|
35
|
-
chunking_strategy: Optional[ChunkingStrategy] =
|
|
35
|
+
chunking_strategy: Optional[ChunkingStrategy] = FixedSizeChunking(),
|
|
36
36
|
max_depth: int = 3,
|
|
37
37
|
max_links: int = 10,
|
|
38
38
|
timeout: int = 10,
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Remote content loading for Knowledge.
|
|
2
|
+
|
|
3
|
+
Provides methods for loading content from cloud storage providers:
|
|
4
|
+
- S3, GCS, SharePoint, GitHub, Azure Blob Storage
|
|
5
|
+
|
|
6
|
+
This module contains the RemoteKnowledge class which combines all loader
|
|
7
|
+
capabilities through inheritance. The Knowledge class inherits from this
|
|
8
|
+
to gain remote content loading capabilities.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import List, Optional
|
|
12
|
+
|
|
13
|
+
from agno.knowledge.content import Content
|
|
14
|
+
from agno.knowledge.loaders.azure_blob import AzureBlobLoader
|
|
15
|
+
from agno.knowledge.loaders.gcs import GCSLoader
|
|
16
|
+
from agno.knowledge.loaders.github import GitHubLoader
|
|
17
|
+
from agno.knowledge.loaders.s3 import S3Loader
|
|
18
|
+
from agno.knowledge.loaders.sharepoint import SharePointLoader
|
|
19
|
+
from agno.knowledge.remote_content.config import RemoteContentConfig
|
|
20
|
+
from agno.knowledge.remote_content.remote_content import (
|
|
21
|
+
AzureBlobContent,
|
|
22
|
+
GCSContent,
|
|
23
|
+
GitHubContent,
|
|
24
|
+
S3Content,
|
|
25
|
+
SharePointContent,
|
|
26
|
+
)
|
|
27
|
+
from agno.utils.log import log_warning
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class RemoteKnowledge(S3Loader, GCSLoader, SharePointLoader, GitHubLoader, AzureBlobLoader):
|
|
31
|
+
"""Base class providing remote content loading capabilities.
|
|
32
|
+
|
|
33
|
+
Inherits from all provider-specific loaders:
|
|
34
|
+
- S3Loader: AWS S3 content loading
|
|
35
|
+
- GCSLoader: Google Cloud Storage content loading
|
|
36
|
+
- SharePointLoader: Microsoft SharePoint content loading
|
|
37
|
+
- GitHubLoader: GitHub repository content loading
|
|
38
|
+
- AzureBlobLoader: Azure Blob Storage content loading
|
|
39
|
+
|
|
40
|
+
Knowledge inherits from this class and provides:
|
|
41
|
+
- content_sources: List[RemoteContentConfig]
|
|
42
|
+
- vector_db, contents_db attributes
|
|
43
|
+
- _should_skip(), _select_reader_by_uri(), _prepare_documents_for_insert() methods
|
|
44
|
+
- _ahandle_vector_db_insert(), _handle_vector_db_insert() methods
|
|
45
|
+
- _ainsert_contents_db(), _insert_contents_db() methods
|
|
46
|
+
- _aupdate_content(), _update_content() methods
|
|
47
|
+
- _build_content_hash() method
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# These attributes are provided by the Knowledge subclass
|
|
51
|
+
content_sources: Optional[List[RemoteContentConfig]]
|
|
52
|
+
|
|
53
|
+
# ==========================================
|
|
54
|
+
# REMOTE CONTENT DISPATCHERS
|
|
55
|
+
# ==========================================
|
|
56
|
+
|
|
57
|
+
async def _aload_from_remote_content(
|
|
58
|
+
self,
|
|
59
|
+
content: Content,
|
|
60
|
+
upsert: bool,
|
|
61
|
+
skip_if_exists: bool,
|
|
62
|
+
):
|
|
63
|
+
"""Async dispatcher for remote content loading.
|
|
64
|
+
|
|
65
|
+
Routes to the appropriate provider-specific loader based on content type.
|
|
66
|
+
"""
|
|
67
|
+
if content.remote_content is None:
|
|
68
|
+
log_warning("No remote content provided for content")
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
remote_content = content.remote_content
|
|
72
|
+
|
|
73
|
+
# Look up config if config_id is provided
|
|
74
|
+
config = None
|
|
75
|
+
if hasattr(remote_content, "config_id") and remote_content.config_id:
|
|
76
|
+
config = self._get_remote_config_by_id(remote_content.config_id)
|
|
77
|
+
if config is None:
|
|
78
|
+
log_warning(f"No config found for config_id: {remote_content.config_id}")
|
|
79
|
+
|
|
80
|
+
if isinstance(remote_content, S3Content):
|
|
81
|
+
await self._aload_from_s3(content, upsert, skip_if_exists, config)
|
|
82
|
+
|
|
83
|
+
elif isinstance(remote_content, GCSContent):
|
|
84
|
+
await self._aload_from_gcs(content, upsert, skip_if_exists, config)
|
|
85
|
+
|
|
86
|
+
elif isinstance(remote_content, SharePointContent):
|
|
87
|
+
await self._aload_from_sharepoint(content, upsert, skip_if_exists, config)
|
|
88
|
+
|
|
89
|
+
elif isinstance(remote_content, GitHubContent):
|
|
90
|
+
await self._aload_from_github(content, upsert, skip_if_exists, config)
|
|
91
|
+
|
|
92
|
+
elif isinstance(remote_content, AzureBlobContent):
|
|
93
|
+
await self._aload_from_azure_blob(content, upsert, skip_if_exists, config)
|
|
94
|
+
|
|
95
|
+
else:
|
|
96
|
+
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
97
|
+
|
|
98
|
+
def _load_from_remote_content(
|
|
99
|
+
self,
|
|
100
|
+
content: Content,
|
|
101
|
+
upsert: bool,
|
|
102
|
+
skip_if_exists: bool,
|
|
103
|
+
):
|
|
104
|
+
"""Sync dispatcher for remote content loading.
|
|
105
|
+
|
|
106
|
+
Routes to the appropriate provider-specific loader based on content type.
|
|
107
|
+
"""
|
|
108
|
+
if content.remote_content is None:
|
|
109
|
+
log_warning("No remote content provided for content")
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
remote_content = content.remote_content
|
|
113
|
+
|
|
114
|
+
# Look up config if config_id is provided
|
|
115
|
+
config = None
|
|
116
|
+
if hasattr(remote_content, "config_id") and remote_content.config_id:
|
|
117
|
+
config = self._get_remote_config_by_id(remote_content.config_id)
|
|
118
|
+
if config is None:
|
|
119
|
+
log_warning(f"No config found for config_id: {remote_content.config_id}")
|
|
120
|
+
|
|
121
|
+
if isinstance(remote_content, S3Content):
|
|
122
|
+
self._load_from_s3(content, upsert, skip_if_exists, config)
|
|
123
|
+
|
|
124
|
+
elif isinstance(remote_content, GCSContent):
|
|
125
|
+
self._load_from_gcs(content, upsert, skip_if_exists, config)
|
|
126
|
+
|
|
127
|
+
elif isinstance(remote_content, SharePointContent):
|
|
128
|
+
self._load_from_sharepoint(content, upsert, skip_if_exists, config)
|
|
129
|
+
|
|
130
|
+
elif isinstance(remote_content, GitHubContent):
|
|
131
|
+
self._load_from_github(content, upsert, skip_if_exists, config)
|
|
132
|
+
|
|
133
|
+
elif isinstance(remote_content, AzureBlobContent):
|
|
134
|
+
self._load_from_azure_blob(content, upsert, skip_if_exists, config)
|
|
135
|
+
|
|
136
|
+
else:
|
|
137
|
+
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
138
|
+
|
|
139
|
+
# ==========================================
|
|
140
|
+
# REMOTE CONFIG HELPERS
|
|
141
|
+
# ==========================================
|
|
142
|
+
|
|
143
|
+
def _get_remote_configs(self) -> List[RemoteContentConfig]:
|
|
144
|
+
"""Return configured remote content sources."""
|
|
145
|
+
return self.content_sources or []
|
|
146
|
+
|
|
147
|
+
def _get_remote_config_by_id(self, config_id: str) -> Optional[RemoteContentConfig]:
|
|
148
|
+
"""Get a remote content config by its ID."""
|
|
149
|
+
if not self.content_sources:
|
|
150
|
+
return None
|
|
151
|
+
return next((c for c in self.content_sources if c.id == config_id), None)
|