agno 2.4.6__py3-none-any.whl → 2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +5 -1
- agno/db/base.py +2 -0
- agno/db/postgres/postgres.py +5 -5
- agno/db/singlestore/singlestore.py +4 -5
- agno/db/sqlite/sqlite.py +4 -4
- agno/knowledge/embedder/aws_bedrock.py +325 -106
- agno/knowledge/knowledge.py +83 -1853
- agno/knowledge/loaders/__init__.py +29 -0
- agno/knowledge/loaders/azure_blob.py +423 -0
- agno/knowledge/loaders/base.py +187 -0
- agno/knowledge/loaders/gcs.py +267 -0
- agno/knowledge/loaders/github.py +415 -0
- agno/knowledge/loaders/s3.py +281 -0
- agno/knowledge/loaders/sharepoint.py +439 -0
- agno/knowledge/reader/website_reader.py +2 -2
- agno/knowledge/remote_knowledge.py +151 -0
- agno/knowledge/reranker/aws_bedrock.py +299 -0
- agno/learn/machine.py +5 -6
- agno/learn/stores/session_context.py +10 -2
- agno/models/azure/openai_chat.py +6 -11
- agno/models/neosantara/__init__.py +5 -0
- agno/models/neosantara/neosantara.py +42 -0
- agno/models/utils.py +5 -0
- agno/os/app.py +4 -1
- agno/os/interfaces/agui/router.py +1 -1
- agno/os/routers/components/components.py +2 -0
- agno/os/routers/knowledge/knowledge.py +0 -1
- agno/os/routers/registry/registry.py +340 -192
- agno/os/routers/workflows/router.py +7 -1
- agno/os/schema.py +104 -0
- agno/registry/registry.py +4 -0
- agno/run/workflow.py +3 -0
- agno/session/workflow.py +1 -1
- agno/skills/utils.py +100 -2
- agno/team/team.py +6 -3
- agno/tools/mcp/mcp.py +26 -1
- agno/vectordb/lancedb/lance_db.py +22 -7
- agno/workflow/__init__.py +4 -0
- agno/workflow/cel.py +299 -0
- agno/workflow/condition.py +280 -58
- agno/workflow/loop.py +177 -46
- agno/workflow/parallel.py +75 -4
- agno/workflow/router.py +260 -44
- agno/workflow/step.py +14 -7
- agno/workflow/steps.py +43 -0
- agno/workflow/workflow.py +104 -46
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/METADATA +25 -37
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/RECORD +51 -39
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/WHEEL +0 -0
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -9,7 +9,6 @@ from os.path import basename
|
|
|
9
9
|
from pathlib import Path
|
|
10
10
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
11
11
|
|
|
12
|
-
import httpx
|
|
13
12
|
from httpx import AsyncClient
|
|
14
13
|
|
|
15
14
|
from agno.db.base import AsyncBaseDb, BaseDb
|
|
@@ -19,21 +18,12 @@ from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
|
19
18
|
from agno.knowledge.document import Document
|
|
20
19
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
21
20
|
from agno.knowledge.remote_content.config import (
|
|
22
|
-
AzureBlobConfig,
|
|
23
|
-
GcsConfig,
|
|
24
|
-
GitHubConfig,
|
|
25
21
|
RemoteContentConfig,
|
|
26
|
-
S3Config,
|
|
27
|
-
SharePointConfig,
|
|
28
22
|
)
|
|
29
23
|
from agno.knowledge.remote_content.remote_content import (
|
|
30
|
-
AzureBlobContent,
|
|
31
|
-
GCSContent,
|
|
32
|
-
GitHubContent,
|
|
33
24
|
RemoteContent,
|
|
34
|
-
S3Content,
|
|
35
|
-
SharePointContent,
|
|
36
25
|
)
|
|
26
|
+
from agno.knowledge.remote_knowledge import RemoteKnowledge
|
|
37
27
|
from agno.utils.http import async_fetch_with_retry
|
|
38
28
|
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
39
29
|
from agno.utils.string import generate_id
|
|
@@ -49,7 +39,7 @@ class KnowledgeContentOrigin(Enum):
|
|
|
49
39
|
|
|
50
40
|
|
|
51
41
|
@dataclass
|
|
52
|
-
class Knowledge:
|
|
42
|
+
class Knowledge(RemoteKnowledge):
|
|
53
43
|
"""Knowledge class"""
|
|
54
44
|
|
|
55
45
|
name: Optional[str] = None
|
|
@@ -2091,1847 +2081,6 @@ class Knowledge:
|
|
|
2091
2081
|
|
|
2092
2082
|
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
2093
2083
|
|
|
2094
|
-
async def _aload_from_remote_content(
|
|
2095
|
-
self,
|
|
2096
|
-
content: Content,
|
|
2097
|
-
upsert: bool,
|
|
2098
|
-
skip_if_exists: bool,
|
|
2099
|
-
):
|
|
2100
|
-
if content.remote_content is None:
|
|
2101
|
-
log_warning("No remote content provided for content")
|
|
2102
|
-
return
|
|
2103
|
-
|
|
2104
|
-
remote_content = content.remote_content
|
|
2105
|
-
|
|
2106
|
-
# Look up config if config_id is provided
|
|
2107
|
-
config = None
|
|
2108
|
-
if hasattr(remote_content, "config_id") and remote_content.config_id:
|
|
2109
|
-
config = self._get_remote_config_by_id(remote_content.config_id)
|
|
2110
|
-
if config is None:
|
|
2111
|
-
log_warning(f"No config found for config_id: {remote_content.config_id}")
|
|
2112
|
-
|
|
2113
|
-
if isinstance(remote_content, S3Content):
|
|
2114
|
-
await self._aload_from_s3(content, upsert, skip_if_exists, config)
|
|
2115
|
-
|
|
2116
|
-
elif isinstance(remote_content, GCSContent):
|
|
2117
|
-
await self._aload_from_gcs(content, upsert, skip_if_exists, config)
|
|
2118
|
-
|
|
2119
|
-
elif isinstance(remote_content, SharePointContent):
|
|
2120
|
-
await self._aload_from_sharepoint(content, upsert, skip_if_exists, config)
|
|
2121
|
-
|
|
2122
|
-
elif isinstance(remote_content, GitHubContent):
|
|
2123
|
-
await self._aload_from_github(content, upsert, skip_if_exists, config)
|
|
2124
|
-
|
|
2125
|
-
elif isinstance(remote_content, AzureBlobContent):
|
|
2126
|
-
await self._aload_from_azure_blob(content, upsert, skip_if_exists, config)
|
|
2127
|
-
|
|
2128
|
-
else:
|
|
2129
|
-
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
2130
|
-
|
|
2131
|
-
async def _aload_from_s3(
|
|
2132
|
-
self,
|
|
2133
|
-
content: Content,
|
|
2134
|
-
upsert: bool,
|
|
2135
|
-
skip_if_exists: bool,
|
|
2136
|
-
config: Optional[RemoteContentConfig] = None,
|
|
2137
|
-
):
|
|
2138
|
-
"""Load the contextual S3 content.
|
|
2139
|
-
|
|
2140
|
-
Note: Uses sync boto3 calls as boto3 doesn't have an async API.
|
|
2141
|
-
|
|
2142
|
-
1. Identify objects to read
|
|
2143
|
-
2. Setup Content object
|
|
2144
|
-
3. Hash content and add it to the contents database
|
|
2145
|
-
4. Select reader
|
|
2146
|
-
5. Fetch and load the content
|
|
2147
|
-
6. Read the content
|
|
2148
|
-
7. Prepare and insert the content in the vector database
|
|
2149
|
-
8. Remove temporary file if needed
|
|
2150
|
-
"""
|
|
2151
|
-
from agno.cloud.aws.s3.bucket import S3Bucket
|
|
2152
|
-
from agno.cloud.aws.s3.object import S3Object
|
|
2153
|
-
|
|
2154
|
-
# Note: S3 support has limited features compared to GitHub/SharePoint
|
|
2155
|
-
log_warning(
|
|
2156
|
-
"S3 content loading has limited features. "
|
|
2157
|
-
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2158
|
-
)
|
|
2159
|
-
|
|
2160
|
-
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
2161
|
-
|
|
2162
|
-
# Get or create bucket with credentials from config
|
|
2163
|
-
bucket = remote_content.bucket
|
|
2164
|
-
try:
|
|
2165
|
-
if bucket is None and remote_content.bucket_name:
|
|
2166
|
-
s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
|
|
2167
|
-
bucket = S3Bucket(
|
|
2168
|
-
name=remote_content.bucket_name,
|
|
2169
|
-
region=s3_config.region if s3_config else None,
|
|
2170
|
-
aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
|
|
2171
|
-
aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
|
|
2172
|
-
)
|
|
2173
|
-
except Exception as e:
|
|
2174
|
-
log_error(f"Error getting bucket: {e}")
|
|
2175
|
-
|
|
2176
|
-
# 1. Identify objects to read
|
|
2177
|
-
objects_to_read: List[S3Object] = []
|
|
2178
|
-
if bucket is not None:
|
|
2179
|
-
if remote_content.key is not None:
|
|
2180
|
-
_object = S3Object(bucket_name=bucket.name, name=remote_content.key)
|
|
2181
|
-
objects_to_read.append(_object)
|
|
2182
|
-
elif remote_content.object is not None:
|
|
2183
|
-
objects_to_read.append(remote_content.object)
|
|
2184
|
-
elif remote_content.prefix is not None:
|
|
2185
|
-
objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
|
|
2186
|
-
else:
|
|
2187
|
-
objects_to_read.extend(bucket.get_objects())
|
|
2188
|
-
|
|
2189
|
-
for s3_object in objects_to_read:
|
|
2190
|
-
# 2. Setup Content object
|
|
2191
|
-
content_name = content.name or ""
|
|
2192
|
-
content_name += "_" + (s3_object.name or "")
|
|
2193
|
-
content_entry = Content(
|
|
2194
|
-
name=content_name,
|
|
2195
|
-
description=content.description,
|
|
2196
|
-
status=ContentStatus.PROCESSING,
|
|
2197
|
-
metadata=content.metadata,
|
|
2198
|
-
file_type="s3",
|
|
2199
|
-
)
|
|
2200
|
-
|
|
2201
|
-
# 3. Hash content and add it to the contents database
|
|
2202
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2203
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
2204
|
-
await self._ainsert_contents_db(content_entry)
|
|
2205
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2206
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
2207
|
-
await self._aupdate_content(content_entry)
|
|
2208
|
-
continue
|
|
2209
|
-
|
|
2210
|
-
# 4. Select reader
|
|
2211
|
-
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
2212
|
-
reader = cast(Reader, reader)
|
|
2213
|
-
|
|
2214
|
-
# 5. Fetch and load the content
|
|
2215
|
-
temporary_file = None
|
|
2216
|
-
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
2217
|
-
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
2218
|
-
if s3_object.uri.endswith(".pdf"):
|
|
2219
|
-
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
2220
|
-
else:
|
|
2221
|
-
temporary_file = Path("storage").joinpath(obj_name)
|
|
2222
|
-
readable_content = temporary_file
|
|
2223
|
-
s3_object.download(readable_content) # type: ignore
|
|
2224
|
-
|
|
2225
|
-
# 6. Read the content
|
|
2226
|
-
read_documents = await reader.async_read(readable_content, name=obj_name)
|
|
2227
|
-
|
|
2228
|
-
# 7. Prepare and insert the content in the vector database
|
|
2229
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2230
|
-
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2231
|
-
|
|
2232
|
-
# 8. Remove temporary file if needed
|
|
2233
|
-
if temporary_file:
|
|
2234
|
-
temporary_file.unlink()
|
|
2235
|
-
|
|
2236
|
-
async def _aload_from_gcs(
|
|
2237
|
-
self,
|
|
2238
|
-
content: Content,
|
|
2239
|
-
upsert: bool,
|
|
2240
|
-
skip_if_exists: bool,
|
|
2241
|
-
config: Optional[RemoteContentConfig] = None,
|
|
2242
|
-
):
|
|
2243
|
-
"""Load the contextual GCS content.
|
|
2244
|
-
|
|
2245
|
-
Note: Uses sync google-cloud-storage calls as it doesn't have an async API.
|
|
2246
|
-
|
|
2247
|
-
1. Identify objects to read
|
|
2248
|
-
2. Setup Content object
|
|
2249
|
-
3. Hash content and add it to the contents database
|
|
2250
|
-
4. Select reader
|
|
2251
|
-
5. Fetch and load the content
|
|
2252
|
-
6. Read the content
|
|
2253
|
-
7. Prepare and insert the content in the vector database
|
|
2254
|
-
"""
|
|
2255
|
-
try:
|
|
2256
|
-
from google.cloud import storage # type: ignore
|
|
2257
|
-
except ImportError:
|
|
2258
|
-
raise ImportError(
|
|
2259
|
-
"The `google-cloud-storage` package is not installed. "
|
|
2260
|
-
"Please install it via `pip install google-cloud-storage`."
|
|
2261
|
-
)
|
|
2262
|
-
|
|
2263
|
-
# Note: GCS support has limited features compared to GitHub/SharePoint
|
|
2264
|
-
log_warning(
|
|
2265
|
-
"GCS content loading has limited features. "
|
|
2266
|
-
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2267
|
-
)
|
|
2268
|
-
|
|
2269
|
-
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
2270
|
-
|
|
2271
|
-
# Get or create bucket with credentials from config
|
|
2272
|
-
bucket = remote_content.bucket
|
|
2273
|
-
if bucket is None and remote_content.bucket_name:
|
|
2274
|
-
gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
|
|
2275
|
-
if gcs_config and gcs_config.credentials_path:
|
|
2276
|
-
client = storage.Client.from_service_account_json(gcs_config.credentials_path)
|
|
2277
|
-
elif gcs_config and gcs_config.project:
|
|
2278
|
-
client = storage.Client(project=gcs_config.project)
|
|
2279
|
-
else:
|
|
2280
|
-
client = storage.Client()
|
|
2281
|
-
bucket = client.bucket(remote_content.bucket_name)
|
|
2282
|
-
|
|
2283
|
-
# 1. Identify objects to read
|
|
2284
|
-
objects_to_read = []
|
|
2285
|
-
if remote_content.blob_name is not None:
|
|
2286
|
-
objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
|
|
2287
|
-
elif remote_content.prefix is not None:
|
|
2288
|
-
objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
2289
|
-
else:
|
|
2290
|
-
objects_to_read.extend(bucket.list_blobs()) # type: ignore
|
|
2291
|
-
|
|
2292
|
-
for gcs_object in objects_to_read:
|
|
2293
|
-
# 2. Setup Content object
|
|
2294
|
-
name = (content.name or "content") + "_" + gcs_object.name
|
|
2295
|
-
content_entry = Content(
|
|
2296
|
-
name=name,
|
|
2297
|
-
description=content.description,
|
|
2298
|
-
status=ContentStatus.PROCESSING,
|
|
2299
|
-
metadata=content.metadata,
|
|
2300
|
-
file_type="gcs",
|
|
2301
|
-
)
|
|
2302
|
-
|
|
2303
|
-
# 3. Hash content and add it to the contents database
|
|
2304
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2305
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
2306
|
-
await self._ainsert_contents_db(content_entry)
|
|
2307
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2308
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
2309
|
-
await self._aupdate_content(content_entry)
|
|
2310
|
-
continue
|
|
2311
|
-
|
|
2312
|
-
# 4. Select reader
|
|
2313
|
-
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
2314
|
-
reader = cast(Reader, reader)
|
|
2315
|
-
|
|
2316
|
-
# 5. Fetch and load the content
|
|
2317
|
-
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
2318
|
-
|
|
2319
|
-
# 6. Read the content
|
|
2320
|
-
read_documents = await reader.async_read(readable_content, name=name)
|
|
2321
|
-
|
|
2322
|
-
# 7. Prepare and insert the content in the vector database
|
|
2323
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2324
|
-
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2325
|
-
|
|
2326
|
-
def _load_from_remote_content(
|
|
2327
|
-
self,
|
|
2328
|
-
content: Content,
|
|
2329
|
-
upsert: bool,
|
|
2330
|
-
skip_if_exists: bool,
|
|
2331
|
-
):
|
|
2332
|
-
"""Synchronous version of _load_from_remote_content."""
|
|
2333
|
-
if content.remote_content is None:
|
|
2334
|
-
log_warning("No remote content provided for content")
|
|
2335
|
-
return
|
|
2336
|
-
|
|
2337
|
-
remote_content = content.remote_content
|
|
2338
|
-
|
|
2339
|
-
# Look up config if config_id is provided
|
|
2340
|
-
config = None
|
|
2341
|
-
if hasattr(remote_content, "config_id") and remote_content.config_id:
|
|
2342
|
-
config = self._get_remote_config_by_id(remote_content.config_id)
|
|
2343
|
-
if config is None:
|
|
2344
|
-
log_warning(f"No config found for config_id: {remote_content.config_id}")
|
|
2345
|
-
|
|
2346
|
-
if isinstance(remote_content, S3Content):
|
|
2347
|
-
self._load_from_s3(content, upsert, skip_if_exists, config)
|
|
2348
|
-
|
|
2349
|
-
elif isinstance(remote_content, GCSContent):
|
|
2350
|
-
self._load_from_gcs(content, upsert, skip_if_exists, config)
|
|
2351
|
-
|
|
2352
|
-
elif isinstance(remote_content, SharePointContent):
|
|
2353
|
-
self._load_from_sharepoint(content, upsert, skip_if_exists, config)
|
|
2354
|
-
|
|
2355
|
-
elif isinstance(remote_content, GitHubContent):
|
|
2356
|
-
self._load_from_github(content, upsert, skip_if_exists, config)
|
|
2357
|
-
|
|
2358
|
-
elif isinstance(remote_content, AzureBlobContent):
|
|
2359
|
-
self._load_from_azure_blob(content, upsert, skip_if_exists, config)
|
|
2360
|
-
|
|
2361
|
-
else:
|
|
2362
|
-
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
2363
|
-
|
|
2364
|
-
def _load_from_s3(
|
|
2365
|
-
self,
|
|
2366
|
-
content: Content,
|
|
2367
|
-
upsert: bool,
|
|
2368
|
-
skip_if_exists: bool,
|
|
2369
|
-
config: Optional[RemoteContentConfig] = None,
|
|
2370
|
-
):
|
|
2371
|
-
"""Synchronous version of _load_from_s3.
|
|
2372
|
-
|
|
2373
|
-
Load the contextual S3 content:
|
|
2374
|
-
1. Identify objects to read
|
|
2375
|
-
2. Setup Content object
|
|
2376
|
-
3. Hash content and add it to the contents database
|
|
2377
|
-
4. Select reader
|
|
2378
|
-
5. Fetch and load the content
|
|
2379
|
-
6. Read the content
|
|
2380
|
-
7. Prepare and insert the content in the vector database
|
|
2381
|
-
8. Remove temporary file if needed
|
|
2382
|
-
"""
|
|
2383
|
-
from agno.cloud.aws.s3.bucket import S3Bucket
|
|
2384
|
-
from agno.cloud.aws.s3.object import S3Object
|
|
2385
|
-
|
|
2386
|
-
# Note: S3 support has limited features compared to GitHub/SharePoint
|
|
2387
|
-
log_warning(
|
|
2388
|
-
"S3 content loading has limited features. "
|
|
2389
|
-
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2390
|
-
)
|
|
2391
|
-
|
|
2392
|
-
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
2393
|
-
|
|
2394
|
-
# Get or create bucket with credentials from config
|
|
2395
|
-
bucket = remote_content.bucket
|
|
2396
|
-
if bucket is None and remote_content.bucket_name:
|
|
2397
|
-
s3_config = cast(S3Config, config) if isinstance(config, S3Config) else None
|
|
2398
|
-
bucket = S3Bucket(
|
|
2399
|
-
name=remote_content.bucket_name,
|
|
2400
|
-
region=s3_config.region if s3_config else None,
|
|
2401
|
-
aws_access_key_id=s3_config.aws_access_key_id if s3_config else None,
|
|
2402
|
-
aws_secret_access_key=s3_config.aws_secret_access_key if s3_config else None,
|
|
2403
|
-
)
|
|
2404
|
-
|
|
2405
|
-
# 1. Identify objects to read
|
|
2406
|
-
objects_to_read: List[S3Object] = []
|
|
2407
|
-
if bucket is not None:
|
|
2408
|
-
if remote_content.key is not None:
|
|
2409
|
-
_object = S3Object(bucket_name=bucket.name, name=remote_content.key)
|
|
2410
|
-
objects_to_read.append(_object)
|
|
2411
|
-
elif remote_content.object is not None:
|
|
2412
|
-
objects_to_read.append(remote_content.object)
|
|
2413
|
-
elif remote_content.prefix is not None:
|
|
2414
|
-
objects_to_read.extend(bucket.get_objects(prefix=remote_content.prefix))
|
|
2415
|
-
else:
|
|
2416
|
-
objects_to_read.extend(bucket.get_objects())
|
|
2417
|
-
|
|
2418
|
-
for s3_object in objects_to_read:
|
|
2419
|
-
# 2. Setup Content object
|
|
2420
|
-
content_name = content.name or ""
|
|
2421
|
-
content_name += "_" + (s3_object.name or "")
|
|
2422
|
-
content_entry = Content(
|
|
2423
|
-
name=content_name,
|
|
2424
|
-
description=content.description,
|
|
2425
|
-
status=ContentStatus.PROCESSING,
|
|
2426
|
-
metadata=content.metadata,
|
|
2427
|
-
file_type="s3",
|
|
2428
|
-
)
|
|
2429
|
-
|
|
2430
|
-
# 3. Hash content and add it to the contents database
|
|
2431
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2432
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
2433
|
-
self._insert_contents_db(content_entry)
|
|
2434
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2435
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
2436
|
-
self._update_content(content_entry)
|
|
2437
|
-
continue
|
|
2438
|
-
|
|
2439
|
-
# 4. Select reader
|
|
2440
|
-
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
2441
|
-
reader = cast(Reader, reader)
|
|
2442
|
-
|
|
2443
|
-
# 5. Fetch and load the content
|
|
2444
|
-
temporary_file = None
|
|
2445
|
-
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
2446
|
-
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
2447
|
-
if s3_object.uri.endswith(".pdf"):
|
|
2448
|
-
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
2449
|
-
else:
|
|
2450
|
-
temporary_file = Path("storage").joinpath(obj_name)
|
|
2451
|
-
readable_content = temporary_file
|
|
2452
|
-
s3_object.download(readable_content) # type: ignore
|
|
2453
|
-
|
|
2454
|
-
# 6. Read the content
|
|
2455
|
-
read_documents = reader.read(readable_content, name=obj_name)
|
|
2456
|
-
|
|
2457
|
-
# 7. Prepare and insert the content in the vector database
|
|
2458
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2459
|
-
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2460
|
-
|
|
2461
|
-
# 8. Remove temporary file if needed
|
|
2462
|
-
if temporary_file:
|
|
2463
|
-
temporary_file.unlink()
|
|
2464
|
-
|
|
2465
|
-
def _load_from_gcs(
|
|
2466
|
-
self,
|
|
2467
|
-
content: Content,
|
|
2468
|
-
upsert: bool,
|
|
2469
|
-
skip_if_exists: bool,
|
|
2470
|
-
config: Optional[RemoteContentConfig] = None,
|
|
2471
|
-
):
|
|
2472
|
-
"""Synchronous version of _load_from_gcs.
|
|
2473
|
-
|
|
2474
|
-
Load the contextual GCS content:
|
|
2475
|
-
1. Identify objects to read
|
|
2476
|
-
2. Setup Content object
|
|
2477
|
-
3. Hash content and add it to the contents database
|
|
2478
|
-
4. Select reader
|
|
2479
|
-
5. Fetch and load the content
|
|
2480
|
-
6. Read the content
|
|
2481
|
-
7. Prepare and insert the content in the vector database
|
|
2482
|
-
"""
|
|
2483
|
-
try:
|
|
2484
|
-
from google.cloud import storage # type: ignore
|
|
2485
|
-
except ImportError:
|
|
2486
|
-
raise ImportError(
|
|
2487
|
-
"The `google-cloud-storage` package is not installed. "
|
|
2488
|
-
"Please install it via `pip install google-cloud-storage`."
|
|
2489
|
-
)
|
|
2490
|
-
|
|
2491
|
-
# Note: GCS support has limited features compared to GitHub/SharePoint
|
|
2492
|
-
log_warning(
|
|
2493
|
-
"GCS content loading has limited features. "
|
|
2494
|
-
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
2495
|
-
)
|
|
2496
|
-
|
|
2497
|
-
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
2498
|
-
|
|
2499
|
-
# Get or create bucket with credentials from config
|
|
2500
|
-
bucket = remote_content.bucket
|
|
2501
|
-
if bucket is None and remote_content.bucket_name:
|
|
2502
|
-
gcs_config = cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
|
|
2503
|
-
if gcs_config and gcs_config.credentials_path:
|
|
2504
|
-
client = storage.Client.from_service_account_json(gcs_config.credentials_path)
|
|
2505
|
-
elif gcs_config and gcs_config.project:
|
|
2506
|
-
client = storage.Client(project=gcs_config.project)
|
|
2507
|
-
else:
|
|
2508
|
-
client = storage.Client()
|
|
2509
|
-
bucket = client.bucket(remote_content.bucket_name)
|
|
2510
|
-
|
|
2511
|
-
# 1. Identify objects to read
|
|
2512
|
-
objects_to_read = []
|
|
2513
|
-
if remote_content.blob_name is not None:
|
|
2514
|
-
objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
|
|
2515
|
-
elif remote_content.prefix is not None:
|
|
2516
|
-
objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
2517
|
-
else:
|
|
2518
|
-
objects_to_read.extend(bucket.list_blobs()) # type: ignore
|
|
2519
|
-
|
|
2520
|
-
for gcs_object in objects_to_read:
|
|
2521
|
-
# 2. Setup Content object
|
|
2522
|
-
name = (content.name or "content") + "_" + gcs_object.name
|
|
2523
|
-
content_entry = Content(
|
|
2524
|
-
name=name,
|
|
2525
|
-
description=content.description,
|
|
2526
|
-
status=ContentStatus.PROCESSING,
|
|
2527
|
-
metadata=content.metadata,
|
|
2528
|
-
file_type="gcs",
|
|
2529
|
-
)
|
|
2530
|
-
|
|
2531
|
-
# 3. Hash content and add it to the contents database
|
|
2532
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2533
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
2534
|
-
self._insert_contents_db(content_entry)
|
|
2535
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2536
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
2537
|
-
self._update_content(content_entry)
|
|
2538
|
-
continue
|
|
2539
|
-
|
|
2540
|
-
# 4. Select reader
|
|
2541
|
-
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
2542
|
-
reader = cast(Reader, reader)
|
|
2543
|
-
|
|
2544
|
-
# 5. Fetch and load the content
|
|
2545
|
-
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
2546
|
-
|
|
2547
|
-
# 6. Read the content
|
|
2548
|
-
read_documents = reader.read(readable_content, name=name)
|
|
2549
|
-
|
|
2550
|
-
# 7. Prepare and insert the content in the vector database
|
|
2551
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2552
|
-
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2553
|
-
|
|
2554
|
-
# --- SharePoint loaders ---
|
|
2555
|
-
|
|
2556
|
-
def _get_sharepoint_access_token(self, sp_config: SharePointConfig) -> Optional[str]:
|
|
2557
|
-
"""Get an access token for Microsoft Graph API using client credentials flow.
|
|
2558
|
-
|
|
2559
|
-
Requires the `msal` package: pip install msal
|
|
2560
|
-
"""
|
|
2561
|
-
try:
|
|
2562
|
-
from msal import ConfidentialClientApplication # type: ignore
|
|
2563
|
-
except ImportError:
|
|
2564
|
-
raise ImportError("The `msal` package is not installed. Please install it via `pip install msal`.")
|
|
2565
|
-
|
|
2566
|
-
authority = f"https://login.microsoftonline.com/{sp_config.tenant_id}"
|
|
2567
|
-
app = ConfidentialClientApplication(
|
|
2568
|
-
sp_config.client_id,
|
|
2569
|
-
authority=authority,
|
|
2570
|
-
client_credential=sp_config.client_secret,
|
|
2571
|
-
)
|
|
2572
|
-
|
|
2573
|
-
# Acquire token for Microsoft Graph
|
|
2574
|
-
scopes = ["https://graph.microsoft.com/.default"]
|
|
2575
|
-
result = app.acquire_token_for_client(scopes=scopes)
|
|
2576
|
-
|
|
2577
|
-
if "access_token" in result:
|
|
2578
|
-
return result["access_token"]
|
|
2579
|
-
else:
|
|
2580
|
-
log_error(f"Failed to acquire SharePoint token: {result.get('error_description', result.get('error'))}")
|
|
2581
|
-
return None
|
|
2582
|
-
|
|
2583
|
-
def _get_sharepoint_site_id(self, hostname: str, site_path: Optional[str], access_token: str) -> Optional[str]:
|
|
2584
|
-
"""Get the SharePoint site ID using Microsoft Graph API."""
|
|
2585
|
-
import httpx
|
|
2586
|
-
|
|
2587
|
-
if site_path:
|
|
2588
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
|
|
2589
|
-
else:
|
|
2590
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
|
|
2591
|
-
|
|
2592
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2593
|
-
|
|
2594
|
-
try:
|
|
2595
|
-
response = httpx.get(url, headers=headers)
|
|
2596
|
-
response.raise_for_status()
|
|
2597
|
-
return response.json().get("id")
|
|
2598
|
-
except httpx.HTTPStatusError as e:
|
|
2599
|
-
log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
|
|
2600
|
-
return None
|
|
2601
|
-
|
|
2602
|
-
def _list_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
|
|
2603
|
-
"""List all items in a SharePoint folder."""
|
|
2604
|
-
import httpx
|
|
2605
|
-
|
|
2606
|
-
# Strip leading slashes to avoid double-slash in URL
|
|
2607
|
-
folder_path = folder_path.lstrip("/")
|
|
2608
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
|
|
2609
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2610
|
-
items: List[dict] = []
|
|
2611
|
-
|
|
2612
|
-
try:
|
|
2613
|
-
while url:
|
|
2614
|
-
response = httpx.get(url, headers=headers)
|
|
2615
|
-
response.raise_for_status()
|
|
2616
|
-
data = response.json()
|
|
2617
|
-
items.extend(data.get("value", []))
|
|
2618
|
-
url = data.get("@odata.nextLink")
|
|
2619
|
-
except httpx.HTTPStatusError as e:
|
|
2620
|
-
log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
|
|
2621
|
-
|
|
2622
|
-
return items
|
|
2623
|
-
|
|
2624
|
-
def _download_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
|
|
2625
|
-
"""Download a file from SharePoint."""
|
|
2626
|
-
import httpx
|
|
2627
|
-
|
|
2628
|
-
# Strip leading slashes to avoid double-slash in URL
|
|
2629
|
-
file_path = file_path.lstrip("/")
|
|
2630
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
|
|
2631
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2632
|
-
|
|
2633
|
-
try:
|
|
2634
|
-
response = httpx.get(url, headers=headers, follow_redirects=True)
|
|
2635
|
-
response.raise_for_status()
|
|
2636
|
-
return BytesIO(response.content)
|
|
2637
|
-
except httpx.HTTPStatusError as e:
|
|
2638
|
-
log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
|
|
2639
|
-
return None
|
|
2640
|
-
|
|
2641
|
-
async def _aget_sharepoint_site_id(
|
|
2642
|
-
self, hostname: str, site_path: Optional[str], access_token: str
|
|
2643
|
-
) -> Optional[str]:
|
|
2644
|
-
"""Get the SharePoint site ID using Microsoft Graph API (async)."""
|
|
2645
|
-
import httpx
|
|
2646
|
-
|
|
2647
|
-
if site_path:
|
|
2648
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}:/{site_path}"
|
|
2649
|
-
else:
|
|
2650
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{hostname}"
|
|
2651
|
-
|
|
2652
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2653
|
-
|
|
2654
|
-
try:
|
|
2655
|
-
async with httpx.AsyncClient() as client:
|
|
2656
|
-
response = await client.get(url, headers=headers)
|
|
2657
|
-
response.raise_for_status()
|
|
2658
|
-
return response.json().get("id")
|
|
2659
|
-
except httpx.HTTPStatusError as e:
|
|
2660
|
-
log_error(f"Failed to get SharePoint site ID: {e.response.status_code} - {e.response.text}")
|
|
2661
|
-
return None
|
|
2662
|
-
|
|
2663
|
-
async def _alist_sharepoint_folder_items(self, site_id: str, folder_path: str, access_token: str) -> List[dict]:
|
|
2664
|
-
"""List all items in a SharePoint folder (async)."""
|
|
2665
|
-
import httpx
|
|
2666
|
-
|
|
2667
|
-
# Strip leading slashes to avoid double-slash in URL
|
|
2668
|
-
folder_path = folder_path.lstrip("/")
|
|
2669
|
-
url: Optional[str] = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{folder_path}:/children"
|
|
2670
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2671
|
-
items: List[dict] = []
|
|
2672
|
-
|
|
2673
|
-
try:
|
|
2674
|
-
async with httpx.AsyncClient() as client:
|
|
2675
|
-
while url:
|
|
2676
|
-
response = await client.get(url, headers=headers)
|
|
2677
|
-
response.raise_for_status()
|
|
2678
|
-
data = response.json()
|
|
2679
|
-
items.extend(data.get("value", []))
|
|
2680
|
-
url = data.get("@odata.nextLink")
|
|
2681
|
-
except httpx.HTTPStatusError as e:
|
|
2682
|
-
log_error(f"Failed to list SharePoint folder: {e.response.status_code} - {e.response.text}")
|
|
2683
|
-
|
|
2684
|
-
return items
|
|
2685
|
-
|
|
2686
|
-
async def _adownload_sharepoint_file(self, site_id: str, file_path: str, access_token: str) -> Optional[BytesIO]:
|
|
2687
|
-
"""Download a file from SharePoint (async)."""
|
|
2688
|
-
import httpx
|
|
2689
|
-
|
|
2690
|
-
# Strip leading slashes to avoid double-slash in URL
|
|
2691
|
-
file_path = file_path.lstrip("/")
|
|
2692
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{file_path}:/content"
|
|
2693
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2694
|
-
|
|
2695
|
-
try:
|
|
2696
|
-
async with httpx.AsyncClient() as client:
|
|
2697
|
-
response = await client.get(url, headers=headers, follow_redirects=True)
|
|
2698
|
-
response.raise_for_status()
|
|
2699
|
-
return BytesIO(response.content)
|
|
2700
|
-
except httpx.HTTPStatusError as e:
|
|
2701
|
-
log_error(f"Failed to download SharePoint file {file_path}: {e.response.status_code} - {e.response.text}")
|
|
2702
|
-
return None
|
|
2703
|
-
|
|
2704
|
-
async def _aload_from_sharepoint(
|
|
2705
|
-
self,
|
|
2706
|
-
content: Content,
|
|
2707
|
-
upsert: bool,
|
|
2708
|
-
skip_if_exists: bool,
|
|
2709
|
-
config: Optional[RemoteContentConfig] = None,
|
|
2710
|
-
):
|
|
2711
|
-
"""Load content from SharePoint.
|
|
2712
|
-
|
|
2713
|
-
Requires the SharePoint config to contain tenant_id, client_id, client_secret, and hostname.
|
|
2714
|
-
|
|
2715
|
-
1. Authenticate with Microsoft Graph using client credentials
|
|
2716
|
-
2. Get site ID from hostname/site_path
|
|
2717
|
-
3. Download file(s) from file_path or folder_path
|
|
2718
|
-
4. Process through reader and insert to vector db
|
|
2719
|
-
"""
|
|
2720
|
-
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
2721
|
-
sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
|
|
2722
|
-
|
|
2723
|
-
if sp_config is None:
|
|
2724
|
-
log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
|
|
2725
|
-
return
|
|
2726
|
-
|
|
2727
|
-
# 1. Get access token
|
|
2728
|
-
access_token = self._get_sharepoint_access_token(sp_config)
|
|
2729
|
-
if not access_token:
|
|
2730
|
-
return
|
|
2731
|
-
|
|
2732
|
-
# 2. Get site ID - use config value if provided, otherwise fetch via API
|
|
2733
|
-
site_id: Optional[str] = sp_config.site_id
|
|
2734
|
-
if not site_id:
|
|
2735
|
-
site_path = remote_content.site_path or sp_config.site_path
|
|
2736
|
-
site_id = await self._aget_sharepoint_site_id(sp_config.hostname, site_path, access_token)
|
|
2737
|
-
if not site_id:
|
|
2738
|
-
log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
|
|
2739
|
-
return
|
|
2740
|
-
|
|
2741
|
-
# 3. Identify files to download
|
|
2742
|
-
files_to_process: List[tuple] = [] # List of (file_path, file_name)
|
|
2743
|
-
|
|
2744
|
-
# Helper function to recursively list all files in a folder
|
|
2745
|
-
async def list_files_recursive(folder: str) -> List[tuple]:
|
|
2746
|
-
"""Recursively list all files in a SharePoint folder."""
|
|
2747
|
-
files: List[tuple] = []
|
|
2748
|
-
items = await self._alist_sharepoint_folder_items(site_id, folder, access_token)
|
|
2749
|
-
for item in items:
|
|
2750
|
-
if "file" in item: # It's a file
|
|
2751
|
-
item_path = f"{folder}/{item['name']}"
|
|
2752
|
-
files.append((item_path, item["name"]))
|
|
2753
|
-
elif "folder" in item: # It's a folder - recurse
|
|
2754
|
-
subdir_path = f"{folder}/{item['name']}"
|
|
2755
|
-
subdir_files = await list_files_recursive(subdir_path)
|
|
2756
|
-
files.extend(subdir_files)
|
|
2757
|
-
return files
|
|
2758
|
-
|
|
2759
|
-
# Get the path to process (file_path or folder_path)
|
|
2760
|
-
path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
|
|
2761
|
-
|
|
2762
|
-
if path_to_process:
|
|
2763
|
-
# Check if path is a file or folder by getting item metadata
|
|
2764
|
-
try:
|
|
2765
|
-
async with AsyncClient() as client:
|
|
2766
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
|
|
2767
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2768
|
-
response = await client.get(url, headers=headers, timeout=30.0)
|
|
2769
|
-
response.raise_for_status()
|
|
2770
|
-
item_data = response.json()
|
|
2771
|
-
|
|
2772
|
-
if "folder" in item_data:
|
|
2773
|
-
# It's a folder - recursively list all files
|
|
2774
|
-
files_to_process = await list_files_recursive(path_to_process)
|
|
2775
|
-
elif "file" in item_data:
|
|
2776
|
-
# It's a single file
|
|
2777
|
-
files_to_process.append((path_to_process, item_data["name"]))
|
|
2778
|
-
else:
|
|
2779
|
-
log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
|
|
2780
|
-
return
|
|
2781
|
-
except Exception as e:
|
|
2782
|
-
log_error(f"Error checking SharePoint path {path_to_process}: {e}")
|
|
2783
|
-
return
|
|
2784
|
-
|
|
2785
|
-
if not files_to_process:
|
|
2786
|
-
log_warning(f"No files found at SharePoint path: {path_to_process}")
|
|
2787
|
-
return
|
|
2788
|
-
|
|
2789
|
-
# 4. Process each file
|
|
2790
|
-
for file_path, file_name in files_to_process:
|
|
2791
|
-
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
2792
|
-
virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
|
|
2793
|
-
|
|
2794
|
-
# Build metadata with all info needed to re-fetch the file
|
|
2795
|
-
sharepoint_metadata = {
|
|
2796
|
-
"source_type": "sharepoint",
|
|
2797
|
-
"source_config_id": sp_config.id,
|
|
2798
|
-
"source_config_name": sp_config.name,
|
|
2799
|
-
"sharepoint_hostname": sp_config.hostname,
|
|
2800
|
-
"sharepoint_site_id": site_id,
|
|
2801
|
-
"sharepoint_path": file_path,
|
|
2802
|
-
"sharepoint_filename": file_name,
|
|
2803
|
-
}
|
|
2804
|
-
# Merge with user-provided metadata (user metadata takes precedence)
|
|
2805
|
-
merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
|
|
2806
|
-
|
|
2807
|
-
# Setup Content object
|
|
2808
|
-
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
2809
|
-
is_folder_upload = len(files_to_process) > 1
|
|
2810
|
-
if is_folder_upload:
|
|
2811
|
-
# Compute relative path from the upload root
|
|
2812
|
-
relative_path = file_path
|
|
2813
|
-
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
2814
|
-
relative_path = file_path[len(path_to_process) + 1 :]
|
|
2815
|
-
# If user provided a name, prefix it; otherwise use full file path
|
|
2816
|
-
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
2817
|
-
else:
|
|
2818
|
-
# Single file: use user's name or the filename
|
|
2819
|
-
content_name = content.name or file_name
|
|
2820
|
-
content_entry = Content(
|
|
2821
|
-
name=content_name,
|
|
2822
|
-
description=content.description,
|
|
2823
|
-
path=virtual_path, # Include path for unique hashing
|
|
2824
|
-
status=ContentStatus.PROCESSING,
|
|
2825
|
-
metadata=merged_metadata,
|
|
2826
|
-
file_type="sharepoint",
|
|
2827
|
-
)
|
|
2828
|
-
|
|
2829
|
-
# Hash content and add to contents database
|
|
2830
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2831
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
2832
|
-
await self._ainsert_contents_db(content_entry)
|
|
2833
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2834
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
2835
|
-
await self._aupdate_content(content_entry)
|
|
2836
|
-
continue
|
|
2837
|
-
|
|
2838
|
-
# Select reader based on file extension
|
|
2839
|
-
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
2840
|
-
reader = cast(Reader, reader)
|
|
2841
|
-
|
|
2842
|
-
# Download file
|
|
2843
|
-
file_content = await self._adownload_sharepoint_file(site_id, file_path, access_token)
|
|
2844
|
-
if not file_content:
|
|
2845
|
-
content_entry.status = ContentStatus.FAILED
|
|
2846
|
-
await self._aupdate_content(content_entry)
|
|
2847
|
-
continue
|
|
2848
|
-
|
|
2849
|
-
# Read the content
|
|
2850
|
-
read_documents = await reader.async_read(file_content, name=file_name)
|
|
2851
|
-
|
|
2852
|
-
# Prepare and insert to vector database
|
|
2853
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
2854
|
-
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
2855
|
-
|
|
2856
|
-
def _load_from_sharepoint(
|
|
2857
|
-
self,
|
|
2858
|
-
content: Content,
|
|
2859
|
-
upsert: bool,
|
|
2860
|
-
skip_if_exists: bool,
|
|
2861
|
-
config: Optional[RemoteContentConfig] = None,
|
|
2862
|
-
):
|
|
2863
|
-
"""Synchronous version of _load_from_sharepoint.
|
|
2864
|
-
|
|
2865
|
-
Load content from SharePoint:
|
|
2866
|
-
1. Authenticate with Microsoft Graph using client credentials
|
|
2867
|
-
2. Get site ID from hostname/site_path
|
|
2868
|
-
3. Download file(s) from file_path or folder_path
|
|
2869
|
-
4. Process through reader and insert to vector db
|
|
2870
|
-
"""
|
|
2871
|
-
remote_content: SharePointContent = cast(SharePointContent, content.remote_content)
|
|
2872
|
-
sp_config = cast(SharePointConfig, config) if isinstance(config, SharePointConfig) else None
|
|
2873
|
-
|
|
2874
|
-
if sp_config is None:
|
|
2875
|
-
log_error(f"SharePoint config not found for config_id: {remote_content.config_id}")
|
|
2876
|
-
return
|
|
2877
|
-
|
|
2878
|
-
# 1. Get access token
|
|
2879
|
-
access_token = self._get_sharepoint_access_token(sp_config)
|
|
2880
|
-
if not access_token:
|
|
2881
|
-
return
|
|
2882
|
-
|
|
2883
|
-
# 2. Get site ID - use config value if provided, otherwise fetch via API
|
|
2884
|
-
site_id: Optional[str] = sp_config.site_id
|
|
2885
|
-
if not site_id:
|
|
2886
|
-
site_path = remote_content.site_path or sp_config.site_path
|
|
2887
|
-
site_id = self._get_sharepoint_site_id(sp_config.hostname, site_path, access_token)
|
|
2888
|
-
if not site_id:
|
|
2889
|
-
log_error(f"Failed to get SharePoint site ID for {sp_config.hostname}/{site_path}")
|
|
2890
|
-
return
|
|
2891
|
-
|
|
2892
|
-
# 3. Identify files to download
|
|
2893
|
-
files_to_process: List[tuple] = [] # List of (file_path, file_name)
|
|
2894
|
-
|
|
2895
|
-
# Helper function to recursively list all files in a folder
|
|
2896
|
-
def list_files_recursive(folder: str) -> List[tuple]:
|
|
2897
|
-
"""Recursively list all files in a SharePoint folder."""
|
|
2898
|
-
files: List[tuple] = []
|
|
2899
|
-
items = self._list_sharepoint_folder_items(site_id, folder, access_token)
|
|
2900
|
-
for item in items:
|
|
2901
|
-
if "file" in item: # It's a file
|
|
2902
|
-
item_path = f"{folder}/{item['name']}"
|
|
2903
|
-
files.append((item_path, item["name"]))
|
|
2904
|
-
elif "folder" in item: # It's a folder - recurse
|
|
2905
|
-
subdir_path = f"{folder}/{item['name']}"
|
|
2906
|
-
subdir_files = list_files_recursive(subdir_path)
|
|
2907
|
-
files.extend(subdir_files)
|
|
2908
|
-
return files
|
|
2909
|
-
|
|
2910
|
-
# Get the path to process (file_path or folder_path)
|
|
2911
|
-
path_to_process = (remote_content.file_path or remote_content.folder_path or "").strip("/")
|
|
2912
|
-
|
|
2913
|
-
if path_to_process:
|
|
2914
|
-
# Check if path is a file or folder by getting item metadata
|
|
2915
|
-
try:
|
|
2916
|
-
with httpx.Client() as client:
|
|
2917
|
-
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drive/root:/{path_to_process}"
|
|
2918
|
-
headers = {"Authorization": f"Bearer {access_token}"}
|
|
2919
|
-
response = client.get(url, headers=headers, timeout=30.0)
|
|
2920
|
-
response.raise_for_status()
|
|
2921
|
-
item_data = response.json()
|
|
2922
|
-
|
|
2923
|
-
if "folder" in item_data:
|
|
2924
|
-
# It's a folder - recursively list all files
|
|
2925
|
-
files_to_process = list_files_recursive(path_to_process)
|
|
2926
|
-
elif "file" in item_data:
|
|
2927
|
-
# It's a single file
|
|
2928
|
-
files_to_process.append((path_to_process, item_data["name"]))
|
|
2929
|
-
else:
|
|
2930
|
-
log_warning(f"SharePoint path {path_to_process} is neither file nor folder")
|
|
2931
|
-
return
|
|
2932
|
-
except Exception as e:
|
|
2933
|
-
log_error(f"Error checking SharePoint path {path_to_process}: {e}")
|
|
2934
|
-
return
|
|
2935
|
-
|
|
2936
|
-
if not files_to_process:
|
|
2937
|
-
log_warning(f"No files found at SharePoint path: {path_to_process}")
|
|
2938
|
-
return
|
|
2939
|
-
|
|
2940
|
-
# 4. Process each file
|
|
2941
|
-
for file_path, file_name in files_to_process:
|
|
2942
|
-
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
2943
|
-
virtual_path = f"sharepoint://{sp_config.hostname}/{site_id}/{file_path}"
|
|
2944
|
-
|
|
2945
|
-
# Build metadata with all info needed to re-fetch the file
|
|
2946
|
-
sharepoint_metadata = {
|
|
2947
|
-
"source_type": "sharepoint",
|
|
2948
|
-
"source_config_id": sp_config.id,
|
|
2949
|
-
"source_config_name": sp_config.name,
|
|
2950
|
-
"sharepoint_hostname": sp_config.hostname,
|
|
2951
|
-
"sharepoint_site_id": site_id,
|
|
2952
|
-
"sharepoint_path": file_path,
|
|
2953
|
-
"sharepoint_filename": file_name,
|
|
2954
|
-
}
|
|
2955
|
-
# Merge with user-provided metadata (user metadata takes precedence)
|
|
2956
|
-
merged_metadata = {**sharepoint_metadata, **(content.metadata or {})}
|
|
2957
|
-
|
|
2958
|
-
# Setup Content object
|
|
2959
|
-
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
2960
|
-
is_folder_upload = len(files_to_process) > 1
|
|
2961
|
-
if is_folder_upload:
|
|
2962
|
-
# Compute relative path from the upload root
|
|
2963
|
-
relative_path = file_path
|
|
2964
|
-
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
2965
|
-
relative_path = file_path[len(path_to_process) + 1 :]
|
|
2966
|
-
# If user provided a name, prefix it; otherwise use full file path
|
|
2967
|
-
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
2968
|
-
else:
|
|
2969
|
-
# Single file: use user's name or the filename
|
|
2970
|
-
content_name = content.name or file_name
|
|
2971
|
-
content_entry = Content(
|
|
2972
|
-
name=content_name,
|
|
2973
|
-
description=content.description,
|
|
2974
|
-
path=virtual_path, # Include path for unique hashing
|
|
2975
|
-
status=ContentStatus.PROCESSING,
|
|
2976
|
-
metadata=merged_metadata,
|
|
2977
|
-
file_type="sharepoint",
|
|
2978
|
-
)
|
|
2979
|
-
|
|
2980
|
-
# Hash content and add to contents database
|
|
2981
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
2982
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
2983
|
-
self._insert_contents_db(content_entry)
|
|
2984
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
2985
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
2986
|
-
self._update_content(content_entry)
|
|
2987
|
-
continue
|
|
2988
|
-
|
|
2989
|
-
# Select reader based on file extension
|
|
2990
|
-
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
2991
|
-
reader = cast(Reader, reader)
|
|
2992
|
-
|
|
2993
|
-
# Download file
|
|
2994
|
-
file_content = self._download_sharepoint_file(site_id, file_path, access_token)
|
|
2995
|
-
if not file_content:
|
|
2996
|
-
content_entry.status = ContentStatus.FAILED
|
|
2997
|
-
self._update_content(content_entry)
|
|
2998
|
-
continue
|
|
2999
|
-
|
|
3000
|
-
# Read the content
|
|
3001
|
-
read_documents = reader.read(file_content, name=file_name)
|
|
3002
|
-
|
|
3003
|
-
# Prepare and insert to vector database
|
|
3004
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3005
|
-
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3006
|
-
|
|
3007
|
-
# --- GitHub loaders ---
|
|
3008
|
-
|
|
3009
|
-
async def _aload_from_github(
|
|
3010
|
-
self,
|
|
3011
|
-
content: Content,
|
|
3012
|
-
upsert: bool,
|
|
3013
|
-
skip_if_exists: bool,
|
|
3014
|
-
config: Optional[RemoteContentConfig] = None,
|
|
3015
|
-
):
|
|
3016
|
-
"""Load content from GitHub.
|
|
3017
|
-
|
|
3018
|
-
Requires the GitHub config to contain repo and optionally token for private repos.
|
|
3019
|
-
Uses the GitHub API to fetch file contents.
|
|
3020
|
-
"""
|
|
3021
|
-
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
3022
|
-
gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
|
|
3023
|
-
|
|
3024
|
-
if gh_config is None:
|
|
3025
|
-
log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
|
|
3026
|
-
return
|
|
3027
|
-
|
|
3028
|
-
# Build headers for GitHub API
|
|
3029
|
-
headers = {
|
|
3030
|
-
"Accept": "application/vnd.github.v3+json",
|
|
3031
|
-
"User-Agent": "Agno-Knowledge",
|
|
3032
|
-
}
|
|
3033
|
-
if gh_config.token:
|
|
3034
|
-
headers["Authorization"] = f"Bearer {gh_config.token}"
|
|
3035
|
-
|
|
3036
|
-
branch = remote_content.branch or gh_config.branch or "main"
|
|
3037
|
-
|
|
3038
|
-
# Get list of files to process
|
|
3039
|
-
files_to_process: List[Dict[str, str]] = []
|
|
3040
|
-
|
|
3041
|
-
async with AsyncClient() as client:
|
|
3042
|
-
# Helper function to recursively list all files in a folder
|
|
3043
|
-
async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
|
|
3044
|
-
"""Recursively list all files in a GitHub folder."""
|
|
3045
|
-
files: List[Dict[str, str]] = []
|
|
3046
|
-
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
|
|
3047
|
-
if branch:
|
|
3048
|
-
api_url += f"?ref={branch}"
|
|
3049
|
-
|
|
3050
|
-
try:
|
|
3051
|
-
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
3052
|
-
response.raise_for_status()
|
|
3053
|
-
items = response.json()
|
|
3054
|
-
|
|
3055
|
-
# If items is not a list, it's a single file response
|
|
3056
|
-
if not isinstance(items, list):
|
|
3057
|
-
items = [items]
|
|
3058
|
-
|
|
3059
|
-
for item in items:
|
|
3060
|
-
if item.get("type") == "file":
|
|
3061
|
-
files.append(
|
|
3062
|
-
{
|
|
3063
|
-
"path": item["path"],
|
|
3064
|
-
"name": item["name"],
|
|
3065
|
-
}
|
|
3066
|
-
)
|
|
3067
|
-
elif item.get("type") == "dir":
|
|
3068
|
-
# Recursively get files from subdirectory
|
|
3069
|
-
subdir_files = await list_files_recursive(item["path"])
|
|
3070
|
-
files.extend(subdir_files)
|
|
3071
|
-
except Exception as e:
|
|
3072
|
-
log_error(f"Error listing GitHub folder {folder}: {e}")
|
|
3073
|
-
|
|
3074
|
-
return files
|
|
3075
|
-
|
|
3076
|
-
# Get the path to process (file_path or folder_path)
|
|
3077
|
-
path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
|
|
3078
|
-
|
|
3079
|
-
if path_to_process:
|
|
3080
|
-
# Fetch the path to determine if it's a file or directory
|
|
3081
|
-
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
|
|
3082
|
-
if branch:
|
|
3083
|
-
api_url += f"?ref={branch}"
|
|
3084
|
-
|
|
3085
|
-
try:
|
|
3086
|
-
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
3087
|
-
response.raise_for_status()
|
|
3088
|
-
path_data = response.json()
|
|
3089
|
-
|
|
3090
|
-
if isinstance(path_data, list):
|
|
3091
|
-
# It's a directory - recursively list all files
|
|
3092
|
-
for item in path_data:
|
|
3093
|
-
if item.get("type") == "file":
|
|
3094
|
-
files_to_process.append({"path": item["path"], "name": item["name"]})
|
|
3095
|
-
elif item.get("type") == "dir":
|
|
3096
|
-
subdir_files = await list_files_recursive(item["path"])
|
|
3097
|
-
files_to_process.extend(subdir_files)
|
|
3098
|
-
else:
|
|
3099
|
-
# It's a single file
|
|
3100
|
-
files_to_process.append(
|
|
3101
|
-
{
|
|
3102
|
-
"path": path_data["path"],
|
|
3103
|
-
"name": path_data["name"],
|
|
3104
|
-
}
|
|
3105
|
-
)
|
|
3106
|
-
except Exception as e:
|
|
3107
|
-
log_error(f"Error fetching GitHub path {path_to_process}: {e}")
|
|
3108
|
-
return
|
|
3109
|
-
|
|
3110
|
-
if not files_to_process:
|
|
3111
|
-
log_warning(f"No files found at GitHub path: {path_to_process}")
|
|
3112
|
-
return
|
|
3113
|
-
|
|
3114
|
-
# Process each file
|
|
3115
|
-
for file_info in files_to_process:
|
|
3116
|
-
file_path = file_info["path"]
|
|
3117
|
-
file_name = file_info["name"]
|
|
3118
|
-
|
|
3119
|
-
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
3120
|
-
virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
|
|
3121
|
-
|
|
3122
|
-
# Build metadata with all info needed to re-fetch the file
|
|
3123
|
-
github_metadata = {
|
|
3124
|
-
"source_type": "github",
|
|
3125
|
-
"source_config_id": gh_config.id,
|
|
3126
|
-
"source_config_name": gh_config.name,
|
|
3127
|
-
"github_repo": gh_config.repo,
|
|
3128
|
-
"github_branch": branch,
|
|
3129
|
-
"github_path": file_path,
|
|
3130
|
-
"github_filename": file_name,
|
|
3131
|
-
}
|
|
3132
|
-
# Merge with user-provided metadata (user metadata takes precedence)
|
|
3133
|
-
merged_metadata = {**github_metadata, **(content.metadata or {})}
|
|
3134
|
-
|
|
3135
|
-
# Setup Content object
|
|
3136
|
-
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
3137
|
-
is_folder_upload = len(files_to_process) > 1
|
|
3138
|
-
if is_folder_upload:
|
|
3139
|
-
# Compute relative path from the upload root
|
|
3140
|
-
relative_path = file_path
|
|
3141
|
-
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
3142
|
-
relative_path = file_path[len(path_to_process) + 1 :]
|
|
3143
|
-
# If user provided a name, prefix it; otherwise use full file path
|
|
3144
|
-
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
3145
|
-
else:
|
|
3146
|
-
# Single file: use user's name or the filename
|
|
3147
|
-
content_name = content.name or file_name
|
|
3148
|
-
content_entry = Content(
|
|
3149
|
-
name=content_name,
|
|
3150
|
-
description=content.description,
|
|
3151
|
-
path=virtual_path, # Include path for unique hashing
|
|
3152
|
-
status=ContentStatus.PROCESSING,
|
|
3153
|
-
metadata=merged_metadata,
|
|
3154
|
-
file_type="github",
|
|
3155
|
-
)
|
|
3156
|
-
|
|
3157
|
-
# Hash content and add to contents database
|
|
3158
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3159
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
3160
|
-
await self._ainsert_contents_db(content_entry)
|
|
3161
|
-
|
|
3162
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3163
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
3164
|
-
await self._aupdate_content(content_entry)
|
|
3165
|
-
continue
|
|
3166
|
-
|
|
3167
|
-
# Fetch file content using GitHub API (works for private repos)
|
|
3168
|
-
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
|
|
3169
|
-
if branch:
|
|
3170
|
-
api_url += f"?ref={branch}"
|
|
3171
|
-
try:
|
|
3172
|
-
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
3173
|
-
response.raise_for_status()
|
|
3174
|
-
file_data = response.json()
|
|
3175
|
-
|
|
3176
|
-
# GitHub API returns content as base64
|
|
3177
|
-
if file_data.get("encoding") == "base64":
|
|
3178
|
-
import base64
|
|
3179
|
-
|
|
3180
|
-
file_content = base64.b64decode(file_data["content"])
|
|
3181
|
-
else:
|
|
3182
|
-
# For large files, GitHub returns a download_url
|
|
3183
|
-
download_url = file_data.get("download_url")
|
|
3184
|
-
if download_url:
|
|
3185
|
-
dl_response = await client.get(download_url, headers=headers, timeout=30.0)
|
|
3186
|
-
dl_response.raise_for_status()
|
|
3187
|
-
file_content = dl_response.content
|
|
3188
|
-
else:
|
|
3189
|
-
raise ValueError("No content or download_url in response")
|
|
3190
|
-
except Exception as e:
|
|
3191
|
-
log_error(f"Error fetching GitHub file {file_path}: {e}")
|
|
3192
|
-
content_entry.status = ContentStatus.FAILED
|
|
3193
|
-
content_entry.status_message = str(e)
|
|
3194
|
-
await self._aupdate_content(content_entry)
|
|
3195
|
-
continue
|
|
3196
|
-
|
|
3197
|
-
# Select reader and read content
|
|
3198
|
-
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3199
|
-
if reader is None:
|
|
3200
|
-
log_warning(f"No reader found for file: {file_name}")
|
|
3201
|
-
content_entry.status = ContentStatus.FAILED
|
|
3202
|
-
content_entry.status_message = "No suitable reader found"
|
|
3203
|
-
await self._aupdate_content(content_entry)
|
|
3204
|
-
continue
|
|
3205
|
-
|
|
3206
|
-
reader = cast(Reader, reader)
|
|
3207
|
-
readable_content = BytesIO(file_content)
|
|
3208
|
-
read_documents = await reader.async_read(readable_content, name=file_name)
|
|
3209
|
-
|
|
3210
|
-
# Prepare and insert into vector database
|
|
3211
|
-
if not content_entry.id:
|
|
3212
|
-
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3213
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3214
|
-
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3215
|
-
|
|
3216
|
-
def _load_from_github(
|
|
3217
|
-
self,
|
|
3218
|
-
content: Content,
|
|
3219
|
-
upsert: bool,
|
|
3220
|
-
skip_if_exists: bool,
|
|
3221
|
-
config: Optional[RemoteContentConfig] = None,
|
|
3222
|
-
):
|
|
3223
|
-
"""Synchronous version of _load_from_github."""
|
|
3224
|
-
import httpx
|
|
3225
|
-
|
|
3226
|
-
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
3227
|
-
gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
|
|
3228
|
-
|
|
3229
|
-
if gh_config is None:
|
|
3230
|
-
log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
|
|
3231
|
-
return
|
|
3232
|
-
|
|
3233
|
-
# Build headers for GitHub API
|
|
3234
|
-
headers = {
|
|
3235
|
-
"Accept": "application/vnd.github.v3+json",
|
|
3236
|
-
"User-Agent": "Agno-Knowledge",
|
|
3237
|
-
}
|
|
3238
|
-
if gh_config.token:
|
|
3239
|
-
headers["Authorization"] = f"Bearer {gh_config.token}"
|
|
3240
|
-
|
|
3241
|
-
branch = remote_content.branch or gh_config.branch or "main"
|
|
3242
|
-
|
|
3243
|
-
# Get list of files to process
|
|
3244
|
-
files_to_process: List[Dict[str, str]] = []
|
|
3245
|
-
|
|
3246
|
-
with httpx.Client() as client:
|
|
3247
|
-
# Helper function to recursively list all files in a folder
|
|
3248
|
-
def list_files_recursive(folder: str) -> List[Dict[str, str]]:
|
|
3249
|
-
"""Recursively list all files in a GitHub folder."""
|
|
3250
|
-
files: List[Dict[str, str]] = []
|
|
3251
|
-
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
|
|
3252
|
-
if branch:
|
|
3253
|
-
api_url += f"?ref={branch}"
|
|
3254
|
-
|
|
3255
|
-
try:
|
|
3256
|
-
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
3257
|
-
response.raise_for_status()
|
|
3258
|
-
items = response.json()
|
|
3259
|
-
|
|
3260
|
-
# If items is not a list, it's a single file response
|
|
3261
|
-
if not isinstance(items, list):
|
|
3262
|
-
items = [items]
|
|
3263
|
-
|
|
3264
|
-
for item in items:
|
|
3265
|
-
if item.get("type") == "file":
|
|
3266
|
-
files.append(
|
|
3267
|
-
{
|
|
3268
|
-
"path": item["path"],
|
|
3269
|
-
"name": item["name"],
|
|
3270
|
-
}
|
|
3271
|
-
)
|
|
3272
|
-
elif item.get("type") == "dir":
|
|
3273
|
-
# Recursively get files from subdirectory
|
|
3274
|
-
subdir_files = list_files_recursive(item["path"])
|
|
3275
|
-
files.extend(subdir_files)
|
|
3276
|
-
except Exception as e:
|
|
3277
|
-
log_error(f"Error listing GitHub folder {folder}: {e}")
|
|
3278
|
-
|
|
3279
|
-
return files
|
|
3280
|
-
|
|
3281
|
-
# Get the path to process (file_path or folder_path)
|
|
3282
|
-
path_to_process = (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
|
|
3283
|
-
|
|
3284
|
-
if path_to_process:
|
|
3285
|
-
# Fetch the path to determine if it's a file or directory
|
|
3286
|
-
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
|
|
3287
|
-
if branch:
|
|
3288
|
-
api_url += f"?ref={branch}"
|
|
3289
|
-
|
|
3290
|
-
try:
|
|
3291
|
-
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
3292
|
-
response.raise_for_status()
|
|
3293
|
-
path_data = response.json()
|
|
3294
|
-
|
|
3295
|
-
if isinstance(path_data, list):
|
|
3296
|
-
# It's a directory - recursively list all files
|
|
3297
|
-
for item in path_data:
|
|
3298
|
-
if item.get("type") == "file":
|
|
3299
|
-
files_to_process.append({"path": item["path"], "name": item["name"]})
|
|
3300
|
-
elif item.get("type") == "dir":
|
|
3301
|
-
subdir_files = list_files_recursive(item["path"])
|
|
3302
|
-
files_to_process.extend(subdir_files)
|
|
3303
|
-
else:
|
|
3304
|
-
# It's a single file
|
|
3305
|
-
files_to_process.append(
|
|
3306
|
-
{
|
|
3307
|
-
"path": path_data["path"],
|
|
3308
|
-
"name": path_data["name"],
|
|
3309
|
-
}
|
|
3310
|
-
)
|
|
3311
|
-
except Exception as e:
|
|
3312
|
-
log_error(f"Error fetching GitHub path {path_to_process}: {e}")
|
|
3313
|
-
return
|
|
3314
|
-
|
|
3315
|
-
if not files_to_process:
|
|
3316
|
-
log_warning(f"No files found at GitHub path: {path_to_process}")
|
|
3317
|
-
return
|
|
3318
|
-
|
|
3319
|
-
# Process each file
|
|
3320
|
-
for file_info in files_to_process:
|
|
3321
|
-
file_path = file_info["path"]
|
|
3322
|
-
file_name = file_info["name"]
|
|
3323
|
-
|
|
3324
|
-
# Build a unique virtual path for hashing (ensures different files don't collide)
|
|
3325
|
-
virtual_path = f"github://{gh_config.repo}/{branch}/{file_path}"
|
|
3326
|
-
|
|
3327
|
-
# Build metadata with all info needed to re-fetch the file
|
|
3328
|
-
github_metadata = {
|
|
3329
|
-
"source_type": "github",
|
|
3330
|
-
"source_config_id": gh_config.id,
|
|
3331
|
-
"source_config_name": gh_config.name,
|
|
3332
|
-
"github_repo": gh_config.repo,
|
|
3333
|
-
"github_branch": branch,
|
|
3334
|
-
"github_path": file_path,
|
|
3335
|
-
"github_filename": file_name,
|
|
3336
|
-
}
|
|
3337
|
-
# Merge with user-provided metadata (user metadata takes precedence)
|
|
3338
|
-
merged_metadata = {**github_metadata, **(content.metadata or {})}
|
|
3339
|
-
|
|
3340
|
-
# Setup Content object
|
|
3341
|
-
# Naming: for folders, use relative path; for single files, use user name or filename
|
|
3342
|
-
is_folder_upload = len(files_to_process) > 1
|
|
3343
|
-
if is_folder_upload:
|
|
3344
|
-
# Compute relative path from the upload root
|
|
3345
|
-
relative_path = file_path
|
|
3346
|
-
if path_to_process and file_path.startswith(path_to_process + "/"):
|
|
3347
|
-
relative_path = file_path[len(path_to_process) + 1 :]
|
|
3348
|
-
# If user provided a name, prefix it; otherwise use full file path
|
|
3349
|
-
content_name = f"{content.name}/{relative_path}" if content.name else file_path
|
|
3350
|
-
else:
|
|
3351
|
-
# Single file: use user's name or the filename
|
|
3352
|
-
content_name = content.name or file_name
|
|
3353
|
-
content_entry = Content(
|
|
3354
|
-
name=content_name,
|
|
3355
|
-
description=content.description,
|
|
3356
|
-
path=virtual_path, # Include path for unique hashing
|
|
3357
|
-
status=ContentStatus.PROCESSING,
|
|
3358
|
-
metadata=merged_metadata,
|
|
3359
|
-
file_type="github",
|
|
3360
|
-
)
|
|
3361
|
-
|
|
3362
|
-
# Hash content and add to contents database
|
|
3363
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3364
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
3365
|
-
self._insert_contents_db(content_entry)
|
|
3366
|
-
|
|
3367
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3368
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
3369
|
-
self._update_content(content_entry)
|
|
3370
|
-
continue
|
|
3371
|
-
|
|
3372
|
-
# Fetch file content using GitHub API (works for private repos)
|
|
3373
|
-
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
|
|
3374
|
-
if branch:
|
|
3375
|
-
api_url += f"?ref={branch}"
|
|
3376
|
-
try:
|
|
3377
|
-
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
3378
|
-
response.raise_for_status()
|
|
3379
|
-
file_data = response.json()
|
|
3380
|
-
|
|
3381
|
-
# GitHub API returns content as base64
|
|
3382
|
-
if file_data.get("encoding") == "base64":
|
|
3383
|
-
import base64
|
|
3384
|
-
|
|
3385
|
-
file_content = base64.b64decode(file_data["content"])
|
|
3386
|
-
else:
|
|
3387
|
-
# For large files, GitHub returns a download_url
|
|
3388
|
-
download_url = file_data.get("download_url")
|
|
3389
|
-
if download_url:
|
|
3390
|
-
dl_response = client.get(download_url, headers=headers, timeout=30.0)
|
|
3391
|
-
dl_response.raise_for_status()
|
|
3392
|
-
file_content = dl_response.content
|
|
3393
|
-
else:
|
|
3394
|
-
raise ValueError("No content or download_url in response")
|
|
3395
|
-
except Exception as e:
|
|
3396
|
-
log_error(f"Error fetching GitHub file {file_path}: {e}")
|
|
3397
|
-
content_entry.status = ContentStatus.FAILED
|
|
3398
|
-
content_entry.status_message = str(e)
|
|
3399
|
-
self._update_content(content_entry)
|
|
3400
|
-
continue
|
|
3401
|
-
|
|
3402
|
-
# Select reader and read content
|
|
3403
|
-
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3404
|
-
if reader is None:
|
|
3405
|
-
log_warning(f"No reader found for file: {file_name}")
|
|
3406
|
-
content_entry.status = ContentStatus.FAILED
|
|
3407
|
-
content_entry.status_message = "No suitable reader found"
|
|
3408
|
-
self._update_content(content_entry)
|
|
3409
|
-
continue
|
|
3410
|
-
|
|
3411
|
-
reader = cast(Reader, reader)
|
|
3412
|
-
readable_content = BytesIO(file_content)
|
|
3413
|
-
read_documents = reader.read(readable_content, name=file_name)
|
|
3414
|
-
|
|
3415
|
-
# Prepare and insert into vector database
|
|
3416
|
-
if not content_entry.id:
|
|
3417
|
-
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3418
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3419
|
-
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3420
|
-
|
|
3421
|
-
# --- Azure Blob Storage loaders ---
|
|
3422
|
-
|
|
3423
|
-
def _get_azure_blob_client(self, azure_config: AzureBlobConfig):
|
|
3424
|
-
"""Get a sync Azure Blob Service Client using client credentials flow.
|
|
3425
|
-
|
|
3426
|
-
Requires the `azure-identity` and `azure-storage-blob` packages.
|
|
3427
|
-
"""
|
|
3428
|
-
try:
|
|
3429
|
-
from azure.identity import ClientSecretCredential # type: ignore
|
|
3430
|
-
from azure.storage.blob import BlobServiceClient # type: ignore
|
|
3431
|
-
except ImportError:
|
|
3432
|
-
raise ImportError(
|
|
3433
|
-
"The `azure-identity` and `azure-storage-blob` packages are not installed. "
|
|
3434
|
-
"Please install them via `pip install azure-identity azure-storage-blob`."
|
|
3435
|
-
)
|
|
3436
|
-
|
|
3437
|
-
credential = ClientSecretCredential(
|
|
3438
|
-
tenant_id=azure_config.tenant_id,
|
|
3439
|
-
client_id=azure_config.client_id,
|
|
3440
|
-
client_secret=azure_config.client_secret,
|
|
3441
|
-
)
|
|
3442
|
-
|
|
3443
|
-
blob_service = BlobServiceClient(
|
|
3444
|
-
account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
|
|
3445
|
-
credential=credential,
|
|
3446
|
-
)
|
|
3447
|
-
|
|
3448
|
-
return blob_service
|
|
3449
|
-
|
|
3450
|
-
def _get_azure_blob_client_async(self, azure_config: AzureBlobConfig):
|
|
3451
|
-
"""Get an async Azure Blob Service Client using client credentials flow.
|
|
3452
|
-
|
|
3453
|
-
Requires the `azure-identity` and `azure-storage-blob` packages.
|
|
3454
|
-
Uses the async versions from azure.storage.blob.aio and azure.identity.aio.
|
|
3455
|
-
"""
|
|
3456
|
-
try:
|
|
3457
|
-
from azure.identity.aio import ClientSecretCredential # type: ignore
|
|
3458
|
-
from azure.storage.blob.aio import BlobServiceClient # type: ignore
|
|
3459
|
-
except ImportError:
|
|
3460
|
-
raise ImportError(
|
|
3461
|
-
"The `azure-identity` and `azure-storage-blob` packages are not installed. "
|
|
3462
|
-
"Please install them via `pip install azure-identity azure-storage-blob`."
|
|
3463
|
-
)
|
|
3464
|
-
|
|
3465
|
-
credential = ClientSecretCredential(
|
|
3466
|
-
tenant_id=azure_config.tenant_id,
|
|
3467
|
-
client_id=azure_config.client_id,
|
|
3468
|
-
client_secret=azure_config.client_secret,
|
|
3469
|
-
)
|
|
3470
|
-
|
|
3471
|
-
blob_service = BlobServiceClient(
|
|
3472
|
-
account_url=f"https://{azure_config.storage_account}.blob.core.windows.net",
|
|
3473
|
-
credential=credential,
|
|
3474
|
-
)
|
|
3475
|
-
|
|
3476
|
-
return blob_service
|
|
3477
|
-
|
|
3478
|
-
async def _aload_from_azure_blob(
|
|
3479
|
-
self,
|
|
3480
|
-
content: Content,
|
|
3481
|
-
upsert: bool,
|
|
3482
|
-
skip_if_exists: bool,
|
|
3483
|
-
config: Optional[RemoteContentConfig] = None,
|
|
3484
|
-
):
|
|
3485
|
-
"""Load content from Azure Blob Storage (async version).
|
|
3486
|
-
|
|
3487
|
-
Requires the AzureBlobConfig to contain tenant_id, client_id, client_secret,
|
|
3488
|
-
storage_account, and container.
|
|
3489
|
-
|
|
3490
|
-
Uses the async Azure SDK to avoid blocking the event loop.
|
|
3491
|
-
|
|
3492
|
-
1. Authenticate with Azure AD using client credentials
|
|
3493
|
-
2. List blobs in container (by prefix or single blob)
|
|
3494
|
-
3. Download and process each blob
|
|
3495
|
-
4. Insert to vector database
|
|
3496
|
-
"""
|
|
3497
|
-
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
3498
|
-
azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
|
|
3499
|
-
|
|
3500
|
-
if azure_config is None:
|
|
3501
|
-
log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
|
|
3502
|
-
return
|
|
3503
|
-
|
|
3504
|
-
# Get async blob service client
|
|
3505
|
-
try:
|
|
3506
|
-
blob_service = self._get_azure_blob_client_async(azure_config)
|
|
3507
|
-
except ImportError as e:
|
|
3508
|
-
log_error(str(e))
|
|
3509
|
-
return
|
|
3510
|
-
except Exception as e:
|
|
3511
|
-
log_error(f"Error creating Azure Blob client: {e}")
|
|
3512
|
-
return
|
|
3513
|
-
|
|
3514
|
-
# Use async context manager for proper resource cleanup
|
|
3515
|
-
async with blob_service:
|
|
3516
|
-
container_client = blob_service.get_container_client(azure_config.container)
|
|
3517
|
-
|
|
3518
|
-
# Helper to list blobs with a given prefix (async)
|
|
3519
|
-
async def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
|
|
3520
|
-
"""List all blobs under a given prefix (folder)."""
|
|
3521
|
-
results: List[Dict[str, Any]] = []
|
|
3522
|
-
normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
|
|
3523
|
-
async for blob in container_client.list_blobs(name_starts_with=normalized_prefix):
|
|
3524
|
-
# Skip "directory" markers (blobs ending with /)
|
|
3525
|
-
if not blob.name.endswith("/"):
|
|
3526
|
-
results.append(
|
|
3527
|
-
{
|
|
3528
|
-
"name": blob.name,
|
|
3529
|
-
"size": blob.size,
|
|
3530
|
-
"content_type": blob.content_settings.content_type if blob.content_settings else None,
|
|
3531
|
-
}
|
|
3532
|
-
)
|
|
3533
|
-
return results
|
|
3534
|
-
|
|
3535
|
-
# Identify blobs to process
|
|
3536
|
-
blobs_to_process: List[Dict[str, Any]] = []
|
|
3537
|
-
|
|
3538
|
-
try:
|
|
3539
|
-
if remote_content.blob_name:
|
|
3540
|
-
# Try to get as a single blob first
|
|
3541
|
-
blob_client = container_client.get_blob_client(remote_content.blob_name)
|
|
3542
|
-
try:
|
|
3543
|
-
props = await blob_client.get_blob_properties()
|
|
3544
|
-
blobs_to_process.append(
|
|
3545
|
-
{
|
|
3546
|
-
"name": remote_content.blob_name,
|
|
3547
|
-
"size": props.size,
|
|
3548
|
-
"content_type": props.content_settings.content_type if props.content_settings else None,
|
|
3549
|
-
}
|
|
3550
|
-
)
|
|
3551
|
-
except Exception:
|
|
3552
|
-
# Blob doesn't exist - check if it's actually a folder (prefix)
|
|
3553
|
-
log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
|
|
3554
|
-
blobs_to_process = await list_blobs_with_prefix(remote_content.blob_name)
|
|
3555
|
-
if not blobs_to_process:
|
|
3556
|
-
log_error(
|
|
3557
|
-
f"No blob or folder found at path: {remote_content.blob_name}. "
|
|
3558
|
-
"If this is a folder, ensure files exist inside it."
|
|
3559
|
-
)
|
|
3560
|
-
return
|
|
3561
|
-
elif remote_content.prefix:
|
|
3562
|
-
# List blobs with prefix
|
|
3563
|
-
blobs_to_process = await list_blobs_with_prefix(remote_content.prefix)
|
|
3564
|
-
except Exception as e:
|
|
3565
|
-
log_error(f"Error listing Azure blobs: {e}")
|
|
3566
|
-
return
|
|
3567
|
-
|
|
3568
|
-
if not blobs_to_process:
|
|
3569
|
-
log_warning(f"No blobs found in Azure container: {azure_config.container}")
|
|
3570
|
-
return
|
|
3571
|
-
|
|
3572
|
-
# For single file uploads, use the original content object to preserve the ID
|
|
3573
|
-
# returned by the API. For folder uploads, create new content entries for each file.
|
|
3574
|
-
is_folder_upload = len(blobs_to_process) > 1
|
|
3575
|
-
|
|
3576
|
-
# Process each blob
|
|
3577
|
-
for blob_info in blobs_to_process:
|
|
3578
|
-
blob_name = blob_info["name"]
|
|
3579
|
-
file_name = blob_name.split("/")[-1]
|
|
3580
|
-
|
|
3581
|
-
# Build a unique virtual path for hashing
|
|
3582
|
-
virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
|
|
3583
|
-
|
|
3584
|
-
# Build metadata
|
|
3585
|
-
azure_metadata = {
|
|
3586
|
-
"source_type": "azure_blob",
|
|
3587
|
-
"source_config_id": azure_config.id,
|
|
3588
|
-
"source_config_name": azure_config.name,
|
|
3589
|
-
"azure_storage_account": azure_config.storage_account,
|
|
3590
|
-
"azure_container": azure_config.container,
|
|
3591
|
-
"azure_blob_name": blob_name,
|
|
3592
|
-
"azure_filename": file_name,
|
|
3593
|
-
}
|
|
3594
|
-
merged_metadata = {**azure_metadata, **(content.metadata or {})}
|
|
3595
|
-
|
|
3596
|
-
# Setup Content object
|
|
3597
|
-
if is_folder_upload:
|
|
3598
|
-
# For folder uploads, create new content entries for each file
|
|
3599
|
-
relative_path = blob_name
|
|
3600
|
-
if remote_content.prefix and blob_name.startswith(remote_content.prefix):
|
|
3601
|
-
relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
|
|
3602
|
-
content_name = f"{content.name}/{relative_path}" if content.name else blob_name
|
|
3603
|
-
|
|
3604
|
-
content_entry = Content(
|
|
3605
|
-
name=content_name,
|
|
3606
|
-
description=content.description,
|
|
3607
|
-
path=virtual_path,
|
|
3608
|
-
status=ContentStatus.PROCESSING,
|
|
3609
|
-
metadata=merged_metadata,
|
|
3610
|
-
file_type="azure_blob",
|
|
3611
|
-
)
|
|
3612
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3613
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
3614
|
-
else:
|
|
3615
|
-
# For single file uploads, use the original content object to preserve ID
|
|
3616
|
-
content_entry = content
|
|
3617
|
-
content_entry.path = virtual_path
|
|
3618
|
-
content_entry.status = ContentStatus.PROCESSING
|
|
3619
|
-
content_entry.metadata = merged_metadata
|
|
3620
|
-
content_entry.file_type = "azure_blob"
|
|
3621
|
-
# Use existing id and content_hash from the original content if available
|
|
3622
|
-
if not content_entry.content_hash:
|
|
3623
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3624
|
-
if not content_entry.id:
|
|
3625
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
3626
|
-
|
|
3627
|
-
await self._ainsert_contents_db(content_entry)
|
|
3628
|
-
|
|
3629
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3630
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
3631
|
-
await self._aupdate_content(content_entry)
|
|
3632
|
-
continue
|
|
3633
|
-
|
|
3634
|
-
# Download blob (async)
|
|
3635
|
-
try:
|
|
3636
|
-
blob_client = container_client.get_blob_client(blob_name)
|
|
3637
|
-
download_stream = await blob_client.download_blob()
|
|
3638
|
-
blob_data = await download_stream.readall()
|
|
3639
|
-
file_content = BytesIO(blob_data)
|
|
3640
|
-
except Exception as e:
|
|
3641
|
-
log_error(f"Error downloading Azure blob {blob_name}: {e}")
|
|
3642
|
-
content_entry.status = ContentStatus.FAILED
|
|
3643
|
-
content_entry.status_message = str(e)
|
|
3644
|
-
await self._aupdate_content(content_entry)
|
|
3645
|
-
continue
|
|
3646
|
-
|
|
3647
|
-
# Select reader and read content
|
|
3648
|
-
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3649
|
-
if reader is None:
|
|
3650
|
-
log_warning(f"No reader found for file: {file_name}")
|
|
3651
|
-
content_entry.status = ContentStatus.FAILED
|
|
3652
|
-
content_entry.status_message = "No suitable reader found"
|
|
3653
|
-
await self._aupdate_content(content_entry)
|
|
3654
|
-
continue
|
|
3655
|
-
|
|
3656
|
-
reader = cast(Reader, reader)
|
|
3657
|
-
read_documents = await reader.async_read(file_content, name=file_name)
|
|
3658
|
-
|
|
3659
|
-
# Prepare and insert into vector database
|
|
3660
|
-
if not content_entry.id:
|
|
3661
|
-
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3662
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3663
|
-
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3664
|
-
|
|
3665
|
-
def _load_from_azure_blob(
|
|
3666
|
-
self,
|
|
3667
|
-
content: Content,
|
|
3668
|
-
upsert: bool,
|
|
3669
|
-
skip_if_exists: bool,
|
|
3670
|
-
config: Optional[RemoteContentConfig] = None,
|
|
3671
|
-
):
|
|
3672
|
-
"""Synchronous version of _load_from_azure_blob.
|
|
3673
|
-
|
|
3674
|
-
Load content from Azure Blob Storage:
|
|
3675
|
-
1. Authenticate with Azure AD using client credentials
|
|
3676
|
-
2. List blobs in container (by prefix or single blob)
|
|
3677
|
-
3. Download and process each blob
|
|
3678
|
-
4. Insert to vector database
|
|
3679
|
-
"""
|
|
3680
|
-
remote_content: AzureBlobContent = cast(AzureBlobContent, content.remote_content)
|
|
3681
|
-
azure_config = cast(AzureBlobConfig, config) if isinstance(config, AzureBlobConfig) else None
|
|
3682
|
-
|
|
3683
|
-
if azure_config is None:
|
|
3684
|
-
log_error(f"Azure Blob config not found for config_id: {remote_content.config_id}")
|
|
3685
|
-
return
|
|
3686
|
-
|
|
3687
|
-
# Get blob service client
|
|
3688
|
-
try:
|
|
3689
|
-
blob_service = self._get_azure_blob_client(azure_config)
|
|
3690
|
-
except ImportError as e:
|
|
3691
|
-
log_error(str(e))
|
|
3692
|
-
return
|
|
3693
|
-
except Exception as e:
|
|
3694
|
-
log_error(f"Error creating Azure Blob client: {e}")
|
|
3695
|
-
return
|
|
3696
|
-
|
|
3697
|
-
container_client = blob_service.get_container_client(azure_config.container)
|
|
3698
|
-
|
|
3699
|
-
# Helper to list blobs with a given prefix
|
|
3700
|
-
def list_blobs_with_prefix(prefix: str) -> List[Dict[str, Any]]:
|
|
3701
|
-
"""List all blobs under a given prefix (folder)."""
|
|
3702
|
-
results: List[Dict[str, Any]] = []
|
|
3703
|
-
normalized_prefix = prefix.rstrip("/") + "/" if not prefix.endswith("/") else prefix
|
|
3704
|
-
blobs = container_client.list_blobs(name_starts_with=normalized_prefix)
|
|
3705
|
-
for blob in blobs:
|
|
3706
|
-
# Skip "directory" markers (blobs ending with /)
|
|
3707
|
-
if not blob.name.endswith("/"):
|
|
3708
|
-
results.append(
|
|
3709
|
-
{
|
|
3710
|
-
"name": blob.name,
|
|
3711
|
-
"size": blob.size,
|
|
3712
|
-
"content_type": blob.content_settings.content_type if blob.content_settings else None,
|
|
3713
|
-
}
|
|
3714
|
-
)
|
|
3715
|
-
return results
|
|
3716
|
-
|
|
3717
|
-
# Identify blobs to process
|
|
3718
|
-
blobs_to_process: List[Dict[str, Any]] = []
|
|
3719
|
-
|
|
3720
|
-
try:
|
|
3721
|
-
if remote_content.blob_name:
|
|
3722
|
-
# Try to get as a single blob first
|
|
3723
|
-
blob_client = container_client.get_blob_client(remote_content.blob_name)
|
|
3724
|
-
try:
|
|
3725
|
-
props = blob_client.get_blob_properties()
|
|
3726
|
-
blobs_to_process.append(
|
|
3727
|
-
{
|
|
3728
|
-
"name": remote_content.blob_name,
|
|
3729
|
-
"size": props.size,
|
|
3730
|
-
"content_type": props.content_settings.content_type if props.content_settings else None,
|
|
3731
|
-
}
|
|
3732
|
-
)
|
|
3733
|
-
except Exception:
|
|
3734
|
-
# Blob doesn't exist - check if it's actually a folder (prefix)
|
|
3735
|
-
log_debug(f"Blob {remote_content.blob_name} not found, checking if it's a folder...")
|
|
3736
|
-
blobs_to_process = list_blobs_with_prefix(remote_content.blob_name)
|
|
3737
|
-
if not blobs_to_process:
|
|
3738
|
-
log_error(
|
|
3739
|
-
f"No blob or folder found at path: {remote_content.blob_name}. "
|
|
3740
|
-
"If this is a folder, ensure files exist inside it."
|
|
3741
|
-
)
|
|
3742
|
-
return
|
|
3743
|
-
elif remote_content.prefix:
|
|
3744
|
-
# List blobs with prefix
|
|
3745
|
-
blobs_to_process = list_blobs_with_prefix(remote_content.prefix)
|
|
3746
|
-
except Exception as e:
|
|
3747
|
-
log_error(f"Error listing Azure blobs: {e}")
|
|
3748
|
-
return
|
|
3749
|
-
|
|
3750
|
-
if not blobs_to_process:
|
|
3751
|
-
log_warning(f"No blobs found in Azure container: {azure_config.container}")
|
|
3752
|
-
return
|
|
3753
|
-
|
|
3754
|
-
# For single file uploads, use the original content object to preserve the ID
|
|
3755
|
-
# returned by the API. For folder uploads, create new content entries for each file.
|
|
3756
|
-
is_folder_upload = len(blobs_to_process) > 1
|
|
3757
|
-
|
|
3758
|
-
# Process each blob
|
|
3759
|
-
for blob_info in blobs_to_process:
|
|
3760
|
-
blob_name = blob_info["name"]
|
|
3761
|
-
file_name = blob_name.split("/")[-1]
|
|
3762
|
-
|
|
3763
|
-
# Build a unique virtual path for hashing
|
|
3764
|
-
virtual_path = f"azure://{azure_config.storage_account}/{azure_config.container}/{blob_name}"
|
|
3765
|
-
|
|
3766
|
-
# Build metadata
|
|
3767
|
-
azure_metadata = {
|
|
3768
|
-
"source_type": "azure_blob",
|
|
3769
|
-
"source_config_id": azure_config.id,
|
|
3770
|
-
"source_config_name": azure_config.name,
|
|
3771
|
-
"azure_storage_account": azure_config.storage_account,
|
|
3772
|
-
"azure_container": azure_config.container,
|
|
3773
|
-
"azure_blob_name": blob_name,
|
|
3774
|
-
"azure_filename": file_name,
|
|
3775
|
-
}
|
|
3776
|
-
merged_metadata = {**azure_metadata, **(content.metadata or {})}
|
|
3777
|
-
|
|
3778
|
-
# Setup Content object
|
|
3779
|
-
if is_folder_upload:
|
|
3780
|
-
# For folder uploads, create new content entries for each file
|
|
3781
|
-
relative_path = blob_name
|
|
3782
|
-
if remote_content.prefix and blob_name.startswith(remote_content.prefix):
|
|
3783
|
-
relative_path = blob_name[len(remote_content.prefix) :].lstrip("/")
|
|
3784
|
-
content_name = f"{content.name}/{relative_path}" if content.name else blob_name
|
|
3785
|
-
|
|
3786
|
-
content_entry = Content(
|
|
3787
|
-
name=content_name,
|
|
3788
|
-
description=content.description,
|
|
3789
|
-
path=virtual_path,
|
|
3790
|
-
status=ContentStatus.PROCESSING,
|
|
3791
|
-
metadata=merged_metadata,
|
|
3792
|
-
file_type="azure_blob",
|
|
3793
|
-
)
|
|
3794
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3795
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
3796
|
-
else:
|
|
3797
|
-
# For single file uploads, use the original content object to preserve ID
|
|
3798
|
-
content_entry = content
|
|
3799
|
-
content_entry.path = virtual_path
|
|
3800
|
-
content_entry.status = ContentStatus.PROCESSING
|
|
3801
|
-
content_entry.metadata = merged_metadata
|
|
3802
|
-
content_entry.file_type = "azure_blob"
|
|
3803
|
-
# Use existing id and content_hash from the original content if available
|
|
3804
|
-
if not content_entry.content_hash:
|
|
3805
|
-
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
3806
|
-
if not content_entry.id:
|
|
3807
|
-
content_entry.id = generate_id(content_entry.content_hash)
|
|
3808
|
-
|
|
3809
|
-
self._insert_contents_db(content_entry)
|
|
3810
|
-
|
|
3811
|
-
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
3812
|
-
content_entry.status = ContentStatus.COMPLETED
|
|
3813
|
-
self._update_content(content_entry)
|
|
3814
|
-
continue
|
|
3815
|
-
|
|
3816
|
-
# Download blob
|
|
3817
|
-
try:
|
|
3818
|
-
blob_client = container_client.get_blob_client(blob_name)
|
|
3819
|
-
download_stream = blob_client.download_blob()
|
|
3820
|
-
file_content = BytesIO(download_stream.readall())
|
|
3821
|
-
except Exception as e:
|
|
3822
|
-
log_error(f"Error downloading Azure blob {blob_name}: {e}")
|
|
3823
|
-
content_entry.status = ContentStatus.FAILED
|
|
3824
|
-
content_entry.status_message = str(e)
|
|
3825
|
-
self._update_content(content_entry)
|
|
3826
|
-
continue
|
|
3827
|
-
|
|
3828
|
-
# Select reader and read content
|
|
3829
|
-
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
3830
|
-
if reader is None:
|
|
3831
|
-
log_warning(f"No reader found for file: {file_name}")
|
|
3832
|
-
content_entry.status = ContentStatus.FAILED
|
|
3833
|
-
content_entry.status_message = "No suitable reader found"
|
|
3834
|
-
self._update_content(content_entry)
|
|
3835
|
-
continue
|
|
3836
|
-
|
|
3837
|
-
reader = cast(Reader, reader)
|
|
3838
|
-
read_documents = reader.read(file_content, name=file_name)
|
|
3839
|
-
|
|
3840
|
-
# Prepare and insert into vector database
|
|
3841
|
-
if not content_entry.id:
|
|
3842
|
-
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
3843
|
-
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
3844
|
-
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
3845
|
-
|
|
3846
|
-
async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
3847
|
-
from agno.vectordb import VectorDb
|
|
3848
|
-
|
|
3849
|
-
self.vector_db = cast(VectorDb, self.vector_db)
|
|
3850
|
-
|
|
3851
|
-
if not self.vector_db:
|
|
3852
|
-
log_error("No vector database configured")
|
|
3853
|
-
content.status = ContentStatus.FAILED
|
|
3854
|
-
content.status_message = "No vector database configured"
|
|
3855
|
-
await self._aupdate_content(content)
|
|
3856
|
-
return
|
|
3857
|
-
|
|
3858
|
-
if self.vector_db.upsert_available() and upsert:
|
|
3859
|
-
try:
|
|
3860
|
-
await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
3861
|
-
except Exception as e:
|
|
3862
|
-
log_error(f"Error upserting document: {e}")
|
|
3863
|
-
content.status = ContentStatus.FAILED
|
|
3864
|
-
content.status_message = "Could not upsert embedding"
|
|
3865
|
-
await self._aupdate_content(content)
|
|
3866
|
-
return
|
|
3867
|
-
else:
|
|
3868
|
-
try:
|
|
3869
|
-
await self.vector_db.async_insert(
|
|
3870
|
-
content.content_hash, # type: ignore[arg-type]
|
|
3871
|
-
documents=read_documents,
|
|
3872
|
-
filters=content.metadata, # type: ignore[arg-type]
|
|
3873
|
-
)
|
|
3874
|
-
except Exception as e:
|
|
3875
|
-
log_error(f"Error inserting document: {e}")
|
|
3876
|
-
content.status = ContentStatus.FAILED
|
|
3877
|
-
content.status_message = "Could not insert embedding"
|
|
3878
|
-
await self._aupdate_content(content)
|
|
3879
|
-
return
|
|
3880
|
-
|
|
3881
|
-
content.status = ContentStatus.COMPLETED
|
|
3882
|
-
await self._aupdate_content(content)
|
|
3883
|
-
|
|
3884
|
-
def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
3885
|
-
"""Synchronously handle vector database insertion."""
|
|
3886
|
-
from agno.vectordb import VectorDb
|
|
3887
|
-
|
|
3888
|
-
self.vector_db = cast(VectorDb, self.vector_db)
|
|
3889
|
-
|
|
3890
|
-
if not self.vector_db:
|
|
3891
|
-
log_error("No vector database configured")
|
|
3892
|
-
content.status = ContentStatus.FAILED
|
|
3893
|
-
content.status_message = "No vector database configured"
|
|
3894
|
-
self._update_content(content)
|
|
3895
|
-
return
|
|
3896
|
-
|
|
3897
|
-
if self.vector_db.upsert_available() and upsert:
|
|
3898
|
-
try:
|
|
3899
|
-
self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
3900
|
-
except Exception as e:
|
|
3901
|
-
log_error(f"Error upserting document: {e}")
|
|
3902
|
-
content.status = ContentStatus.FAILED
|
|
3903
|
-
content.status_message = "Could not upsert embedding"
|
|
3904
|
-
self._update_content(content)
|
|
3905
|
-
return
|
|
3906
|
-
else:
|
|
3907
|
-
try:
|
|
3908
|
-
self.vector_db.insert(
|
|
3909
|
-
content.content_hash, # type: ignore[arg-type]
|
|
3910
|
-
documents=read_documents,
|
|
3911
|
-
filters=content.metadata, # type: ignore[arg-type]
|
|
3912
|
-
)
|
|
3913
|
-
except Exception as e:
|
|
3914
|
-
log_error(f"Error inserting document: {e}")
|
|
3915
|
-
content.status = ContentStatus.FAILED
|
|
3916
|
-
content.status_message = "Could not insert embedding"
|
|
3917
|
-
self._update_content(content)
|
|
3918
|
-
return
|
|
3919
|
-
|
|
3920
|
-
content.status = ContentStatus.COMPLETED
|
|
3921
|
-
self._update_content(content)
|
|
3922
|
-
|
|
3923
|
-
# --- Remote Content Sources ---
|
|
3924
|
-
|
|
3925
|
-
def _get_remote_configs(self) -> List[RemoteContentConfig]:
|
|
3926
|
-
"""Return configured remote content sources."""
|
|
3927
|
-
return self.content_sources or []
|
|
3928
|
-
|
|
3929
|
-
def _get_remote_config_by_id(self, config_id: str) -> Optional[RemoteContentConfig]:
|
|
3930
|
-
"""Get a remote content config by its ID."""
|
|
3931
|
-
if not self.content_sources:
|
|
3932
|
-
return None
|
|
3933
|
-
return next((c for c in self.content_sources if c.id == config_id), None)
|
|
3934
|
-
|
|
3935
2084
|
# ==========================================
|
|
3936
2085
|
# PRIVATE - CONVERSION & DATA METHODS
|
|
3937
2086
|
# ==========================================
|
|
@@ -4156,6 +2305,87 @@ class Knowledge:
|
|
|
4156
2305
|
content_row = self._build_knowledge_row(content)
|
|
4157
2306
|
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
4158
2307
|
|
|
2308
|
+
# --- Vector DB Insert Helpers ---
|
|
2309
|
+
|
|
2310
|
+
async def _ahandle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
2311
|
+
from agno.vectordb import VectorDb
|
|
2312
|
+
|
|
2313
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2314
|
+
|
|
2315
|
+
if not self.vector_db:
|
|
2316
|
+
log_error("No vector database configured")
|
|
2317
|
+
content.status = ContentStatus.FAILED
|
|
2318
|
+
content.status_message = "No vector database configured"
|
|
2319
|
+
await self._aupdate_content(content)
|
|
2320
|
+
return
|
|
2321
|
+
|
|
2322
|
+
if self.vector_db.upsert_available() and upsert:
|
|
2323
|
+
try:
|
|
2324
|
+
await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
2325
|
+
except Exception as e:
|
|
2326
|
+
log_error(f"Error upserting document: {e}")
|
|
2327
|
+
content.status = ContentStatus.FAILED
|
|
2328
|
+
content.status_message = "Could not upsert embedding"
|
|
2329
|
+
await self._aupdate_content(content)
|
|
2330
|
+
return
|
|
2331
|
+
else:
|
|
2332
|
+
try:
|
|
2333
|
+
await self.vector_db.async_insert(
|
|
2334
|
+
content.content_hash, # type: ignore[arg-type]
|
|
2335
|
+
documents=read_documents,
|
|
2336
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
2337
|
+
)
|
|
2338
|
+
except Exception as e:
|
|
2339
|
+
log_error(f"Error inserting document: {e}")
|
|
2340
|
+
content.status = ContentStatus.FAILED
|
|
2341
|
+
content.status_message = "Could not insert embedding"
|
|
2342
|
+
await self._aupdate_content(content)
|
|
2343
|
+
return
|
|
2344
|
+
|
|
2345
|
+
content.status = ContentStatus.COMPLETED
|
|
2346
|
+
await self._aupdate_content(content)
|
|
2347
|
+
|
|
2348
|
+
def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
2349
|
+
"""Synchronously handle vector database insertion."""
|
|
2350
|
+
from agno.vectordb import VectorDb
|
|
2351
|
+
|
|
2352
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2353
|
+
|
|
2354
|
+
if not self.vector_db:
|
|
2355
|
+
log_error("No vector database configured")
|
|
2356
|
+
content.status = ContentStatus.FAILED
|
|
2357
|
+
content.status_message = "No vector database configured"
|
|
2358
|
+
self._update_content(content)
|
|
2359
|
+
return
|
|
2360
|
+
|
|
2361
|
+
if self.vector_db.upsert_available() and upsert:
|
|
2362
|
+
try:
|
|
2363
|
+
self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
2364
|
+
except Exception as e:
|
|
2365
|
+
log_error(f"Error upserting document: {e}")
|
|
2366
|
+
content.status = ContentStatus.FAILED
|
|
2367
|
+
content.status_message = "Could not upsert embedding"
|
|
2368
|
+
self._update_content(content)
|
|
2369
|
+
return
|
|
2370
|
+
else:
|
|
2371
|
+
try:
|
|
2372
|
+
self.vector_db.insert(
|
|
2373
|
+
content.content_hash, # type: ignore[arg-type]
|
|
2374
|
+
documents=read_documents,
|
|
2375
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
2376
|
+
)
|
|
2377
|
+
except Exception as e:
|
|
2378
|
+
log_error(f"Error inserting document: {e}")
|
|
2379
|
+
content.status = ContentStatus.FAILED
|
|
2380
|
+
content.status_message = "Could not insert embedding"
|
|
2381
|
+
self._update_content(content)
|
|
2382
|
+
return
|
|
2383
|
+
|
|
2384
|
+
content.status = ContentStatus.COMPLETED
|
|
2385
|
+
self._update_content(content)
|
|
2386
|
+
|
|
2387
|
+
# --- Content Update ---
|
|
2388
|
+
|
|
4159
2389
|
def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
4160
2390
|
from agno.vectordb import VectorDb
|
|
4161
2391
|
|