agno 2.4.7__py3-none-any.whl → 2.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +5 -1
- agno/db/base.py +2 -0
- agno/db/postgres/postgres.py +5 -5
- agno/db/sqlite/sqlite.py +4 -4
- agno/knowledge/knowledge.py +83 -1853
- agno/knowledge/loaders/__init__.py +29 -0
- agno/knowledge/loaders/azure_blob.py +423 -0
- agno/knowledge/loaders/base.py +187 -0
- agno/knowledge/loaders/gcs.py +267 -0
- agno/knowledge/loaders/github.py +415 -0
- agno/knowledge/loaders/s3.py +281 -0
- agno/knowledge/loaders/sharepoint.py +439 -0
- agno/knowledge/reader/website_reader.py +2 -2
- agno/knowledge/remote_knowledge.py +151 -0
- agno/learn/stores/session_context.py +10 -2
- agno/models/azure/openai_chat.py +6 -11
- agno/models/neosantara/__init__.py +5 -0
- agno/models/neosantara/neosantara.py +42 -0
- agno/models/utils.py +5 -0
- agno/os/app.py +4 -1
- agno/os/interfaces/agui/router.py +1 -1
- agno/os/routers/components/components.py +2 -0
- agno/os/routers/knowledge/knowledge.py +0 -1
- agno/os/routers/registry/registry.py +340 -192
- agno/os/routers/workflows/router.py +7 -1
- agno/os/schema.py +104 -0
- agno/registry/registry.py +4 -0
- agno/session/workflow.py +1 -1
- agno/skills/utils.py +100 -2
- agno/team/team.py +6 -3
- agno/vectordb/lancedb/lance_db.py +22 -7
- agno/workflow/__init__.py +4 -0
- agno/workflow/cel.py +299 -0
- agno/workflow/condition.py +145 -2
- agno/workflow/loop.py +177 -46
- agno/workflow/parallel.py +75 -4
- agno/workflow/router.py +260 -44
- agno/workflow/step.py +14 -7
- agno/workflow/steps.py +43 -0
- agno/workflow/workflow.py +104 -46
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/METADATA +24 -36
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/RECORD +45 -34
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/WHEEL +0 -0
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.7.dist-info → agno-2.4.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
"""GCS content loader for Knowledge.
|
|
2
|
+
|
|
3
|
+
Provides methods for loading content from Google Cloud Storage.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# mypy: disable-error-code="attr-defined"
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Any, Dict, Optional, cast
|
|
10
|
+
|
|
11
|
+
from agno.knowledge.content import Content, ContentStatus
|
|
12
|
+
from agno.knowledge.loaders.base import BaseLoader
|
|
13
|
+
from agno.knowledge.reader import Reader
|
|
14
|
+
from agno.knowledge.remote_content.config import GcsConfig, RemoteContentConfig
|
|
15
|
+
from agno.knowledge.remote_content.remote_content import GCSContent
|
|
16
|
+
from agno.utils.log import log_info, log_warning
|
|
17
|
+
from agno.utils.string import generate_id
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GCSLoader(BaseLoader):
|
|
21
|
+
"""Loader for Google Cloud Storage content."""
|
|
22
|
+
|
|
23
|
+
# ==========================================
|
|
24
|
+
# GCS HELPERS (shared between sync/async)
|
|
25
|
+
# ==========================================
|
|
26
|
+
|
|
27
|
+
def _validate_gcs_config(
|
|
28
|
+
self,
|
|
29
|
+
content: Content,
|
|
30
|
+
config: Optional[RemoteContentConfig],
|
|
31
|
+
) -> Optional[GcsConfig]:
|
|
32
|
+
"""Validate and extract GCS config.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
GcsConfig if valid, None otherwise (GCS can work without explicit config)
|
|
36
|
+
"""
|
|
37
|
+
return cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
|
|
38
|
+
|
|
39
|
+
def _get_gcs_client(self, gcs_config: Optional[GcsConfig]):
|
|
40
|
+
"""Get a GCS client.
|
|
41
|
+
|
|
42
|
+
Requires the `google-cloud-storage` package.
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
from google.cloud import storage # type: ignore
|
|
46
|
+
except ImportError:
|
|
47
|
+
raise ImportError(
|
|
48
|
+
"The `google-cloud-storage` package is not installed. "
|
|
49
|
+
"Please install it via `pip install google-cloud-storage`."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if gcs_config and gcs_config.credentials_path:
|
|
53
|
+
return storage.Client.from_service_account_json(gcs_config.credentials_path)
|
|
54
|
+
elif gcs_config and gcs_config.project:
|
|
55
|
+
return storage.Client(project=gcs_config.project)
|
|
56
|
+
else:
|
|
57
|
+
return storage.Client()
|
|
58
|
+
|
|
59
|
+
def _build_gcs_metadata(
|
|
60
|
+
self,
|
|
61
|
+
gcs_config: Optional[GcsConfig],
|
|
62
|
+
bucket_name: str,
|
|
63
|
+
blob_name: str,
|
|
64
|
+
) -> Dict[str, str]:
|
|
65
|
+
"""Build GCS-specific metadata dictionary."""
|
|
66
|
+
metadata: Dict[str, str] = {
|
|
67
|
+
"source_type": "gcs",
|
|
68
|
+
"gcs_bucket": bucket_name,
|
|
69
|
+
"gcs_blob_name": blob_name,
|
|
70
|
+
}
|
|
71
|
+
if gcs_config:
|
|
72
|
+
metadata["source_config_id"] = gcs_config.id
|
|
73
|
+
metadata["source_config_name"] = gcs_config.name
|
|
74
|
+
return metadata
|
|
75
|
+
|
|
76
|
+
def _build_gcs_virtual_path(self, bucket_name: str, blob_name: str) -> str:
|
|
77
|
+
"""Build virtual path for GCS content."""
|
|
78
|
+
return f"gcs://{bucket_name}/{blob_name}"
|
|
79
|
+
|
|
80
|
+
# ==========================================
|
|
81
|
+
# GCS LOADERS
|
|
82
|
+
# ==========================================
|
|
83
|
+
|
|
84
|
+
async def _aload_from_gcs(
|
|
85
|
+
self,
|
|
86
|
+
content: Content,
|
|
87
|
+
upsert: bool,
|
|
88
|
+
skip_if_exists: bool,
|
|
89
|
+
config: Optional[RemoteContentConfig] = None,
|
|
90
|
+
):
|
|
91
|
+
"""Load content from Google Cloud Storage (async).
|
|
92
|
+
|
|
93
|
+
Note: Uses sync google-cloud-storage calls as it doesn't have an async API.
|
|
94
|
+
"""
|
|
95
|
+
try:
|
|
96
|
+
from google.cloud import storage # type: ignore # noqa: F401
|
|
97
|
+
except ImportError:
|
|
98
|
+
raise ImportError(
|
|
99
|
+
"The `google-cloud-storage` package is not installed. "
|
|
100
|
+
"Please install it via `pip install google-cloud-storage`."
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
log_warning(
|
|
104
|
+
"GCS content loading has limited features. "
|
|
105
|
+
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
109
|
+
gcs_config = self._validate_gcs_config(content, config)
|
|
110
|
+
|
|
111
|
+
# Get or create bucket
|
|
112
|
+
bucket = remote_content.bucket
|
|
113
|
+
if bucket is None and remote_content.bucket_name:
|
|
114
|
+
client = self._get_gcs_client(gcs_config)
|
|
115
|
+
bucket = client.bucket(remote_content.bucket_name)
|
|
116
|
+
|
|
117
|
+
# Identify objects to read
|
|
118
|
+
objects_to_read = []
|
|
119
|
+
if remote_content.blob_name is not None:
|
|
120
|
+
objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
|
|
121
|
+
elif remote_content.prefix is not None:
|
|
122
|
+
objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
123
|
+
else:
|
|
124
|
+
objects_to_read.extend(bucket.list_blobs()) # type: ignore
|
|
125
|
+
|
|
126
|
+
if objects_to_read:
|
|
127
|
+
log_info(f"Processing {len(objects_to_read)} file(s) from GCS")
|
|
128
|
+
|
|
129
|
+
bucket_name = remote_content.bucket_name or (bucket.name if bucket else "unknown")
|
|
130
|
+
is_folder_upload = len(objects_to_read) > 1
|
|
131
|
+
root_path = remote_content.prefix or ""
|
|
132
|
+
|
|
133
|
+
for gcs_object in objects_to_read:
|
|
134
|
+
blob_name = gcs_object.name
|
|
135
|
+
file_name = blob_name.split("/")[-1]
|
|
136
|
+
|
|
137
|
+
# Build metadata and virtual path using helpers
|
|
138
|
+
virtual_path = self._build_gcs_virtual_path(bucket_name, blob_name)
|
|
139
|
+
gcs_metadata = self._build_gcs_metadata(gcs_config, bucket_name, blob_name)
|
|
140
|
+
merged_metadata: Dict[str, Any] = self._merge_metadata(gcs_metadata, content.metadata)
|
|
141
|
+
|
|
142
|
+
# Compute content name using base helper
|
|
143
|
+
content_name = self._compute_content_name(blob_name, file_name, content.name, root_path, is_folder_upload)
|
|
144
|
+
|
|
145
|
+
# Create content entry
|
|
146
|
+
content_entry = Content(
|
|
147
|
+
name=content_name,
|
|
148
|
+
description=content.description,
|
|
149
|
+
path=virtual_path,
|
|
150
|
+
status=ContentStatus.PROCESSING,
|
|
151
|
+
metadata=merged_metadata,
|
|
152
|
+
file_type="gcs",
|
|
153
|
+
)
|
|
154
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
155
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
156
|
+
|
|
157
|
+
await self._ainsert_contents_db(content_entry)
|
|
158
|
+
|
|
159
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
160
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
161
|
+
await self._aupdate_content(content_entry)
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
# Select reader
|
|
165
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
166
|
+
reader = cast(Reader, reader)
|
|
167
|
+
|
|
168
|
+
# Fetch and load the content
|
|
169
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
170
|
+
|
|
171
|
+
# Read the content
|
|
172
|
+
read_documents = await reader.async_read(readable_content, name=file_name)
|
|
173
|
+
|
|
174
|
+
# Prepare and insert the content in the vector database
|
|
175
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
176
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
177
|
+
|
|
178
|
+
def _load_from_gcs(
|
|
179
|
+
self,
|
|
180
|
+
content: Content,
|
|
181
|
+
upsert: bool,
|
|
182
|
+
skip_if_exists: bool,
|
|
183
|
+
config: Optional[RemoteContentConfig] = None,
|
|
184
|
+
):
|
|
185
|
+
"""Load content from Google Cloud Storage (sync)."""
|
|
186
|
+
try:
|
|
187
|
+
from google.cloud import storage # type: ignore # noqa: F401
|
|
188
|
+
except ImportError:
|
|
189
|
+
raise ImportError(
|
|
190
|
+
"The `google-cloud-storage` package is not installed. "
|
|
191
|
+
"Please install it via `pip install google-cloud-storage`."
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
log_warning(
|
|
195
|
+
"GCS content loading has limited features. "
|
|
196
|
+
"Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
200
|
+
gcs_config = self._validate_gcs_config(content, config)
|
|
201
|
+
|
|
202
|
+
# Get or create bucket
|
|
203
|
+
bucket = remote_content.bucket
|
|
204
|
+
if bucket is None and remote_content.bucket_name:
|
|
205
|
+
client = self._get_gcs_client(gcs_config)
|
|
206
|
+
bucket = client.bucket(remote_content.bucket_name)
|
|
207
|
+
|
|
208
|
+
# Identify objects to read
|
|
209
|
+
objects_to_read = []
|
|
210
|
+
if remote_content.blob_name is not None:
|
|
211
|
+
objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
|
|
212
|
+
elif remote_content.prefix is not None:
|
|
213
|
+
objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
214
|
+
else:
|
|
215
|
+
objects_to_read.extend(bucket.list_blobs()) # type: ignore
|
|
216
|
+
|
|
217
|
+
if objects_to_read:
|
|
218
|
+
log_info(f"Processing {len(objects_to_read)} file(s) from GCS")
|
|
219
|
+
|
|
220
|
+
bucket_name = remote_content.bucket_name or (bucket.name if bucket else "unknown")
|
|
221
|
+
is_folder_upload = len(objects_to_read) > 1
|
|
222
|
+
root_path = remote_content.prefix or ""
|
|
223
|
+
|
|
224
|
+
for gcs_object in objects_to_read:
|
|
225
|
+
blob_name = gcs_object.name
|
|
226
|
+
file_name = blob_name.split("/")[-1]
|
|
227
|
+
|
|
228
|
+
# Build metadata and virtual path using helpers
|
|
229
|
+
virtual_path = self._build_gcs_virtual_path(bucket_name, blob_name)
|
|
230
|
+
gcs_metadata = self._build_gcs_metadata(gcs_config, bucket_name, blob_name)
|
|
231
|
+
merged_metadata: Dict[str, Any] = self._merge_metadata(gcs_metadata, content.metadata)
|
|
232
|
+
|
|
233
|
+
# Compute content name using base helper
|
|
234
|
+
content_name = self._compute_content_name(blob_name, file_name, content.name, root_path, is_folder_upload)
|
|
235
|
+
|
|
236
|
+
# Create content entry
|
|
237
|
+
content_entry = Content(
|
|
238
|
+
name=content_name,
|
|
239
|
+
description=content.description,
|
|
240
|
+
path=virtual_path,
|
|
241
|
+
status=ContentStatus.PROCESSING,
|
|
242
|
+
metadata=merged_metadata,
|
|
243
|
+
file_type="gcs",
|
|
244
|
+
)
|
|
245
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
246
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
247
|
+
|
|
248
|
+
self._insert_contents_db(content_entry)
|
|
249
|
+
|
|
250
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
251
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
252
|
+
self._update_content(content_entry)
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
# Select reader
|
|
256
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
257
|
+
reader = cast(Reader, reader)
|
|
258
|
+
|
|
259
|
+
# Fetch and load the content
|
|
260
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
261
|
+
|
|
262
|
+
# Read the content
|
|
263
|
+
read_documents = reader.read(readable_content, name=file_name)
|
|
264
|
+
|
|
265
|
+
# Prepare and insert the content in the vector database
|
|
266
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
267
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""GitHub content loader for Knowledge.
|
|
2
|
+
|
|
3
|
+
Provides methods for loading content from GitHub repositories.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
# mypy: disable-error-code="attr-defined"
|
|
7
|
+
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from typing import Dict, List, Optional, cast
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
from httpx import AsyncClient
|
|
13
|
+
|
|
14
|
+
from agno.knowledge.content import Content, ContentStatus
|
|
15
|
+
from agno.knowledge.loaders.base import BaseLoader
|
|
16
|
+
from agno.knowledge.reader import Reader
|
|
17
|
+
from agno.knowledge.remote_content.config import GitHubConfig, RemoteContentConfig
|
|
18
|
+
from agno.knowledge.remote_content.remote_content import GitHubContent
|
|
19
|
+
from agno.utils.log import log_error, log_info, log_warning
|
|
20
|
+
from agno.utils.string import generate_id
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class GitHubLoader(BaseLoader):
|
|
24
|
+
"""Loader for GitHub content."""
|
|
25
|
+
|
|
26
|
+
# ==========================================
|
|
27
|
+
# GITHUB HELPERS (shared between sync/async)
|
|
28
|
+
# ==========================================
|
|
29
|
+
|
|
30
|
+
def _validate_github_config(
|
|
31
|
+
self,
|
|
32
|
+
content: Content,
|
|
33
|
+
config: Optional[RemoteContentConfig],
|
|
34
|
+
) -> Optional[GitHubConfig]:
|
|
35
|
+
"""Validate and extract GitHub config.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
GitHubConfig if valid, None otherwise
|
|
39
|
+
"""
|
|
40
|
+
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
41
|
+
gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
|
|
42
|
+
|
|
43
|
+
if gh_config is None:
|
|
44
|
+
log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
return gh_config
|
|
48
|
+
|
|
49
|
+
def _build_github_headers(self, gh_config: GitHubConfig) -> Dict[str, str]:
|
|
50
|
+
"""Build headers for GitHub API requests."""
|
|
51
|
+
headers: Dict[str, str] = {
|
|
52
|
+
"Accept": "application/vnd.github.v3+json",
|
|
53
|
+
"User-Agent": "Agno-Knowledge",
|
|
54
|
+
}
|
|
55
|
+
if gh_config.token:
|
|
56
|
+
headers["Authorization"] = f"Bearer {gh_config.token}"
|
|
57
|
+
return headers
|
|
58
|
+
|
|
59
|
+
def _build_github_metadata(
|
|
60
|
+
self,
|
|
61
|
+
gh_config: GitHubConfig,
|
|
62
|
+
branch: str,
|
|
63
|
+
file_path: str,
|
|
64
|
+
file_name: str,
|
|
65
|
+
) -> Dict[str, str]:
|
|
66
|
+
"""Build GitHub-specific metadata dictionary."""
|
|
67
|
+
return {
|
|
68
|
+
"source_type": "github",
|
|
69
|
+
"source_config_id": gh_config.id,
|
|
70
|
+
"source_config_name": gh_config.name,
|
|
71
|
+
"github_repo": gh_config.repo,
|
|
72
|
+
"github_branch": branch,
|
|
73
|
+
"github_path": file_path,
|
|
74
|
+
"github_filename": file_name,
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
def _build_github_virtual_path(self, repo: str, branch: str, file_path: str) -> str:
|
|
78
|
+
"""Build virtual path for GitHub content."""
|
|
79
|
+
return f"github://{repo}/{branch}/{file_path}"
|
|
80
|
+
|
|
81
|
+
def _get_github_branch(self, remote_content: GitHubContent, gh_config: GitHubConfig) -> str:
|
|
82
|
+
"""Get the branch to use for GitHub operations."""
|
|
83
|
+
return remote_content.branch or gh_config.branch or "main"
|
|
84
|
+
|
|
85
|
+
def _get_github_path_to_process(self, remote_content: GitHubContent) -> str:
|
|
86
|
+
"""Get the path to process from remote content."""
|
|
87
|
+
return (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
|
|
88
|
+
|
|
89
|
+
def _process_github_file_content(
|
|
90
|
+
self,
|
|
91
|
+
file_data: dict,
|
|
92
|
+
client: httpx.Client,
|
|
93
|
+
headers: Dict[str, str],
|
|
94
|
+
) -> bytes:
|
|
95
|
+
"""Process GitHub API response and return file content (sync)."""
|
|
96
|
+
if file_data.get("encoding") == "base64":
|
|
97
|
+
import base64
|
|
98
|
+
|
|
99
|
+
return base64.b64decode(file_data["content"])
|
|
100
|
+
else:
|
|
101
|
+
download_url = file_data.get("download_url")
|
|
102
|
+
if download_url:
|
|
103
|
+
dl_response = client.get(download_url, headers=headers, timeout=30.0)
|
|
104
|
+
dl_response.raise_for_status()
|
|
105
|
+
return dl_response.content
|
|
106
|
+
else:
|
|
107
|
+
raise ValueError("No content or download_url in response")
|
|
108
|
+
|
|
109
|
+
async def _aprocess_github_file_content(
|
|
110
|
+
self,
|
|
111
|
+
file_data: dict,
|
|
112
|
+
client: AsyncClient,
|
|
113
|
+
headers: Dict[str, str],
|
|
114
|
+
) -> bytes:
|
|
115
|
+
"""Process GitHub API response and return file content (async)."""
|
|
116
|
+
if file_data.get("encoding") == "base64":
|
|
117
|
+
import base64
|
|
118
|
+
|
|
119
|
+
return base64.b64decode(file_data["content"])
|
|
120
|
+
else:
|
|
121
|
+
download_url = file_data.get("download_url")
|
|
122
|
+
if download_url:
|
|
123
|
+
dl_response = await client.get(download_url, headers=headers, timeout=30.0)
|
|
124
|
+
dl_response.raise_for_status()
|
|
125
|
+
return dl_response.content
|
|
126
|
+
else:
|
|
127
|
+
raise ValueError("No content or download_url in response")
|
|
128
|
+
|
|
129
|
+
# ==========================================
|
|
130
|
+
# GITHUB LOADERS
|
|
131
|
+
# ==========================================
|
|
132
|
+
|
|
133
|
+
async def _aload_from_github(
|
|
134
|
+
self,
|
|
135
|
+
content: Content,
|
|
136
|
+
upsert: bool,
|
|
137
|
+
skip_if_exists: bool,
|
|
138
|
+
config: Optional[RemoteContentConfig] = None,
|
|
139
|
+
):
|
|
140
|
+
"""Load content from GitHub (async).
|
|
141
|
+
|
|
142
|
+
Requires the GitHub config to contain repo and optionally token for private repos.
|
|
143
|
+
Uses the GitHub API to fetch file contents.
|
|
144
|
+
"""
|
|
145
|
+
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
146
|
+
gh_config = self._validate_github_config(content, config)
|
|
147
|
+
if gh_config is None:
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
headers = self._build_github_headers(gh_config)
|
|
151
|
+
branch = self._get_github_branch(remote_content, gh_config)
|
|
152
|
+
path_to_process = self._get_github_path_to_process(remote_content)
|
|
153
|
+
|
|
154
|
+
files_to_process: List[Dict[str, str]] = []
|
|
155
|
+
|
|
156
|
+
async with AsyncClient() as client:
|
|
157
|
+
# Helper function to recursively list all files in a folder
|
|
158
|
+
async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
|
|
159
|
+
"""Recursively list all files in a GitHub folder."""
|
|
160
|
+
files: List[Dict[str, str]] = []
|
|
161
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
|
|
162
|
+
if branch:
|
|
163
|
+
api_url += f"?ref={branch}"
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
167
|
+
response.raise_for_status()
|
|
168
|
+
items = response.json()
|
|
169
|
+
|
|
170
|
+
if not isinstance(items, list):
|
|
171
|
+
items = [items]
|
|
172
|
+
|
|
173
|
+
for item in items:
|
|
174
|
+
if item.get("type") == "file":
|
|
175
|
+
files.append({"path": item["path"], "name": item["name"]})
|
|
176
|
+
elif item.get("type") == "dir":
|
|
177
|
+
subdir_files = await list_files_recursive(item["path"])
|
|
178
|
+
files.extend(subdir_files)
|
|
179
|
+
except Exception as e:
|
|
180
|
+
log_error(f"Error listing GitHub folder {folder}: {e}")
|
|
181
|
+
|
|
182
|
+
return files
|
|
183
|
+
|
|
184
|
+
if path_to_process:
|
|
185
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
|
|
186
|
+
if branch:
|
|
187
|
+
api_url += f"?ref={branch}"
|
|
188
|
+
|
|
189
|
+
try:
|
|
190
|
+
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
191
|
+
response.raise_for_status()
|
|
192
|
+
path_data = response.json()
|
|
193
|
+
|
|
194
|
+
if isinstance(path_data, list):
|
|
195
|
+
for item in path_data:
|
|
196
|
+
if item.get("type") == "file":
|
|
197
|
+
files_to_process.append({"path": item["path"], "name": item["name"]})
|
|
198
|
+
elif item.get("type") == "dir":
|
|
199
|
+
subdir_files = await list_files_recursive(item["path"])
|
|
200
|
+
files_to_process.extend(subdir_files)
|
|
201
|
+
else:
|
|
202
|
+
files_to_process.append({"path": path_data["path"], "name": path_data["name"]})
|
|
203
|
+
except Exception as e:
|
|
204
|
+
log_error(f"Error fetching GitHub path {path_to_process}: {e}")
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
if not files_to_process:
|
|
208
|
+
log_warning(f"No files found at GitHub path: {path_to_process}")
|
|
209
|
+
return
|
|
210
|
+
|
|
211
|
+
log_info(f"Processing {len(files_to_process)} file(s) from GitHub")
|
|
212
|
+
is_folder_upload = len(files_to_process) > 1
|
|
213
|
+
|
|
214
|
+
for file_info in files_to_process:
|
|
215
|
+
file_path = file_info["path"]
|
|
216
|
+
file_name = file_info["name"]
|
|
217
|
+
|
|
218
|
+
# Build metadata and virtual path using helpers
|
|
219
|
+
virtual_path = self._build_github_virtual_path(gh_config.repo, branch, file_path)
|
|
220
|
+
github_metadata = self._build_github_metadata(gh_config, branch, file_path, file_name)
|
|
221
|
+
merged_metadata = self._merge_metadata(github_metadata, content.metadata)
|
|
222
|
+
|
|
223
|
+
# Compute content name using base helper
|
|
224
|
+
content_name = self._compute_content_name(
|
|
225
|
+
file_path, file_name, content.name, path_to_process, is_folder_upload
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
# Create content entry using base helper
|
|
229
|
+
content_entry = self._create_content_entry(
|
|
230
|
+
content, content_name, virtual_path, merged_metadata, "github", is_folder_upload
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
await self._ainsert_contents_db(content_entry)
|
|
234
|
+
|
|
235
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
236
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
237
|
+
await self._aupdate_content(content_entry)
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
# Fetch file content
|
|
241
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
|
|
242
|
+
if branch:
|
|
243
|
+
api_url += f"?ref={branch}"
|
|
244
|
+
try:
|
|
245
|
+
response = await client.get(api_url, headers=headers, timeout=30.0)
|
|
246
|
+
response.raise_for_status()
|
|
247
|
+
file_data = response.json()
|
|
248
|
+
file_content = await self._aprocess_github_file_content(file_data, client, headers)
|
|
249
|
+
except Exception as e:
|
|
250
|
+
log_error(f"Error fetching GitHub file {file_path}: {e}")
|
|
251
|
+
content_entry.status = ContentStatus.FAILED
|
|
252
|
+
content_entry.status_message = str(e)
|
|
253
|
+
await self._aupdate_content(content_entry)
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# Select reader and read content
|
|
257
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
258
|
+
if reader is None:
|
|
259
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
260
|
+
content_entry.status = ContentStatus.FAILED
|
|
261
|
+
content_entry.status_message = "No suitable reader found"
|
|
262
|
+
await self._aupdate_content(content_entry)
|
|
263
|
+
continue
|
|
264
|
+
|
|
265
|
+
reader = cast(Reader, reader)
|
|
266
|
+
readable_content = BytesIO(file_content)
|
|
267
|
+
read_documents = await reader.async_read(readable_content, name=file_name)
|
|
268
|
+
|
|
269
|
+
# Prepare and insert into vector database
|
|
270
|
+
if not content_entry.id:
|
|
271
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
272
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
273
|
+
await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
|
|
274
|
+
|
|
275
|
+
def _load_from_github(
|
|
276
|
+
self,
|
|
277
|
+
content: Content,
|
|
278
|
+
upsert: bool,
|
|
279
|
+
skip_if_exists: bool,
|
|
280
|
+
config: Optional[RemoteContentConfig] = None,
|
|
281
|
+
):
|
|
282
|
+
"""Load content from GitHub (sync).
|
|
283
|
+
|
|
284
|
+
Requires the GitHub config to contain repo and optionally token for private repos.
|
|
285
|
+
Uses the GitHub API to fetch file contents.
|
|
286
|
+
"""
|
|
287
|
+
remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
|
|
288
|
+
gh_config = self._validate_github_config(content, config)
|
|
289
|
+
if gh_config is None:
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
headers = self._build_github_headers(gh_config)
|
|
293
|
+
branch = self._get_github_branch(remote_content, gh_config)
|
|
294
|
+
path_to_process = self._get_github_path_to_process(remote_content)
|
|
295
|
+
|
|
296
|
+
files_to_process: List[Dict[str, str]] = []
|
|
297
|
+
|
|
298
|
+
with httpx.Client() as client:
|
|
299
|
+
# Helper function to recursively list all files in a folder
|
|
300
|
+
def list_files_recursive(folder: str) -> List[Dict[str, str]]:
|
|
301
|
+
"""Recursively list all files in a GitHub folder."""
|
|
302
|
+
files: List[Dict[str, str]] = []
|
|
303
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
|
|
304
|
+
if branch:
|
|
305
|
+
api_url += f"?ref={branch}"
|
|
306
|
+
|
|
307
|
+
try:
|
|
308
|
+
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
309
|
+
response.raise_for_status()
|
|
310
|
+
items = response.json()
|
|
311
|
+
|
|
312
|
+
if not isinstance(items, list):
|
|
313
|
+
items = [items]
|
|
314
|
+
|
|
315
|
+
for item in items:
|
|
316
|
+
if item.get("type") == "file":
|
|
317
|
+
files.append({"path": item["path"], "name": item["name"]})
|
|
318
|
+
elif item.get("type") == "dir":
|
|
319
|
+
subdir_files = list_files_recursive(item["path"])
|
|
320
|
+
files.extend(subdir_files)
|
|
321
|
+
except Exception as e:
|
|
322
|
+
log_error(f"Error listing GitHub folder {folder}: {e}")
|
|
323
|
+
|
|
324
|
+
return files
|
|
325
|
+
|
|
326
|
+
if path_to_process:
|
|
327
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
|
|
328
|
+
if branch:
|
|
329
|
+
api_url += f"?ref={branch}"
|
|
330
|
+
|
|
331
|
+
try:
|
|
332
|
+
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
333
|
+
response.raise_for_status()
|
|
334
|
+
path_data = response.json()
|
|
335
|
+
|
|
336
|
+
if isinstance(path_data, list):
|
|
337
|
+
for item in path_data:
|
|
338
|
+
if item.get("type") == "file":
|
|
339
|
+
files_to_process.append({"path": item["path"], "name": item["name"]})
|
|
340
|
+
elif item.get("type") == "dir":
|
|
341
|
+
subdir_files = list_files_recursive(item["path"])
|
|
342
|
+
files_to_process.extend(subdir_files)
|
|
343
|
+
else:
|
|
344
|
+
files_to_process.append({"path": path_data["path"], "name": path_data["name"]})
|
|
345
|
+
except Exception as e:
|
|
346
|
+
log_error(f"Error fetching GitHub path {path_to_process}: {e}")
|
|
347
|
+
return
|
|
348
|
+
|
|
349
|
+
if not files_to_process:
|
|
350
|
+
log_warning(f"No files found at GitHub path: {path_to_process}")
|
|
351
|
+
return
|
|
352
|
+
|
|
353
|
+
log_info(f"Processing {len(files_to_process)} file(s) from GitHub")
|
|
354
|
+
is_folder_upload = len(files_to_process) > 1
|
|
355
|
+
|
|
356
|
+
for file_info in files_to_process:
|
|
357
|
+
file_path = file_info["path"]
|
|
358
|
+
file_name = file_info["name"]
|
|
359
|
+
|
|
360
|
+
# Build metadata and virtual path using helpers
|
|
361
|
+
virtual_path = self._build_github_virtual_path(gh_config.repo, branch, file_path)
|
|
362
|
+
github_metadata = self._build_github_metadata(gh_config, branch, file_path, file_name)
|
|
363
|
+
merged_metadata = self._merge_metadata(github_metadata, content.metadata)
|
|
364
|
+
|
|
365
|
+
# Compute content name using base helper
|
|
366
|
+
content_name = self._compute_content_name(
|
|
367
|
+
file_path, file_name, content.name, path_to_process, is_folder_upload
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
# Create content entry using base helper
|
|
371
|
+
content_entry = self._create_content_entry(
|
|
372
|
+
content, content_name, virtual_path, merged_metadata, "github", is_folder_upload
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
self._insert_contents_db(content_entry)
|
|
376
|
+
|
|
377
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
378
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
379
|
+
self._update_content(content_entry)
|
|
380
|
+
continue
|
|
381
|
+
|
|
382
|
+
# Fetch file content
|
|
383
|
+
api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
|
|
384
|
+
if branch:
|
|
385
|
+
api_url += f"?ref={branch}"
|
|
386
|
+
try:
|
|
387
|
+
response = client.get(api_url, headers=headers, timeout=30.0)
|
|
388
|
+
response.raise_for_status()
|
|
389
|
+
file_data = response.json()
|
|
390
|
+
file_content = self._process_github_file_content(file_data, client, headers)
|
|
391
|
+
except Exception as e:
|
|
392
|
+
log_error(f"Error fetching GitHub file {file_path}: {e}")
|
|
393
|
+
content_entry.status = ContentStatus.FAILED
|
|
394
|
+
content_entry.status_message = str(e)
|
|
395
|
+
self._update_content(content_entry)
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
# Select reader and read content
|
|
399
|
+
reader = self._select_reader_by_uri(file_name, content.reader)
|
|
400
|
+
if reader is None:
|
|
401
|
+
log_warning(f"No reader found for file: {file_name}")
|
|
402
|
+
content_entry.status = ContentStatus.FAILED
|
|
403
|
+
content_entry.status_message = "No suitable reader found"
|
|
404
|
+
self._update_content(content_entry)
|
|
405
|
+
continue
|
|
406
|
+
|
|
407
|
+
reader = cast(Reader, reader)
|
|
408
|
+
readable_content = BytesIO(file_content)
|
|
409
|
+
read_documents = reader.read(readable_content, name=file_name)
|
|
410
|
+
|
|
411
|
+
# Prepare and insert into vector database
|
|
412
|
+
if not content_entry.id:
|
|
413
|
+
content_entry.id = generate_id(content_entry.content_hash or "")
|
|
414
|
+
self._prepare_documents_for_insert(read_documents, content_entry.id)
|
|
415
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|