agno 2.4.6__py3-none-any.whl → 2.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. agno/agent/agent.py +5 -1
  2. agno/db/base.py +2 -0
  3. agno/db/postgres/postgres.py +5 -5
  4. agno/db/singlestore/singlestore.py +4 -5
  5. agno/db/sqlite/sqlite.py +4 -4
  6. agno/knowledge/embedder/aws_bedrock.py +325 -106
  7. agno/knowledge/knowledge.py +83 -1853
  8. agno/knowledge/loaders/__init__.py +29 -0
  9. agno/knowledge/loaders/azure_blob.py +423 -0
  10. agno/knowledge/loaders/base.py +187 -0
  11. agno/knowledge/loaders/gcs.py +267 -0
  12. agno/knowledge/loaders/github.py +415 -0
  13. agno/knowledge/loaders/s3.py +281 -0
  14. agno/knowledge/loaders/sharepoint.py +439 -0
  15. agno/knowledge/reader/website_reader.py +2 -2
  16. agno/knowledge/remote_knowledge.py +151 -0
  17. agno/knowledge/reranker/aws_bedrock.py +299 -0
  18. agno/learn/machine.py +5 -6
  19. agno/learn/stores/session_context.py +10 -2
  20. agno/models/azure/openai_chat.py +6 -11
  21. agno/models/neosantara/__init__.py +5 -0
  22. agno/models/neosantara/neosantara.py +42 -0
  23. agno/models/utils.py +5 -0
  24. agno/os/app.py +4 -1
  25. agno/os/interfaces/agui/router.py +1 -1
  26. agno/os/routers/components/components.py +2 -0
  27. agno/os/routers/knowledge/knowledge.py +0 -1
  28. agno/os/routers/registry/registry.py +340 -192
  29. agno/os/routers/workflows/router.py +7 -1
  30. agno/os/schema.py +104 -0
  31. agno/registry/registry.py +4 -0
  32. agno/run/workflow.py +3 -0
  33. agno/session/workflow.py +1 -1
  34. agno/skills/utils.py +100 -2
  35. agno/team/team.py +6 -3
  36. agno/tools/mcp/mcp.py +26 -1
  37. agno/vectordb/lancedb/lance_db.py +22 -7
  38. agno/workflow/__init__.py +4 -0
  39. agno/workflow/cel.py +299 -0
  40. agno/workflow/condition.py +280 -58
  41. agno/workflow/loop.py +177 -46
  42. agno/workflow/parallel.py +75 -4
  43. agno/workflow/router.py +260 -44
  44. agno/workflow/step.py +14 -7
  45. agno/workflow/steps.py +43 -0
  46. agno/workflow/workflow.py +104 -46
  47. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/METADATA +25 -37
  48. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/RECORD +51 -39
  49. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/WHEEL +0 -0
  50. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/licenses/LICENSE +0 -0
  51. {agno-2.4.6.dist-info → agno-2.4.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,267 @@
1
+ """GCS content loader for Knowledge.
2
+
3
+ Provides methods for loading content from Google Cloud Storage.
4
+ """
5
+
6
+ # mypy: disable-error-code="attr-defined"
7
+
8
+ from io import BytesIO
9
+ from typing import Any, Dict, Optional, cast
10
+
11
+ from agno.knowledge.content import Content, ContentStatus
12
+ from agno.knowledge.loaders.base import BaseLoader
13
+ from agno.knowledge.reader import Reader
14
+ from agno.knowledge.remote_content.config import GcsConfig, RemoteContentConfig
15
+ from agno.knowledge.remote_content.remote_content import GCSContent
16
+ from agno.utils.log import log_info, log_warning
17
+ from agno.utils.string import generate_id
18
+
19
+
20
+ class GCSLoader(BaseLoader):
21
+ """Loader for Google Cloud Storage content."""
22
+
23
+ # ==========================================
24
+ # GCS HELPERS (shared between sync/async)
25
+ # ==========================================
26
+
27
+ def _validate_gcs_config(
28
+ self,
29
+ content: Content,
30
+ config: Optional[RemoteContentConfig],
31
+ ) -> Optional[GcsConfig]:
32
+ """Validate and extract GCS config.
33
+
34
+ Returns:
35
+ GcsConfig if valid, None otherwise (GCS can work without explicit config)
36
+ """
37
+ return cast(GcsConfig, config) if isinstance(config, GcsConfig) else None
38
+
39
+ def _get_gcs_client(self, gcs_config: Optional[GcsConfig]):
40
+ """Get a GCS client.
41
+
42
+ Requires the `google-cloud-storage` package.
43
+ """
44
+ try:
45
+ from google.cloud import storage # type: ignore
46
+ except ImportError:
47
+ raise ImportError(
48
+ "The `google-cloud-storage` package is not installed. "
49
+ "Please install it via `pip install google-cloud-storage`."
50
+ )
51
+
52
+ if gcs_config and gcs_config.credentials_path:
53
+ return storage.Client.from_service_account_json(gcs_config.credentials_path)
54
+ elif gcs_config and gcs_config.project:
55
+ return storage.Client(project=gcs_config.project)
56
+ else:
57
+ return storage.Client()
58
+
59
+ def _build_gcs_metadata(
60
+ self,
61
+ gcs_config: Optional[GcsConfig],
62
+ bucket_name: str,
63
+ blob_name: str,
64
+ ) -> Dict[str, str]:
65
+ """Build GCS-specific metadata dictionary."""
66
+ metadata: Dict[str, str] = {
67
+ "source_type": "gcs",
68
+ "gcs_bucket": bucket_name,
69
+ "gcs_blob_name": blob_name,
70
+ }
71
+ if gcs_config:
72
+ metadata["source_config_id"] = gcs_config.id
73
+ metadata["source_config_name"] = gcs_config.name
74
+ return metadata
75
+
76
+ def _build_gcs_virtual_path(self, bucket_name: str, blob_name: str) -> str:
77
+ """Build virtual path for GCS content."""
78
+ return f"gcs://{bucket_name}/{blob_name}"
79
+
80
+ # ==========================================
81
+ # GCS LOADERS
82
+ # ==========================================
83
+
84
+ async def _aload_from_gcs(
85
+ self,
86
+ content: Content,
87
+ upsert: bool,
88
+ skip_if_exists: bool,
89
+ config: Optional[RemoteContentConfig] = None,
90
+ ):
91
+ """Load content from Google Cloud Storage (async).
92
+
93
+ Note: Uses sync google-cloud-storage calls as it doesn't have an async API.
94
+ """
95
+ try:
96
+ from google.cloud import storage # type: ignore # noqa: F401
97
+ except ImportError:
98
+ raise ImportError(
99
+ "The `google-cloud-storage` package is not installed. "
100
+ "Please install it via `pip install google-cloud-storage`."
101
+ )
102
+
103
+ log_warning(
104
+ "GCS content loading has limited features. "
105
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
106
+ )
107
+
108
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
109
+ gcs_config = self._validate_gcs_config(content, config)
110
+
111
+ # Get or create bucket
112
+ bucket = remote_content.bucket
113
+ if bucket is None and remote_content.bucket_name:
114
+ client = self._get_gcs_client(gcs_config)
115
+ bucket = client.bucket(remote_content.bucket_name)
116
+
117
+ # Identify objects to read
118
+ objects_to_read = []
119
+ if remote_content.blob_name is not None:
120
+ objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
121
+ elif remote_content.prefix is not None:
122
+ objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
123
+ else:
124
+ objects_to_read.extend(bucket.list_blobs()) # type: ignore
125
+
126
+ if objects_to_read:
127
+ log_info(f"Processing {len(objects_to_read)} file(s) from GCS")
128
+
129
+ bucket_name = remote_content.bucket_name or (bucket.name if bucket else "unknown")
130
+ is_folder_upload = len(objects_to_read) > 1
131
+ root_path = remote_content.prefix or ""
132
+
133
+ for gcs_object in objects_to_read:
134
+ blob_name = gcs_object.name
135
+ file_name = blob_name.split("/")[-1]
136
+
137
+ # Build metadata and virtual path using helpers
138
+ virtual_path = self._build_gcs_virtual_path(bucket_name, blob_name)
139
+ gcs_metadata = self._build_gcs_metadata(gcs_config, bucket_name, blob_name)
140
+ merged_metadata: Dict[str, Any] = self._merge_metadata(gcs_metadata, content.metadata)
141
+
142
+ # Compute content name using base helper
143
+ content_name = self._compute_content_name(blob_name, file_name, content.name, root_path, is_folder_upload)
144
+
145
+ # Create content entry
146
+ content_entry = Content(
147
+ name=content_name,
148
+ description=content.description,
149
+ path=virtual_path,
150
+ status=ContentStatus.PROCESSING,
151
+ metadata=merged_metadata,
152
+ file_type="gcs",
153
+ )
154
+ content_entry.content_hash = self._build_content_hash(content_entry)
155
+ content_entry.id = generate_id(content_entry.content_hash)
156
+
157
+ await self._ainsert_contents_db(content_entry)
158
+
159
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
160
+ content_entry.status = ContentStatus.COMPLETED
161
+ await self._aupdate_content(content_entry)
162
+ continue
163
+
164
+ # Select reader
165
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
166
+ reader = cast(Reader, reader)
167
+
168
+ # Fetch and load the content
169
+ readable_content = BytesIO(gcs_object.download_as_bytes())
170
+
171
+ # Read the content
172
+ read_documents = await reader.async_read(readable_content, name=file_name)
173
+
174
+ # Prepare and insert the content in the vector database
175
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
176
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
177
+
178
+ def _load_from_gcs(
179
+ self,
180
+ content: Content,
181
+ upsert: bool,
182
+ skip_if_exists: bool,
183
+ config: Optional[RemoteContentConfig] = None,
184
+ ):
185
+ """Load content from Google Cloud Storage (sync)."""
186
+ try:
187
+ from google.cloud import storage # type: ignore # noqa: F401
188
+ except ImportError:
189
+ raise ImportError(
190
+ "The `google-cloud-storage` package is not installed. "
191
+ "Please install it via `pip install google-cloud-storage`."
192
+ )
193
+
194
+ log_warning(
195
+ "GCS content loading has limited features. "
196
+ "Recursive folder traversal, rich metadata, and improved naming are coming in a future release."
197
+ )
198
+
199
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
200
+ gcs_config = self._validate_gcs_config(content, config)
201
+
202
+ # Get or create bucket
203
+ bucket = remote_content.bucket
204
+ if bucket is None and remote_content.bucket_name:
205
+ client = self._get_gcs_client(gcs_config)
206
+ bucket = client.bucket(remote_content.bucket_name)
207
+
208
+ # Identify objects to read
209
+ objects_to_read = []
210
+ if remote_content.blob_name is not None:
211
+ objects_to_read.append(bucket.blob(remote_content.blob_name)) # type: ignore
212
+ elif remote_content.prefix is not None:
213
+ objects_to_read.extend(bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
214
+ else:
215
+ objects_to_read.extend(bucket.list_blobs()) # type: ignore
216
+
217
+ if objects_to_read:
218
+ log_info(f"Processing {len(objects_to_read)} file(s) from GCS")
219
+
220
+ bucket_name = remote_content.bucket_name or (bucket.name if bucket else "unknown")
221
+ is_folder_upload = len(objects_to_read) > 1
222
+ root_path = remote_content.prefix or ""
223
+
224
+ for gcs_object in objects_to_read:
225
+ blob_name = gcs_object.name
226
+ file_name = blob_name.split("/")[-1]
227
+
228
+ # Build metadata and virtual path using helpers
229
+ virtual_path = self._build_gcs_virtual_path(bucket_name, blob_name)
230
+ gcs_metadata = self._build_gcs_metadata(gcs_config, bucket_name, blob_name)
231
+ merged_metadata: Dict[str, Any] = self._merge_metadata(gcs_metadata, content.metadata)
232
+
233
+ # Compute content name using base helper
234
+ content_name = self._compute_content_name(blob_name, file_name, content.name, root_path, is_folder_upload)
235
+
236
+ # Create content entry
237
+ content_entry = Content(
238
+ name=content_name,
239
+ description=content.description,
240
+ path=virtual_path,
241
+ status=ContentStatus.PROCESSING,
242
+ metadata=merged_metadata,
243
+ file_type="gcs",
244
+ )
245
+ content_entry.content_hash = self._build_content_hash(content_entry)
246
+ content_entry.id = generate_id(content_entry.content_hash)
247
+
248
+ self._insert_contents_db(content_entry)
249
+
250
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
251
+ content_entry.status = ContentStatus.COMPLETED
252
+ self._update_content(content_entry)
253
+ continue
254
+
255
+ # Select reader
256
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
257
+ reader = cast(Reader, reader)
258
+
259
+ # Fetch and load the content
260
+ readable_content = BytesIO(gcs_object.download_as_bytes())
261
+
262
+ # Read the content
263
+ read_documents = reader.read(readable_content, name=file_name)
264
+
265
+ # Prepare and insert the content in the vector database
266
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
267
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
@@ -0,0 +1,415 @@
1
+ """GitHub content loader for Knowledge.
2
+
3
+ Provides methods for loading content from GitHub repositories.
4
+ """
5
+
6
+ # mypy: disable-error-code="attr-defined"
7
+
8
+ from io import BytesIO
9
+ from typing import Dict, List, Optional, cast
10
+
11
+ import httpx
12
+ from httpx import AsyncClient
13
+
14
+ from agno.knowledge.content import Content, ContentStatus
15
+ from agno.knowledge.loaders.base import BaseLoader
16
+ from agno.knowledge.reader import Reader
17
+ from agno.knowledge.remote_content.config import GitHubConfig, RemoteContentConfig
18
+ from agno.knowledge.remote_content.remote_content import GitHubContent
19
+ from agno.utils.log import log_error, log_info, log_warning
20
+ from agno.utils.string import generate_id
21
+
22
+
23
+ class GitHubLoader(BaseLoader):
24
+ """Loader for GitHub content."""
25
+
26
+ # ==========================================
27
+ # GITHUB HELPERS (shared between sync/async)
28
+ # ==========================================
29
+
30
+ def _validate_github_config(
31
+ self,
32
+ content: Content,
33
+ config: Optional[RemoteContentConfig],
34
+ ) -> Optional[GitHubConfig]:
35
+ """Validate and extract GitHub config.
36
+
37
+ Returns:
38
+ GitHubConfig if valid, None otherwise
39
+ """
40
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
41
+ gh_config = cast(GitHubConfig, config) if isinstance(config, GitHubConfig) else None
42
+
43
+ if gh_config is None:
44
+ log_error(f"GitHub config not found for config_id: {remote_content.config_id}")
45
+ return None
46
+
47
+ return gh_config
48
+
49
+ def _build_github_headers(self, gh_config: GitHubConfig) -> Dict[str, str]:
50
+ """Build headers for GitHub API requests."""
51
+ headers: Dict[str, str] = {
52
+ "Accept": "application/vnd.github.v3+json",
53
+ "User-Agent": "Agno-Knowledge",
54
+ }
55
+ if gh_config.token:
56
+ headers["Authorization"] = f"Bearer {gh_config.token}"
57
+ return headers
58
+
59
+ def _build_github_metadata(
60
+ self,
61
+ gh_config: GitHubConfig,
62
+ branch: str,
63
+ file_path: str,
64
+ file_name: str,
65
+ ) -> Dict[str, str]:
66
+ """Build GitHub-specific metadata dictionary."""
67
+ return {
68
+ "source_type": "github",
69
+ "source_config_id": gh_config.id,
70
+ "source_config_name": gh_config.name,
71
+ "github_repo": gh_config.repo,
72
+ "github_branch": branch,
73
+ "github_path": file_path,
74
+ "github_filename": file_name,
75
+ }
76
+
77
+ def _build_github_virtual_path(self, repo: str, branch: str, file_path: str) -> str:
78
+ """Build virtual path for GitHub content."""
79
+ return f"github://{repo}/{branch}/{file_path}"
80
+
81
+ def _get_github_branch(self, remote_content: GitHubContent, gh_config: GitHubConfig) -> str:
82
+ """Get the branch to use for GitHub operations."""
83
+ return remote_content.branch or gh_config.branch or "main"
84
+
85
+ def _get_github_path_to_process(self, remote_content: GitHubContent) -> str:
86
+ """Get the path to process from remote content."""
87
+ return (remote_content.file_path or remote_content.folder_path or "").rstrip("/")
88
+
89
+ def _process_github_file_content(
90
+ self,
91
+ file_data: dict,
92
+ client: httpx.Client,
93
+ headers: Dict[str, str],
94
+ ) -> bytes:
95
+ """Process GitHub API response and return file content (sync)."""
96
+ if file_data.get("encoding") == "base64":
97
+ import base64
98
+
99
+ return base64.b64decode(file_data["content"])
100
+ else:
101
+ download_url = file_data.get("download_url")
102
+ if download_url:
103
+ dl_response = client.get(download_url, headers=headers, timeout=30.0)
104
+ dl_response.raise_for_status()
105
+ return dl_response.content
106
+ else:
107
+ raise ValueError("No content or download_url in response")
108
+
109
+ async def _aprocess_github_file_content(
110
+ self,
111
+ file_data: dict,
112
+ client: AsyncClient,
113
+ headers: Dict[str, str],
114
+ ) -> bytes:
115
+ """Process GitHub API response and return file content (async)."""
116
+ if file_data.get("encoding") == "base64":
117
+ import base64
118
+
119
+ return base64.b64decode(file_data["content"])
120
+ else:
121
+ download_url = file_data.get("download_url")
122
+ if download_url:
123
+ dl_response = await client.get(download_url, headers=headers, timeout=30.0)
124
+ dl_response.raise_for_status()
125
+ return dl_response.content
126
+ else:
127
+ raise ValueError("No content or download_url in response")
128
+
129
+ # ==========================================
130
+ # GITHUB LOADERS
131
+ # ==========================================
132
+
133
+ async def _aload_from_github(
134
+ self,
135
+ content: Content,
136
+ upsert: bool,
137
+ skip_if_exists: bool,
138
+ config: Optional[RemoteContentConfig] = None,
139
+ ):
140
+ """Load content from GitHub (async).
141
+
142
+ Requires the GitHub config to contain repo and optionally token for private repos.
143
+ Uses the GitHub API to fetch file contents.
144
+ """
145
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
146
+ gh_config = self._validate_github_config(content, config)
147
+ if gh_config is None:
148
+ return
149
+
150
+ headers = self._build_github_headers(gh_config)
151
+ branch = self._get_github_branch(remote_content, gh_config)
152
+ path_to_process = self._get_github_path_to_process(remote_content)
153
+
154
+ files_to_process: List[Dict[str, str]] = []
155
+
156
+ async with AsyncClient() as client:
157
+ # Helper function to recursively list all files in a folder
158
+ async def list_files_recursive(folder: str) -> List[Dict[str, str]]:
159
+ """Recursively list all files in a GitHub folder."""
160
+ files: List[Dict[str, str]] = []
161
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
162
+ if branch:
163
+ api_url += f"?ref={branch}"
164
+
165
+ try:
166
+ response = await client.get(api_url, headers=headers, timeout=30.0)
167
+ response.raise_for_status()
168
+ items = response.json()
169
+
170
+ if not isinstance(items, list):
171
+ items = [items]
172
+
173
+ for item in items:
174
+ if item.get("type") == "file":
175
+ files.append({"path": item["path"], "name": item["name"]})
176
+ elif item.get("type") == "dir":
177
+ subdir_files = await list_files_recursive(item["path"])
178
+ files.extend(subdir_files)
179
+ except Exception as e:
180
+ log_error(f"Error listing GitHub folder {folder}: {e}")
181
+
182
+ return files
183
+
184
+ if path_to_process:
185
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
186
+ if branch:
187
+ api_url += f"?ref={branch}"
188
+
189
+ try:
190
+ response = await client.get(api_url, headers=headers, timeout=30.0)
191
+ response.raise_for_status()
192
+ path_data = response.json()
193
+
194
+ if isinstance(path_data, list):
195
+ for item in path_data:
196
+ if item.get("type") == "file":
197
+ files_to_process.append({"path": item["path"], "name": item["name"]})
198
+ elif item.get("type") == "dir":
199
+ subdir_files = await list_files_recursive(item["path"])
200
+ files_to_process.extend(subdir_files)
201
+ else:
202
+ files_to_process.append({"path": path_data["path"], "name": path_data["name"]})
203
+ except Exception as e:
204
+ log_error(f"Error fetching GitHub path {path_to_process}: {e}")
205
+ return
206
+
207
+ if not files_to_process:
208
+ log_warning(f"No files found at GitHub path: {path_to_process}")
209
+ return
210
+
211
+ log_info(f"Processing {len(files_to_process)} file(s) from GitHub")
212
+ is_folder_upload = len(files_to_process) > 1
213
+
214
+ for file_info in files_to_process:
215
+ file_path = file_info["path"]
216
+ file_name = file_info["name"]
217
+
218
+ # Build metadata and virtual path using helpers
219
+ virtual_path = self._build_github_virtual_path(gh_config.repo, branch, file_path)
220
+ github_metadata = self._build_github_metadata(gh_config, branch, file_path, file_name)
221
+ merged_metadata = self._merge_metadata(github_metadata, content.metadata)
222
+
223
+ # Compute content name using base helper
224
+ content_name = self._compute_content_name(
225
+ file_path, file_name, content.name, path_to_process, is_folder_upload
226
+ )
227
+
228
+ # Create content entry using base helper
229
+ content_entry = self._create_content_entry(
230
+ content, content_name, virtual_path, merged_metadata, "github", is_folder_upload
231
+ )
232
+
233
+ await self._ainsert_contents_db(content_entry)
234
+
235
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
236
+ content_entry.status = ContentStatus.COMPLETED
237
+ await self._aupdate_content(content_entry)
238
+ continue
239
+
240
+ # Fetch file content
241
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
242
+ if branch:
243
+ api_url += f"?ref={branch}"
244
+ try:
245
+ response = await client.get(api_url, headers=headers, timeout=30.0)
246
+ response.raise_for_status()
247
+ file_data = response.json()
248
+ file_content = await self._aprocess_github_file_content(file_data, client, headers)
249
+ except Exception as e:
250
+ log_error(f"Error fetching GitHub file {file_path}: {e}")
251
+ content_entry.status = ContentStatus.FAILED
252
+ content_entry.status_message = str(e)
253
+ await self._aupdate_content(content_entry)
254
+ continue
255
+
256
+ # Select reader and read content
257
+ reader = self._select_reader_by_uri(file_name, content.reader)
258
+ if reader is None:
259
+ log_warning(f"No reader found for file: {file_name}")
260
+ content_entry.status = ContentStatus.FAILED
261
+ content_entry.status_message = "No suitable reader found"
262
+ await self._aupdate_content(content_entry)
263
+ continue
264
+
265
+ reader = cast(Reader, reader)
266
+ readable_content = BytesIO(file_content)
267
+ read_documents = await reader.async_read(readable_content, name=file_name)
268
+
269
+ # Prepare and insert into vector database
270
+ if not content_entry.id:
271
+ content_entry.id = generate_id(content_entry.content_hash or "")
272
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
273
+ await self._ahandle_vector_db_insert(content_entry, read_documents, upsert)
274
+
275
+ def _load_from_github(
276
+ self,
277
+ content: Content,
278
+ upsert: bool,
279
+ skip_if_exists: bool,
280
+ config: Optional[RemoteContentConfig] = None,
281
+ ):
282
+ """Load content from GitHub (sync).
283
+
284
+ Requires the GitHub config to contain repo and optionally token for private repos.
285
+ Uses the GitHub API to fetch file contents.
286
+ """
287
+ remote_content: GitHubContent = cast(GitHubContent, content.remote_content)
288
+ gh_config = self._validate_github_config(content, config)
289
+ if gh_config is None:
290
+ return
291
+
292
+ headers = self._build_github_headers(gh_config)
293
+ branch = self._get_github_branch(remote_content, gh_config)
294
+ path_to_process = self._get_github_path_to_process(remote_content)
295
+
296
+ files_to_process: List[Dict[str, str]] = []
297
+
298
+ with httpx.Client() as client:
299
+ # Helper function to recursively list all files in a folder
300
+ def list_files_recursive(folder: str) -> List[Dict[str, str]]:
301
+ """Recursively list all files in a GitHub folder."""
302
+ files: List[Dict[str, str]] = []
303
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{folder}"
304
+ if branch:
305
+ api_url += f"?ref={branch}"
306
+
307
+ try:
308
+ response = client.get(api_url, headers=headers, timeout=30.0)
309
+ response.raise_for_status()
310
+ items = response.json()
311
+
312
+ if not isinstance(items, list):
313
+ items = [items]
314
+
315
+ for item in items:
316
+ if item.get("type") == "file":
317
+ files.append({"path": item["path"], "name": item["name"]})
318
+ elif item.get("type") == "dir":
319
+ subdir_files = list_files_recursive(item["path"])
320
+ files.extend(subdir_files)
321
+ except Exception as e:
322
+ log_error(f"Error listing GitHub folder {folder}: {e}")
323
+
324
+ return files
325
+
326
+ if path_to_process:
327
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{path_to_process}"
328
+ if branch:
329
+ api_url += f"?ref={branch}"
330
+
331
+ try:
332
+ response = client.get(api_url, headers=headers, timeout=30.0)
333
+ response.raise_for_status()
334
+ path_data = response.json()
335
+
336
+ if isinstance(path_data, list):
337
+ for item in path_data:
338
+ if item.get("type") == "file":
339
+ files_to_process.append({"path": item["path"], "name": item["name"]})
340
+ elif item.get("type") == "dir":
341
+ subdir_files = list_files_recursive(item["path"])
342
+ files_to_process.extend(subdir_files)
343
+ else:
344
+ files_to_process.append({"path": path_data["path"], "name": path_data["name"]})
345
+ except Exception as e:
346
+ log_error(f"Error fetching GitHub path {path_to_process}: {e}")
347
+ return
348
+
349
+ if not files_to_process:
350
+ log_warning(f"No files found at GitHub path: {path_to_process}")
351
+ return
352
+
353
+ log_info(f"Processing {len(files_to_process)} file(s) from GitHub")
354
+ is_folder_upload = len(files_to_process) > 1
355
+
356
+ for file_info in files_to_process:
357
+ file_path = file_info["path"]
358
+ file_name = file_info["name"]
359
+
360
+ # Build metadata and virtual path using helpers
361
+ virtual_path = self._build_github_virtual_path(gh_config.repo, branch, file_path)
362
+ github_metadata = self._build_github_metadata(gh_config, branch, file_path, file_name)
363
+ merged_metadata = self._merge_metadata(github_metadata, content.metadata)
364
+
365
+ # Compute content name using base helper
366
+ content_name = self._compute_content_name(
367
+ file_path, file_name, content.name, path_to_process, is_folder_upload
368
+ )
369
+
370
+ # Create content entry using base helper
371
+ content_entry = self._create_content_entry(
372
+ content, content_name, virtual_path, merged_metadata, "github", is_folder_upload
373
+ )
374
+
375
+ self._insert_contents_db(content_entry)
376
+
377
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
378
+ content_entry.status = ContentStatus.COMPLETED
379
+ self._update_content(content_entry)
380
+ continue
381
+
382
+ # Fetch file content
383
+ api_url = f"https://api.github.com/repos/{gh_config.repo}/contents/{file_path}"
384
+ if branch:
385
+ api_url += f"?ref={branch}"
386
+ try:
387
+ response = client.get(api_url, headers=headers, timeout=30.0)
388
+ response.raise_for_status()
389
+ file_data = response.json()
390
+ file_content = self._process_github_file_content(file_data, client, headers)
391
+ except Exception as e:
392
+ log_error(f"Error fetching GitHub file {file_path}: {e}")
393
+ content_entry.status = ContentStatus.FAILED
394
+ content_entry.status_message = str(e)
395
+ self._update_content(content_entry)
396
+ continue
397
+
398
+ # Select reader and read content
399
+ reader = self._select_reader_by_uri(file_name, content.reader)
400
+ if reader is None:
401
+ log_warning(f"No reader found for file: {file_name}")
402
+ content_entry.status = ContentStatus.FAILED
403
+ content_entry.status_message = "No suitable reader found"
404
+ self._update_content(content_entry)
405
+ continue
406
+
407
+ reader = cast(Reader, reader)
408
+ readable_content = BytesIO(file_content)
409
+ read_documents = reader.read(readable_content, name=file_name)
410
+
411
+ # Prepare and insert into vector database
412
+ if not content_entry.id:
413
+ content_entry.id = generate_id(content_entry.content_hash or "")
414
+ self._prepare_documents_for_insert(read_documents, content_entry.id)
415
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)