bioguider 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +88 -0
- bioguider/agents/agent_tools.py +147 -0
- bioguider/agents/agent_utils.py +357 -0
- bioguider/agents/collection_execute_step.py +180 -0
- bioguider/agents/collection_observe_step.py +113 -0
- bioguider/agents/collection_plan_step.py +154 -0
- bioguider/agents/collection_task.py +179 -0
- bioguider/agents/collection_task_utils.py +109 -0
- bioguider/agents/common_agent.py +159 -0
- bioguider/agents/common_agent_2step.py +126 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +153 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_task.py +269 -0
- bioguider/agents/identification_execute_step.py +179 -0
- bioguider/agents/identification_observe_step.py +92 -0
- bioguider/agents/identification_plan_step.py +135 -0
- bioguider/agents/identification_task.py +220 -0
- bioguider/agents/identification_task_utils.py +18 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +190 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/summarized_file_db.py +140 -0
- bioguider/managers/evaluation_manager.py +108 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +648 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +134 -0
- bioguider/settings.py +103 -0
- bioguider/utils/constants.py +40 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +126 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/utils.py +27 -0
- bioguider-0.2.3.dist-info/LICENSE +21 -0
- bioguider-0.2.3.dist-info/METADATA +44 -0
- bioguider-0.2.3.dist-info/RECORD +47 -0
- bioguider-0.2.3.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
import adalflow as adal
|
|
3
|
+
from adalflow.core.types import Document, List
|
|
4
|
+
from adalflow.components.data_process import TextSplitter, ToEmbeddings
|
|
5
|
+
import os
|
|
6
|
+
import subprocess
|
|
7
|
+
import json
|
|
8
|
+
import tiktoken
|
|
9
|
+
import logging
|
|
10
|
+
import base64
|
|
11
|
+
import re
|
|
12
|
+
import glob
|
|
13
|
+
|
|
14
|
+
from adalflow.core.db import LocalDB
|
|
15
|
+
from binaryornot.check import is_binary
|
|
16
|
+
|
|
17
|
+
from ..utils.gitignore_checker import GitignoreChecker
|
|
18
|
+
from ..utils.file_utils import retrieve_data_root_path
|
|
19
|
+
from .config import configs, create_model_client, create_model_kwargs
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Maximum token limit for OpenAI embedding models
|
|
24
|
+
MAX_EMBEDDING_TOKENS = 8192
|
|
25
|
+
|
|
26
|
+
def count_tokens(text: str, model: str = "text-embedding-3-small") -> int:
|
|
27
|
+
"""
|
|
28
|
+
Count the number of tokens in a text string using tiktoken.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
text (str): The text to count tokens for.
|
|
32
|
+
model (str): The model to use for tokenization.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
int: The number of tokens in the text.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
encoding = tiktoken.encoding_for_model(model)
|
|
39
|
+
return len(encoding.encode(text))
|
|
40
|
+
except Exception as e:
|
|
41
|
+
# Fallback to a simple approximation if tiktoken fails
|
|
42
|
+
logger.warning(f"Error counting tokens with tiktoken: {e}")
|
|
43
|
+
# Rough approximation: 4 characters per token
|
|
44
|
+
return len(text) // 4
|
|
45
|
+
|
|
46
|
+
def download_repo(repo_url: str, local_path: str, access_token: str = None):
|
|
47
|
+
"""
|
|
48
|
+
Downloads a Git repository (GitHub or GitLab) to a specified local path.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
repo_url (str): The URL of the Git repository to clone.
|
|
52
|
+
local_path (str): The local directory where the repository will be cloned.
|
|
53
|
+
access_token (str, optional): Access token for private repositories.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
str: The output message from the `git` command.
|
|
57
|
+
"""
|
|
58
|
+
try:
|
|
59
|
+
# Check if Git is installed
|
|
60
|
+
logger.info(f"Preparing to clone repository to {local_path}")
|
|
61
|
+
subprocess.run(
|
|
62
|
+
["git", "--version"],
|
|
63
|
+
check=True,
|
|
64
|
+
stdout=subprocess.PIPE,
|
|
65
|
+
stderr=subprocess.PIPE,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Check if repository already exists
|
|
69
|
+
if os.path.exists(local_path) and os.listdir(local_path):
|
|
70
|
+
# Directory exists and is not empty
|
|
71
|
+
logger.warning(f"Repository already exists at {local_path}. Using existing repository.")
|
|
72
|
+
return f"Using existing repository at {local_path}"
|
|
73
|
+
|
|
74
|
+
# Ensure the local path exists
|
|
75
|
+
os.makedirs(local_path, exist_ok=True)
|
|
76
|
+
|
|
77
|
+
# Prepare the clone URL with access token if provided
|
|
78
|
+
clone_url = repo_url
|
|
79
|
+
if access_token:
|
|
80
|
+
# Determine the repository type and format the URL accordingly
|
|
81
|
+
if "github.com" in repo_url:
|
|
82
|
+
# Format: https://{token}@github.com/owner/repo.git
|
|
83
|
+
clone_url = repo_url.replace("https://", f"https://{access_token}@")
|
|
84
|
+
elif "gitlab.com" in repo_url:
|
|
85
|
+
# Format: https://oauth2:{token}@gitlab.com/owner/repo.git
|
|
86
|
+
clone_url = repo_url.replace("https://", f"https://oauth2:{access_token}@")
|
|
87
|
+
|
|
88
|
+
logger.info("Using access token for authentication")
|
|
89
|
+
|
|
90
|
+
# Clone the repository
|
|
91
|
+
logger.info(f"Cloning repository from {repo_url} to {local_path}")
|
|
92
|
+
# We use repo_url in the log to avoid exposing the token in logs
|
|
93
|
+
result = subprocess.run(
|
|
94
|
+
["git", "clone", clone_url, local_path],
|
|
95
|
+
check=True,
|
|
96
|
+
stdout=subprocess.PIPE,
|
|
97
|
+
stderr=subprocess.PIPE,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
logger.info("Repository cloned successfully")
|
|
101
|
+
return result.stdout.decode("utf-8")
|
|
102
|
+
|
|
103
|
+
except subprocess.CalledProcessError as e:
|
|
104
|
+
error_msg = e.stderr.decode('utf-8')
|
|
105
|
+
# Sanitize error message to remove any tokens
|
|
106
|
+
if access_token and access_token in error_msg:
|
|
107
|
+
error_msg = error_msg.replace(access_token, "***TOKEN***")
|
|
108
|
+
raise ValueError(f"Error during cloning: {error_msg}")
|
|
109
|
+
except Exception as e:
|
|
110
|
+
raise ValueError(f"An unexpected error occurred: {str(e)}")
|
|
111
|
+
|
|
112
|
+
# Alias for backward compatibility
|
|
113
|
+
download_github_repo = download_repo
|
|
114
|
+
|
|
115
|
+
# File extensions to look for, prioritizing code files
|
|
116
|
+
code_extensions = [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs",
|
|
117
|
+
".jsx", ".tsx", ".html", ".css", "scss", ".php", ".swift", ".cs"]
|
|
118
|
+
doc_extensions = [".md", ".txt", ".rst", ".json", ".yaml", ".yml"]
|
|
119
|
+
|
|
120
|
+
def get_all_valid_doc_and_code_files(dir_path: str, all_valid_files: List[str] | None = None) -> List[str]:
|
|
121
|
+
all_valid_code_files = []
|
|
122
|
+
all_valid_doc_files = []
|
|
123
|
+
if all_valid_files is None:
|
|
124
|
+
for ext in code_extensions:
|
|
125
|
+
files = glob.glob(f"{dir_path}/**/*{ext}", recursive=True)
|
|
126
|
+
all_valid_code_files.extend(files)
|
|
127
|
+
for ext in doc_extensions:
|
|
128
|
+
files = glob.glob(f"{dir_path}/**/*{ext}", recursive=True)
|
|
129
|
+
all_valid_doc_files.extend(files)
|
|
130
|
+
return all_valid_doc_files, all_valid_code_files
|
|
131
|
+
|
|
132
|
+
for f in all_valid_files:
|
|
133
|
+
_, ext = os.path.splitext(f)
|
|
134
|
+
f = os.path.join(dir_path, f)
|
|
135
|
+
if ext in code_extensions:
|
|
136
|
+
all_valid_code_files.append(f)
|
|
137
|
+
elif ext in doc_extensions:
|
|
138
|
+
all_valid_doc_files.append(f)
|
|
139
|
+
else:
|
|
140
|
+
if not is_binary(f):
|
|
141
|
+
all_valid_doc_files.append(f)
|
|
142
|
+
|
|
143
|
+
return all_valid_doc_files, all_valid_code_files
|
|
144
|
+
|
|
145
|
+
def read_all_documents(path: str) -> tuple[list[Document], list[Document]]:
|
|
146
|
+
"""
|
|
147
|
+
Recursively reads all documents in a directory and its subdirectories.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
path (str): The root directory path.
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
tuple: a tuple of two lists of Document objects with metadata.
|
|
154
|
+
"""
|
|
155
|
+
doc_documents = []
|
|
156
|
+
code_documents = []
|
|
157
|
+
|
|
158
|
+
# Get excluded files and directories from config
|
|
159
|
+
excluded_dirs = configs.get("file_filters", {}).get("excluded_dirs", [".venv", "node_modules"])
|
|
160
|
+
excluded_files = configs.get("file_filters", {}).get("excluded_files", ["package-lock.json"])
|
|
161
|
+
|
|
162
|
+
logger.info(f"Reading documents from {path}")
|
|
163
|
+
|
|
164
|
+
all_valid_files: List[str] | None = None
|
|
165
|
+
if os.path.exists(os.path.join(path, ".gitignore")):
|
|
166
|
+
# Use GitignoreChecker to get excluded patterns
|
|
167
|
+
gitignore_checker = GitignoreChecker(
|
|
168
|
+
directory=path,
|
|
169
|
+
gitignore_path=os.path.join(path, ".gitignore"),
|
|
170
|
+
exclude_dir_patterns=configs["file_filters"]["excluded_dirs"],
|
|
171
|
+
exclude_file_patterns=configs["file_filters"]["excluded_files"],
|
|
172
|
+
)
|
|
173
|
+
all_valid_files = gitignore_checker.check_files_and_folders()
|
|
174
|
+
doc_files, code_files = get_all_valid_doc_and_code_files(path, all_valid_files)
|
|
175
|
+
|
|
176
|
+
# Process code files first
|
|
177
|
+
for file_path in code_files:
|
|
178
|
+
# Skip excluded directories and files
|
|
179
|
+
is_excluded = False
|
|
180
|
+
if any(excluded in file_path for excluded in excluded_dirs):
|
|
181
|
+
is_excluded = True
|
|
182
|
+
if not is_excluded and any(os.path.basename(file_path) == excluded for excluded in excluded_files):
|
|
183
|
+
is_excluded = True
|
|
184
|
+
if is_excluded:
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
189
|
+
content = f.read()
|
|
190
|
+
relative_path = os.path.relpath(file_path, path)
|
|
191
|
+
_, ext = os.path.splitext(relative_path)
|
|
192
|
+
|
|
193
|
+
# Determine if this is an implementation file
|
|
194
|
+
is_implementation = (
|
|
195
|
+
not relative_path.startswith("test_")
|
|
196
|
+
and not relative_path.startswith("app_")
|
|
197
|
+
and "test" not in relative_path.lower()
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Check token count
|
|
201
|
+
token_count = count_tokens(content)
|
|
202
|
+
if token_count > MAX_EMBEDDING_TOKENS:
|
|
203
|
+
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
doc = Document(
|
|
207
|
+
text=content,
|
|
208
|
+
meta_data={
|
|
209
|
+
"file_path": relative_path,
|
|
210
|
+
"type": ext[1:] if len(ext) > 1 else "unknown",
|
|
211
|
+
"is_code": True,
|
|
212
|
+
"is_implementation": is_implementation,
|
|
213
|
+
"title": relative_path,
|
|
214
|
+
"token_count": token_count,
|
|
215
|
+
},
|
|
216
|
+
)
|
|
217
|
+
code_documents.append(doc)
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Error reading {file_path}: {e}")
|
|
220
|
+
|
|
221
|
+
# Then process documentation files
|
|
222
|
+
for file_path in doc_files:
|
|
223
|
+
# Skip excluded directories and files
|
|
224
|
+
is_excluded = False
|
|
225
|
+
if any(excluded in file_path for excluded in excluded_dirs):
|
|
226
|
+
is_excluded = True
|
|
227
|
+
if not is_excluded and any(os.path.basename(file_path) == excluded for excluded in excluded_files):
|
|
228
|
+
is_excluded = True
|
|
229
|
+
if is_excluded:
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
with open(file_path, "r", encoding="utf-8") as f:
|
|
234
|
+
content = f.read()
|
|
235
|
+
relative_path = os.path.relpath(file_path, path)
|
|
236
|
+
_, ext = os.path.splitext(relative_path)
|
|
237
|
+
|
|
238
|
+
# Check token count
|
|
239
|
+
token_count = count_tokens(content)
|
|
240
|
+
if token_count > MAX_EMBEDDING_TOKENS:
|
|
241
|
+
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
doc = Document(
|
|
245
|
+
text=content,
|
|
246
|
+
meta_data={
|
|
247
|
+
"file_path": relative_path,
|
|
248
|
+
"type": ext[1:] if len(ext) > 1 else "unknown",
|
|
249
|
+
"is_code": False,
|
|
250
|
+
"is_implementation": False,
|
|
251
|
+
"title": relative_path,
|
|
252
|
+
"token_count": token_count,
|
|
253
|
+
},
|
|
254
|
+
)
|
|
255
|
+
doc_documents.append(doc)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.error(f"Error reading {file_path}: {e}")
|
|
258
|
+
|
|
259
|
+
logger.info(f"Found {len(doc_documents)} doc documents")
|
|
260
|
+
logger.info(f"Found {len(code_documents)} code documents")
|
|
261
|
+
return doc_documents, code_documents
|
|
262
|
+
|
|
263
|
+
def prepare_data_pipeline():
|
|
264
|
+
"""Creates and returns the data transformation pipeline."""
|
|
265
|
+
splitter = TextSplitter(**configs["text_splitter"])
|
|
266
|
+
embedder = adal.Embedder(
|
|
267
|
+
model_client=create_model_client(),
|
|
268
|
+
model_kwargs=create_model_kwargs(),
|
|
269
|
+
)
|
|
270
|
+
embedder_transformer = ToEmbeddings(
|
|
271
|
+
embedder=embedder, batch_size=configs["embedder"]["batch_size"]
|
|
272
|
+
)
|
|
273
|
+
data_transformer = adal.Sequential(
|
|
274
|
+
splitter, embedder_transformer
|
|
275
|
+
) # sequential will chain together splitter and embedder
|
|
276
|
+
return data_transformer
|
|
277
|
+
|
|
278
|
+
def transform_documents_and_save_to_db(
|
|
279
|
+
documents: List[Document], db_path: str
|
|
280
|
+
) -> LocalDB:
|
|
281
|
+
"""
|
|
282
|
+
Transforms a list of documents and saves them to a local database.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
documents (list): A list of `Document` objects.
|
|
286
|
+
db_path (str): The path to the local database file.
|
|
287
|
+
"""
|
|
288
|
+
# Get the data transformer
|
|
289
|
+
data_transformer = prepare_data_pipeline()
|
|
290
|
+
|
|
291
|
+
# Save the documents to a local database
|
|
292
|
+
db = LocalDB()
|
|
293
|
+
db.register_transformer(transformer=data_transformer, key="split_and_embed")
|
|
294
|
+
db.load(documents)
|
|
295
|
+
db.transform(key="split_and_embed")
|
|
296
|
+
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
|
297
|
+
db.save_state(filepath=db_path)
|
|
298
|
+
return db
|
|
299
|
+
|
|
300
|
+
def get_github_file_content(repo_url: str, file_path: str, access_token: str = None) -> str:
|
|
301
|
+
"""
|
|
302
|
+
Retrieves the content of a file from a GitHub repository using the GitHub API.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
repo_url (str): The URL of the GitHub repository (e.g., "https://github.com/username/repo")
|
|
306
|
+
file_path (str): The path to the file within the repository (e.g., "src/main.py")
|
|
307
|
+
access_token (str, optional): GitHub personal access token for private repositories
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
str: The content of the file as a string
|
|
311
|
+
|
|
312
|
+
Raises:
|
|
313
|
+
ValueError: If the file cannot be fetched or if the URL is not a valid GitHub URL
|
|
314
|
+
"""
|
|
315
|
+
try:
|
|
316
|
+
# Extract owner and repo name from GitHub URL
|
|
317
|
+
if not (repo_url.startswith("https://github.com/") or repo_url.startswith("http://github.com/")):
|
|
318
|
+
raise ValueError("Not a valid GitHub repository URL")
|
|
319
|
+
|
|
320
|
+
parts = repo_url.rstrip('/').split('/')
|
|
321
|
+
if len(parts) < 5:
|
|
322
|
+
raise ValueError("Invalid GitHub URL format")
|
|
323
|
+
|
|
324
|
+
owner = parts[-2]
|
|
325
|
+
repo = parts[-1].replace(".git", "")
|
|
326
|
+
|
|
327
|
+
# Use GitHub API to get file content
|
|
328
|
+
# The API endpoint for getting file content is: /repos/{owner}/{repo}/contents/{path}
|
|
329
|
+
api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
|
|
330
|
+
|
|
331
|
+
# Prepare curl command with authentication if token is provided
|
|
332
|
+
curl_cmd = ["curl", "-s"]
|
|
333
|
+
if access_token:
|
|
334
|
+
curl_cmd.extend(["-H", f"Authorization: token {access_token}"])
|
|
335
|
+
curl_cmd.append(api_url)
|
|
336
|
+
|
|
337
|
+
logger.info(f"Fetching file content from GitHub API: {api_url}")
|
|
338
|
+
result = subprocess.run(
|
|
339
|
+
curl_cmd,
|
|
340
|
+
check=True,
|
|
341
|
+
stdout=subprocess.PIPE,
|
|
342
|
+
stderr=subprocess.PIPE,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
content_data = json.loads(result.stdout.decode("utf-8"))
|
|
346
|
+
|
|
347
|
+
# Check if we got an error response
|
|
348
|
+
if "message" in content_data and "documentation_url" in content_data:
|
|
349
|
+
raise ValueError(f"GitHub API error: {content_data['message']}")
|
|
350
|
+
|
|
351
|
+
# GitHub API returns file content as base64 encoded string
|
|
352
|
+
if "content" in content_data and "encoding" in content_data:
|
|
353
|
+
if content_data["encoding"] == "base64":
|
|
354
|
+
# The content might be split into lines, so join them first
|
|
355
|
+
content_base64 = content_data["content"].replace("\n", "")
|
|
356
|
+
content = base64.b64decode(content_base64).decode("utf-8")
|
|
357
|
+
return content
|
|
358
|
+
else:
|
|
359
|
+
raise ValueError(f"Unexpected encoding: {content_data['encoding']}")
|
|
360
|
+
else:
|
|
361
|
+
raise ValueError("File content not found in GitHub API response")
|
|
362
|
+
|
|
363
|
+
except subprocess.CalledProcessError as e:
|
|
364
|
+
error_msg = e.stderr.decode('utf-8')
|
|
365
|
+
# Sanitize error message to remove any tokens
|
|
366
|
+
if access_token and access_token in error_msg:
|
|
367
|
+
error_msg = error_msg.replace(access_token, "***TOKEN***")
|
|
368
|
+
raise ValueError(f"Error fetching file content: {error_msg}")
|
|
369
|
+
except json.JSONDecodeError:
|
|
370
|
+
raise ValueError("Invalid response from GitHub API")
|
|
371
|
+
except Exception as e:
|
|
372
|
+
raise ValueError(f"Failed to get file content: {str(e)}")
|
|
373
|
+
|
|
374
|
+
def get_gitlab_file_content(repo_url: str, file_path: str, access_token: str = None) -> str:
|
|
375
|
+
"""
|
|
376
|
+
Retrieves the content of a file from a GitLab repository using the GitLab API.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
repo_url (str): The URL of the GitLab repository (e.g., "https://gitlab.com/username/repo")
|
|
380
|
+
file_path (str): The path to the file within the repository (e.g., "src/main.py")
|
|
381
|
+
access_token (str, optional): GitLab personal access token for private repositories
|
|
382
|
+
|
|
383
|
+
Returns:
|
|
384
|
+
str: The content of the file as a string
|
|
385
|
+
|
|
386
|
+
Raises:
|
|
387
|
+
ValueError: If the file cannot be fetched or if the URL is not a valid GitLab URL
|
|
388
|
+
"""
|
|
389
|
+
try:
|
|
390
|
+
# Extract owner and repo name from GitLab URL
|
|
391
|
+
if not (repo_url.startswith("https://gitlab.com/") or repo_url.startswith("http://gitlab.com/")):
|
|
392
|
+
raise ValueError("Not a valid GitLab repository URL")
|
|
393
|
+
|
|
394
|
+
parts = repo_url.rstrip('/').split('/')
|
|
395
|
+
if len(parts) < 5:
|
|
396
|
+
raise ValueError("Invalid GitLab URL format")
|
|
397
|
+
|
|
398
|
+
# For GitLab, the URL format can be:
|
|
399
|
+
# - https://gitlab.com/username/repo
|
|
400
|
+
# - https://gitlab.com/group/subgroup/repo
|
|
401
|
+
# We need to extract the project path with namespace
|
|
402
|
+
|
|
403
|
+
# Remove the domain part
|
|
404
|
+
path_parts = parts[3:]
|
|
405
|
+
# Join the remaining parts to get the project path with namespace
|
|
406
|
+
project_path = '/'.join(path_parts).replace(".git", "")
|
|
407
|
+
# URL encode the path for API use
|
|
408
|
+
encoded_project_path = project_path.replace('/', '%2F')
|
|
409
|
+
|
|
410
|
+
# Use GitLab API to get file content
|
|
411
|
+
# The API endpoint for getting file content is: /api/v4/projects/{encoded_project_path}/repository/files/{encoded_file_path}/raw
|
|
412
|
+
encoded_file_path = file_path.replace('/', '%2F')
|
|
413
|
+
api_url = f"https://gitlab.com/api/v4/projects/{encoded_project_path}/repository/files/{encoded_file_path}/raw?ref=main"
|
|
414
|
+
|
|
415
|
+
# Prepare curl command with authentication if token is provided
|
|
416
|
+
curl_cmd = ["curl", "-s"]
|
|
417
|
+
if access_token:
|
|
418
|
+
curl_cmd.extend(["-H", f"PRIVATE-TOKEN: {access_token}"])
|
|
419
|
+
curl_cmd.append(api_url)
|
|
420
|
+
|
|
421
|
+
logger.info(f"Fetching file content from GitLab API: {api_url}")
|
|
422
|
+
result = subprocess.run(
|
|
423
|
+
curl_cmd,
|
|
424
|
+
check=True,
|
|
425
|
+
stdout=subprocess.PIPE,
|
|
426
|
+
stderr=subprocess.PIPE,
|
|
427
|
+
)
|
|
428
|
+
|
|
429
|
+
# GitLab API returns the raw file content directly
|
|
430
|
+
content = result.stdout.decode("utf-8")
|
|
431
|
+
|
|
432
|
+
# Check if we got an error response (GitLab returns JSON for errors)
|
|
433
|
+
if content.startswith('{') and '"message":' in content:
|
|
434
|
+
try:
|
|
435
|
+
error_data = json.loads(content)
|
|
436
|
+
if "message" in error_data:
|
|
437
|
+
# Try with 'master' branch if 'main' failed
|
|
438
|
+
api_url = f"https://gitlab.com/api/v4/projects/{encoded_project_path}/repository/files/{encoded_file_path}/raw?ref=master"
|
|
439
|
+
logger.info(f"Retrying with master branch: {api_url}")
|
|
440
|
+
|
|
441
|
+
# Prepare curl command for retry
|
|
442
|
+
curl_cmd = ["curl", "-s"]
|
|
443
|
+
if access_token:
|
|
444
|
+
curl_cmd.extend(["-H", f"PRIVATE-TOKEN: {access_token}"])
|
|
445
|
+
curl_cmd.append(api_url)
|
|
446
|
+
|
|
447
|
+
result = subprocess.run(
|
|
448
|
+
curl_cmd,
|
|
449
|
+
check=True,
|
|
450
|
+
stdout=subprocess.PIPE,
|
|
451
|
+
stderr=subprocess.PIPE,
|
|
452
|
+
)
|
|
453
|
+
content = result.stdout.decode("utf-8")
|
|
454
|
+
|
|
455
|
+
# Check again for error
|
|
456
|
+
if content.startswith('{') and '"message":' in content:
|
|
457
|
+
error_data = json.loads(content)
|
|
458
|
+
if "message" in error_data:
|
|
459
|
+
raise ValueError(f"GitLab API error: {error_data['message']}")
|
|
460
|
+
except json.JSONDecodeError:
|
|
461
|
+
# If it's not valid JSON, it's probably the file content
|
|
462
|
+
pass
|
|
463
|
+
|
|
464
|
+
return content
|
|
465
|
+
|
|
466
|
+
except subprocess.CalledProcessError as e:
|
|
467
|
+
error_msg = e.stderr.decode('utf-8')
|
|
468
|
+
# Sanitize error message to remove any tokens
|
|
469
|
+
if access_token and access_token in error_msg:
|
|
470
|
+
error_msg = error_msg.replace(access_token, "***TOKEN***")
|
|
471
|
+
raise ValueError(f"Error fetching file content: {error_msg}")
|
|
472
|
+
except Exception as e:
|
|
473
|
+
raise ValueError(f"Failed to get file content: {str(e)}")
|
|
474
|
+
|
|
475
|
+
def get_file_content(repo_url: str, file_path: str, access_token: str = None) -> str:
|
|
476
|
+
"""
|
|
477
|
+
Retrieves the content of a file from a Git repository (GitHub or GitLab).
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
repo_url (str): The URL of the repository
|
|
481
|
+
file_path (str): The path to the file within the repository
|
|
482
|
+
access_token (str, optional): Access token for private repositories
|
|
483
|
+
|
|
484
|
+
Returns:
|
|
485
|
+
str: The content of the file as a string
|
|
486
|
+
|
|
487
|
+
Raises:
|
|
488
|
+
ValueError: If the file cannot be fetched or if the URL is not valid
|
|
489
|
+
"""
|
|
490
|
+
if "github.com" in repo_url:
|
|
491
|
+
return get_github_file_content(repo_url, file_path, access_token)
|
|
492
|
+
elif "gitlab.com" in repo_url:
|
|
493
|
+
return get_gitlab_file_content(repo_url, file_path, access_token)
|
|
494
|
+
else:
|
|
495
|
+
raise ValueError("Unsupported repository URL. Only GitHub and GitLab are supported.")
|
|
496
|
+
|
|
497
|
+
class DatabaseManager:
|
|
498
|
+
"""
|
|
499
|
+
Manages the creation, loading, transformation, and persistence of LocalDB instances.
|
|
500
|
+
"""
|
|
501
|
+
|
|
502
|
+
def __init__(self):
|
|
503
|
+
self.db = None
|
|
504
|
+
self.repo_url_or_path = None
|
|
505
|
+
self.repo_paths = None
|
|
506
|
+
|
|
507
|
+
def prepare_database(self, repo_url_or_path: str, access_token: str = None) -> Tuple[List[Document], List[Document]]:
|
|
508
|
+
"""
|
|
509
|
+
Create a new database from the repository.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
repo_url_or_path (str): The URL or local path of the repository
|
|
513
|
+
access_token (str, optional): Access token for private repositories
|
|
514
|
+
|
|
515
|
+
Returns:
|
|
516
|
+
Tuple[List[Document], List[Document]]: Tuple of two Lists of Document objects
|
|
517
|
+
"""
|
|
518
|
+
self.reset_database()
|
|
519
|
+
self._create_repo(repo_url_or_path, access_token)
|
|
520
|
+
return self.prepare_db_index()
|
|
521
|
+
|
|
522
|
+
def reset_database(self):
|
|
523
|
+
"""
|
|
524
|
+
Reset the database to its initial state.
|
|
525
|
+
"""
|
|
526
|
+
self.doc_db = None
|
|
527
|
+
self.code_db = None
|
|
528
|
+
self.repo_url_or_path = None
|
|
529
|
+
self.repo_paths = None
|
|
530
|
+
|
|
531
|
+
def _create_repo(self, repo_url_or_path: str, access_token: str = None) -> None:
|
|
532
|
+
"""
|
|
533
|
+
Download and prepare all paths.
|
|
534
|
+
Paths:
|
|
535
|
+
~/.adalflow/repos/{repo_name} (for url, local path will be the same)
|
|
536
|
+
~/.adalflow/databases/{repo_name}.pkl
|
|
537
|
+
|
|
538
|
+
Args:
|
|
539
|
+
repo_url_or_path (str): The URL or local path of the repository
|
|
540
|
+
access_token (str, optional): Access token for private repositories
|
|
541
|
+
"""
|
|
542
|
+
logger.info(f"Preparing repo storage for {repo_url_or_path}...")
|
|
543
|
+
|
|
544
|
+
try:
|
|
545
|
+
root_path = retrieve_data_root_path()
|
|
546
|
+
|
|
547
|
+
os.makedirs(root_path, exist_ok=True)
|
|
548
|
+
# url
|
|
549
|
+
if repo_url_or_path.startswith("https://") or repo_url_or_path.startswith("http://"):
|
|
550
|
+
# Extract repo name based on the URL format
|
|
551
|
+
if "github.com" in repo_url_or_path:
|
|
552
|
+
# GitHub URL format: https://github.com/owner/repo
|
|
553
|
+
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
|
|
554
|
+
elif "gitlab.com" in repo_url_or_path:
|
|
555
|
+
# GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo
|
|
556
|
+
# Use the last part of the URL as the repo name
|
|
557
|
+
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
|
|
558
|
+
else:
|
|
559
|
+
# Generic handling for other Git URLs
|
|
560
|
+
repo_name = repo_url_or_path.split("/")[-1].replace(".git", "")
|
|
561
|
+
|
|
562
|
+
save_repo_dir = os.path.join(root_path, "repos", repo_name)
|
|
563
|
+
|
|
564
|
+
# Check if the repository directory already exists and is not empty
|
|
565
|
+
if not (os.path.exists(save_repo_dir) and os.listdir(save_repo_dir)):
|
|
566
|
+
# Only download if the repository doesn't exist or is empty
|
|
567
|
+
download_repo(repo_url_or_path, save_repo_dir, access_token)
|
|
568
|
+
else:
|
|
569
|
+
logger.info(f"Repository already exists at {save_repo_dir}. Using existing repository.")
|
|
570
|
+
else: # local path
|
|
571
|
+
repo_name = os.path.basename(repo_url_or_path)
|
|
572
|
+
save_repo_dir = repo_url_or_path
|
|
573
|
+
|
|
574
|
+
save_doc_db_file = os.path.join(root_path, "databases", f"{repo_name}_doc.pkl")
|
|
575
|
+
save_code_db_file = os.path.join(root_path, "databases", f"{repo_name}_code.pkl")
|
|
576
|
+
os.makedirs(save_repo_dir, exist_ok=True)
|
|
577
|
+
os.makedirs(os.path.dirname(save_doc_db_file), exist_ok=True)
|
|
578
|
+
|
|
579
|
+
self.repo_paths = {
|
|
580
|
+
"save_repo_dir": save_repo_dir,
|
|
581
|
+
"save_doc_db_file": save_doc_db_file,
|
|
582
|
+
"save_code_db_file": save_code_db_file,
|
|
583
|
+
}
|
|
584
|
+
self.repo_url_or_path = repo_url_or_path
|
|
585
|
+
logger.info(f"Repo paths: {self.repo_paths}")
|
|
586
|
+
|
|
587
|
+
except Exception as e:
|
|
588
|
+
logger.error(f"Failed to create repository structure: {e}")
|
|
589
|
+
raise
|
|
590
|
+
|
|
591
|
+
@property
|
|
592
|
+
def repo_dir(self):
|
|
593
|
+
if self.repo_paths and "save_repo_dir" in self.repo_paths:
|
|
594
|
+
return self.repo_paths["save_repo_dir"]
|
|
595
|
+
return None
|
|
596
|
+
|
|
597
|
+
def prepare_db_index(self) -> Tuple[List[Document], List[Document]]:
|
|
598
|
+
"""
|
|
599
|
+
Prepare the indexed database for the repository.
|
|
600
|
+
:return: Tuple of two Lists of Document objects
|
|
601
|
+
"""
|
|
602
|
+
# check the database
|
|
603
|
+
if self.repo_paths and os.path.exists(self.repo_paths["save_doc_db_file"]) \
|
|
604
|
+
and os.path.exists(self.repo_paths["save_code_db_file"]):
|
|
605
|
+
logger.info("Loading existing database...")
|
|
606
|
+
try:
|
|
607
|
+
self.doc_db = LocalDB.load_state(self.repo_paths["save_doc_db_file"])
|
|
608
|
+
self.code_db = LocalDB.load_state(self.repo_paths["save_code_db_file"])
|
|
609
|
+
doc_documents = self.doc_db.get_transformed_data(key="split_and_embed")
|
|
610
|
+
code_documents = self.code_db.get_transformed_data(key="split_and_embed")
|
|
611
|
+
if doc_documents and code_documents:
|
|
612
|
+
logger.info(f"Loaded {len(doc_documents)} doc documents from existing database")
|
|
613
|
+
logger.info(f"Loaded {len(code_documents)} code documents from existing database")
|
|
614
|
+
return doc_documents, code_documents
|
|
615
|
+
except Exception as e:
|
|
616
|
+
logger.error(f"Error loading existing database: {e}")
|
|
617
|
+
# Continue to create a new database
|
|
618
|
+
|
|
619
|
+
# prepare the database
|
|
620
|
+
logger.info("Creating new database...")
|
|
621
|
+
doc_documents, code_documents = read_all_documents(self.repo_paths["save_repo_dir"])
|
|
622
|
+
self.doc_db = transform_documents_and_save_to_db(
|
|
623
|
+
doc_documents, self.repo_paths["save_doc_db_file"]
|
|
624
|
+
)
|
|
625
|
+
self.code_db = transform_documents_and_save_to_db(
|
|
626
|
+
code_documents, self.repo_paths["save_code_db_file"]
|
|
627
|
+
)
|
|
628
|
+
logger.info(f"Total doc documents: {len(doc_documents)}")
|
|
629
|
+
logger.info(f"Total code documents: {len(code_documents)}")
|
|
630
|
+
transformed_doc_documents = self.doc_db.get_transformed_data(key="split_and_embed")
|
|
631
|
+
transformed_code_documents = self.code_db.get_transformed_data(key="split_and_embed")
|
|
632
|
+
logger.info(f"Total transformed doc documents: {len(transformed_doc_documents)}")
|
|
633
|
+
logger.info(f"Total transformed code documents: {len(transformed_code_documents)}")
|
|
634
|
+
return transformed_doc_documents, transformed_code_documents
|
|
635
|
+
|
|
636
|
+
def prepare_retriever(self, repo_url_or_path: str, access_token: str = None):
|
|
637
|
+
"""
|
|
638
|
+
Prepare the retriever for a repository.
|
|
639
|
+
This is a compatibility method for the isolated API.
|
|
640
|
+
|
|
641
|
+
Args:
|
|
642
|
+
repo_url_or_path (str): The URL or local path of the repository
|
|
643
|
+
access_token (str, optional): Access token for private repositories
|
|
644
|
+
|
|
645
|
+
Returns:
|
|
646
|
+
List[Document]: List of Document objects
|
|
647
|
+
"""
|
|
648
|
+
return self.prepare_database(repo_url_or_path, access_token)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
|
|
2
|
+
import adalflow as adal
|
|
3
|
+
from .config import configs
|
|
4
|
+
|
|
5
|
+
def get_embedder():
|
|
6
|
+
"""
|
|
7
|
+
Returns an embedder
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
Embedder: An instance of the Embedder class
|
|
11
|
+
"""
|
|
12
|
+
embedder_config = configs["embedder"]
|
|
13
|
+
|
|
14
|
+
# --- Initialize Embedder ---
|
|
15
|
+
model_client_class = embedder_config["model_client"]
|
|
16
|
+
if "initialize_kwargs" in embedder_config:
|
|
17
|
+
model_client = model_client_class(**embedder_config["initialize_kwargs"])
|
|
18
|
+
else:
|
|
19
|
+
model_client = model_client_class()
|
|
20
|
+
embedder = adal.Embedder(
|
|
21
|
+
model_client=model_client,
|
|
22
|
+
model_kwargs=embedder_config["model_kwargs"],
|
|
23
|
+
)
|
|
24
|
+
return embedder
|