bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,651 @@
1
+ from typing import Tuple
2
+ import adalflow as adal
3
+ from adalflow.core.types import Document, List
4
+ from adalflow.components.data_process import TextSplitter, ToEmbeddings
5
+ import os
6
+ import subprocess
7
+ import json
8
+ import tiktoken
9
+ import logging
10
+ import base64
11
+ import re
12
+ import glob
13
+
14
+ from adalflow.core.db import LocalDB
15
+ from binaryornot.check import is_binary
16
+
17
+ from ..utils.gitignore_checker import GitignoreChecker
18
+ from ..utils.file_utils import retrieve_data_root_path
19
+ from .config import configs, create_model_client, create_model_kwargs
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Maximum token limit for OpenAI embedding models
24
+ MAX_EMBEDDING_TOKENS = 8192
25
+
26
+ def count_tokens(text: str, model: str = "text-embedding-3-small") -> int:
27
+ """
28
+ Count the number of tokens in a text string using tiktoken.
29
+
30
+ Args:
31
+ text (str): The text to count tokens for.
32
+ model (str): The model to use for tokenization.
33
+
34
+ Returns:
35
+ int: The number of tokens in the text.
36
+ """
37
+ try:
38
+ encoding = tiktoken.encoding_for_model(model)
39
+ return len(encoding.encode(text))
40
+ except Exception as e:
41
+ # Fallback to a simple approximation if tiktoken fails
42
+ logger.warning(f"Error counting tokens with tiktoken: {e}")
43
+ # Rough approximation: 4 characters per token
44
+ return len(text) // 4
45
+
46
+ def download_repo(repo_url: str, local_path: str, access_token: str = None):
47
+ """
48
+ Downloads a Git repository (GitHub or GitLab) to a specified local path.
49
+
50
+ Args:
51
+ repo_url (str): The URL of the Git repository to clone.
52
+ local_path (str): The local directory where the repository will be cloned.
53
+ access_token (str, optional): Access token for private repositories.
54
+
55
+ Returns:
56
+ str: The output message from the `git` command.
57
+ """
58
+ try:
59
+ # Check if Git is installed
60
+ logger.info(f"Preparing to clone repository to {local_path}")
61
+ subprocess.run(
62
+ ["git", "--version"],
63
+ check=True,
64
+ stdout=subprocess.PIPE,
65
+ stderr=subprocess.PIPE,
66
+ )
67
+
68
+ # Check if repository already exists
69
+ if os.path.exists(local_path) and os.listdir(local_path):
70
+ # Directory exists and is not empty
71
+ logger.warning(f"Repository already exists at {local_path}. Using existing repository.")
72
+ return f"Using existing repository at {local_path}"
73
+
74
+ # Ensure the local path exists
75
+ os.makedirs(local_path, exist_ok=True)
76
+
77
+ # Prepare the clone URL with access token if provided
78
+ clone_url = repo_url
79
+ if access_token:
80
+ # Determine the repository type and format the URL accordingly
81
+ if "github.com" in repo_url:
82
+ # Format: https://{token}@github.com/owner/repo.git
83
+ clone_url = repo_url.replace("https://", f"https://{access_token}@")
84
+ elif "gitlab.com" in repo_url:
85
+ # Format: https://oauth2:{token}@gitlab.com/owner/repo.git
86
+ clone_url = repo_url.replace("https://", f"https://oauth2:{access_token}@")
87
+
88
+ logger.info("Using access token for authentication")
89
+
90
+ # Clone the repository
91
+ logger.info(f"Cloning repository from {repo_url} to {local_path}")
92
+ # We use repo_url in the log to avoid exposing the token in logs
93
+ result = subprocess.run(
94
+ ["git", "clone", "--recurse-submodules", clone_url, local_path],
95
+ check=True,
96
+ stdout=subprocess.PIPE,
97
+ stderr=subprocess.PIPE,
98
+ )
99
+
100
+ logger.info("Repository cloned successfully")
101
+ return result.stdout.decode("utf-8")
102
+
103
+ except subprocess.CalledProcessError as e:
104
+ error_msg = e.stderr.decode('utf-8')
105
+ # Sanitize error message to remove any tokens
106
+ if access_token and access_token in error_msg:
107
+ error_msg = error_msg.replace(access_token, "***TOKEN***")
108
+ raise ValueError(f"Error during cloning: {error_msg}")
109
+ except Exception as e:
110
+ raise ValueError(f"An unexpected error occurred: {str(e)}")
111
+
112
+ # Alias for backward compatibility
113
+ download_github_repo = download_repo
114
+
115
+ # File extensions to look for, prioritizing code files
116
+ code_extensions = [".py", ".js", ".ts", ".java", ".cpp", ".c", ".go", ".rs",
117
+ ".jsx", ".tsx", ".html", ".css", "scss", ".php", ".swift", ".cs"]
118
+ doc_extensions = [".md", ".txt", ".rst", ".json", ".yaml", ".yml"]
119
+
120
+ def get_all_valid_doc_and_code_files(dir_path: str, all_valid_files: List[str] | None = None) -> List[str]:
121
+ all_valid_code_files = []
122
+ all_valid_doc_files = []
123
+ if all_valid_files is None:
124
+ for ext in code_extensions:
125
+ files = glob.glob(f"{dir_path}/**/*{ext}", recursive=True)
126
+ all_valid_code_files.extend(files)
127
+ for ext in doc_extensions:
128
+ files = glob.glob(f"{dir_path}/**/*{ext}", recursive=True)
129
+ all_valid_doc_files.extend(files)
130
+ return all_valid_doc_files, all_valid_code_files
131
+
132
+ for f in all_valid_files:
133
+ _, ext = os.path.splitext(f)
134
+ f = os.path.join(dir_path, f)
135
+ if ext in code_extensions:
136
+ all_valid_code_files.append(f)
137
+ elif ext in doc_extensions:
138
+ all_valid_doc_files.append(f)
139
+ else:
140
+ if not is_binary(f):
141
+ all_valid_doc_files.append(f)
142
+
143
+ return all_valid_doc_files, all_valid_code_files
144
+
145
+ def read_all_documents(path: str) -> tuple[list[Document], list[Document]]:
146
+ """
147
+ Recursively reads all documents in a directory and its subdirectories.
148
+
149
+ Args:
150
+ path (str): The root directory path.
151
+
152
+ Returns:
153
+ tuple: a tuple of two lists of Document objects with metadata.
154
+ """
155
+ doc_documents = []
156
+ code_documents = []
157
+
158
+ # Get excluded files and directories from config
159
+ excluded_dirs = configs.get("file_filters", {}).get("excluded_dirs", [".venv", "node_modules"])
160
+ excluded_files = configs.get("file_filters", {}).get("excluded_files", ["package-lock.json"])
161
+
162
+ logger.info(f"Reading documents from {path}")
163
+
164
+ all_valid_files: List[str] | None = None
165
+ if os.path.exists(os.path.join(path, ".gitignore")):
166
+ # Use GitignoreChecker to get excluded patterns
167
+ gitignore_checker = GitignoreChecker(
168
+ directory=path,
169
+ gitignore_path=os.path.join(path, ".gitignore"),
170
+ exclude_dir_patterns=configs["file_filters"]["excluded_dirs"],
171
+ exclude_file_patterns=configs["file_filters"]["excluded_files"],
172
+ )
173
+ all_valid_files = gitignore_checker.check_files_and_folders()
174
+ doc_files, code_files = get_all_valid_doc_and_code_files(path, all_valid_files)
175
+
176
+ # Process code files first
177
+ for file_path in code_files:
178
+ # Skip excluded directories and files
179
+ is_excluded = False
180
+ if any(excluded in file_path for excluded in excluded_dirs):
181
+ is_excluded = True
182
+ if not is_excluded and any(os.path.basename(file_path) == excluded for excluded in excluded_files):
183
+ is_excluded = True
184
+ if is_excluded:
185
+ continue
186
+
187
+ try:
188
+ with open(file_path, "r", encoding="utf-8") as f:
189
+ content = f.read()
190
+ relative_path = os.path.relpath(file_path, path)
191
+ _, ext = os.path.splitext(relative_path)
192
+
193
+ # Determine if this is an implementation file
194
+ is_implementation = (
195
+ not relative_path.startswith("test_")
196
+ and not relative_path.startswith("app_")
197
+ and "test" not in relative_path.lower()
198
+ )
199
+
200
+ # Check token count
201
+ token_count = count_tokens(content)
202
+ if token_count > MAX_EMBEDDING_TOKENS:
203
+ logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
204
+ continue
205
+
206
+ doc = Document(
207
+ text=content,
208
+ meta_data={
209
+ "file_path": relative_path,
210
+ "type": ext[1:] if len(ext) > 1 else "unknown",
211
+ "is_code": True,
212
+ "is_implementation": is_implementation,
213
+ "title": relative_path,
214
+ "token_count": token_count,
215
+ },
216
+ )
217
+ code_documents.append(doc)
218
+ except Exception as e:
219
+ logger.error(f"Error reading {file_path}: {e}")
220
+
221
+ # Then process documentation files
222
+ for file_path in doc_files:
223
+ # Skip excluded directories and files
224
+ is_excluded = False
225
+ if any(excluded in file_path for excluded in excluded_dirs):
226
+ is_excluded = True
227
+ if not is_excluded and any(os.path.basename(file_path) == excluded for excluded in excluded_files):
228
+ is_excluded = True
229
+ if is_excluded:
230
+ continue
231
+
232
+ try:
233
+ with open(file_path, "r", encoding="utf-8") as f:
234
+ content = f.read()
235
+ relative_path = os.path.relpath(file_path, path)
236
+ _, ext = os.path.splitext(relative_path)
237
+
238
+ # Check token count
239
+ token_count = count_tokens(content)
240
+ if token_count > MAX_EMBEDDING_TOKENS:
241
+ logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
242
+ continue
243
+
244
+ doc = Document(
245
+ text=content,
246
+ meta_data={
247
+ "file_path": relative_path,
248
+ "type": ext[1:] if len(ext) > 1 else "unknown",
249
+ "is_code": False,
250
+ "is_implementation": False,
251
+ "title": relative_path,
252
+ "token_count": token_count,
253
+ },
254
+ )
255
+ doc_documents.append(doc)
256
+ except Exception as e:
257
+ logger.error(f"Error reading {file_path}: {e}")
258
+
259
+ logger.info(f"Found {len(doc_documents)} doc documents")
260
+ logger.info(f"Found {len(code_documents)} code documents")
261
+ return doc_documents, code_documents
262
+
263
+ def prepare_data_pipeline():
264
+ """Creates and returns the data transformation pipeline."""
265
+ splitter = TextSplitter(**configs["text_splitter"])
266
+ embedder = adal.Embedder(
267
+ model_client=create_model_client(),
268
+ model_kwargs=create_model_kwargs(),
269
+ )
270
+ embedder_transformer = ToEmbeddings(
271
+ embedder=embedder, batch_size=configs["embedder"]["batch_size"]
272
+ )
273
+ data_transformer = adal.Sequential(
274
+ splitter, embedder_transformer
275
+ ) # sequential will chain together splitter and embedder
276
+ return data_transformer
277
+
278
+ def transform_documents_and_save_to_db(
279
+ documents: List[Document], db_path: str
280
+ ) -> LocalDB:
281
+ """
282
+ Transforms a list of documents and saves them to a local database.
283
+
284
+ Args:
285
+ documents (list): A list of `Document` objects.
286
+ db_path (str): The path to the local database file.
287
+ """
288
+ # Get the data transformer
289
+ data_transformer = prepare_data_pipeline()
290
+
291
+ # Save the documents to a local database
292
+ db = LocalDB()
293
+ db.register_transformer(transformer=data_transformer, key="split_and_embed")
294
+ db.load(documents)
295
+ db.transform(key="split_and_embed")
296
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
297
+ db.save_state(filepath=db_path)
298
+ return db
299
+
300
+ def get_github_file_content(repo_url: str, file_path: str, access_token: str = None) -> str:
301
+ """
302
+ Retrieves the content of a file from a GitHub repository using the GitHub API.
303
+
304
+ Args:
305
+ repo_url (str): The URL of the GitHub repository (e.g., "https://github.com/username/repo")
306
+ file_path (str): The path to the file within the repository (e.g., "src/main.py")
307
+ access_token (str, optional): GitHub personal access token for private repositories
308
+
309
+ Returns:
310
+ str: The content of the file as a string
311
+
312
+ Raises:
313
+ ValueError: If the file cannot be fetched or if the URL is not a valid GitHub URL
314
+ """
315
+ try:
316
+ # Extract owner and repo name from GitHub URL
317
+ if not (repo_url.startswith("https://github.com/") or repo_url.startswith("http://github.com/")):
318
+ raise ValueError("Not a valid GitHub repository URL")
319
+
320
+ parts = repo_url.rstrip('/').split('/')
321
+ if len(parts) < 5:
322
+ raise ValueError("Invalid GitHub URL format")
323
+
324
+ owner = parts[-2]
325
+ repo = parts[-1].replace(".git", "")
326
+
327
+ # Use GitHub API to get file content
328
+ # The API endpoint for getting file content is: /repos/{owner}/{repo}/contents/{path}
329
+ api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{file_path}"
330
+
331
+ # Prepare curl command with authentication if token is provided
332
+ curl_cmd = ["curl", "-s"]
333
+ if access_token:
334
+ curl_cmd.extend(["-H", f"Authorization: token {access_token}"])
335
+ curl_cmd.append(api_url)
336
+
337
+ logger.info(f"Fetching file content from GitHub API: {api_url}")
338
+ result = subprocess.run(
339
+ curl_cmd,
340
+ check=True,
341
+ stdout=subprocess.PIPE,
342
+ stderr=subprocess.PIPE,
343
+ )
344
+
345
+ content_data = json.loads(result.stdout.decode("utf-8"))
346
+
347
+ # Check if we got an error response
348
+ if "message" in content_data and "documentation_url" in content_data:
349
+ raise ValueError(f"GitHub API error: {content_data['message']}")
350
+
351
+ # GitHub API returns file content as base64 encoded string
352
+ if "content" in content_data and "encoding" in content_data:
353
+ if content_data["encoding"] == "base64":
354
+ # The content might be split into lines, so join them first
355
+ content_base64 = content_data["content"].replace("\n", "")
356
+ content = base64.b64decode(content_base64).decode("utf-8")
357
+ return content
358
+ else:
359
+ raise ValueError(f"Unexpected encoding: {content_data['encoding']}")
360
+ else:
361
+ raise ValueError("File content not found in GitHub API response")
362
+
363
+ except subprocess.CalledProcessError as e:
364
+ error_msg = e.stderr.decode('utf-8')
365
+ # Sanitize error message to remove any tokens
366
+ if access_token and access_token in error_msg:
367
+ error_msg = error_msg.replace(access_token, "***TOKEN***")
368
+ raise ValueError(f"Error fetching file content: {error_msg}")
369
+ except json.JSONDecodeError:
370
+ raise ValueError("Invalid response from GitHub API")
371
+ except Exception as e:
372
+ raise ValueError(f"Failed to get file content: {str(e)}")
373
+
374
+ def get_gitlab_file_content(repo_url: str, file_path: str, access_token: str = None) -> str:
375
+ """
376
+ Retrieves the content of a file from a GitLab repository using the GitLab API.
377
+
378
+ Args:
379
+ repo_url (str): The URL of the GitLab repository (e.g., "https://gitlab.com/username/repo")
380
+ file_path (str): The path to the file within the repository (e.g., "src/main.py")
381
+ access_token (str, optional): GitLab personal access token for private repositories
382
+
383
+ Returns:
384
+ str: The content of the file as a string
385
+
386
+ Raises:
387
+ ValueError: If the file cannot be fetched or if the URL is not a valid GitLab URL
388
+ """
389
+ try:
390
+ # Extract owner and repo name from GitLab URL
391
+ if not (repo_url.startswith("https://gitlab.com/") or repo_url.startswith("http://gitlab.com/")):
392
+ raise ValueError("Not a valid GitLab repository URL")
393
+
394
+ parts = repo_url.rstrip('/').split('/')
395
+ if len(parts) < 5:
396
+ raise ValueError("Invalid GitLab URL format")
397
+
398
+ # For GitLab, the URL format can be:
399
+ # - https://gitlab.com/username/repo
400
+ # - https://gitlab.com/group/subgroup/repo
401
+ # We need to extract the project path with namespace
402
+
403
+ # Remove the domain part
404
+ path_parts = parts[3:]
405
+ # Join the remaining parts to get the project path with namespace
406
+ project_path = '/'.join(path_parts).replace(".git", "")
407
+ # URL encode the path for API use
408
+ encoded_project_path = project_path.replace('/', '%2F')
409
+
410
+ # Use GitLab API to get file content
411
+ # The API endpoint for getting file content is: /api/v4/projects/{encoded_project_path}/repository/files/{encoded_file_path}/raw
412
+ encoded_file_path = file_path.replace('/', '%2F')
413
+ api_url = f"https://gitlab.com/api/v4/projects/{encoded_project_path}/repository/files/{encoded_file_path}/raw?ref=main"
414
+
415
+ # Prepare curl command with authentication if token is provided
416
+ curl_cmd = ["curl", "-s"]
417
+ if access_token:
418
+ curl_cmd.extend(["-H", f"PRIVATE-TOKEN: {access_token}"])
419
+ curl_cmd.append(api_url)
420
+
421
+ logger.info(f"Fetching file content from GitLab API: {api_url}")
422
+ result = subprocess.run(
423
+ curl_cmd,
424
+ check=True,
425
+ stdout=subprocess.PIPE,
426
+ stderr=subprocess.PIPE,
427
+ )
428
+
429
+ # GitLab API returns the raw file content directly
430
+ content = result.stdout.decode("utf-8")
431
+
432
+ # Check if we got an error response (GitLab returns JSON for errors)
433
+ if content.startswith('{') and '"message":' in content:
434
+ try:
435
+ error_data = json.loads(content)
436
+ if "message" in error_data:
437
+ # Try with 'master' branch if 'main' failed
438
+ api_url = f"https://gitlab.com/api/v4/projects/{encoded_project_path}/repository/files/{encoded_file_path}/raw?ref=master"
439
+ logger.info(f"Retrying with master branch: {api_url}")
440
+
441
+ # Prepare curl command for retry
442
+ curl_cmd = ["curl", "-s"]
443
+ if access_token:
444
+ curl_cmd.extend(["-H", f"PRIVATE-TOKEN: {access_token}"])
445
+ curl_cmd.append(api_url)
446
+
447
+ result = subprocess.run(
448
+ curl_cmd,
449
+ check=True,
450
+ stdout=subprocess.PIPE,
451
+ stderr=subprocess.PIPE,
452
+ )
453
+ content = result.stdout.decode("utf-8")
454
+
455
+ # Check again for error
456
+ if content.startswith('{') and '"message":' in content:
457
+ error_data = json.loads(content)
458
+ if "message" in error_data:
459
+ raise ValueError(f"GitLab API error: {error_data['message']}")
460
+ except json.JSONDecodeError:
461
+ # If it's not valid JSON, it's probably the file content
462
+ pass
463
+
464
+ return content
465
+
466
+ except subprocess.CalledProcessError as e:
467
+ error_msg = e.stderr.decode('utf-8')
468
+ # Sanitize error message to remove any tokens
469
+ if access_token and access_token in error_msg:
470
+ error_msg = error_msg.replace(access_token, "***TOKEN***")
471
+ raise ValueError(f"Error fetching file content: {error_msg}")
472
+ except Exception as e:
473
+ raise ValueError(f"Failed to get file content: {str(e)}")
474
+
475
+ def get_file_content(repo_url: str, file_path: str, access_token: str = None) -> str:
476
+ """
477
+ Retrieves the content of a file from a Git repository (GitHub or GitLab).
478
+
479
+ Args:
480
+ repo_url (str): The URL of the repository
481
+ file_path (str): The path to the file within the repository
482
+ access_token (str, optional): Access token for private repositories
483
+
484
+ Returns:
485
+ str: The content of the file as a string
486
+
487
+ Raises:
488
+ ValueError: If the file cannot be fetched or if the URL is not valid
489
+ """
490
+ if "github.com" in repo_url:
491
+ return get_github_file_content(repo_url, file_path, access_token)
492
+ elif "gitlab.com" in repo_url:
493
+ return get_gitlab_file_content(repo_url, file_path, access_token)
494
+ else:
495
+ raise ValueError("Unsupported repository URL. Only GitHub and GitLab are supported.")
496
+
497
+ class DatabaseManager:
498
+ """
499
+ Manages the creation, loading, transformation, and persistence of LocalDB instances.
500
+ """
501
+
502
+ def __init__(self):
503
+ self.db = None
504
+ self.repo_url_or_path = None
505
+ self.repo_paths = None
506
+
507
+ def reset_database_and_create_repo(self, repo_url_or_path: str, access_token: str = None):
508
+ self._reset_database()
509
+ self._create_repo(repo_url_or_path, access_token)
510
+
511
+ def prepare_database(self) -> Tuple[List[Document], List[Document]]:
512
+ """
513
+ Create a new database from the repository.
514
+
515
+ Args:
516
+ repo_url_or_path (str): The URL or local path of the repository
517
+ access_token (str, optional): Access token for private repositories
518
+
519
+ Returns:
520
+ Tuple[List[Document], List[Document]]: Tuple of two Lists of Document objects
521
+ """
522
+ return self._prepare_db_index()
523
+
524
+ def _extract_repo_name_from_url(self, repo_url_or_path: str, repo_type: str) -> str:
525
+ # Extract owner and repo name to create unique identifier
526
+ url_parts = repo_url_or_path.rstrip('/').split('/')
527
+
528
+ if repo_type in ["github", "gitlab", "bitbucket"] and len(url_parts) >= 5:
529
+ # GitHub URL format: https://github.com/owner/repo
530
+ # GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo
531
+ # Bitbucket URL format: https://bitbucket.org/owner/repo
532
+ owner = url_parts[-2]
533
+ repo = url_parts[-1].replace(".git", "")
534
+ repo_name = f"{owner}_{repo}"
535
+ else:
536
+ repo_name = url_parts[-1].replace(".git", "")
537
+ return repo_name
538
+
539
+ def _reset_database(self):
540
+ """
541
+ Reset the database to its initial state.
542
+ """
543
+ self.doc_db = None
544
+ self.code_db = None
545
+ self.repo_url_or_path = None
546
+ self.repo_paths = None
547
+
548
+ def _create_repo(self, repo_url_or_path: str, access_token: str = None) -> None:
549
+ """
550
+ Download and prepare all paths.
551
+ Paths:
552
+ ~/.adalflow/repos/{repo_name} (for url, local path will be the same)
553
+ ~/.adalflow/databases/{repo_name}.pkl
554
+
555
+ Args:
556
+ repo_url_or_path (str): The URL or local path of the repository
557
+ access_token (str, optional): Access token for private repositories
558
+ """
559
+ logger.info(f"Preparing repo storage for {repo_url_or_path}...")
560
+
561
+ try:
562
+ root_path = retrieve_data_root_path()
563
+
564
+ os.makedirs(root_path, exist_ok=True)
565
+ repo_type = "unknown"
566
+ # url
567
+ if repo_url_or_path.startswith("https://") or repo_url_or_path.startswith("http://"):
568
+ # Extract repo name based on the URL format
569
+ if "github.com" in repo_url_or_path:
570
+ # GitHub URL format: https://github.com/owner/repo
571
+ repo_type = "github"
572
+ elif "gitlab.com" in repo_url_or_path:
573
+ # GitLab URL format: https://gitlab.com/owner/repo or https://gitlab.com/group/subgroup/repo
574
+ # Use the last part of the URL as the repo name
575
+ repo_type = "gitlab"
576
+ repo_name = self._extract_repo_name_from_url(repo_url_or_path, repo_type)
577
+
578
+ save_repo_dir = os.path.join(root_path, "repos", repo_name)
579
+
580
+ # Check if the repository directory already exists and is not empty
581
+ if not (os.path.exists(save_repo_dir) and os.listdir(save_repo_dir)):
582
+ # Only download if the repository doesn't exist or is empty
583
+ download_repo(repo_url_or_path, save_repo_dir, access_token)
584
+ else:
585
+ logger.info(f"Repository already exists at {save_repo_dir}. Using existing repository.")
586
+ else: # local path
587
+ repo_name = os.path.basename(repo_url_or_path)
588
+ save_repo_dir = repo_url_or_path
589
+
590
+ save_doc_db_file = os.path.join(root_path, "databases", f"{repo_name}_doc.pkl")
591
+ save_code_db_file = os.path.join(root_path, "databases", f"{repo_name}_code.pkl")
592
+ os.makedirs(save_repo_dir, exist_ok=True)
593
+ os.makedirs(os.path.dirname(save_doc_db_file), exist_ok=True)
594
+
595
+ self.repo_paths = {
596
+ "save_repo_dir": save_repo_dir,
597
+ "save_doc_db_file": save_doc_db_file,
598
+ "save_code_db_file": save_code_db_file,
599
+ }
600
+ self.repo_url_or_path = repo_url_or_path
601
+ logger.info(f"Repo paths: {self.repo_paths}")
602
+
603
+ except Exception as e:
604
+ logger.error(f"Failed to create repository structure: {e}")
605
+ raise
606
+
607
+ @property
608
+ def repo_dir(self):
609
+ if self.repo_paths and "save_repo_dir" in self.repo_paths:
610
+ return self.repo_paths["save_repo_dir"]
611
+ return None
612
+
613
+ def _prepare_db_index(self) -> Tuple[List[Document], List[Document]]:
614
+ """
615
+ Prepare the indexed database for the repository.
616
+ :return: Tuple of two Lists of Document objects
617
+ """
618
+ # check the database
619
+ if self.repo_paths and os.path.exists(self.repo_paths["save_doc_db_file"]) \
620
+ and os.path.exists(self.repo_paths["save_code_db_file"]):
621
+ logger.info("Loading existing database...")
622
+ try:
623
+ self.doc_db = LocalDB.load_state(self.repo_paths["save_doc_db_file"])
624
+ self.code_db = LocalDB.load_state(self.repo_paths["save_code_db_file"])
625
+ doc_documents = self.doc_db.get_transformed_data(key="split_and_embed")
626
+ code_documents = self.code_db.get_transformed_data(key="split_and_embed")
627
+ if doc_documents and code_documents:
628
+ logger.info(f"Loaded {len(doc_documents)} doc documents from existing database")
629
+ logger.info(f"Loaded {len(code_documents)} code documents from existing database")
630
+ return doc_documents, code_documents
631
+ except Exception as e:
632
+ logger.error(f"Error loading existing database: {e}")
633
+ # Continue to create a new database
634
+
635
+ # prepare the database
636
+ logger.info("Creating new database...")
637
+ doc_documents, code_documents = read_all_documents(self.repo_paths["save_repo_dir"])
638
+ self.doc_db = transform_documents_and_save_to_db(
639
+ doc_documents, self.repo_paths["save_doc_db_file"]
640
+ )
641
+ self.code_db = transform_documents_and_save_to_db(
642
+ code_documents, self.repo_paths["save_code_db_file"]
643
+ )
644
+ logger.info(f"Total doc documents: {len(doc_documents)}")
645
+ logger.info(f"Total code documents: {len(code_documents)}")
646
+ transformed_doc_documents = self.doc_db.get_transformed_data(key="split_and_embed")
647
+ transformed_code_documents = self.code_db.get_transformed_data(key="split_and_embed")
648
+ logger.info(f"Total transformed doc documents: {len(transformed_doc_documents)}")
649
+ logger.info(f"Total transformed code documents: {len(transformed_code_documents)}")
650
+ return transformed_doc_documents, transformed_code_documents
651
+
@@ -0,0 +1,24 @@
1
+
2
+ import adalflow as adal
3
+ from .config import configs
4
+
5
+ def get_embedder():
6
+ """
7
+ Returns an embedder
8
+
9
+ Returns:
10
+ Embedder: An instance of the Embedder class
11
+ """
12
+ embedder_config = configs["embedder"]
13
+
14
+ # --- Initialize Embedder ---
15
+ model_client_class = embedder_config["model_client"]
16
+ if "initialize_kwargs" in embedder_config:
17
+ model_client = model_client_class(**embedder_config["initialize_kwargs"])
18
+ else:
19
+ model_client = model_client_class()
20
+ embedder = adal.Embedder(
21
+ model_client=model_client,
22
+ model_kwargs=embedder_config["model_kwargs"],
23
+ )
24
+ return embedder