bioguider 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

Files changed (47) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +88 -0
  4. bioguider/agents/agent_tools.py +147 -0
  5. bioguider/agents/agent_utils.py +357 -0
  6. bioguider/agents/collection_execute_step.py +180 -0
  7. bioguider/agents/collection_observe_step.py +113 -0
  8. bioguider/agents/collection_plan_step.py +154 -0
  9. bioguider/agents/collection_task.py +179 -0
  10. bioguider/agents/collection_task_utils.py +109 -0
  11. bioguider/agents/common_agent.py +159 -0
  12. bioguider/agents/common_agent_2step.py +126 -0
  13. bioguider/agents/common_step.py +85 -0
  14. bioguider/agents/dockergeneration_execute_step.py +186 -0
  15. bioguider/agents/dockergeneration_observe_step.py +153 -0
  16. bioguider/agents/dockergeneration_plan_step.py +158 -0
  17. bioguider/agents/dockergeneration_task.py +158 -0
  18. bioguider/agents/dockergeneration_task_utils.py +220 -0
  19. bioguider/agents/evaluation_task.py +269 -0
  20. bioguider/agents/identification_execute_step.py +179 -0
  21. bioguider/agents/identification_observe_step.py +92 -0
  22. bioguider/agents/identification_plan_step.py +135 -0
  23. bioguider/agents/identification_task.py +220 -0
  24. bioguider/agents/identification_task_utils.py +18 -0
  25. bioguider/agents/peo_common_step.py +64 -0
  26. bioguider/agents/prompt_utils.py +190 -0
  27. bioguider/agents/python_ast_repl_tool.py +69 -0
  28. bioguider/agents/rag_collection_task.py +130 -0
  29. bioguider/conversation.py +67 -0
  30. bioguider/database/summarized_file_db.py +140 -0
  31. bioguider/managers/evaluation_manager.py +108 -0
  32. bioguider/rag/__init__.py +0 -0
  33. bioguider/rag/config.py +117 -0
  34. bioguider/rag/data_pipeline.py +648 -0
  35. bioguider/rag/embedder.py +24 -0
  36. bioguider/rag/rag.py +134 -0
  37. bioguider/settings.py +103 -0
  38. bioguider/utils/constants.py +40 -0
  39. bioguider/utils/default.gitignore +140 -0
  40. bioguider/utils/file_utils.py +126 -0
  41. bioguider/utils/gitignore_checker.py +175 -0
  42. bioguider/utils/pyphen_utils.py +73 -0
  43. bioguider/utils/utils.py +27 -0
  44. bioguider-0.2.3.dist-info/LICENSE +21 -0
  45. bioguider-0.2.3.dist-info/METADATA +44 -0
  46. bioguider-0.2.3.dist-info/RECORD +47 -0
  47. bioguider-0.2.3.dist-info/WHEEL +4 -0
bioguider/rag/rag.py ADDED
@@ -0,0 +1,134 @@
1
+ import os
2
+ from typing import Any, List, Tuple, Optional, Dict
3
+ from uuid import uuid4
4
+ import logging
5
+ import re
6
+ import adalflow as adal
7
+ from adalflow.core.types import (
8
+ Conversation,
9
+ DialogTurn,
10
+ UserQuery,
11
+ AssistantResponse,
12
+ )
13
+ from adalflow.components.retriever.faiss_retriever import FAISSRetriever
14
+ from adalflow.components.model_client.openai_client import OpenAIClient
15
+ from adalflow.components.model_client.azureai_client import AzureAIClient
16
+ from .config import configs, create_model_client, create_model_kwargs
17
+ from .data_pipeline import DatabaseManager
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Maximum token limit for embedding models
22
+ MAX_INPUT_TOKENS = 7500 # Safe threshold below 8192 token limit
23
+
24
+ class RAG(adal.Component):
25
+ """RAG with one repo.
26
+ If you want to load a new repos, call prepare_retriever(repo_url_or_path) first."""
27
+
28
+ def __init__(self, use_s3: bool = False):
29
+ """
30
+ Initialize the RAG component.
31
+
32
+ Args:
33
+ use_s3: Whether to use S3 for database storage (default: False)
34
+ """
35
+ super().__init__()
36
+
37
+ self.embedder = adal.Embedder(
38
+ model_client=create_model_client(),
39
+ model_kwargs=create_model_kwargs(),
40
+ )
41
+
42
+ self.initialize_db_manager()
43
+
44
+ @property
45
+ def repo_dir(self):
46
+ if self.db_manager:
47
+ return self.db_manager.repo_dir
48
+ return None
49
+
50
+ def initialize_db_manager(self):
51
+ """Initialize the database manager with local storage"""
52
+ self.db_manager = DatabaseManager()
53
+ self.transformed_doc_documents = []
54
+ self.transformed_code_documents = []
55
+
56
+ def prepare_retriever(self, repo_url_or_path: str, access_token: str = None):
57
+ """
58
+ Prepare the retriever for a repository.
59
+ Will load database from local storage if available.
60
+
61
+ Args:
62
+ repo_url_or_path: URL or local path to the repository
63
+ access_token: Optional access token for private repositories
64
+ """
65
+ self.initialize_db_manager()
66
+ self.repo_url_or_path = repo_url_or_path
67
+ self.transformed_doc_documents, self.transformed_code_documents \
68
+ = self.db_manager.prepare_database(repo_url_or_path, access_token)
69
+ logger.info(f"Loaded {len(self.transformed_doc_documents)} doc documents for retrieval")
70
+ logger.info(f"Loaded {len(self.transformed_code_documents)} code documents for retrieval")
71
+ self.doc_retriever = FAISSRetriever(
72
+ **configs["retriever"],
73
+ embedder=self.embedder,
74
+ documents=self.transformed_doc_documents,
75
+ document_map_func=lambda doc: doc.vector,
76
+ dimensions=256,
77
+ )
78
+ self.code_retriever = FAISSRetriever(
79
+ **configs["retriever"],
80
+ embedder=self.embedder,
81
+ documents=self.transformed_code_documents,
82
+ document_map_func=lambda doc: doc.vector,
83
+ dimensions=256,
84
+ )
85
+
86
+ def query_doc(self, query: str) -> List:
87
+ """
88
+ Process a query using RAG.
89
+
90
+ Args:
91
+ query: The user's query
92
+
93
+ Returns:
94
+ retrieved_documents: List of documents retrieved based on the query
95
+ """
96
+ retrieved_documents = self.doc_retriever(query)
97
+ # Fill in the documents
98
+ retrieved_documents[0].documents = [
99
+ self.transformed_doc_documents[doc_index]
100
+ for doc_index in retrieved_documents[0].doc_indices
101
+ ]
102
+ return retrieved_documents
103
+
104
+ def query_code(self, query: str) -> List:
105
+ """
106
+ Process a code query using RAG.
107
+
108
+ Args:
109
+ query: The user's code query
110
+
111
+ Returns:
112
+ retrieved_documents: List of code documents retrieved based on the query
113
+ """
114
+ try:
115
+ retrieved_documents = self.code_retriever(query)
116
+ # Fill in the documents
117
+ retrieved_documents[0].documents = [
118
+ self.transformed_code_documents[doc_index]
119
+ for doc_index in retrieved_documents[0].doc_indices
120
+ ]
121
+ except Exception as e:
122
+ logger.error(e)
123
+ raise e
124
+ return retrieved_documents
125
+
126
+ @property
127
+ def save_repo_dir(self) -> str:
128
+ """
129
+ Get the directory where the repository is saved.
130
+
131
+ Returns:
132
+ str: The path to the repository directory
133
+ """
134
+ return self.db_manager.repo_paths["save_repo_dir"]
bioguider/settings.py ADDED
@@ -0,0 +1,103 @@
1
+ from enum import StrEnum
2
+ from typing import Optional
3
+
4
+ from iso639 import Language, LanguageNotFoundError
5
+ from pydantic import (
6
+ DirectoryPath,
7
+ Field,
8
+ HttpUrl,
9
+ PositiveFloat,
10
+ PositiveInt,
11
+ SecretStr,
12
+ field_validator,
13
+ )
14
+ from pydantic_settings import BaseSettings
15
+ from pathlib import Path
16
+
17
+ class ProjectSettings(BaseSettings):
18
+ target_repo: DirectoryPath = "" # type: ignore
19
+ hierarchy_name: str = ".project_doc_record"
20
+ markdown_docs_name: str = "markdown_docs"
21
+ ignore_list: list[str] = []
22
+ language: str = "English"
23
+ max_thread_count: PositiveInt = 4
24
+
25
+ @field_validator("language")
26
+ @classmethod
27
+ def validate_language_code(cls, v: str) -> str:
28
+ try:
29
+ language_name = Language.match(v).name
30
+ return language_name # Returning the resolved language name
31
+ except LanguageNotFoundError:
32
+ raise ValueError(
33
+ "Invalid language input. Please enter a valid ISO 639 code or language name."
34
+ )
35
+
36
+ class ChatCompletionSettings(BaseSettings):
37
+ model: str = "gpt-4o-mini" # NOTE: No model restrictions for user flexibility, but it's recommended to use models with a larger context window.
38
+ temperature: PositiveFloat = 0.2
39
+ request_timeout: PositiveInt = 60
40
+ openai_base_url: str = "https://api.openai.com/v1"
41
+ openai_api_key: SecretStr = Field("", exclude=True)
42
+
43
+ @field_validator("openai_base_url", mode="before")
44
+ @classmethod
45
+ def convert_base_url_to_str(cls, openai_base_url: HttpUrl) -> str:
46
+ return str(openai_base_url)
47
+
48
+
49
+ class Setting(BaseSettings):
50
+ project: ProjectSettings = {} # type: ignore
51
+ chat_completion: ChatCompletionSettings = {} # type: ignore
52
+
53
+
54
+ class SettingsManager:
55
+ _setting_instance: Optional[Setting] = (
56
+ None # Private class attribute, initially None
57
+ )
58
+
59
+ @classmethod
60
+ def get_setting(cls):
61
+ if cls._setting_instance is None:
62
+ cls._setting_instance = Setting()
63
+ return cls._setting_instance
64
+
65
+ @classmethod
66
+ def initialize_with_params(
67
+ cls,
68
+ target_repo: Path,
69
+ markdown_docs_name: str,
70
+ hierarchy_name: str,
71
+ ignore_list: list[str],
72
+ language: str,
73
+ max_thread_count: int,
74
+ model: str,
75
+ temperature: float,
76
+ request_timeout: int,
77
+ openai_base_url: str,
78
+ ):
79
+ project_settings = ProjectSettings(
80
+ target_repo=target_repo,
81
+ hierarchy_name=hierarchy_name,
82
+ markdown_docs_name=markdown_docs_name,
83
+ ignore_list=ignore_list,
84
+ language=language,
85
+ max_thread_count=max_thread_count,
86
+ )
87
+
88
+ chat_completion_settings = ChatCompletionSettings(
89
+ model=model,
90
+ temperature=temperature,
91
+ request_timeout=request_timeout,
92
+ openai_base_url=openai_base_url,
93
+ )
94
+
95
+ cls._setting_instance = Setting(
96
+ project=project_settings,
97
+ chat_completion=chat_completion_settings,
98
+ )
99
+
100
+
101
+ if __name__ == "__main__":
102
+ setting = SettingsManager.get_setting()
103
+ print(setting.model_dump())
@@ -0,0 +1,40 @@
1
+
2
+ from enum import Enum
3
+ from typing import Optional
4
+
5
+ DEFAULT_TOKEN_USAGE = {
6
+ "total_tokens": 0,
7
+ "completion_tokens": 0,
8
+ "prompt_tokens": 0,
9
+ }
10
+
11
+ class ProjectTypeEnum(Enum):
12
+ application="application"
13
+ package="package"
14
+ pipeline="pipeline"
15
+ unknown="unknown type"
16
+
17
+ class PrimaryLanguageEnum(Enum):
18
+ python="python"
19
+ R="R"
20
+ unknown="unknown type"
21
+
22
+ class ProjectMetadata:
23
+ def __init__(
24
+ self,
25
+ url: str,
26
+ project_type: ProjectTypeEnum,
27
+ primary_language: PrimaryLanguageEnum,
28
+ repo_name: str=None,
29
+ owner: Optional[str]=None,
30
+ description: Optional[str]=None,
31
+ license: Optional[str]=None,
32
+ ):
33
+ self.url = url
34
+ self.project_type = project_type
35
+ self.primary_language = primary_language
36
+ self.repo_name = repo_name
37
+ self.owner = owner
38
+ self.description = description
39
+ self.license = license
40
+
@@ -0,0 +1,140 @@
1
+ # History files
2
+ .Rhistory
3
+ .Rapp.history
4
+
5
+ # R session data
6
+ .RData
7
+ .RDataTmp
8
+
9
+ # User-specific files
10
+ .Rproj.user/
11
+ .Ruserdata/
12
+
13
+ # RStudio files
14
+ .rsituational.user
15
+ .Renviron
16
+ *proj.user
17
+
18
+ # Example and temporary files
19
+ *.Rhistory
20
+ *.RData
21
+ *.RData~
22
+ *.Rproj.user
23
+ *~
24
+ .DS_Store # macOS specific
25
+ Thumbs.db # Windows specific
26
+
27
+ # Knitr and R Markdown cache and intermediate files
28
+ *_cache/
29
+ /cache/
30
+ *.utf8.md
31
+ *.knit.md
32
+ *.log
33
+ *.ind
34
+ *.aux
35
+ *.synctex.gz
36
+ *.toc
37
+ *.out
38
+ *.nav
39
+ *.snm
40
+
41
+ # Package and dependency management
42
+ # renv, packrat, etc.
43
+ renv/
44
+ packrat/
45
+ .renv/
46
+ .Rprofile
47
+
48
+ # Compiled code from packages like Rcpp
49
+ /src/*.so
50
+ /src/*.o
51
+ /src/*.dll
52
+
53
+ # Shiny app logs
54
+ *.log
55
+ *.log.*
56
+
57
+
58
+ # Byte-compiled / optimized / C files
59
+ __pycache__/
60
+ *.py[cod]
61
+ *$py.class
62
+
63
+ # C extensions
64
+ *.so
65
+
66
+ # Distribution / packaging
67
+ .Python
68
+ build/
69
+ develop-eggs/
70
+ dist/
71
+ downloads/
72
+ eggs/
73
+ .eggs/
74
+ lib/
75
+ lib64/
76
+ parts/
77
+ sdist/
78
+ var/
79
+ wheels/
80
+ *.egg-info/
81
+ .installed.cfg
82
+ *.egg
83
+ MANIFEST
84
+
85
+ # PyInstaller
86
+ # Usually these files are in 'dist' folder, but they may appear in the root directory
87
+ *.manifest
88
+ *.spec
89
+
90
+ # Installer logs
91
+ pip-log.txt
92
+ pip-delete-this-directory.txt
93
+
94
+ # Virtual Environments
95
+ .env
96
+ .venv
97
+ env/
98
+ venv/
99
+ ENV/
100
+ env.bak/
101
+ venv.bak/
102
+
103
+ # IDE and editor configuration files
104
+ # VSCode
105
+ .vscode/
106
+ # PyCharm
107
+ .idea/
108
+ # Sublime Text
109
+ *.sublime-project
110
+ *.sublime-workspace
111
+ # Atom
112
+ .atom/
113
+ # Jupyter Notebook
114
+ .ipynb_checkpoints
115
+
116
+ # Testing
117
+ .pytest_cache/
118
+ .coverage
119
+ .coverage.*
120
+ htmlcov/
121
+ nosetests.xml
122
+ coverage.xml
123
+ *.cover
124
+ .hypothesis/
125
+ .tox/
126
+
127
+ # Scrapy stuff
128
+ .scrapy
129
+
130
+ # Sphinx documentation
131
+ docs/_build/
132
+
133
+ # OS-generated files
134
+ .DS_Store
135
+ .DS_Store?
136
+ ._*
137
+ .Spotlight-V100
138
+ .Trashes
139
+ ehthumbs.db
140
+ Thumbs.db
@@ -0,0 +1,126 @@
1
+ import os
2
+ from enum import Enum
3
+ import json
4
+ # from adalflow.utils import get_adalflow_default_root_path
5
+ from pathlib import Path
6
+
7
+ class FileType(Enum):
8
+ unknown = "u"
9
+ file = "f"
10
+ directory = "d"
11
+ symlink = "l"
12
+ broken_symlink = "broken symlink"
13
+
14
+ def get_file_type(file_path: str) -> FileType:
15
+ """
16
+ Get the file type of a given file path.
17
+
18
+ Args:
19
+ file_path (str): The path to the file or directory.
20
+
21
+ Returns:
22
+ FileType: The type of the file (file, directory, or symlink).
23
+ """
24
+ if os.path.isfile(file_path):
25
+ return FileType.file
26
+ elif os.path.isdir(file_path):
27
+ return FileType.directory
28
+ elif os.path.islink(file_path):
29
+ try:
30
+ os.stat(file_path)
31
+ return FileType.symlink
32
+ except FileNotFoundError:
33
+ return FileType.broken_symlink
34
+ except Exception:
35
+ return FileType.unknown
36
+ else:
37
+ # raise ValueError(f"Unknown file type for path: {file_path}")
38
+ return FileType.unknown
39
+
40
+ def remove_output_cells(notebook_path: str) -> str:
41
+ """
42
+ Remove output cells from a Jupyter notebook to reduce its size.
43
+
44
+ Args:
45
+ notebook_path (str): Path to the input Jupyter notebook file.
46
+ output_path (str): Path to save the modified notebook file.
47
+ """
48
+ with open(notebook_path, 'r', encoding='utf-8') as nb_file:
49
+ notebook = json.load(nb_file)
50
+
51
+ notebook['cells'] = [
52
+ cell for cell in notebook.get('cells', [])
53
+ if cell.get('cell_type') != 'markdown'
54
+ ]
55
+ for cell in notebook.get('cells'):
56
+ if cell.get('cell_type') == 'code':
57
+ cell['outputs'] = []
58
+ cell['execution_count'] = None
59
+
60
+
61
+ return json.dumps(notebook)
62
+
63
+ def extract_code_from_notebook(notebook_path: str) -> str:
64
+ """
65
+ Extract all code from a Jupyter notebook.
66
+
67
+ Args:
68
+ notebook_path (str): Path to the input Jupyter notebook file.
69
+
70
+ Returns:
71
+ str: A concatenated string of all code cells.
72
+ """
73
+ with open(notebook_path, 'r', encoding='utf-8') as nb_file:
74
+ notebook = json.load(nb_file)
75
+
76
+ # Extract code from cells of type 'code'
77
+ code_cells = [
78
+ '\n'.join(cell['source']) for cell in notebook.get('cells', [])
79
+ if cell.get('cell_type') == 'code'
80
+ ]
81
+ code_cells = [
82
+ cell.replace("\n\n", "\n") for cell in code_cells
83
+ ]
84
+
85
+ # Combine all code cells into a single string
86
+ return '\n\n'.join(code_cells)
87
+
88
+ def parse_repo_url(url: str) -> tuple[str | None, str | None]:
89
+ """
90
+ Parses a git repository URL to extract the author/organization and repository name.
91
+
92
+ Args:
93
+ url: The repository URL (e.g., HTTPS or SSH).
94
+
95
+ Returns:
96
+ A tuple containing (author_or_org, repo_name), or (None, None) if parsing fails.
97
+ """
98
+ try:
99
+ # Handle SSH format first (e.g., git@github.com:user/repo.git)
100
+ if '@' in url and ':' in url:
101
+ path_part = url.split(':')[-1]
102
+ # Handle HTTPS format (e.g., https://github.com/user/repo.git)
103
+ else:
104
+ path_part = url.split('://')[-1].split('/', 1)[-1]
105
+
106
+ # Clean up the path
107
+ if path_part.endswith('.git'):
108
+ path_part = path_part[:-4]
109
+
110
+ parts = path_part.split('/')
111
+ if len(parts) >= 2:
112
+ author = parts[-2]
113
+ repo_name = parts[-1]
114
+ return author, repo_name
115
+ else:
116
+ return None, None
117
+ except Exception:
118
+ return None, None
119
+
120
+ def retrieve_data_root_path():
121
+ data_folder = os.environ.get("DATA_FOLDER", "./data")
122
+ return data_folder
123
+
124
+
125
+
126
+