bioguider 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +88 -0
- bioguider/agents/agent_tools.py +147 -0
- bioguider/agents/agent_utils.py +357 -0
- bioguider/agents/collection_execute_step.py +180 -0
- bioguider/agents/collection_observe_step.py +113 -0
- bioguider/agents/collection_plan_step.py +154 -0
- bioguider/agents/collection_task.py +179 -0
- bioguider/agents/collection_task_utils.py +109 -0
- bioguider/agents/common_agent.py +159 -0
- bioguider/agents/common_agent_2step.py +126 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +153 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_task.py +269 -0
- bioguider/agents/identification_execute_step.py +179 -0
- bioguider/agents/identification_observe_step.py +92 -0
- bioguider/agents/identification_plan_step.py +135 -0
- bioguider/agents/identification_task.py +220 -0
- bioguider/agents/identification_task_utils.py +18 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +190 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/summarized_file_db.py +140 -0
- bioguider/managers/evaluation_manager.py +108 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +648 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +134 -0
- bioguider/settings.py +103 -0
- bioguider/utils/constants.py +40 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +126 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/utils.py +27 -0
- bioguider-0.2.3.dist-info/LICENSE +21 -0
- bioguider-0.2.3.dist-info/METADATA +44 -0
- bioguider-0.2.3.dist-info/RECORD +47 -0
- bioguider-0.2.3.dist-info/WHEEL +4 -0
bioguider/rag/rag.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import Any, List, Tuple, Optional, Dict
|
|
3
|
+
from uuid import uuid4
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
import adalflow as adal
|
|
7
|
+
from adalflow.core.types import (
|
|
8
|
+
Conversation,
|
|
9
|
+
DialogTurn,
|
|
10
|
+
UserQuery,
|
|
11
|
+
AssistantResponse,
|
|
12
|
+
)
|
|
13
|
+
from adalflow.components.retriever.faiss_retriever import FAISSRetriever
|
|
14
|
+
from adalflow.components.model_client.openai_client import OpenAIClient
|
|
15
|
+
from adalflow.components.model_client.azureai_client import AzureAIClient
|
|
16
|
+
from .config import configs, create_model_client, create_model_kwargs
|
|
17
|
+
from .data_pipeline import DatabaseManager
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
# Maximum token limit for embedding models
|
|
22
|
+
MAX_INPUT_TOKENS = 7500 # Safe threshold below 8192 token limit
|
|
23
|
+
|
|
24
|
+
class RAG(adal.Component):
|
|
25
|
+
"""RAG with one repo.
|
|
26
|
+
If you want to load a new repos, call prepare_retriever(repo_url_or_path) first."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, use_s3: bool = False):
|
|
29
|
+
"""
|
|
30
|
+
Initialize the RAG component.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
use_s3: Whether to use S3 for database storage (default: False)
|
|
34
|
+
"""
|
|
35
|
+
super().__init__()
|
|
36
|
+
|
|
37
|
+
self.embedder = adal.Embedder(
|
|
38
|
+
model_client=create_model_client(),
|
|
39
|
+
model_kwargs=create_model_kwargs(),
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self.initialize_db_manager()
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def repo_dir(self):
|
|
46
|
+
if self.db_manager:
|
|
47
|
+
return self.db_manager.repo_dir
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
def initialize_db_manager(self):
|
|
51
|
+
"""Initialize the database manager with local storage"""
|
|
52
|
+
self.db_manager = DatabaseManager()
|
|
53
|
+
self.transformed_doc_documents = []
|
|
54
|
+
self.transformed_code_documents = []
|
|
55
|
+
|
|
56
|
+
def prepare_retriever(self, repo_url_or_path: str, access_token: str = None):
|
|
57
|
+
"""
|
|
58
|
+
Prepare the retriever for a repository.
|
|
59
|
+
Will load database from local storage if available.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
repo_url_or_path: URL or local path to the repository
|
|
63
|
+
access_token: Optional access token for private repositories
|
|
64
|
+
"""
|
|
65
|
+
self.initialize_db_manager()
|
|
66
|
+
self.repo_url_or_path = repo_url_or_path
|
|
67
|
+
self.transformed_doc_documents, self.transformed_code_documents \
|
|
68
|
+
= self.db_manager.prepare_database(repo_url_or_path, access_token)
|
|
69
|
+
logger.info(f"Loaded {len(self.transformed_doc_documents)} doc documents for retrieval")
|
|
70
|
+
logger.info(f"Loaded {len(self.transformed_code_documents)} code documents for retrieval")
|
|
71
|
+
self.doc_retriever = FAISSRetriever(
|
|
72
|
+
**configs["retriever"],
|
|
73
|
+
embedder=self.embedder,
|
|
74
|
+
documents=self.transformed_doc_documents,
|
|
75
|
+
document_map_func=lambda doc: doc.vector,
|
|
76
|
+
dimensions=256,
|
|
77
|
+
)
|
|
78
|
+
self.code_retriever = FAISSRetriever(
|
|
79
|
+
**configs["retriever"],
|
|
80
|
+
embedder=self.embedder,
|
|
81
|
+
documents=self.transformed_code_documents,
|
|
82
|
+
document_map_func=lambda doc: doc.vector,
|
|
83
|
+
dimensions=256,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def query_doc(self, query: str) -> List:
|
|
87
|
+
"""
|
|
88
|
+
Process a query using RAG.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
query: The user's query
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
retrieved_documents: List of documents retrieved based on the query
|
|
95
|
+
"""
|
|
96
|
+
retrieved_documents = self.doc_retriever(query)
|
|
97
|
+
# Fill in the documents
|
|
98
|
+
retrieved_documents[0].documents = [
|
|
99
|
+
self.transformed_doc_documents[doc_index]
|
|
100
|
+
for doc_index in retrieved_documents[0].doc_indices
|
|
101
|
+
]
|
|
102
|
+
return retrieved_documents
|
|
103
|
+
|
|
104
|
+
def query_code(self, query: str) -> List:
|
|
105
|
+
"""
|
|
106
|
+
Process a code query using RAG.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
query: The user's code query
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
retrieved_documents: List of code documents retrieved based on the query
|
|
113
|
+
"""
|
|
114
|
+
try:
|
|
115
|
+
retrieved_documents = self.code_retriever(query)
|
|
116
|
+
# Fill in the documents
|
|
117
|
+
retrieved_documents[0].documents = [
|
|
118
|
+
self.transformed_code_documents[doc_index]
|
|
119
|
+
for doc_index in retrieved_documents[0].doc_indices
|
|
120
|
+
]
|
|
121
|
+
except Exception as e:
|
|
122
|
+
logger.error(e)
|
|
123
|
+
raise e
|
|
124
|
+
return retrieved_documents
|
|
125
|
+
|
|
126
|
+
@property
|
|
127
|
+
def save_repo_dir(self) -> str:
|
|
128
|
+
"""
|
|
129
|
+
Get the directory where the repository is saved.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
str: The path to the repository directory
|
|
133
|
+
"""
|
|
134
|
+
return self.db_manager.repo_paths["save_repo_dir"]
|
bioguider/settings.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from iso639 import Language, LanguageNotFoundError
|
|
5
|
+
from pydantic import (
|
|
6
|
+
DirectoryPath,
|
|
7
|
+
Field,
|
|
8
|
+
HttpUrl,
|
|
9
|
+
PositiveFloat,
|
|
10
|
+
PositiveInt,
|
|
11
|
+
SecretStr,
|
|
12
|
+
field_validator,
|
|
13
|
+
)
|
|
14
|
+
from pydantic_settings import BaseSettings
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
class ProjectSettings(BaseSettings):
|
|
18
|
+
target_repo: DirectoryPath = "" # type: ignore
|
|
19
|
+
hierarchy_name: str = ".project_doc_record"
|
|
20
|
+
markdown_docs_name: str = "markdown_docs"
|
|
21
|
+
ignore_list: list[str] = []
|
|
22
|
+
language: str = "English"
|
|
23
|
+
max_thread_count: PositiveInt = 4
|
|
24
|
+
|
|
25
|
+
@field_validator("language")
|
|
26
|
+
@classmethod
|
|
27
|
+
def validate_language_code(cls, v: str) -> str:
|
|
28
|
+
try:
|
|
29
|
+
language_name = Language.match(v).name
|
|
30
|
+
return language_name # Returning the resolved language name
|
|
31
|
+
except LanguageNotFoundError:
|
|
32
|
+
raise ValueError(
|
|
33
|
+
"Invalid language input. Please enter a valid ISO 639 code or language name."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
class ChatCompletionSettings(BaseSettings):
|
|
37
|
+
model: str = "gpt-4o-mini" # NOTE: No model restrictions for user flexibility, but it's recommended to use models with a larger context window.
|
|
38
|
+
temperature: PositiveFloat = 0.2
|
|
39
|
+
request_timeout: PositiveInt = 60
|
|
40
|
+
openai_base_url: str = "https://api.openai.com/v1"
|
|
41
|
+
openai_api_key: SecretStr = Field("", exclude=True)
|
|
42
|
+
|
|
43
|
+
@field_validator("openai_base_url", mode="before")
|
|
44
|
+
@classmethod
|
|
45
|
+
def convert_base_url_to_str(cls, openai_base_url: HttpUrl) -> str:
|
|
46
|
+
return str(openai_base_url)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class Setting(BaseSettings):
|
|
50
|
+
project: ProjectSettings = {} # type: ignore
|
|
51
|
+
chat_completion: ChatCompletionSettings = {} # type: ignore
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class SettingsManager:
|
|
55
|
+
_setting_instance: Optional[Setting] = (
|
|
56
|
+
None # Private class attribute, initially None
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def get_setting(cls):
|
|
61
|
+
if cls._setting_instance is None:
|
|
62
|
+
cls._setting_instance = Setting()
|
|
63
|
+
return cls._setting_instance
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def initialize_with_params(
|
|
67
|
+
cls,
|
|
68
|
+
target_repo: Path,
|
|
69
|
+
markdown_docs_name: str,
|
|
70
|
+
hierarchy_name: str,
|
|
71
|
+
ignore_list: list[str],
|
|
72
|
+
language: str,
|
|
73
|
+
max_thread_count: int,
|
|
74
|
+
model: str,
|
|
75
|
+
temperature: float,
|
|
76
|
+
request_timeout: int,
|
|
77
|
+
openai_base_url: str,
|
|
78
|
+
):
|
|
79
|
+
project_settings = ProjectSettings(
|
|
80
|
+
target_repo=target_repo,
|
|
81
|
+
hierarchy_name=hierarchy_name,
|
|
82
|
+
markdown_docs_name=markdown_docs_name,
|
|
83
|
+
ignore_list=ignore_list,
|
|
84
|
+
language=language,
|
|
85
|
+
max_thread_count=max_thread_count,
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
chat_completion_settings = ChatCompletionSettings(
|
|
89
|
+
model=model,
|
|
90
|
+
temperature=temperature,
|
|
91
|
+
request_timeout=request_timeout,
|
|
92
|
+
openai_base_url=openai_base_url,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
cls._setting_instance = Setting(
|
|
96
|
+
project=project_settings,
|
|
97
|
+
chat_completion=chat_completion_settings,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
setting = SettingsManager.get_setting()
|
|
103
|
+
print(setting.model_dump())
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
DEFAULT_TOKEN_USAGE = {
|
|
6
|
+
"total_tokens": 0,
|
|
7
|
+
"completion_tokens": 0,
|
|
8
|
+
"prompt_tokens": 0,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
class ProjectTypeEnum(Enum):
|
|
12
|
+
application="application"
|
|
13
|
+
package="package"
|
|
14
|
+
pipeline="pipeline"
|
|
15
|
+
unknown="unknown type"
|
|
16
|
+
|
|
17
|
+
class PrimaryLanguageEnum(Enum):
|
|
18
|
+
python="python"
|
|
19
|
+
R="R"
|
|
20
|
+
unknown="unknown type"
|
|
21
|
+
|
|
22
|
+
class ProjectMetadata:
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
url: str,
|
|
26
|
+
project_type: ProjectTypeEnum,
|
|
27
|
+
primary_language: PrimaryLanguageEnum,
|
|
28
|
+
repo_name: str=None,
|
|
29
|
+
owner: Optional[str]=None,
|
|
30
|
+
description: Optional[str]=None,
|
|
31
|
+
license: Optional[str]=None,
|
|
32
|
+
):
|
|
33
|
+
self.url = url
|
|
34
|
+
self.project_type = project_type
|
|
35
|
+
self.primary_language = primary_language
|
|
36
|
+
self.repo_name = repo_name
|
|
37
|
+
self.owner = owner
|
|
38
|
+
self.description = description
|
|
39
|
+
self.license = license
|
|
40
|
+
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# History files
|
|
2
|
+
.Rhistory
|
|
3
|
+
.Rapp.history
|
|
4
|
+
|
|
5
|
+
# R session data
|
|
6
|
+
.RData
|
|
7
|
+
.RDataTmp
|
|
8
|
+
|
|
9
|
+
# User-specific files
|
|
10
|
+
.Rproj.user/
|
|
11
|
+
.Ruserdata/
|
|
12
|
+
|
|
13
|
+
# RStudio files
|
|
14
|
+
.rsituational.user
|
|
15
|
+
.Renviron
|
|
16
|
+
*proj.user
|
|
17
|
+
|
|
18
|
+
# Example and temporary files
|
|
19
|
+
*.Rhistory
|
|
20
|
+
*.RData
|
|
21
|
+
*.RData~
|
|
22
|
+
*.Rproj.user
|
|
23
|
+
*~
|
|
24
|
+
.DS_Store # macOS specific
|
|
25
|
+
Thumbs.db # Windows specific
|
|
26
|
+
|
|
27
|
+
# Knitr and R Markdown cache and intermediate files
|
|
28
|
+
*_cache/
|
|
29
|
+
/cache/
|
|
30
|
+
*.utf8.md
|
|
31
|
+
*.knit.md
|
|
32
|
+
*.log
|
|
33
|
+
*.ind
|
|
34
|
+
*.aux
|
|
35
|
+
*.synctex.gz
|
|
36
|
+
*.toc
|
|
37
|
+
*.out
|
|
38
|
+
*.nav
|
|
39
|
+
*.snm
|
|
40
|
+
|
|
41
|
+
# Package and dependency management
|
|
42
|
+
# renv, packrat, etc.
|
|
43
|
+
renv/
|
|
44
|
+
packrat/
|
|
45
|
+
.renv/
|
|
46
|
+
.Rprofile
|
|
47
|
+
|
|
48
|
+
# Compiled code from packages like Rcpp
|
|
49
|
+
/src/*.so
|
|
50
|
+
/src/*.o
|
|
51
|
+
/src/*.dll
|
|
52
|
+
|
|
53
|
+
# Shiny app logs
|
|
54
|
+
*.log
|
|
55
|
+
*.log.*
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Byte-compiled / optimized / C files
|
|
59
|
+
__pycache__/
|
|
60
|
+
*.py[cod]
|
|
61
|
+
*$py.class
|
|
62
|
+
|
|
63
|
+
# C extensions
|
|
64
|
+
*.so
|
|
65
|
+
|
|
66
|
+
# Distribution / packaging
|
|
67
|
+
.Python
|
|
68
|
+
build/
|
|
69
|
+
develop-eggs/
|
|
70
|
+
dist/
|
|
71
|
+
downloads/
|
|
72
|
+
eggs/
|
|
73
|
+
.eggs/
|
|
74
|
+
lib/
|
|
75
|
+
lib64/
|
|
76
|
+
parts/
|
|
77
|
+
sdist/
|
|
78
|
+
var/
|
|
79
|
+
wheels/
|
|
80
|
+
*.egg-info/
|
|
81
|
+
.installed.cfg
|
|
82
|
+
*.egg
|
|
83
|
+
MANIFEST
|
|
84
|
+
|
|
85
|
+
# PyInstaller
|
|
86
|
+
# Usually these files are in 'dist' folder, but they may appear in the root directory
|
|
87
|
+
*.manifest
|
|
88
|
+
*.spec
|
|
89
|
+
|
|
90
|
+
# Installer logs
|
|
91
|
+
pip-log.txt
|
|
92
|
+
pip-delete-this-directory.txt
|
|
93
|
+
|
|
94
|
+
# Virtual Environments
|
|
95
|
+
.env
|
|
96
|
+
.venv
|
|
97
|
+
env/
|
|
98
|
+
venv/
|
|
99
|
+
ENV/
|
|
100
|
+
env.bak/
|
|
101
|
+
venv.bak/
|
|
102
|
+
|
|
103
|
+
# IDE and editor configuration files
|
|
104
|
+
# VSCode
|
|
105
|
+
.vscode/
|
|
106
|
+
# PyCharm
|
|
107
|
+
.idea/
|
|
108
|
+
# Sublime Text
|
|
109
|
+
*.sublime-project
|
|
110
|
+
*.sublime-workspace
|
|
111
|
+
# Atom
|
|
112
|
+
.atom/
|
|
113
|
+
# Jupyter Notebook
|
|
114
|
+
.ipynb_checkpoints
|
|
115
|
+
|
|
116
|
+
# Testing
|
|
117
|
+
.pytest_cache/
|
|
118
|
+
.coverage
|
|
119
|
+
.coverage.*
|
|
120
|
+
htmlcov/
|
|
121
|
+
nosetests.xml
|
|
122
|
+
coverage.xml
|
|
123
|
+
*.cover
|
|
124
|
+
.hypothesis/
|
|
125
|
+
.tox/
|
|
126
|
+
|
|
127
|
+
# Scrapy stuff
|
|
128
|
+
.scrapy
|
|
129
|
+
|
|
130
|
+
# Sphinx documentation
|
|
131
|
+
docs/_build/
|
|
132
|
+
|
|
133
|
+
# OS-generated files
|
|
134
|
+
.DS_Store
|
|
135
|
+
.DS_Store?
|
|
136
|
+
._*
|
|
137
|
+
.Spotlight-V100
|
|
138
|
+
.Trashes
|
|
139
|
+
ehthumbs.db
|
|
140
|
+
Thumbs.db
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum
|
|
3
|
+
import json
|
|
4
|
+
# from adalflow.utils import get_adalflow_default_root_path
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
class FileType(Enum):
|
|
8
|
+
unknown = "u"
|
|
9
|
+
file = "f"
|
|
10
|
+
directory = "d"
|
|
11
|
+
symlink = "l"
|
|
12
|
+
broken_symlink = "broken symlink"
|
|
13
|
+
|
|
14
|
+
def get_file_type(file_path: str) -> FileType:
|
|
15
|
+
"""
|
|
16
|
+
Get the file type of a given file path.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
file_path (str): The path to the file or directory.
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
FileType: The type of the file (file, directory, or symlink).
|
|
23
|
+
"""
|
|
24
|
+
if os.path.isfile(file_path):
|
|
25
|
+
return FileType.file
|
|
26
|
+
elif os.path.isdir(file_path):
|
|
27
|
+
return FileType.directory
|
|
28
|
+
elif os.path.islink(file_path):
|
|
29
|
+
try:
|
|
30
|
+
os.stat(file_path)
|
|
31
|
+
return FileType.symlink
|
|
32
|
+
except FileNotFoundError:
|
|
33
|
+
return FileType.broken_symlink
|
|
34
|
+
except Exception:
|
|
35
|
+
return FileType.unknown
|
|
36
|
+
else:
|
|
37
|
+
# raise ValueError(f"Unknown file type for path: {file_path}")
|
|
38
|
+
return FileType.unknown
|
|
39
|
+
|
|
40
|
+
def remove_output_cells(notebook_path: str) -> str:
|
|
41
|
+
"""
|
|
42
|
+
Remove output cells from a Jupyter notebook to reduce its size.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
notebook_path (str): Path to the input Jupyter notebook file.
|
|
46
|
+
output_path (str): Path to save the modified notebook file.
|
|
47
|
+
"""
|
|
48
|
+
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
|
|
49
|
+
notebook = json.load(nb_file)
|
|
50
|
+
|
|
51
|
+
notebook['cells'] = [
|
|
52
|
+
cell for cell in notebook.get('cells', [])
|
|
53
|
+
if cell.get('cell_type') != 'markdown'
|
|
54
|
+
]
|
|
55
|
+
for cell in notebook.get('cells'):
|
|
56
|
+
if cell.get('cell_type') == 'code':
|
|
57
|
+
cell['outputs'] = []
|
|
58
|
+
cell['execution_count'] = None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
return json.dumps(notebook)
|
|
62
|
+
|
|
63
|
+
def extract_code_from_notebook(notebook_path: str) -> str:
|
|
64
|
+
"""
|
|
65
|
+
Extract all code from a Jupyter notebook.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
notebook_path (str): Path to the input Jupyter notebook file.
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
str: A concatenated string of all code cells.
|
|
72
|
+
"""
|
|
73
|
+
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
|
|
74
|
+
notebook = json.load(nb_file)
|
|
75
|
+
|
|
76
|
+
# Extract code from cells of type 'code'
|
|
77
|
+
code_cells = [
|
|
78
|
+
'\n'.join(cell['source']) for cell in notebook.get('cells', [])
|
|
79
|
+
if cell.get('cell_type') == 'code'
|
|
80
|
+
]
|
|
81
|
+
code_cells = [
|
|
82
|
+
cell.replace("\n\n", "\n") for cell in code_cells
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
# Combine all code cells into a single string
|
|
86
|
+
return '\n\n'.join(code_cells)
|
|
87
|
+
|
|
88
|
+
def parse_repo_url(url: str) -> tuple[str | None, str | None]:
|
|
89
|
+
"""
|
|
90
|
+
Parses a git repository URL to extract the author/organization and repository name.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
url: The repository URL (e.g., HTTPS or SSH).
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
A tuple containing (author_or_org, repo_name), or (None, None) if parsing fails.
|
|
97
|
+
"""
|
|
98
|
+
try:
|
|
99
|
+
# Handle SSH format first (e.g., git@github.com:user/repo.git)
|
|
100
|
+
if '@' in url and ':' in url:
|
|
101
|
+
path_part = url.split(':')[-1]
|
|
102
|
+
# Handle HTTPS format (e.g., https://github.com/user/repo.git)
|
|
103
|
+
else:
|
|
104
|
+
path_part = url.split('://')[-1].split('/', 1)[-1]
|
|
105
|
+
|
|
106
|
+
# Clean up the path
|
|
107
|
+
if path_part.endswith('.git'):
|
|
108
|
+
path_part = path_part[:-4]
|
|
109
|
+
|
|
110
|
+
parts = path_part.split('/')
|
|
111
|
+
if len(parts) >= 2:
|
|
112
|
+
author = parts[-2]
|
|
113
|
+
repo_name = parts[-1]
|
|
114
|
+
return author, repo_name
|
|
115
|
+
else:
|
|
116
|
+
return None, None
|
|
117
|
+
except Exception:
|
|
118
|
+
return None, None
|
|
119
|
+
|
|
120
|
+
def retrieve_data_root_path():
|
|
121
|
+
data_folder = os.environ.get("DATA_FOLDER", "./data")
|
|
122
|
+
return data_folder
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
|