bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum
|
|
3
|
+
import json
|
|
4
|
+
# from adalflow.utils import get_adalflow_default_root_path
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Union, List, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import string
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import magic # optional: pip install python-magic
|
|
13
|
+
HAS_MAGIC = True
|
|
14
|
+
except ImportError:
|
|
15
|
+
HAS_MAGIC = False
|
|
16
|
+
|
|
17
|
+
class FileType(Enum):
|
|
18
|
+
unknown = "u"
|
|
19
|
+
file = "f"
|
|
20
|
+
directory = "d"
|
|
21
|
+
symlink = "l"
|
|
22
|
+
broken_symlink = "broken symlink"
|
|
23
|
+
|
|
24
|
+
def get_file_type(file_path: str) -> FileType:
|
|
25
|
+
"""
|
|
26
|
+
Get the file type of a given file path.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
file_path (str): The path to the file or directory.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
FileType: The type of the file (file, directory, or symlink).
|
|
33
|
+
"""
|
|
34
|
+
if os.path.isfile(file_path):
|
|
35
|
+
return FileType.file
|
|
36
|
+
elif os.path.isdir(file_path):
|
|
37
|
+
return FileType.directory
|
|
38
|
+
elif os.path.islink(file_path):
|
|
39
|
+
try:
|
|
40
|
+
os.stat(file_path)
|
|
41
|
+
return FileType.symlink
|
|
42
|
+
except FileNotFoundError:
|
|
43
|
+
return FileType.broken_symlink
|
|
44
|
+
except Exception:
|
|
45
|
+
return FileType.unknown
|
|
46
|
+
else:
|
|
47
|
+
# raise ValueError(f"Unknown file type for path: {file_path}")
|
|
48
|
+
return FileType.unknown
|
|
49
|
+
|
|
50
|
+
def remove_output_cells(notebook_path: str) -> str:
|
|
51
|
+
"""
|
|
52
|
+
Remove output cells from a Jupyter notebook to reduce its size.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
notebook_path (str): Path to the input Jupyter notebook file.
|
|
56
|
+
output_path (str): Path to save the modified notebook file.
|
|
57
|
+
"""
|
|
58
|
+
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
|
|
59
|
+
notebook = json.load(nb_file)
|
|
60
|
+
|
|
61
|
+
notebook['cells'] = [
|
|
62
|
+
cell for cell in notebook.get('cells', [])
|
|
63
|
+
if cell.get('cell_type') != 'markdown'
|
|
64
|
+
]
|
|
65
|
+
for cell in notebook.get('cells'):
|
|
66
|
+
if cell.get('cell_type') == 'code':
|
|
67
|
+
cell['outputs'] = []
|
|
68
|
+
cell['execution_count'] = None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
return json.dumps(notebook)
|
|
72
|
+
|
|
73
|
+
def extract_code_from_notebook(notebook_path: str) -> str:
|
|
74
|
+
"""
|
|
75
|
+
Extract all code from a Jupyter notebook.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
notebook_path (str): Path to the input Jupyter notebook file.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
str: A concatenated string of all code cells.
|
|
82
|
+
"""
|
|
83
|
+
with open(notebook_path, 'r', encoding='utf-8') as nb_file:
|
|
84
|
+
notebook = json.load(nb_file)
|
|
85
|
+
|
|
86
|
+
# Extract code from cells of type 'code'
|
|
87
|
+
code_cells = [
|
|
88
|
+
'\n'.join(cell['source']) for cell in notebook.get('cells', [])
|
|
89
|
+
if cell.get('cell_type') == 'code'
|
|
90
|
+
]
|
|
91
|
+
code_cells = [
|
|
92
|
+
cell.replace("\n\n", "\n") for cell in code_cells
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
# Combine all code cells into a single string
|
|
96
|
+
return '\n\n'.join(code_cells)
|
|
97
|
+
|
|
98
|
+
def parse_repo_url(url: str) -> Tuple[Optional[str], Optional[str]]:
|
|
99
|
+
"""
|
|
100
|
+
Parses a git repository URL to extract the author/organization and repository name.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
url: The repository URL (e.g., HTTPS or SSH).
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
A tuple containing (author_or_org, repo_name), or (None, None) if parsing fails.
|
|
107
|
+
"""
|
|
108
|
+
try:
|
|
109
|
+
# Handle SSH format first (e.g., git@github.com:user/repo.git)
|
|
110
|
+
if '@' in url and ':' in url:
|
|
111
|
+
path_part = url.split(':')[-1]
|
|
112
|
+
# Handle HTTPS format (e.g., https://github.com/user/repo.git)
|
|
113
|
+
else:
|
|
114
|
+
path_part = url.split('://')[-1].split('/', 1)[-1]
|
|
115
|
+
|
|
116
|
+
# Clean up the path
|
|
117
|
+
if path_part.endswith('.git'):
|
|
118
|
+
path_part = path_part[:-4]
|
|
119
|
+
|
|
120
|
+
parts = path_part.split('/')
|
|
121
|
+
if len(parts) >= 2:
|
|
122
|
+
author = parts[-2]
|
|
123
|
+
repo_name = parts[-1]
|
|
124
|
+
return author, repo_name
|
|
125
|
+
else:
|
|
126
|
+
return None, None
|
|
127
|
+
except Exception:
|
|
128
|
+
return None, None
|
|
129
|
+
|
|
130
|
+
def parse_refined_repo_path(refined_repo_path: str) -> Tuple[Optional[str], Optional[str]]:
|
|
131
|
+
repo_path = refined_repo_path.split("/")[-1]
|
|
132
|
+
arr = repo_path.split("_")
|
|
133
|
+
repo_name = arr[-1] if len(arr) > 1 else repo_path
|
|
134
|
+
author = arr[0] if len(arr) > 1 else ""
|
|
135
|
+
return author, repo_name
|
|
136
|
+
|
|
137
|
+
def retrieve_data_root_path() -> Path:
|
|
138
|
+
data_folder = os.environ.get("DATA_FOLDER", "./data")
|
|
139
|
+
root_folder = Path(data_folder, ".adalflow")
|
|
140
|
+
return root_folder.absolute()
|
|
141
|
+
|
|
142
|
+
def detect_file_type(filepath, blocksize=2048, use_magic=True):
|
|
143
|
+
"""
|
|
144
|
+
Detect if a file is text or binary.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
filepath (str): Path to file.
|
|
148
|
+
blocksize (int): Number of bytes to read for inspection.
|
|
149
|
+
use_magic (bool): Use python-magic if available.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
str: "text" or "binary"
|
|
153
|
+
"""
|
|
154
|
+
# Option 1: Use python-magic if available and requested
|
|
155
|
+
if use_magic and HAS_MAGIC:
|
|
156
|
+
try:
|
|
157
|
+
mime = magic.from_file(filepath, mime=True)
|
|
158
|
+
if mime and mime.startswith("text/"):
|
|
159
|
+
return "text"
|
|
160
|
+
return "binary"
|
|
161
|
+
except Exception:
|
|
162
|
+
pass # fallback to heuristic
|
|
163
|
+
|
|
164
|
+
# Option 2: Heuristic detection
|
|
165
|
+
with open(filepath, "rb") as f:
|
|
166
|
+
chunk = f.read(blocksize)
|
|
167
|
+
if not chunk: # empty file → treat as text
|
|
168
|
+
return "text"
|
|
169
|
+
|
|
170
|
+
# Null byte check
|
|
171
|
+
if b"\0" in chunk:
|
|
172
|
+
return "binary"
|
|
173
|
+
|
|
174
|
+
# Check ratio of non-printable characters
|
|
175
|
+
text_chars = bytearray(string.printable, "ascii")
|
|
176
|
+
nontext = chunk.translate(None, text_chars)
|
|
177
|
+
if float(len(nontext)) / len(chunk) > 0.30:
|
|
178
|
+
return "binary"
|
|
179
|
+
|
|
180
|
+
return "text"
|
|
181
|
+
|
|
182
|
+
def flatten_files(repo_path: Union[str, Path], files: Optional[List[str]]) -> List[str]:
|
|
183
|
+
"""
|
|
184
|
+
Flatten directories into individual files.
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
repo_path (Union[str, Path]): The root path of the repository
|
|
188
|
+
files (Optional[List[str]]): List of file/directory paths to flatten
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
List[str]: List of individual file paths (directories are expanded to their contents)
|
|
192
|
+
"""
|
|
193
|
+
if files is None:
|
|
194
|
+
return []
|
|
195
|
+
|
|
196
|
+
flattened = []
|
|
197
|
+
repo_path = Path(repo_path)
|
|
198
|
+
|
|
199
|
+
for file_path in files:
|
|
200
|
+
full_path = repo_path / file_path
|
|
201
|
+
|
|
202
|
+
if full_path.is_dir():
|
|
203
|
+
# If it's a directory, recursively get all files in it
|
|
204
|
+
for item in full_path.rglob("*"):
|
|
205
|
+
if item.is_file():
|
|
206
|
+
# Get relative path from repo_path
|
|
207
|
+
rel_path = item.relative_to(repo_path)
|
|
208
|
+
flattened.append(str(rel_path))
|
|
209
|
+
elif full_path.is_file():
|
|
210
|
+
# If it's already a file, just add it
|
|
211
|
+
flattened.append(file_path)
|
|
212
|
+
# Skip if path doesn't exist
|
|
213
|
+
|
|
214
|
+
return flattened
|
|
215
|
+
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Callable
|
|
5
|
+
|
|
6
|
+
class GitignoreChecker:
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
directory: str,
|
|
10
|
+
gitignore_path: str,
|
|
11
|
+
exclude_dir_patterns: list[str] | None = None,
|
|
12
|
+
exclude_file_patterns: list[str] | None = None
|
|
13
|
+
):
|
|
14
|
+
"""
|
|
15
|
+
Initialize the GitignoreChecker with a specific directory and the path to a .gitignore file.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
directory (str): The directory to be checked.
|
|
19
|
+
gitignore_path (str): The path to the .gitignore file.
|
|
20
|
+
"""
|
|
21
|
+
self.directory = directory
|
|
22
|
+
self.gitignore_path = gitignore_path
|
|
23
|
+
self.folder_patterns, self.file_patterns = self._load_gitignore_patterns()
|
|
24
|
+
self.exclude_dir_patterns = exclude_dir_patterns
|
|
25
|
+
self.exclude_file_patterns = exclude_file_patterns
|
|
26
|
+
|
|
27
|
+
def _load_gitignore_patterns(self) -> tuple:
|
|
28
|
+
"""
|
|
29
|
+
Load and parse the .gitignore file, then split the patterns into folder and file patterns.
|
|
30
|
+
|
|
31
|
+
If the specified .gitignore file is not found, fall back to the default path.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
tuple: A tuple containing two lists - one for folder patterns and one for file patterns.
|
|
35
|
+
"""
|
|
36
|
+
try:
|
|
37
|
+
with open(self.gitignore_path, "r", encoding="utf-8") as file:
|
|
38
|
+
gitignore_content = file.read()
|
|
39
|
+
except FileNotFoundError:
|
|
40
|
+
# Fallback to the default .gitignore path if the specified file is not found
|
|
41
|
+
default_path = os.path.join(
|
|
42
|
+
os.path.dirname(__file__), "default.gitignore"
|
|
43
|
+
)
|
|
44
|
+
with open(default_path, "r", encoding="utf-8") as file:
|
|
45
|
+
gitignore_content = file.read()
|
|
46
|
+
|
|
47
|
+
patterns = self._parse_gitignore(gitignore_content)
|
|
48
|
+
return self._split_gitignore_patterns(patterns)
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def _parse_gitignore(gitignore_content: str) -> list:
|
|
52
|
+
"""
|
|
53
|
+
Parse the .gitignore content and return patterns as a list.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
gitignore_content (str): The content of the .gitignore file.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
list: A list of patterns extracted from the .gitignore content.
|
|
60
|
+
"""
|
|
61
|
+
patterns = []
|
|
62
|
+
for line in gitignore_content.splitlines():
|
|
63
|
+
line = line.strip()
|
|
64
|
+
if line and not line.startswith("#"):
|
|
65
|
+
patterns.append(line)
|
|
66
|
+
return patterns
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def _split_gitignore_patterns(gitignore_patterns: list) -> tuple:
|
|
70
|
+
"""
|
|
71
|
+
Split the .gitignore patterns into folder patterns and file patterns.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
gitignore_patterns (list): A list of patterns from the .gitignore file.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
tuple: Two lists, one for folder patterns and one for file patterns.
|
|
78
|
+
"""
|
|
79
|
+
folder_patterns = []
|
|
80
|
+
file_patterns = []
|
|
81
|
+
for pattern in gitignore_patterns:
|
|
82
|
+
if pattern.endswith("/"):
|
|
83
|
+
folder_patterns.append(pattern.rstrip("/"))
|
|
84
|
+
else:
|
|
85
|
+
file_patterns.append(pattern)
|
|
86
|
+
return folder_patterns, file_patterns
|
|
87
|
+
|
|
88
|
+
@staticmethod
|
|
89
|
+
def _is_ignored(path: str, patterns: list, is_dir: bool = False) -> bool:
|
|
90
|
+
"""
|
|
91
|
+
Check if the given path matches any of the patterns.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
path (str): The path to check.
|
|
95
|
+
patterns (list): A list of patterns to check against.
|
|
96
|
+
is_dir (bool): True if the path is a directory, False otherwise.
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
bool: True if the path matches any pattern, False otherwise.
|
|
100
|
+
"""
|
|
101
|
+
for pattern in patterns:
|
|
102
|
+
if fnmatch.fnmatch(path, pattern):
|
|
103
|
+
return True
|
|
104
|
+
if is_dir and pattern.endswith("/") and fnmatch.fnmatch(path, pattern[:-1]):
|
|
105
|
+
return True
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
@staticmethod
|
|
109
|
+
def _is_ignored_by_default(path: str, is_dir: bool=False) -> bool:
|
|
110
|
+
return is_dir and path.startswith(".") # path == ".git"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _is_ignored_by_exclude_file_patterns(self, f: str):
|
|
114
|
+
if self.exclude_file_patterns is None:
|
|
115
|
+
return False
|
|
116
|
+
return True if self._is_ignored(f, self.exclude_file_patterns) else False
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def check_files_and_folders(
|
|
120
|
+
self,
|
|
121
|
+
level=-1,
|
|
122
|
+
check_file_cb: Callable[[str, str], bool] | None = None
|
|
123
|
+
) -> list:
|
|
124
|
+
"""
|
|
125
|
+
Check all files and folders in the given directory against the split gitignore patterns.
|
|
126
|
+
Return a list of files that are not ignored.
|
|
127
|
+
The returned file paths are relative to the self.directory.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
list: A list of paths to files that are not ignored.
|
|
131
|
+
"""
|
|
132
|
+
not_ignored_files = []
|
|
133
|
+
root_path = Path(self.directory)
|
|
134
|
+
for root, dirs, files in os.walk(self.directory):
|
|
135
|
+
current_root_path = Path(root)
|
|
136
|
+
current_levels = len(current_root_path.relative_to(root_path).parts)
|
|
137
|
+
if level >= 0 and current_levels > level:
|
|
138
|
+
continue
|
|
139
|
+
dirs[:] = [
|
|
140
|
+
d
|
|
141
|
+
for d in dirs
|
|
142
|
+
if not self._is_ignored(d, self.folder_patterns, is_dir=True) \
|
|
143
|
+
and not self._is_ignored_by_default(d, True)
|
|
144
|
+
]
|
|
145
|
+
if self.exclude_dir_patterns:
|
|
146
|
+
dirs[:] = [
|
|
147
|
+
d
|
|
148
|
+
for d in dirs
|
|
149
|
+
if not self._is_ignored(d, self.exclude_dir_patterns, is_dir=True)
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
for file in files:
|
|
153
|
+
file_path = os.path.join(root, file)
|
|
154
|
+
relative_path = os.path.relpath(file_path, self.directory)
|
|
155
|
+
if not self._is_ignored(
|
|
156
|
+
file, self.file_patterns
|
|
157
|
+
) and not self._is_ignored_by_exclude_file_patterns(file):
|
|
158
|
+
if check_file_cb is None:
|
|
159
|
+
not_ignored_files.append(relative_path)
|
|
160
|
+
else:
|
|
161
|
+
if check_file_cb(self.directory, relative_path):
|
|
162
|
+
not_ignored_files.append(relative_path)
|
|
163
|
+
|
|
164
|
+
if level >= 0 and current_levels == level:
|
|
165
|
+
not_ignored_files = \
|
|
166
|
+
not_ignored_files + \
|
|
167
|
+
[os.path.relpath(os.path.join(root, d), self.directory) for d in dirs]
|
|
168
|
+
|
|
169
|
+
return not_ignored_files
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# Example usage:
|
|
173
|
+
# gitignore_checker = GitignoreChecker('path_to_directory', 'path_to_gitignore_file')
|
|
174
|
+
# not_ignored_files = gitignore_checker.check_files_and_folders()
|
|
175
|
+
# print(not_ignored_files)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Union, Dict, Any, List
|
|
4
|
+
import json
|
|
5
|
+
|
|
6
|
+
def extract_markdown_from_notebook(
|
|
7
|
+
ipynb_path: Union[str, Path],
|
|
8
|
+
out_path: Union[str, Path, None] = None,
|
|
9
|
+
) -> Dict[str, Any]:
|
|
10
|
+
"""
|
|
11
|
+
Extract markdown from a Jupyter notebook.
|
|
12
|
+
"""
|
|
13
|
+
ipynb_path = Path(ipynb_path)
|
|
14
|
+
if not ipynb_path.exists():
|
|
15
|
+
raise FileNotFoundError(f"File {ipynb_path} does not exist")
|
|
16
|
+
try:
|
|
17
|
+
with ipynb_path.open("r", encoding="utf-8") as f:
|
|
18
|
+
nb = json.load(f)
|
|
19
|
+
except json.JSONDecodeError:
|
|
20
|
+
raise ValueError(f"File {ipynb_path} is not a valid JSON file")
|
|
21
|
+
|
|
22
|
+
markdown_txts = [
|
|
23
|
+
"\n".join(cell.get("source")) if isinstance(cell.get("source"), list) else cell.get("source") for cell in nb.get("cells", [])
|
|
24
|
+
if cell.get("cell_type") == "markdown"
|
|
25
|
+
]
|
|
26
|
+
text = "\n".join(markdown_txts)
|
|
27
|
+
if out_path is not None:
|
|
28
|
+
with open(out_path, "w", encoding="utf-8") as f:
|
|
29
|
+
f.write(text)
|
|
30
|
+
return text
|
|
31
|
+
|
|
32
|
+
def strip_notebook_to_code_and_markdown(
|
|
33
|
+
ipynb_path: Union[str, Path],
|
|
34
|
+
out_path: Union[str, Path, None] = None,
|
|
35
|
+
keep_top_metadata: bool = True,
|
|
36
|
+
) -> Dict[str, Any]:
|
|
37
|
+
"""
|
|
38
|
+
Load a .ipynb and return a new notebook that:
|
|
39
|
+
- keeps ONLY 'code' and 'markdown' cells
|
|
40
|
+
- empties outputs and execution_count for code cells
|
|
41
|
+
- drops all other cell types (e.g., 'raw')
|
|
42
|
+
- preserves attachments on markdown cells
|
|
43
|
+
- optionally preserves top-level metadata (kernelspec, language_info, etc.)
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
ipynb_path : str | Path
|
|
48
|
+
Path to the input .ipynb file.
|
|
49
|
+
out_path : str | Path | None, default None
|
|
50
|
+
If provided, write the cleaned notebook to this path.
|
|
51
|
+
keep_top_metadata : bool, default True
|
|
52
|
+
If True, copy top-level metadata as-is (useful for re-running).
|
|
53
|
+
If False, keep only minimal metadata.
|
|
54
|
+
|
|
55
|
+
Returns
|
|
56
|
+
-------
|
|
57
|
+
dict
|
|
58
|
+
The cleaned notebook (nbformat v4-style dict).
|
|
59
|
+
"""
|
|
60
|
+
ipynb_path = Path(ipynb_path)
|
|
61
|
+
if not ipynb_path.exists():
|
|
62
|
+
raise FileNotFoundError(f"File {ipynb_path} does not exist")
|
|
63
|
+
try:
|
|
64
|
+
with ipynb_path.open("r", encoding="utf-8") as f:
|
|
65
|
+
nb = json.load(f)
|
|
66
|
+
except json.JSONDecodeError:
|
|
67
|
+
raise ValueError(f"File {ipynb_path} is not a valid JSON file")
|
|
68
|
+
|
|
69
|
+
nbformat = nb.get("nbformat", 4)
|
|
70
|
+
nbformat_minor = nb.get("nbformat_minor", 5)
|
|
71
|
+
|
|
72
|
+
def _to_text(src) -> str:
|
|
73
|
+
# nbformat allows str or list of lines
|
|
74
|
+
if isinstance(src, list):
|
|
75
|
+
return "".join(src)
|
|
76
|
+
return src or ""
|
|
77
|
+
|
|
78
|
+
new_cells: List[Dict[str, Any]] = []
|
|
79
|
+
for cell in nb.get("cells", []):
|
|
80
|
+
ctype = cell.get("cell_type")
|
|
81
|
+
if ctype == "markdown":
|
|
82
|
+
new_cell = {
|
|
83
|
+
"cell_type": "markdown",
|
|
84
|
+
"metadata": cell.get("metadata", {}),
|
|
85
|
+
"source": _to_text(cell.get("source", "")),
|
|
86
|
+
}
|
|
87
|
+
if "attachments" in cell:
|
|
88
|
+
new_cell["attachments"] = cell["attachments"]
|
|
89
|
+
new_cells.append(new_cell)
|
|
90
|
+
|
|
91
|
+
elif ctype == "code":
|
|
92
|
+
new_cells.append({
|
|
93
|
+
"cell_type": "code",
|
|
94
|
+
"metadata": cell.get("metadata", {}),
|
|
95
|
+
"source": _to_text(cell.get("source", "")),
|
|
96
|
+
"execution_count": None, # clear execution count
|
|
97
|
+
"outputs": [], # strip ALL outputs
|
|
98
|
+
})
|
|
99
|
+
|
|
100
|
+
# else: drop 'raw' and any other unknown cell types
|
|
101
|
+
|
|
102
|
+
# Build new notebook object
|
|
103
|
+
new_nb: Dict[str, Any] = {
|
|
104
|
+
"nbformat": nbformat,
|
|
105
|
+
"nbformat_minor": nbformat_minor,
|
|
106
|
+
"metadata": nb.get("metadata", {}) if keep_top_metadata else {},
|
|
107
|
+
"cells": new_cells,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if out_path is not None:
|
|
111
|
+
out_path = Path(out_path)
|
|
112
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
113
|
+
with out_path.open("w", encoding="utf-8") as f:
|
|
114
|
+
json.dump(new_nb, f, ensure_ascii=False, indent=1)
|
|
115
|
+
|
|
116
|
+
return new_nb
|
|
117
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import pyphen
|
|
5
|
+
import math
|
|
6
|
+
|
|
7
|
+
class PyphenReadability:
|
|
8
|
+
def __init__(self, lang='en'):
|
|
9
|
+
self.dic = pyphen.Pyphen(lang=lang)
|
|
10
|
+
|
|
11
|
+
def count_syllables(self, word):
|
|
12
|
+
return self.dic.inserted(word).count('-') + 1 if word.isalpha() else 0
|
|
13
|
+
|
|
14
|
+
def extract_urls(self, text):
|
|
15
|
+
"""Find all URLs in the text."""
|
|
16
|
+
url_pattern = r'https?://\S+|www\.\S+'
|
|
17
|
+
return re.findall(url_pattern, text)
|
|
18
|
+
|
|
19
|
+
def remove_urls(self, text):
|
|
20
|
+
"""Remove URLs from text for clean sentence splitting."""
|
|
21
|
+
url_pattern = r'https?://\S+|www\.\S+'
|
|
22
|
+
return re.sub(url_pattern, '', text)
|
|
23
|
+
|
|
24
|
+
def split_sentences(self, text):
|
|
25
|
+
"""Split into sentences using punctuation."""
|
|
26
|
+
return re.split(r'[.!?]+', text)
|
|
27
|
+
|
|
28
|
+
def split_words(self, text):
|
|
29
|
+
"""Extract words."""
|
|
30
|
+
return re.findall(r'\b\w+\b', text)
|
|
31
|
+
|
|
32
|
+
def is_polysyllabic(self, word):
|
|
33
|
+
return self.count_syllables(word) >= 3
|
|
34
|
+
|
|
35
|
+
def is_complex(self, word):
|
|
36
|
+
return self.is_polysyllabic(word)
|
|
37
|
+
|
|
38
|
+
def readability_metrics(self, text):
|
|
39
|
+
# Extract and remove URLs
|
|
40
|
+
urls = self.extract_urls(text)
|
|
41
|
+
url_count = len(urls)
|
|
42
|
+
text_without_urls = self.remove_urls(text)
|
|
43
|
+
|
|
44
|
+
# Split and count
|
|
45
|
+
sentences = [s for s in self.split_sentences(text_without_urls) if s.strip()]
|
|
46
|
+
sentence_count = len(sentences) + url_count
|
|
47
|
+
|
|
48
|
+
words = self.split_words(text) # split_words(text_without_urls)
|
|
49
|
+
word_count = len(words)
|
|
50
|
+
|
|
51
|
+
syllable_count = sum(self.count_syllables(w) for w in words)
|
|
52
|
+
polysyllables = sum(1 for w in words if self.is_polysyllabic(w))
|
|
53
|
+
complex_words = sum(1 for w in words if self.is_complex(w))
|
|
54
|
+
|
|
55
|
+
# Avoid division by zero
|
|
56
|
+
words_per_sentence = word_count / sentence_count if sentence_count > 0 else 0
|
|
57
|
+
syllables_per_word = syllable_count / word_count if word_count > 0 else 0
|
|
58
|
+
complex_per_word = complex_words / word_count if word_count > 0 else 0
|
|
59
|
+
|
|
60
|
+
# Readability formulas
|
|
61
|
+
flesch_reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
|
|
62
|
+
flesch_kincaid_grade = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
|
|
63
|
+
gunning_fog_index = 0.4 * (words_per_sentence + 100 * complex_per_word)
|
|
64
|
+
smog_index = (
|
|
65
|
+
1.043 * math.sqrt(polysyllables * (30 / sentence_count)) + 3.1291
|
|
66
|
+
if sentence_count >= 1 else 0
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index,\
|
|
70
|
+
sentence_count, word_count, syllable_count, polysyllables, complex_words
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
class PythonFileHandler:
|
|
5
|
+
def __init__(self, file_path: str):
|
|
6
|
+
self.file_path = file_path
|
|
7
|
+
|
|
8
|
+
def get_functions_and_classes(self) -> list[str]:
|
|
9
|
+
"""
|
|
10
|
+
Get the functions and classes in a given file.
|
|
11
|
+
Returns a list of tuples, each containing:
|
|
12
|
+
1. the function or class name,
|
|
13
|
+
2. parent name,
|
|
14
|
+
3. start line number,
|
|
15
|
+
4. end line number,
|
|
16
|
+
5. doc string,
|
|
17
|
+
6. params.
|
|
18
|
+
"""
|
|
19
|
+
with open(self.file_path, 'r') as f:
|
|
20
|
+
tree = ast.parse(f.read())
|
|
21
|
+
functions_and_classes = []
|
|
22
|
+
for node in tree.body:
|
|
23
|
+
if isinstance(node, ast.FunctionDef) or isinstance(node, ast.ClassDef):
|
|
24
|
+
start_lineno = node.lineno
|
|
25
|
+
end_lineno = self.get_end_lineno(node)
|
|
26
|
+
doc_string = ast.get_docstring(node)
|
|
27
|
+
params = (
|
|
28
|
+
[arg.arg for arg in node.args.args] if "args" in dir(node) else []
|
|
29
|
+
)
|
|
30
|
+
parent = None
|
|
31
|
+
functions_and_classes.append((node.name, parent, start_lineno, end_lineno, doc_string, params))
|
|
32
|
+
for child in node.body:
|
|
33
|
+
if isinstance(child, ast.FunctionDef):
|
|
34
|
+
start_lineno = child.lineno
|
|
35
|
+
end_lineno = self.get_end_lineno(child)
|
|
36
|
+
doc_string = ast.get_docstring(child)
|
|
37
|
+
params = (
|
|
38
|
+
[arg.arg for arg in child.args.args] if "args" in dir(child) else []
|
|
39
|
+
)
|
|
40
|
+
parent = node.name
|
|
41
|
+
functions_and_classes.append((child.name, parent, start_lineno, end_lineno, doc_string, params))
|
|
42
|
+
return functions_and_classes
|
|
43
|
+
|
|
44
|
+
def get_imports(self) -> list[str]:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def get_end_lineno(self, node):
|
|
48
|
+
"""
|
|
49
|
+
Get the end line number of a given node.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
node: The node for which to find the end line number.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
int: The end line number of the node. Returns -1 if the node does not have a line number.
|
|
56
|
+
"""
|
|
57
|
+
if not hasattr(node, "lineno"):
|
|
58
|
+
return -1 # 返回-1表示此节点没有行号
|
|
59
|
+
|
|
60
|
+
end_lineno = node.lineno
|
|
61
|
+
for child in ast.iter_child_nodes(node):
|
|
62
|
+
child_end = getattr(child, "end_lineno", None) or self.get_end_lineno(child)
|
|
63
|
+
if child_end > -1: # 只更新当子节点有有效行号时
|
|
64
|
+
end_lineno = max(end_lineno, child_end)
|
|
65
|
+
return end_lineno
|