bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,215 @@
1
+ import os
2
+ from enum import Enum
3
+ import json
4
+ # from adalflow.utils import get_adalflow_default_root_path
5
+ from pathlib import Path
6
+ from typing import Union, List, Optional, Tuple
7
+
8
+ import os
9
+ import string
10
+
11
+ try:
12
+ import magic # optional: pip install python-magic
13
+ HAS_MAGIC = True
14
+ except ImportError:
15
+ HAS_MAGIC = False
16
+
17
+ class FileType(Enum):
18
+ unknown = "u"
19
+ file = "f"
20
+ directory = "d"
21
+ symlink = "l"
22
+ broken_symlink = "broken symlink"
23
+
24
+ def get_file_type(file_path: str) -> FileType:
25
+ """
26
+ Get the file type of a given file path.
27
+
28
+ Args:
29
+ file_path (str): The path to the file or directory.
30
+
31
+ Returns:
32
+ FileType: The type of the file (file, directory, or symlink).
33
+ """
34
+ if os.path.isfile(file_path):
35
+ return FileType.file
36
+ elif os.path.isdir(file_path):
37
+ return FileType.directory
38
+ elif os.path.islink(file_path):
39
+ try:
40
+ os.stat(file_path)
41
+ return FileType.symlink
42
+ except FileNotFoundError:
43
+ return FileType.broken_symlink
44
+ except Exception:
45
+ return FileType.unknown
46
+ else:
47
+ # raise ValueError(f"Unknown file type for path: {file_path}")
48
+ return FileType.unknown
49
+
50
+ def remove_output_cells(notebook_path: str) -> str:
51
+ """
52
+ Remove output cells from a Jupyter notebook to reduce its size.
53
+
54
+ Args:
55
+ notebook_path (str): Path to the input Jupyter notebook file.
56
+ output_path (str): Path to save the modified notebook file.
57
+ """
58
+ with open(notebook_path, 'r', encoding='utf-8') as nb_file:
59
+ notebook = json.load(nb_file)
60
+
61
+ notebook['cells'] = [
62
+ cell for cell in notebook.get('cells', [])
63
+ if cell.get('cell_type') != 'markdown'
64
+ ]
65
+ for cell in notebook.get('cells'):
66
+ if cell.get('cell_type') == 'code':
67
+ cell['outputs'] = []
68
+ cell['execution_count'] = None
69
+
70
+
71
+ return json.dumps(notebook)
72
+
73
+ def extract_code_from_notebook(notebook_path: str) -> str:
74
+ """
75
+ Extract all code from a Jupyter notebook.
76
+
77
+ Args:
78
+ notebook_path (str): Path to the input Jupyter notebook file.
79
+
80
+ Returns:
81
+ str: A concatenated string of all code cells.
82
+ """
83
+ with open(notebook_path, 'r', encoding='utf-8') as nb_file:
84
+ notebook = json.load(nb_file)
85
+
86
+ # Extract code from cells of type 'code'
87
+ code_cells = [
88
+ '\n'.join(cell['source']) for cell in notebook.get('cells', [])
89
+ if cell.get('cell_type') == 'code'
90
+ ]
91
+ code_cells = [
92
+ cell.replace("\n\n", "\n") for cell in code_cells
93
+ ]
94
+
95
+ # Combine all code cells into a single string
96
+ return '\n\n'.join(code_cells)
97
+
98
+ def parse_repo_url(url: str) -> Tuple[Optional[str], Optional[str]]:
99
+ """
100
+ Parses a git repository URL to extract the author/organization and repository name.
101
+
102
+ Args:
103
+ url: The repository URL (e.g., HTTPS or SSH).
104
+
105
+ Returns:
106
+ A tuple containing (author_or_org, repo_name), or (None, None) if parsing fails.
107
+ """
108
+ try:
109
+ # Handle SSH format first (e.g., git@github.com:user/repo.git)
110
+ if '@' in url and ':' in url:
111
+ path_part = url.split(':')[-1]
112
+ # Handle HTTPS format (e.g., https://github.com/user/repo.git)
113
+ else:
114
+ path_part = url.split('://')[-1].split('/', 1)[-1]
115
+
116
+ # Clean up the path
117
+ if path_part.endswith('.git'):
118
+ path_part = path_part[:-4]
119
+
120
+ parts = path_part.split('/')
121
+ if len(parts) >= 2:
122
+ author = parts[-2]
123
+ repo_name = parts[-1]
124
+ return author, repo_name
125
+ else:
126
+ return None, None
127
+ except Exception:
128
+ return None, None
129
+
130
+ def parse_refined_repo_path(refined_repo_path: str) -> Tuple[Optional[str], Optional[str]]:
131
+ repo_path = refined_repo_path.split("/")[-1]
132
+ arr = repo_path.split("_")
133
+ repo_name = arr[-1] if len(arr) > 1 else repo_path
134
+ author = arr[0] if len(arr) > 1 else ""
135
+ return author, repo_name
136
+
137
+ def retrieve_data_root_path() -> Path:
138
+ data_folder = os.environ.get("DATA_FOLDER", "./data")
139
+ root_folder = Path(data_folder, ".adalflow")
140
+ return root_folder.absolute()
141
+
142
+ def detect_file_type(filepath, blocksize=2048, use_magic=True):
143
+ """
144
+ Detect if a file is text or binary.
145
+
146
+ Args:
147
+ filepath (str): Path to file.
148
+ blocksize (int): Number of bytes to read for inspection.
149
+ use_magic (bool): Use python-magic if available.
150
+
151
+ Returns:
152
+ str: "text" or "binary"
153
+ """
154
+ # Option 1: Use python-magic if available and requested
155
+ if use_magic and HAS_MAGIC:
156
+ try:
157
+ mime = magic.from_file(filepath, mime=True)
158
+ if mime and mime.startswith("text/"):
159
+ return "text"
160
+ return "binary"
161
+ except Exception:
162
+ pass # fallback to heuristic
163
+
164
+ # Option 2: Heuristic detection
165
+ with open(filepath, "rb") as f:
166
+ chunk = f.read(blocksize)
167
+ if not chunk: # empty file → treat as text
168
+ return "text"
169
+
170
+ # Null byte check
171
+ if b"\0" in chunk:
172
+ return "binary"
173
+
174
+ # Check ratio of non-printable characters
175
+ text_chars = bytearray(string.printable, "ascii")
176
+ nontext = chunk.translate(None, text_chars)
177
+ if float(len(nontext)) / len(chunk) > 0.30:
178
+ return "binary"
179
+
180
+ return "text"
181
+
182
+ def flatten_files(repo_path: Union[str, Path], files: Optional[List[str]]) -> List[str]:
183
+ """
184
+ Flatten directories into individual files.
185
+
186
+ Args:
187
+ repo_path (Union[str, Path]): The root path of the repository
188
+ files (Optional[List[str]]): List of file/directory paths to flatten
189
+
190
+ Returns:
191
+ List[str]: List of individual file paths (directories are expanded to their contents)
192
+ """
193
+ if files is None:
194
+ return []
195
+
196
+ flattened = []
197
+ repo_path = Path(repo_path)
198
+
199
+ for file_path in files:
200
+ full_path = repo_path / file_path
201
+
202
+ if full_path.is_dir():
203
+ # If it's a directory, recursively get all files in it
204
+ for item in full_path.rglob("*"):
205
+ if item.is_file():
206
+ # Get relative path from repo_path
207
+ rel_path = item.relative_to(repo_path)
208
+ flattened.append(str(rel_path))
209
+ elif full_path.is_file():
210
+ # If it's already a file, just add it
211
+ flattened.append(file_path)
212
+ # Skip if path doesn't exist
213
+
214
+ return flattened
215
+
@@ -0,0 +1,175 @@
1
+ import fnmatch
2
+ import os
3
+ from pathlib import Path
4
+ from typing import Callable
5
+
6
+ class GitignoreChecker:
7
+ def __init__(
8
+ self,
9
+ directory: str,
10
+ gitignore_path: str,
11
+ exclude_dir_patterns: list[str] | None = None,
12
+ exclude_file_patterns: list[str] | None = None
13
+ ):
14
+ """
15
+ Initialize the GitignoreChecker with a specific directory and the path to a .gitignore file.
16
+
17
+ Args:
18
+ directory (str): The directory to be checked.
19
+ gitignore_path (str): The path to the .gitignore file.
20
+ """
21
+ self.directory = directory
22
+ self.gitignore_path = gitignore_path
23
+ self.folder_patterns, self.file_patterns = self._load_gitignore_patterns()
24
+ self.exclude_dir_patterns = exclude_dir_patterns
25
+ self.exclude_file_patterns = exclude_file_patterns
26
+
27
+ def _load_gitignore_patterns(self) -> tuple:
28
+ """
29
+ Load and parse the .gitignore file, then split the patterns into folder and file patterns.
30
+
31
+ If the specified .gitignore file is not found, fall back to the default path.
32
+
33
+ Returns:
34
+ tuple: A tuple containing two lists - one for folder patterns and one for file patterns.
35
+ """
36
+ try:
37
+ with open(self.gitignore_path, "r", encoding="utf-8") as file:
38
+ gitignore_content = file.read()
39
+ except FileNotFoundError:
40
+ # Fallback to the default .gitignore path if the specified file is not found
41
+ default_path = os.path.join(
42
+ os.path.dirname(__file__), "default.gitignore"
43
+ )
44
+ with open(default_path, "r", encoding="utf-8") as file:
45
+ gitignore_content = file.read()
46
+
47
+ patterns = self._parse_gitignore(gitignore_content)
48
+ return self._split_gitignore_patterns(patterns)
49
+
50
+ @staticmethod
51
+ def _parse_gitignore(gitignore_content: str) -> list:
52
+ """
53
+ Parse the .gitignore content and return patterns as a list.
54
+
55
+ Args:
56
+ gitignore_content (str): The content of the .gitignore file.
57
+
58
+ Returns:
59
+ list: A list of patterns extracted from the .gitignore content.
60
+ """
61
+ patterns = []
62
+ for line in gitignore_content.splitlines():
63
+ line = line.strip()
64
+ if line and not line.startswith("#"):
65
+ patterns.append(line)
66
+ return patterns
67
+
68
+ @staticmethod
69
+ def _split_gitignore_patterns(gitignore_patterns: list) -> tuple:
70
+ """
71
+ Split the .gitignore patterns into folder patterns and file patterns.
72
+
73
+ Args:
74
+ gitignore_patterns (list): A list of patterns from the .gitignore file.
75
+
76
+ Returns:
77
+ tuple: Two lists, one for folder patterns and one for file patterns.
78
+ """
79
+ folder_patterns = []
80
+ file_patterns = []
81
+ for pattern in gitignore_patterns:
82
+ if pattern.endswith("/"):
83
+ folder_patterns.append(pattern.rstrip("/"))
84
+ else:
85
+ file_patterns.append(pattern)
86
+ return folder_patterns, file_patterns
87
+
88
+ @staticmethod
89
+ def _is_ignored(path: str, patterns: list, is_dir: bool = False) -> bool:
90
+ """
91
+ Check if the given path matches any of the patterns.
92
+
93
+ Args:
94
+ path (str): The path to check.
95
+ patterns (list): A list of patterns to check against.
96
+ is_dir (bool): True if the path is a directory, False otherwise.
97
+
98
+ Returns:
99
+ bool: True if the path matches any pattern, False otherwise.
100
+ """
101
+ for pattern in patterns:
102
+ if fnmatch.fnmatch(path, pattern):
103
+ return True
104
+ if is_dir and pattern.endswith("/") and fnmatch.fnmatch(path, pattern[:-1]):
105
+ return True
106
+ return False
107
+
108
+ @staticmethod
109
+ def _is_ignored_by_default(path: str, is_dir: bool=False) -> bool:
110
+ return is_dir and path.startswith(".") # path == ".git"
111
+
112
+
113
+ def _is_ignored_by_exclude_file_patterns(self, f: str):
114
+ if self.exclude_file_patterns is None:
115
+ return False
116
+ return True if self._is_ignored(f, self.exclude_file_patterns) else False
117
+
118
+
119
+ def check_files_and_folders(
120
+ self,
121
+ level=-1,
122
+ check_file_cb: Callable[[str, str], bool] | None = None
123
+ ) -> list:
124
+ """
125
+ Check all files and folders in the given directory against the split gitignore patterns.
126
+ Return a list of files that are not ignored.
127
+ The returned file paths are relative to the self.directory.
128
+
129
+ Returns:
130
+ list: A list of paths to files that are not ignored.
131
+ """
132
+ not_ignored_files = []
133
+ root_path = Path(self.directory)
134
+ for root, dirs, files in os.walk(self.directory):
135
+ current_root_path = Path(root)
136
+ current_levels = len(current_root_path.relative_to(root_path).parts)
137
+ if level >= 0 and current_levels > level:
138
+ continue
139
+ dirs[:] = [
140
+ d
141
+ for d in dirs
142
+ if not self._is_ignored(d, self.folder_patterns, is_dir=True) \
143
+ and not self._is_ignored_by_default(d, True)
144
+ ]
145
+ if self.exclude_dir_patterns:
146
+ dirs[:] = [
147
+ d
148
+ for d in dirs
149
+ if not self._is_ignored(d, self.exclude_dir_patterns, is_dir=True)
150
+ ]
151
+
152
+ for file in files:
153
+ file_path = os.path.join(root, file)
154
+ relative_path = os.path.relpath(file_path, self.directory)
155
+ if not self._is_ignored(
156
+ file, self.file_patterns
157
+ ) and not self._is_ignored_by_exclude_file_patterns(file):
158
+ if check_file_cb is None:
159
+ not_ignored_files.append(relative_path)
160
+ else:
161
+ if check_file_cb(self.directory, relative_path):
162
+ not_ignored_files.append(relative_path)
163
+
164
+ if level >= 0 and current_levels == level:
165
+ not_ignored_files = \
166
+ not_ignored_files + \
167
+ [os.path.relpath(os.path.join(root, d), self.directory) for d in dirs]
168
+
169
+ return not_ignored_files
170
+
171
+
172
+ # Example usage:
173
+ # gitignore_checker = GitignoreChecker('path_to_directory', 'path_to_gitignore_file')
174
+ # not_ignored_files = gitignore_checker.check_files_and_folders()
175
+ # print(not_ignored_files)
@@ -0,0 +1,117 @@
1
+ from __future__ import annotations
2
+ from pathlib import Path
3
+ from typing import Union, Dict, Any, List
4
+ import json
5
+
6
+ def extract_markdown_from_notebook(
7
+ ipynb_path: Union[str, Path],
8
+ out_path: Union[str, Path, None] = None,
9
+ ) -> Dict[str, Any]:
10
+ """
11
+ Extract markdown from a Jupyter notebook.
12
+ """
13
+ ipynb_path = Path(ipynb_path)
14
+ if not ipynb_path.exists():
15
+ raise FileNotFoundError(f"File {ipynb_path} does not exist")
16
+ try:
17
+ with ipynb_path.open("r", encoding="utf-8") as f:
18
+ nb = json.load(f)
19
+ except json.JSONDecodeError:
20
+ raise ValueError(f"File {ipynb_path} is not a valid JSON file")
21
+
22
+ markdown_txts = [
23
+ "\n".join(cell.get("source")) if isinstance(cell.get("source"), list) else cell.get("source") for cell in nb.get("cells", [])
24
+ if cell.get("cell_type") == "markdown"
25
+ ]
26
+ text = "\n".join(markdown_txts)
27
+ if out_path is not None:
28
+ with open(out_path, "w", encoding="utf-8") as f:
29
+ f.write(text)
30
+ return text
31
+
32
+ def strip_notebook_to_code_and_markdown(
33
+ ipynb_path: Union[str, Path],
34
+ out_path: Union[str, Path, None] = None,
35
+ keep_top_metadata: bool = True,
36
+ ) -> Dict[str, Any]:
37
+ """
38
+ Load a .ipynb and return a new notebook that:
39
+ - keeps ONLY 'code' and 'markdown' cells
40
+ - empties outputs and execution_count for code cells
41
+ - drops all other cell types (e.g., 'raw')
42
+ - preserves attachments on markdown cells
43
+ - optionally preserves top-level metadata (kernelspec, language_info, etc.)
44
+
45
+ Parameters
46
+ ----------
47
+ ipynb_path : str | Path
48
+ Path to the input .ipynb file.
49
+ out_path : str | Path | None, default None
50
+ If provided, write the cleaned notebook to this path.
51
+ keep_top_metadata : bool, default True
52
+ If True, copy top-level metadata as-is (useful for re-running).
53
+ If False, keep only minimal metadata.
54
+
55
+ Returns
56
+ -------
57
+ dict
58
+ The cleaned notebook (nbformat v4-style dict).
59
+ """
60
+ ipynb_path = Path(ipynb_path)
61
+ if not ipynb_path.exists():
62
+ raise FileNotFoundError(f"File {ipynb_path} does not exist")
63
+ try:
64
+ with ipynb_path.open("r", encoding="utf-8") as f:
65
+ nb = json.load(f)
66
+ except json.JSONDecodeError:
67
+ raise ValueError(f"File {ipynb_path} is not a valid JSON file")
68
+
69
+ nbformat = nb.get("nbformat", 4)
70
+ nbformat_minor = nb.get("nbformat_minor", 5)
71
+
72
+ def _to_text(src) -> str:
73
+ # nbformat allows str or list of lines
74
+ if isinstance(src, list):
75
+ return "".join(src)
76
+ return src or ""
77
+
78
+ new_cells: List[Dict[str, Any]] = []
79
+ for cell in nb.get("cells", []):
80
+ ctype = cell.get("cell_type")
81
+ if ctype == "markdown":
82
+ new_cell = {
83
+ "cell_type": "markdown",
84
+ "metadata": cell.get("metadata", {}),
85
+ "source": _to_text(cell.get("source", "")),
86
+ }
87
+ if "attachments" in cell:
88
+ new_cell["attachments"] = cell["attachments"]
89
+ new_cells.append(new_cell)
90
+
91
+ elif ctype == "code":
92
+ new_cells.append({
93
+ "cell_type": "code",
94
+ "metadata": cell.get("metadata", {}),
95
+ "source": _to_text(cell.get("source", "")),
96
+ "execution_count": None, # clear execution count
97
+ "outputs": [], # strip ALL outputs
98
+ })
99
+
100
+ # else: drop 'raw' and any other unknown cell types
101
+
102
+ # Build new notebook object
103
+ new_nb: Dict[str, Any] = {
104
+ "nbformat": nbformat,
105
+ "nbformat_minor": nbformat_minor,
106
+ "metadata": nb.get("metadata", {}) if keep_top_metadata else {},
107
+ "cells": new_cells,
108
+ }
109
+
110
+ if out_path is not None:
111
+ out_path = Path(out_path)
112
+ out_path.parent.mkdir(parents=True, exist_ok=True)
113
+ with out_path.open("w", encoding="utf-8") as f:
114
+ json.dump(new_nb, f, ensure_ascii=False, indent=1)
115
+
116
+ return new_nb
117
+
@@ -0,0 +1,73 @@
1
+
2
+ import os
3
+ import re
4
+ import pyphen
5
+ import math
6
+
7
+ class PyphenReadability:
8
+ def __init__(self, lang='en'):
9
+ self.dic = pyphen.Pyphen(lang=lang)
10
+
11
+ def count_syllables(self, word):
12
+ return self.dic.inserted(word).count('-') + 1 if word.isalpha() else 0
13
+
14
+ def extract_urls(self, text):
15
+ """Find all URLs in the text."""
16
+ url_pattern = r'https?://\S+|www\.\S+'
17
+ return re.findall(url_pattern, text)
18
+
19
+ def remove_urls(self, text):
20
+ """Remove URLs from text for clean sentence splitting."""
21
+ url_pattern = r'https?://\S+|www\.\S+'
22
+ return re.sub(url_pattern, '', text)
23
+
24
+ def split_sentences(self, text):
25
+ """Split into sentences using punctuation."""
26
+ return re.split(r'[.!?]+', text)
27
+
28
+ def split_words(self, text):
29
+ """Extract words."""
30
+ return re.findall(r'\b\w+\b', text)
31
+
32
+ def is_polysyllabic(self, word):
33
+ return self.count_syllables(word) >= 3
34
+
35
+ def is_complex(self, word):
36
+ return self.is_polysyllabic(word)
37
+
38
+ def readability_metrics(self, text):
39
+ # Extract and remove URLs
40
+ urls = self.extract_urls(text)
41
+ url_count = len(urls)
42
+ text_without_urls = self.remove_urls(text)
43
+
44
+ # Split and count
45
+ sentences = [s for s in self.split_sentences(text_without_urls) if s.strip()]
46
+ sentence_count = len(sentences) + url_count
47
+
48
+ words = self.split_words(text) # split_words(text_without_urls)
49
+ word_count = len(words)
50
+
51
+ syllable_count = sum(self.count_syllables(w) for w in words)
52
+ polysyllables = sum(1 for w in words if self.is_polysyllabic(w))
53
+ complex_words = sum(1 for w in words if self.is_complex(w))
54
+
55
+ # Avoid division by zero
56
+ words_per_sentence = word_count / sentence_count if sentence_count > 0 else 0
57
+ syllables_per_word = syllable_count / word_count if word_count > 0 else 0
58
+ complex_per_word = complex_words / word_count if word_count > 0 else 0
59
+
60
+ # Readability formulas
61
+ flesch_reading_ease = 206.835 - 1.015 * words_per_sentence - 84.6 * syllables_per_word
62
+ flesch_kincaid_grade = 0.39 * words_per_sentence + 11.8 * syllables_per_word - 15.59
63
+ gunning_fog_index = 0.4 * (words_per_sentence + 100 * complex_per_word)
64
+ smog_index = (
65
+ 1.043 * math.sqrt(polysyllables * (30 / sentence_count)) + 3.1291
66
+ if sentence_count >= 1 else 0
67
+ )
68
+
69
+ return flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index,\
70
+ sentence_count, word_count, syllable_count, polysyllables, complex_words
71
+
72
+
73
+
@@ -0,0 +1,65 @@
1
+ import ast
2
+ import os
3
+
4
+ class PythonFileHandler:
5
+ def __init__(self, file_path: str):
6
+ self.file_path = file_path
7
+
8
+ def get_functions_and_classes(self) -> list[str]:
9
+ """
10
+ Get the functions and classes in a given file.
11
+ Returns a list of tuples, each containing:
12
+ 1. the function or class name,
13
+ 2. parent name,
14
+ 3. start line number,
15
+ 4. end line number,
16
+ 5. doc string,
17
+ 6. params.
18
+ """
19
+ with open(self.file_path, 'r') as f:
20
+ tree = ast.parse(f.read())
21
+ functions_and_classes = []
22
+ for node in tree.body:
23
+ if isinstance(node, ast.FunctionDef) or isinstance(node, ast.ClassDef):
24
+ start_lineno = node.lineno
25
+ end_lineno = self.get_end_lineno(node)
26
+ doc_string = ast.get_docstring(node)
27
+ params = (
28
+ [arg.arg for arg in node.args.args] if "args" in dir(node) else []
29
+ )
30
+ parent = None
31
+ functions_and_classes.append((node.name, parent, start_lineno, end_lineno, doc_string, params))
32
+ for child in node.body:
33
+ if isinstance(child, ast.FunctionDef):
34
+ start_lineno = child.lineno
35
+ end_lineno = self.get_end_lineno(child)
36
+ doc_string = ast.get_docstring(child)
37
+ params = (
38
+ [arg.arg for arg in child.args.args] if "args" in dir(child) else []
39
+ )
40
+ parent = node.name
41
+ functions_and_classes.append((child.name, parent, start_lineno, end_lineno, doc_string, params))
42
+ return functions_and_classes
43
+
44
+ def get_imports(self) -> list[str]:
45
+ pass
46
+
47
+ def get_end_lineno(self, node):
48
+ """
49
+ Get the end line number of a given node.
50
+
51
+ Args:
52
+ node: The node for which to find the end line number.
53
+
54
+ Returns:
55
+ int: The end line number of the node. Returns -1 if the node does not have a line number.
56
+ """
57
+ if not hasattr(node, "lineno"):
58
+ return -1 # 返回-1表示此节点没有行号
59
+
60
+ end_lineno = node.lineno
61
+ for child in ast.iter_child_nodes(node):
62
+ child_end = getattr(child, "end_lineno", None) or self.get_end_lineno(child)
63
+ if child_end > -1: # 只更新当子节点有有效行号时
64
+ end_lineno = max(end_lineno, child_end)
65
+ return end_lineno