bioguider 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/agent_utils.py +18 -10
- bioguider/agents/collection_execute_step.py +1 -1
- bioguider/agents/collection_observe_step.py +7 -2
- bioguider/agents/collection_task_utils.py +1 -0
- bioguider/agents/common_conversation.py +20 -2
- bioguider/agents/consistency_collection_step.py +100 -0
- bioguider/agents/consistency_evaluation_task.py +56 -0
- bioguider/agents/consistency_evaluation_task_utils.py +13 -0
- bioguider/agents/consistency_observe_step.py +107 -0
- bioguider/agents/consistency_query_step.py +74 -0
- bioguider/agents/evaluation_task.py +2 -2
- bioguider/agents/evaluation_userguide_prompts.py +162 -0
- bioguider/agents/evaluation_userguide_task.py +131 -0
- bioguider/agents/prompt_utils.py +15 -8
- bioguider/database/code_structure_db.py +489 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/change_planner.py +140 -0
- bioguider/generation/document_renderer.py +47 -0
- bioguider/generation/llm_cleaner.py +43 -0
- bioguider/generation/llm_content_generator.py +69 -0
- bioguider/generation/llm_injector.py +270 -0
- bioguider/generation/models.py +77 -0
- bioguider/generation/output_manager.py +54 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +151 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +136 -0
- bioguider/generation/test_metrics.py +104 -0
- bioguider/managers/evaluation_manager.py +24 -0
- bioguider/managers/generation_manager.py +160 -0
- bioguider/managers/generation_test_manager.py +74 -0
- bioguider/utils/code_structure_builder.py +47 -0
- bioguider/utils/constants.py +12 -12
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +368 -0
- bioguider/utils/utils.py +34 -1
- {bioguider-0.2.19.dist-info → bioguider-0.2.21.dist-info}/METADATA +1 -1
- bioguider-0.2.21.dist-info/RECORD +77 -0
- bioguider-0.2.19.dist-info/RECORD +0 -51
- {bioguider-0.2.19.dist-info → bioguider-0.2.21.dist-info}/LICENSE +0 -0
- {bioguider-0.2.19.dist-info → bioguider-0.2.21.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
from typing import Tuple
|
|
6
|
+
|
|
7
|
+
from bioguider.generation.llm_injector import LLMErrorInjector
|
|
8
|
+
from bioguider.generation.test_metrics import evaluate_fixes
|
|
9
|
+
from bioguider.managers.generation_manager import DocumentationGenerationManager
|
|
10
|
+
from bioguider.agents.agent_utils import read_file, write_file
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GenerationTestManager:
|
|
14
|
+
def __init__(self, llm, step_callback):
|
|
15
|
+
self.llm = llm
|
|
16
|
+
self.step_output = step_callback
|
|
17
|
+
|
|
18
|
+
def print_step(self, name: str, out: str | None = None):
|
|
19
|
+
if self.step_output:
|
|
20
|
+
self.step_output(step_name=name, step_output=out)
|
|
21
|
+
|
|
22
|
+
def run_quant_test(self, report_path: str, baseline_repo_path: str, tmp_repo_path: str) -> str:
|
|
23
|
+
self.print_step("QuantTest:LoadBaseline", baseline_repo_path)
|
|
24
|
+
baseline_readme_path = os.path.join(baseline_repo_path, "README.md")
|
|
25
|
+
baseline = read_file(baseline_readme_path) or ""
|
|
26
|
+
|
|
27
|
+
self.print_step("QuantTest:Inject")
|
|
28
|
+
injector = LLMErrorInjector(self.llm)
|
|
29
|
+
corrupted, inj_manifest = injector.inject(baseline, min_per_category=3)
|
|
30
|
+
|
|
31
|
+
# write corrupted into tmp repo path
|
|
32
|
+
os.makedirs(tmp_repo_path, exist_ok=True)
|
|
33
|
+
corrupted_readme_path = os.path.join(tmp_repo_path, "README.md")
|
|
34
|
+
write_file(corrupted_readme_path, corrupted)
|
|
35
|
+
inj_path = os.path.join(tmp_repo_path, "INJECTION_MANIFEST.json")
|
|
36
|
+
with open(inj_path, "w", encoding="utf-8") as fobj:
|
|
37
|
+
json.dump(inj_manifest, fobj, indent=2)
|
|
38
|
+
|
|
39
|
+
self.print_step("QuantTest:Generate")
|
|
40
|
+
gen = DocumentationGenerationManager(self.llm, self.step_output)
|
|
41
|
+
out_dir = gen.run(report_path=report_path, repo_path=tmp_repo_path)
|
|
42
|
+
|
|
43
|
+
# read revised
|
|
44
|
+
revised_readme_path = os.path.join(out_dir, "README.md")
|
|
45
|
+
revised = read_file(revised_readme_path) or ""
|
|
46
|
+
|
|
47
|
+
self.print_step("QuantTest:Evaluate")
|
|
48
|
+
results = evaluate_fixes(baseline, corrupted, revised, inj_manifest)
|
|
49
|
+
# write results
|
|
50
|
+
with open(os.path.join(out_dir, "GEN_TEST_RESULTS.json"), "w", encoding="utf-8") as fobj:
|
|
51
|
+
json.dump(results, fobj, indent=2)
|
|
52
|
+
# simple md report
|
|
53
|
+
lines = ["# Quantifiable Generation Test Report\n"]
|
|
54
|
+
lines.append("## Metrics by Category\n")
|
|
55
|
+
for cat, m in results["per_category"].items():
|
|
56
|
+
lines.append(f"- {cat}: {m}")
|
|
57
|
+
lines.append("\n## Notes\n")
|
|
58
|
+
lines.append("- Three versions saved in this directory: README.original.md, README.corrupted.md, README.md (fixed).")
|
|
59
|
+
with open(os.path.join(out_dir, "GEN_TEST_REPORT.md"), "w", encoding="utf-8") as fobj:
|
|
60
|
+
fobj.write("\n".join(lines))
|
|
61
|
+
# Save versioned files into output dir
|
|
62
|
+
write_file(os.path.join(out_dir, "README.original.md"), baseline)
|
|
63
|
+
write_file(os.path.join(out_dir, "README.corrupted.md"), corrupted)
|
|
64
|
+
# Copy injection manifest
|
|
65
|
+
try:
|
|
66
|
+
with open(inj_path, "r", encoding="utf-8") as fin:
|
|
67
|
+
with open(os.path.join(out_dir, "INJECTION_MANIFEST.json"), "w", encoding="utf-8") as fout:
|
|
68
|
+
fout.write(fin.read())
|
|
69
|
+
except Exception:
|
|
70
|
+
pass
|
|
71
|
+
self.print_step("QuantTest:Done", out_dir)
|
|
72
|
+
return out_dir
|
|
73
|
+
|
|
74
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from bioguider.utils.r_file_handler import RFileHandler
|
|
5
|
+
|
|
6
|
+
from .gitignore_checker import GitignoreChecker
|
|
7
|
+
from .python_file_handler import PythonFileHandler
|
|
8
|
+
from ..database.code_structure_db import CodeStructureDb
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
class CodeStructureBuilder:
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
repo_path: str,
|
|
16
|
+
gitignore_path: str,
|
|
17
|
+
code_structure_db: CodeStructureDb,
|
|
18
|
+
):
|
|
19
|
+
self.repo_path = repo_path
|
|
20
|
+
self.gitignore_checker = GitignoreChecker(repo_path, gitignore_path)
|
|
21
|
+
self.file_handler = PythonFileHandler(repo_path)
|
|
22
|
+
self.code_structure_db = code_structure_db
|
|
23
|
+
|
|
24
|
+
def build_code_structure(self):
|
|
25
|
+
files = self.gitignore_checker.check_files_and_folders()
|
|
26
|
+
for file in files:
|
|
27
|
+
if not file.endswith(".py") and not file.endswith(".R"):
|
|
28
|
+
continue
|
|
29
|
+
logger.info(f"Building code structure for {file}")
|
|
30
|
+
if file.endswith(".py"):
|
|
31
|
+
file_handler = PythonFileHandler(Path(self.repo_path) / file)
|
|
32
|
+
else:
|
|
33
|
+
file_handler = RFileHandler(Path(self.repo_path) / file)
|
|
34
|
+
functions_and_classes = file_handler.get_functions_and_classes()
|
|
35
|
+
# fixme: currently, we don't extract reference graph for each function or class
|
|
36
|
+
for function_or_class in functions_and_classes:
|
|
37
|
+
self.code_structure_db.insert_code_structure(
|
|
38
|
+
function_or_class[0], # name
|
|
39
|
+
file,
|
|
40
|
+
function_or_class[2], # start line number
|
|
41
|
+
function_or_class[3], # end line number
|
|
42
|
+
function_or_class[1], # parent name
|
|
43
|
+
function_or_class[4], # doc string
|
|
44
|
+
function_or_class[5], # params
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
bioguider/utils/constants.py
CHANGED
|
@@ -119,15 +119,15 @@ class DemoInstructionsResult(BaseModel):
|
|
|
119
119
|
expected_output_description: Optional[bool] = Field(description="A boolean value. Does it provide the description of expected output?")
|
|
120
120
|
|
|
121
121
|
class EvaluationSubmissionRequirementsResult(BaseModel):
|
|
122
|
-
compiled_standalone_software: bool
|
|
123
|
-
source_code: bool
|
|
124
|
-
demo_dataset: bool
|
|
125
|
-
run_on_data_instruction: bool
|
|
126
|
-
run_on_custom_instruction: bool
|
|
127
|
-
expected_output_description: bool
|
|
128
|
-
complete_readme: bool
|
|
129
|
-
software_dependency: bool
|
|
130
|
-
install_tutorial: bool
|
|
131
|
-
license: bool
|
|
132
|
-
hardware_requirements: bool
|
|
133
|
-
compatible_os: bool
|
|
122
|
+
compiled_standalone_software: bool | None
|
|
123
|
+
source_code: bool | None
|
|
124
|
+
demo_dataset: bool | None
|
|
125
|
+
run_on_data_instruction: bool | None
|
|
126
|
+
run_on_custom_instruction: bool | None
|
|
127
|
+
expected_output_description: bool | None
|
|
128
|
+
complete_readme: bool | None
|
|
129
|
+
software_dependency: bool | None
|
|
130
|
+
install_tutorial: bool | None
|
|
131
|
+
license: bool | None
|
|
132
|
+
hardware_requirements: bool | None
|
|
133
|
+
compatible_os: bool | None
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
class PythonFileHandler:
|
|
5
|
+
def __init__(self, file_path: str):
|
|
6
|
+
self.file_path = file_path
|
|
7
|
+
|
|
8
|
+
def get_functions_and_classes(self) -> list[str]:
|
|
9
|
+
"""
|
|
10
|
+
Get the functions and classes in a given file.
|
|
11
|
+
Returns a list of tuples, each containing:
|
|
12
|
+
1. the function or class name,
|
|
13
|
+
2. parent name,
|
|
14
|
+
3. start line number,
|
|
15
|
+
4. end line number,
|
|
16
|
+
5. doc string,
|
|
17
|
+
6. params.
|
|
18
|
+
"""
|
|
19
|
+
with open(self.file_path, 'r') as f:
|
|
20
|
+
tree = ast.parse(f.read())
|
|
21
|
+
functions_and_classes = []
|
|
22
|
+
for node in tree.body:
|
|
23
|
+
if isinstance(node, ast.FunctionDef) or isinstance(node, ast.ClassDef):
|
|
24
|
+
start_lineno = node.lineno
|
|
25
|
+
end_lineno = self.get_end_lineno(node)
|
|
26
|
+
doc_string = ast.get_docstring(node)
|
|
27
|
+
params = (
|
|
28
|
+
[arg.arg for arg in node.args.args] if "args" in dir(node) else []
|
|
29
|
+
)
|
|
30
|
+
parent = None
|
|
31
|
+
functions_and_classes.append((node.name, parent, start_lineno, end_lineno, doc_string, params))
|
|
32
|
+
for child in node.body:
|
|
33
|
+
if isinstance(child, ast.FunctionDef):
|
|
34
|
+
start_lineno = child.lineno
|
|
35
|
+
end_lineno = self.get_end_lineno(child)
|
|
36
|
+
doc_string = ast.get_docstring(child)
|
|
37
|
+
params = (
|
|
38
|
+
[arg.arg for arg in child.args.args] if "args" in dir(child) else []
|
|
39
|
+
)
|
|
40
|
+
parent = node.name
|
|
41
|
+
functions_and_classes.append((child.name, parent, start_lineno, end_lineno, doc_string, params))
|
|
42
|
+
return functions_and_classes
|
|
43
|
+
|
|
44
|
+
def get_imports(self) -> list[str]:
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def get_end_lineno(self, node):
|
|
48
|
+
"""
|
|
49
|
+
Get the end line number of a given node.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
node: The node for which to find the end line number.
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
int: The end line number of the node. Returns -1 if the node does not have a line number.
|
|
56
|
+
"""
|
|
57
|
+
if not hasattr(node, "lineno"):
|
|
58
|
+
return -1 # 返回-1表示此节点没有行号
|
|
59
|
+
|
|
60
|
+
end_lineno = node.lineno
|
|
61
|
+
for child in ast.iter_child_nodes(node):
|
|
62
|
+
child_end = getattr(child, "end_lineno", None) or self.get_end_lineno(child)
|
|
63
|
+
if child_end > -1: # 只更新当子节点有有效行号时
|
|
64
|
+
end_lineno = max(end_lineno, child_end)
|
|
65
|
+
return end_lineno
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import os
|
|
3
|
+
from typing import List, Tuple, Optional
|
|
4
|
+
|
|
5
|
+
class RFileHandler:
|
|
6
|
+
def __init__(self, file_path: str):
|
|
7
|
+
self.file_path = file_path
|
|
8
|
+
|
|
9
|
+
def get_functions_and_classes(self) -> List[Tuple[str, Optional[str], int, int, Optional[str], List[str]]]:
|
|
10
|
+
"""
|
|
11
|
+
Get the functions and S4 classes in a given R file.
|
|
12
|
+
Returns a list of tuples, each containing:
|
|
13
|
+
1. the function or class name,
|
|
14
|
+
2. parent name (None for R, as R doesn't have nested functions in the same way),
|
|
15
|
+
3. start line number,
|
|
16
|
+
4. end line number,
|
|
17
|
+
5. doc string (roxygen comments),
|
|
18
|
+
6. params (function parameters).
|
|
19
|
+
"""
|
|
20
|
+
with open(self.file_path, 'r', encoding='utf-8') as f:
|
|
21
|
+
lines = f.readlines()
|
|
22
|
+
|
|
23
|
+
functions_and_classes = []
|
|
24
|
+
i = 0
|
|
25
|
+
|
|
26
|
+
while i < len(lines):
|
|
27
|
+
line = lines[i].strip()
|
|
28
|
+
|
|
29
|
+
# Skip empty lines and comments (except roxygen)
|
|
30
|
+
if not line or (line.startswith('#') and not line.startswith('#\'') and not line.startswith('#@')):
|
|
31
|
+
i += 1
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
# Check for function definitions
|
|
35
|
+
func_match = self._match_function(lines, i)
|
|
36
|
+
if func_match:
|
|
37
|
+
name, start_line, end_line, doc_string, params = func_match
|
|
38
|
+
functions_and_classes.append((name, None, start_line + 1, end_line + 1, doc_string, params))
|
|
39
|
+
i = end_line + 1
|
|
40
|
+
continue
|
|
41
|
+
|
|
42
|
+
# Check for S4 class definitions
|
|
43
|
+
class_match = self._match_s4_class(lines, i)
|
|
44
|
+
if class_match:
|
|
45
|
+
name, start_line, end_line, doc_string = class_match
|
|
46
|
+
functions_and_classes.append((name, None, start_line + 1, end_line + 1, doc_string, []))
|
|
47
|
+
i = end_line + 1
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# Check for S3 class methods (functions with class-specific naming)
|
|
51
|
+
s3_match = self._match_s3_method(lines, i)
|
|
52
|
+
if s3_match:
|
|
53
|
+
name, start_line, end_line, doc_string, params = s3_match
|
|
54
|
+
functions_and_classes.append((name, None, start_line + 1, end_line + 1, doc_string, params))
|
|
55
|
+
i = end_line + 1
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
i += 1
|
|
59
|
+
|
|
60
|
+
return functions_and_classes
|
|
61
|
+
|
|
62
|
+
def _match_function(self, lines: List[str], start_idx: int) -> Optional[Tuple[str, int, int, Optional[str], List[str]]]:
|
|
63
|
+
"""Match function definitions in R code."""
|
|
64
|
+
# Collect roxygen documentation before function
|
|
65
|
+
doc_string = self._extract_roxygen_doc(lines, start_idx)
|
|
66
|
+
doc_start_idx = start_idx
|
|
67
|
+
|
|
68
|
+
# Skip roxygen comments to find function definition
|
|
69
|
+
while start_idx < len(lines) and (lines[start_idx].strip().startswith('#\'') or
|
|
70
|
+
lines[start_idx].strip().startswith('#@') or
|
|
71
|
+
not lines[start_idx].strip()):
|
|
72
|
+
start_idx += 1
|
|
73
|
+
|
|
74
|
+
if start_idx >= len(lines):
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
# Pattern for function definition: name <- function(params) or name = function(params)
|
|
78
|
+
func_pattern = r'^(\s*)([a-zA-Z_][a-zA-Z0-9_.\$]*)\s*(<-|=)\s*function\s*\('
|
|
79
|
+
|
|
80
|
+
line = lines[start_idx]
|
|
81
|
+
match = re.match(func_pattern, line)
|
|
82
|
+
|
|
83
|
+
if not match:
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
func_name = match.group(2)
|
|
87
|
+
indent_level = len(match.group(1))
|
|
88
|
+
|
|
89
|
+
# Extract parameters
|
|
90
|
+
params = self._extract_function_params(lines, start_idx)
|
|
91
|
+
|
|
92
|
+
# Find the end of the function by tracking braces
|
|
93
|
+
end_idx = self._find_function_end(lines, start_idx, indent_level)
|
|
94
|
+
|
|
95
|
+
return func_name, doc_start_idx, end_idx, doc_string, params
|
|
96
|
+
|
|
97
|
+
def _match_s4_class(self, lines: List[str], start_idx: int) -> Optional[Tuple[str, int, int, Optional[str]]]:
|
|
98
|
+
"""Match S4 class definitions."""
|
|
99
|
+
doc_string = self._extract_roxygen_doc(lines, start_idx)
|
|
100
|
+
doc_start_idx = start_idx
|
|
101
|
+
|
|
102
|
+
# Skip documentation to find class definition
|
|
103
|
+
while start_idx < len(lines) and (lines[start_idx].strip().startswith('#\'') or
|
|
104
|
+
lines[start_idx].strip().startswith('#@') or
|
|
105
|
+
not lines[start_idx].strip()):
|
|
106
|
+
start_idx += 1
|
|
107
|
+
|
|
108
|
+
if start_idx >= len(lines):
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
# Pattern for S4 class: setClass("ClassName", ...)
|
|
112
|
+
class_pattern = r'setClass\s*\(\s*["\']([^"\']+)["\']'
|
|
113
|
+
|
|
114
|
+
line = lines[start_idx]
|
|
115
|
+
match = re.search(class_pattern, line)
|
|
116
|
+
|
|
117
|
+
if not match:
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
class_name = match.group(1)
|
|
121
|
+
|
|
122
|
+
# Find the end by tracking parentheses
|
|
123
|
+
end_idx = self._find_parentheses_end(lines, start_idx)
|
|
124
|
+
|
|
125
|
+
return class_name, doc_start_idx, end_idx, doc_string
|
|
126
|
+
|
|
127
|
+
def _match_s3_method(self, lines: List[str], start_idx: int) -> Optional[Tuple[str, int, int, Optional[str], List[str]]]:
|
|
128
|
+
"""Match S3 method definitions (method.class pattern)."""
|
|
129
|
+
doc_string = self._extract_roxygen_doc(lines, start_idx)
|
|
130
|
+
doc_start_idx = start_idx
|
|
131
|
+
|
|
132
|
+
# Skip documentation
|
|
133
|
+
while start_idx < len(lines) and (lines[start_idx].strip().startswith('#\'') or
|
|
134
|
+
lines[start_idx].strip().startswith('#@') or
|
|
135
|
+
not lines[start_idx].strip()):
|
|
136
|
+
start_idx += 1
|
|
137
|
+
|
|
138
|
+
if start_idx >= len(lines):
|
|
139
|
+
return None
|
|
140
|
+
|
|
141
|
+
# Pattern for S3 method: method.class <- function(params)
|
|
142
|
+
s3_pattern = r'^(\s*)([a-zA-Z_][a-zA-Z0-9_]*\.[a-zA-Z_][a-zA-Z0-9_]*)\s*(<-|=)\s*function\s*\('
|
|
143
|
+
|
|
144
|
+
line = lines[start_idx]
|
|
145
|
+
match = re.match(s3_pattern, line)
|
|
146
|
+
|
|
147
|
+
if not match:
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
method_name = match.group(2)
|
|
151
|
+
indent_level = len(match.group(1))
|
|
152
|
+
|
|
153
|
+
# Extract parameters
|
|
154
|
+
params = self._extract_function_params(lines, start_idx)
|
|
155
|
+
|
|
156
|
+
# Find the end of the function
|
|
157
|
+
end_idx = self._find_function_end(lines, start_idx, indent_level)
|
|
158
|
+
|
|
159
|
+
return method_name, doc_start_idx, end_idx, doc_string, params
|
|
160
|
+
|
|
161
|
+
def _extract_roxygen_doc(self, lines: List[str], start_idx: int) -> Optional[str]:
|
|
162
|
+
"""Extract roxygen2 documentation comments."""
|
|
163
|
+
doc_lines = []
|
|
164
|
+
i = start_idx
|
|
165
|
+
|
|
166
|
+
# Go backwards to find the start of roxygen comments
|
|
167
|
+
while i > 0 and (lines[i-1].strip().startswith('#\'') or lines[i-1].strip().startswith('#@') or not lines[i-1].strip()):
|
|
168
|
+
if lines[i-1].strip().startswith('#\'') or lines[i-1].strip().startswith('#@'):
|
|
169
|
+
i -= 1
|
|
170
|
+
elif not lines[i-1].strip():
|
|
171
|
+
i -= 1
|
|
172
|
+
else:
|
|
173
|
+
break
|
|
174
|
+
|
|
175
|
+
# Collect roxygen comments
|
|
176
|
+
while i < len(lines):
|
|
177
|
+
line = lines[i].strip()
|
|
178
|
+
if line.startswith('#\'') or line.startswith('#@'):
|
|
179
|
+
# Remove the roxygen prefix
|
|
180
|
+
clean_line = re.sub(r'^#[\'@]\s?', '', line)
|
|
181
|
+
doc_lines.append(clean_line)
|
|
182
|
+
i += 1
|
|
183
|
+
elif not line: # Empty line
|
|
184
|
+
i += 1
|
|
185
|
+
else:
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
return '\n'.join(doc_lines) if doc_lines else None
|
|
189
|
+
|
|
190
|
+
def _extract_function_params(self, lines: List[str], start_idx: int) -> List[str]:
|
|
191
|
+
"""Extract function parameters from function definition."""
|
|
192
|
+
params = []
|
|
193
|
+
|
|
194
|
+
# Find the function line and extract parameters
|
|
195
|
+
func_line_complete = ""
|
|
196
|
+
i = start_idx
|
|
197
|
+
paren_count = 0
|
|
198
|
+
found_opening = False
|
|
199
|
+
|
|
200
|
+
while i < len(lines):
|
|
201
|
+
line = lines[i]
|
|
202
|
+
func_line_complete += line
|
|
203
|
+
|
|
204
|
+
# Count parentheses to find the complete parameter list
|
|
205
|
+
for char in line:
|
|
206
|
+
if char == '(':
|
|
207
|
+
paren_count += 1
|
|
208
|
+
found_opening = True
|
|
209
|
+
elif char == ')':
|
|
210
|
+
paren_count -= 1
|
|
211
|
+
|
|
212
|
+
if found_opening and paren_count == 0:
|
|
213
|
+
break
|
|
214
|
+
i += 1
|
|
215
|
+
|
|
216
|
+
# Extract parameters using regex
|
|
217
|
+
param_match = re.search(r'function\s*\((.*?)\)', func_line_complete, re.DOTALL)
|
|
218
|
+
if param_match:
|
|
219
|
+
param_str = param_match.group(1).strip()
|
|
220
|
+
if param_str:
|
|
221
|
+
# Split by comma, but be careful with nested parentheses and quotes
|
|
222
|
+
params = self._smart_split_params(param_str)
|
|
223
|
+
# Clean up parameter names (remove default values, whitespace)
|
|
224
|
+
params = [re.split(r'\s*=\s*', param.strip())[0].strip() for param in params]
|
|
225
|
+
params = [param for param in params if param and param != '...']
|
|
226
|
+
|
|
227
|
+
return params
|
|
228
|
+
|
|
229
|
+
def _smart_split_params(self, param_str: str) -> List[str]:
|
|
230
|
+
"""Split parameters by comma, handling nested structures."""
|
|
231
|
+
params = []
|
|
232
|
+
current_param = ""
|
|
233
|
+
paren_count = 0
|
|
234
|
+
quote_char = None
|
|
235
|
+
|
|
236
|
+
for char in param_str:
|
|
237
|
+
if quote_char:
|
|
238
|
+
current_param += char
|
|
239
|
+
if char == quote_char and (len(current_param) == 1 or current_param[-2] != '\\'):
|
|
240
|
+
quote_char = None
|
|
241
|
+
elif char in ['"', "'"]:
|
|
242
|
+
quote_char = char
|
|
243
|
+
current_param += char
|
|
244
|
+
elif char == '(':
|
|
245
|
+
paren_count += 1
|
|
246
|
+
current_param += char
|
|
247
|
+
elif char == ')':
|
|
248
|
+
paren_count -= 1
|
|
249
|
+
current_param += char
|
|
250
|
+
elif char == ',' and paren_count == 0:
|
|
251
|
+
params.append(current_param.strip())
|
|
252
|
+
current_param = ""
|
|
253
|
+
else:
|
|
254
|
+
current_param += char
|
|
255
|
+
|
|
256
|
+
if current_param.strip():
|
|
257
|
+
params.append(current_param.strip())
|
|
258
|
+
|
|
259
|
+
return params
|
|
260
|
+
|
|
261
|
+
def _find_function_end(self, lines: List[str], start_idx: int, indent_level: int) -> int:
|
|
262
|
+
"""Find the end of a function by tracking braces and indentation."""
|
|
263
|
+
brace_count = 0
|
|
264
|
+
in_function = False
|
|
265
|
+
i = start_idx
|
|
266
|
+
|
|
267
|
+
while i < len(lines):
|
|
268
|
+
line = lines[i]
|
|
269
|
+
|
|
270
|
+
# Count braces
|
|
271
|
+
for char in line:
|
|
272
|
+
if char == '{':
|
|
273
|
+
brace_count += 1
|
|
274
|
+
in_function = True
|
|
275
|
+
elif char == '}':
|
|
276
|
+
brace_count -= 1
|
|
277
|
+
|
|
278
|
+
# If we've closed all braces, we're at the end
|
|
279
|
+
if in_function and brace_count == 0:
|
|
280
|
+
return i
|
|
281
|
+
|
|
282
|
+
# If no braces are used, look for next function or end of file
|
|
283
|
+
if not in_function and i > start_idx:
|
|
284
|
+
stripped = line.strip()
|
|
285
|
+
if stripped and not stripped.startswith('#'):
|
|
286
|
+
# Check if this looks like a new function or assignment at same/higher level
|
|
287
|
+
if re.match(r'^(\s*)[a-zA-Z_][a-zA-Z0-9_.\$]*\s*(<-|=)', line):
|
|
288
|
+
current_indent = len(re.match(r'^(\s*)', line).group(1))
|
|
289
|
+
if current_indent <= indent_level:
|
|
290
|
+
return i - 1
|
|
291
|
+
|
|
292
|
+
i += 1
|
|
293
|
+
|
|
294
|
+
return len(lines) - 1
|
|
295
|
+
|
|
296
|
+
def _find_parentheses_end(self, lines: List[str], start_idx: int) -> int:
|
|
297
|
+
"""Find the end of a parenthetical expression."""
|
|
298
|
+
paren_count = 0
|
|
299
|
+
i = start_idx
|
|
300
|
+
|
|
301
|
+
while i < len(lines):
|
|
302
|
+
line = lines[i]
|
|
303
|
+
for char in line:
|
|
304
|
+
if char == '(':
|
|
305
|
+
paren_count += 1
|
|
306
|
+
elif char == ')':
|
|
307
|
+
paren_count -= 1
|
|
308
|
+
if paren_count == 0:
|
|
309
|
+
return i
|
|
310
|
+
i += 1
|
|
311
|
+
|
|
312
|
+
return len(lines) - 1
|
|
313
|
+
|
|
314
|
+
def get_imports(self) -> List[str]:
|
|
315
|
+
"""
|
|
316
|
+
Get library imports and source statements in R code.
|
|
317
|
+
Returns a list of library names and sourced files.
|
|
318
|
+
"""
|
|
319
|
+
imports = []
|
|
320
|
+
|
|
321
|
+
with open(self.file_path, 'r', encoding='utf-8') as f:
|
|
322
|
+
lines = f.readlines()
|
|
323
|
+
|
|
324
|
+
for line in lines:
|
|
325
|
+
line = line.strip()
|
|
326
|
+
|
|
327
|
+
# Match library() calls
|
|
328
|
+
lib_match = re.search(r'library\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', line)
|
|
329
|
+
if lib_match:
|
|
330
|
+
imports.append(f"library({lib_match.group(1)})")
|
|
331
|
+
|
|
332
|
+
# Match require() calls
|
|
333
|
+
req_match = re.search(r'require\s*\(\s*["\']?([^"\')\s]+)["\']?\s*\)', line)
|
|
334
|
+
if req_match:
|
|
335
|
+
imports.append(f"require({req_match.group(1)})")
|
|
336
|
+
|
|
337
|
+
# Match source() calls
|
|
338
|
+
src_match = re.search(r'source\s*\(\s*["\']([^"\']+)["\']\s*\)', line)
|
|
339
|
+
if src_match:
|
|
340
|
+
imports.append(f"source({src_match.group(1)})")
|
|
341
|
+
|
|
342
|
+
# Match :: namespace calls (just collect unique packages)
|
|
343
|
+
ns_matches = re.findall(r'([a-zA-Z_][a-zA-Z0-9_.]*)::', line)
|
|
344
|
+
for ns in ns_matches:
|
|
345
|
+
ns_import = f"{ns}::"
|
|
346
|
+
if ns_import not in imports:
|
|
347
|
+
imports.append(ns_import)
|
|
348
|
+
|
|
349
|
+
return imports
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# Example usage:
|
|
353
|
+
if __name__ == "__main__":
|
|
354
|
+
# Example R file analysis
|
|
355
|
+
handler = RFileHandler("example.R")
|
|
356
|
+
|
|
357
|
+
# Get functions and classes
|
|
358
|
+
functions_and_classes = handler.get_functions_and_classes()
|
|
359
|
+
print("Functions and Classes:")
|
|
360
|
+
for item in functions_and_classes:
|
|
361
|
+
name, parent, start, end, doc, params = item
|
|
362
|
+
print(f" {name}: lines {start}-{end}, params: {params}")
|
|
363
|
+
if doc:
|
|
364
|
+
print(f" Doc: {doc[:50]}...")
|
|
365
|
+
|
|
366
|
+
# Get imports
|
|
367
|
+
imports = handler.get_imports()
|
|
368
|
+
print(f"\nImports: {imports}")
|
bioguider/utils/utils.py
CHANGED
|
@@ -2,6 +2,7 @@ import logging
|
|
|
2
2
|
import re
|
|
3
3
|
import subprocess
|
|
4
4
|
from typing import Optional
|
|
5
|
+
from pydantic import BaseModel
|
|
5
6
|
import tiktoken
|
|
6
7
|
|
|
7
8
|
from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
|
|
@@ -68,4 +69,36 @@ def increase_token_usage(
|
|
|
68
69
|
|
|
69
70
|
return token_usage
|
|
70
71
|
|
|
71
|
-
|
|
72
|
+
def clean_action_input(action_input: str) -> str:
|
|
73
|
+
replaced_input = ""
|
|
74
|
+
|
|
75
|
+
while (True):
|
|
76
|
+
replaced_input = action_input.strip()
|
|
77
|
+
replaced_input = replaced_input.strip("`")
|
|
78
|
+
replaced_input = replaced_input.strip('"')
|
|
79
|
+
replaced_input = replaced_input.strip()
|
|
80
|
+
replaced_input = replaced_input.strip("`")
|
|
81
|
+
replaced_input = replaced_input.strip('"')
|
|
82
|
+
replaced_input = replaced_input.strip()
|
|
83
|
+
if (replaced_input == action_input):
|
|
84
|
+
break
|
|
85
|
+
action_input = replaced_input
|
|
86
|
+
|
|
87
|
+
action_input = action_input.replace("'", '"')
|
|
88
|
+
action_input = action_input.replace("`", '"')
|
|
89
|
+
return action_input
|
|
90
|
+
|
|
91
|
+
# Convert BaseModel objects to dictionaries for JSON serialization
|
|
92
|
+
def convert_to_serializable(obj):
|
|
93
|
+
if isinstance(obj, BaseModel):
|
|
94
|
+
return obj.model_dump()
|
|
95
|
+
elif hasattr(obj, 'model_dump'):
|
|
96
|
+
return obj.model_dump()
|
|
97
|
+
elif isinstance(obj, dict):
|
|
98
|
+
return {k: convert_to_serializable(v) for k, v in obj.items()}
|
|
99
|
+
elif isinstance(obj, list):
|
|
100
|
+
return [convert_to_serializable(item) for item in obj]
|
|
101
|
+
elif isinstance(obj, tuple):
|
|
102
|
+
return [convert_to_serializable(item) for item in obj]
|
|
103
|
+
else:
|
|
104
|
+
return obj
|