bioguider 0.2.25__tar.gz → 0.2.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- {bioguider-0.2.25 → bioguider-0.2.27}/PKG-INFO +1 -1
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/consistency_observe_step.py +2 -2
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_tutorial_task.py +8 -8
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_tutorial_task_prompts.py +14 -24
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_userguide_prompts.py +12 -70
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_userguide_task.py +4 -4
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/llm_content_generator.py +5 -2
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/llm_injector.py +74 -2
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/models.py +4 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/test_metrics.py +85 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/managers/generation_manager.py +54 -8
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/managers/generation_test_manager.py +41 -8
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/utils.py +9 -5
- {bioguider-0.2.25 → bioguider-0.2.27}/pyproject.toml +1 -1
- {bioguider-0.2.25 → bioguider-0.2.27}/LICENSE +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/README.md +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/__init__.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/__init__.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/agent_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/agent_tools.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/agent_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/collection_execute_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/collection_observe_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/collection_plan_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/collection_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/collection_task_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/common_agent.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/common_agent_2step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/common_conversation.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/common_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/consistency_collection_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/consistency_evaluation_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/consistency_evaluation_task_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/consistency_query_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/dockergeneration_execute_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/dockergeneration_observe_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/dockergeneration_plan_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/dockergeneration_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/dockergeneration_task_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_installation_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_readme_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_submission_requirements_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/identification_execute_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/identification_observe_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/identification_plan_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/identification_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/identification_task_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/peo_common_step.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/prompt_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/python_ast_repl_tool.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/rag_collection_task.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/conversation.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/database/code_structure_db.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/database/summarized_file_db.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/__init__.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/change_planner.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/document_renderer.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/llm_cleaner.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/output_manager.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/repo_reader.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/report_loader.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/style_analyzer.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/generation/suggestion_extractor.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/managers/evaluation_manager.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/rag/__init__.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/rag/config.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/rag/data_pipeline.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/rag/embedder.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/rag/rag.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/settings.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/code_structure_builder.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/constants.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/default.gitignore +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/file_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/gitignore_checker.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/notebook_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/pyphen_utils.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/python_file_handler.py +0 -0
- {bioguider-0.2.25 → bioguider-0.2.27}/bioguider/utils/r_file_handler.py +0 -0
|
@@ -21,7 +21,7 @@ and generate a structured consistency assessment based on the following criteria
|
|
|
21
21
|
**Consistency**:
|
|
22
22
|
* **Score**: [Poor / Fair / Good / Excellent]
|
|
23
23
|
* **Assessment**: [Your evaluation of whether the {domain} documentation is consistent with the code definitions]
|
|
24
|
-
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring, and describe how they are inconsistent]
|
|
24
|
+
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring, and describe how they are inconsistent, please be as specific as possible]
|
|
25
25
|
* **Strengths**: [A list of strengths of the {domain} documentation on consistency]
|
|
26
26
|
|
|
27
27
|
---
|
|
@@ -32,7 +32,7 @@ Your output **must exactly match** the following format:
|
|
|
32
32
|
**Consistency**:
|
|
33
33
|
* **Score**: [Poor / Fair / Good / Excellent]
|
|
34
34
|
* **Assessment**: [Your evaluation of whether the {domain} documentation is consistent with the code definitions]
|
|
35
|
-
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring, and describe how they are inconsistent]
|
|
35
|
+
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring, and describe how they are inconsistent, please be as specific as possible]
|
|
36
36
|
* **Strengths**: [A list of strengths of the {domain} documentation on consistency]
|
|
37
37
|
```
|
|
38
38
|
|
|
@@ -28,21 +28,21 @@ MAX_FILE_SIZE = 1024 * 100 # 100K
|
|
|
28
28
|
class TutorialEvaluationResult(BaseModel):
|
|
29
29
|
overall_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
30
30
|
overall_key_strengths: str=Field(description="A string value, the key strengths of the tutorial")
|
|
31
|
-
overall_improvement_suggestions: str=Field(description="Suggestions to improve the overall score if necessary")
|
|
31
|
+
# overall_improvement_suggestions: str=Field(description="Suggestions to improve the overall score if necessary")
|
|
32
32
|
readability_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
33
|
-
readability_suggestions: str=Field(description="
|
|
33
|
+
readability_suggestions: list[str]=Field(description="A list of string values, suggestions to improve readability if necessary")
|
|
34
34
|
setup_and_dependencies_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
35
|
-
setup_and_dependencies_suggestions: str=Field(description="
|
|
35
|
+
setup_and_dependencies_suggestions: list[str]=Field(description="A list of string values, suggestions to improve setup and dependencies if necessary")
|
|
36
36
|
reproducibility_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
37
|
-
reproducibility_suggestions: str=Field(description="
|
|
37
|
+
reproducibility_suggestions: list[str]=Field(description="A list of string values, suggestions to improve reproducibility if necessary")
|
|
38
38
|
structure_and_navigation_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
39
|
-
structure_and_navigation_suggestions: str=Field(description="
|
|
39
|
+
structure_and_navigation_suggestions: list[str]=Field(description="A list of string values, suggestions to improve structure and navigation if necessary")
|
|
40
40
|
executable_code_quality_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
41
|
-
executable_code_quality_suggestions: str=Field(description="
|
|
41
|
+
executable_code_quality_suggestions: list[str]=Field(description="A list of string values, suggestions to improve executable code quality if necessary")
|
|
42
42
|
result_verification_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
43
|
-
result_verification_suggestions: str=Field(description="
|
|
43
|
+
result_verification_suggestions: list[str]=Field(description="A list of string values, suggestions to improve result verification if necessary")
|
|
44
44
|
performance_and_resource_notes_score: str=Field(description="A string value, could be 'Poor', 'Fair', 'Good', or 'Excellent'")
|
|
45
|
-
performance_and_resource_notes_suggestions: str=Field(description="
|
|
45
|
+
performance_and_resource_notes_suggestions: list[str]=Field(description="A list of string values, suggestions to improve performance and resource notes if necessary")
|
|
46
46
|
|
|
47
47
|
class IndividualTutorialEvaluationResult(BaseModel):
|
|
48
48
|
tutorial_evaluation: TutorialEvaluationResult | None=Field(description="The evaluation result of the tutorial")
|
|
@@ -15,37 +15,37 @@ Your task is to analyze the provided tutorial file and generate a structured qua
|
|
|
15
15
|
|
|
16
16
|
2. **Coverage**:
|
|
17
17
|
* **Assessment**: [Your evaluation of whether it covers all major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.]
|
|
18
|
-
* **Improvement Suggestions**:
|
|
18
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
19
19
|
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
20
20
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
21
21
|
|
|
22
22
|
3. **Reproducibility**:
|
|
23
23
|
* **Assessment**: [Your evaluation of whether it provides a clear **description** of reproducibility]
|
|
24
|
-
* **Improvement Suggestions**:
|
|
24
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
25
25
|
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
26
26
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
27
27
|
|
|
28
28
|
4. **Structure & Navigation**:
|
|
29
29
|
* **Assessment**: [Your evaluation of whether it provides logical sections (e.g., intro -> setup -> steps -> results -> next), TOC/anchors, estimated time, etc.]
|
|
30
|
-
* **Improvement Suggestions**:
|
|
30
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
31
31
|
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
32
32
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
33
33
|
|
|
34
34
|
5. **Executable Code Quality**:
|
|
35
35
|
* **Assessment**: [Your evaluation on whether the code snippets are executable and functional, idiomatic, no hard-coded paths, etc.]
|
|
36
|
-
* **Improvement Suggestions**:
|
|
36
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
37
37
|
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
38
38
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
39
39
|
|
|
40
40
|
6. **Result Verification**:
|
|
41
41
|
* **Assessment**: [Your evaluation on expected outputs shown (figures/tables/metrics), acceptance criteria, etc.]
|
|
42
|
-
* **Improvement Suggestions**:
|
|
42
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
43
43
|
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
44
44
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
45
45
|
|
|
46
46
|
7. **Performance & Resource Notes**:
|
|
47
47
|
* **Assessment**: [Your evaluation on performance and resource notes, e.g., CPU/GPU usage, memory usage, runtime estimates, small "lite" path provided.]
|
|
48
|
-
* **Improvement Suggestions**:
|
|
48
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
49
49
|
* **Original text:** [Quote a specific line/section from the tutorial.]
|
|
50
50
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
51
51
|
|
|
@@ -57,49 +57,39 @@ Your final report must **exactly match** the following format. Do not add or omi
|
|
|
57
57
|
**FinalAnswer**
|
|
58
58
|
* **Overall Score:** [Poor / Fair / Good / Excellent]
|
|
59
59
|
* **Overall Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
|
|
60
|
-
|
|
61
|
-
- "Original text snippet 1" - Improving comment 1
|
|
62
|
-
- "Original text snippet 2" - Improving comment 2
|
|
63
|
-
- ...
|
|
60
|
+
|
|
64
61
|
* **Readability Score:** [Poor / Fair / Good / Excellent]
|
|
65
|
-
* **Readability
|
|
66
|
-
* **Readability Improvement Suggestions:**
|
|
62
|
+
* **Readability Improvement Suggestions:** please be as specific as possible.
|
|
67
63
|
- "Original text snippet 1" - Improving comment 1
|
|
68
64
|
- "Original text snippet 2" - Improving comment 2
|
|
69
65
|
- ...
|
|
70
66
|
* **Coverage Score:** [Poor / Fair / Good / Excellent]
|
|
71
|
-
* **Coverage
|
|
72
|
-
* **Coverage Improvement Suggestions:**
|
|
67
|
+
* **Coverage Improvement Suggestions:** please be as specific as possible.
|
|
73
68
|
- "Original text snippet 1" - Improving comment 1
|
|
74
69
|
- "Original text snippet 2" - Improving comment 2
|
|
75
70
|
- ...
|
|
76
71
|
* **Reproducibility Score:** [Poor / Fair / Good / Excellent]
|
|
77
|
-
* **Reproducibility
|
|
78
|
-
* **Reproducibility Improvement Suggestions:**
|
|
72
|
+
* **Reproducibility Improvement Suggestions:** please be as specific as possible.
|
|
79
73
|
- "Original text snippet 1" - Improving comment 1
|
|
80
74
|
- "Original text snippet 2" - Improving comment 2
|
|
81
75
|
- ...
|
|
82
76
|
* **Structure & Navigation Score:** [Poor / Fair / Good / Excellent]
|
|
83
|
-
* **Structure & Navigation
|
|
84
|
-
* **Structure & Navigation Improvement Suggestions:**
|
|
77
|
+
* **Structure & Navigation Improvement Suggestions:** please be as specific as possible.
|
|
85
78
|
- "Original text snippet 1" - Improving comment 1
|
|
86
79
|
- "Original text snippet 2" - Improving comment 2
|
|
87
80
|
- ...
|
|
88
81
|
* **Executable Code Quality Score:** [Poor / Fair / Good / Excellent]
|
|
89
|
-
* **Executable Code Quality
|
|
90
|
-
* **Executable Code Quality Improvement Suggestions:**
|
|
82
|
+
* **Executable Code Quality Improvement Suggestions:** please be as specific as possible.
|
|
91
83
|
- "Original text snippet 1" - Improving comment 1
|
|
92
84
|
- "Original text snippet 2" - Improving comment 2
|
|
93
85
|
- ...
|
|
94
86
|
* **Result Verification Score:** [Poor / Fair / Good / Excellent]
|
|
95
|
-
* **Result Verification
|
|
96
|
-
* **Result Verification Improvement Suggestions:**
|
|
87
|
+
* **Result Verification Improvement Suggestions:** please be as specific as possible.
|
|
97
88
|
- "Original text snippet 1" - Improving comment 1
|
|
98
89
|
- "Original text snippet 2" - Improving comment 2
|
|
99
90
|
- ...
|
|
100
91
|
* **Performance & Resource Notes Score:** [Poor / Fair / Good / Excellent]
|
|
101
|
-
* **Performance & Resource Notes
|
|
102
|
-
* **Performance & Resource Notes Improvement Suggestions:**
|
|
92
|
+
* **Performance & Resource Notes Improvement Suggestions:** please be as specific as possible.
|
|
103
93
|
- "Original text snippet 1" - Improving comment 1
|
|
104
94
|
- "Original text snippet 2" - Improving comment 2
|
|
105
95
|
- ...
|
|
@@ -15,31 +15,31 @@ Your task is to analyze the provided files related to user guide and generate a
|
|
|
15
15
|
|
|
16
16
|
2. **Arguments and Clarity**:
|
|
17
17
|
* **Assessment**: [Your evaluation of whether it provides a clear **description** of arguments and their usage]
|
|
18
|
-
* **Improvement Suggestions**:
|
|
18
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
19
19
|
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
20
20
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
21
21
|
|
|
22
22
|
3. **Return Value and Clarity**:
|
|
23
23
|
* **Assessment**: [Your evaluation of whether it provides a clear **description** of return value and its meaning]
|
|
24
|
-
* **Improvement Suggestions**:
|
|
24
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
25
25
|
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
26
26
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
27
27
|
|
|
28
28
|
4. **Context and Purpose**:
|
|
29
29
|
* **Assessment**: [Your evaluation of whether it provides a clear **description** of the context and purpose of the module]
|
|
30
|
-
* **Improvement Suggestions**:
|
|
30
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
31
31
|
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
32
32
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
33
33
|
|
|
34
34
|
5. **Error Handling**:
|
|
35
35
|
* **Assessment**: [Your evaluation of whether it provides a clear **description** of error handling]
|
|
36
|
-
* **Improvement Suggestions**:
|
|
36
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
37
37
|
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
38
38
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
39
39
|
|
|
40
40
|
6. **Usage Examples**:
|
|
41
41
|
* **Assessment**: [Your evaluation of whether it provides a clear **description** of usage examples]
|
|
42
|
-
* **Improvement Suggestions**:
|
|
42
|
+
* **Improvement Suggestions**: please be as specific as possible.
|
|
43
43
|
* **Original text:** [Quote a specific line/section from the user guide.]
|
|
44
44
|
* **Improving comments:** [Provide your suggestions to improve clarity.]
|
|
45
45
|
|
|
@@ -54,43 +54,40 @@ Your final report must **exactly match** the following format. Do not add or omi
|
|
|
54
54
|
**FinalAnswer**
|
|
55
55
|
* **Overall Score:** [Poor / Fair / Good / Excellent]
|
|
56
56
|
* **Overall Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
57
|
-
|
|
58
|
-
- "Original text snippet 1" - Improving comment 1
|
|
59
|
-
- "Original text snippet 2" - Improving comment 2
|
|
60
|
-
- ...
|
|
57
|
+
|
|
61
58
|
* **Readability Analysis Score:** [Poor / Fair / Good / Excellent]
|
|
62
59
|
* **Readability Analysis Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
63
|
-
* **Readability Analysis Improvement Suggestions:**
|
|
60
|
+
* **Readability Analysis Improvement Suggestions:** please be as specific as possible.
|
|
64
61
|
- "Original text snippet 1" - Improving comment 1
|
|
65
62
|
- "Original text snippet 2" - Improving comment 2
|
|
66
63
|
- ...
|
|
67
64
|
* **Arguments and Clarity Score:** [Poor / Fair / Good / Excellent]
|
|
68
65
|
* **Arguments and Clarity Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
69
|
-
* **Arguments and Clarity Improvement Suggestions:**
|
|
66
|
+
* **Arguments and Clarity Improvement Suggestions:** please be as specific as possible.
|
|
70
67
|
- "Original text snippet 1" - Improving comment 1
|
|
71
68
|
- "Original text snippet 2" - Improving comment 2
|
|
72
69
|
- ...
|
|
73
70
|
* **Return Value and Clarity Score:** [Poor / Fair / Good / Excellent]
|
|
74
71
|
* **Return Value and Clarity Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
75
|
-
* **Return Value and Clarity Improvement Suggestions:**
|
|
72
|
+
* **Return Value and Clarity Improvement Suggestions:** please be as specific as possible.
|
|
76
73
|
- "Original text snippet 1" - Improving comment 1
|
|
77
74
|
- "Original text snippet 2" - Improving comment 2
|
|
78
75
|
- ...
|
|
79
76
|
* **Context and Purpose Score:** [Poor / Fair / Good / Excellent]
|
|
80
77
|
* **Context and Purpose Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
81
|
-
* **Context and Purpose Improvement Suggestions:**
|
|
78
|
+
* **Context and Purpose Improvement Suggestions:** please be as specific as possible.
|
|
82
79
|
- "Original text snippet 1" - Improving comment 1
|
|
83
80
|
- "Original text snippet 2" - Improving comment 2
|
|
84
81
|
- ...
|
|
85
82
|
* **Error Handling Score:** [Poor / Fair / Good / Excellent]
|
|
86
83
|
* **Error Handling Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
87
|
-
* **Error Handling Improvement Suggestions:**
|
|
84
|
+
* **Error Handling Improvement Suggestions:** please be as specific as possible.
|
|
88
85
|
- "Original text snippet 1" - Improving comment 1
|
|
89
86
|
- "Original text snippet 2" - Improving comment 2
|
|
90
87
|
- ...
|
|
91
88
|
* **Usage Examples Score:** [Poor / Fair / Good / Excellent]
|
|
92
89
|
* **Usage Examples Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
|
|
93
|
-
* **Usage Examples Improvement Suggestions:**
|
|
90
|
+
* **Usage Examples Improvement Suggestions:** please be as specific as possible.
|
|
94
91
|
- "Original text snippet 1" - Improving comment 1
|
|
95
92
|
- "Original text snippet 2" - Improving comment 2
|
|
96
93
|
- ...
|
|
@@ -105,58 +102,3 @@ Your final report must **exactly match** the following format. Do not add or omi
|
|
|
105
102
|
|
|
106
103
|
"""
|
|
107
104
|
|
|
108
|
-
CONSISTENCY_EVAL_SYSTEM_PROMPT = """
|
|
109
|
-
You are an expert in evaluating the consistency of user guide in software repositories.
|
|
110
|
-
Your task is to analyze both:
|
|
111
|
-
1. the provided file related to user guide/API documentation,
|
|
112
|
-
2. the code definitions related to the user guide/API documentation
|
|
113
|
-
and generate a structured consistency assessment based on the following criteria.
|
|
114
|
-
|
|
115
|
-
---
|
|
116
|
-
|
|
117
|
-
### **Evaluation Criteria**
|
|
118
|
-
|
|
119
|
-
**Consistency**:
|
|
120
|
-
* **Score**: [Poor / Fair / Good / Excellent]
|
|
121
|
-
* **Assessment**: [Your evaluation of whether the user guide/API documentation is consistent with the code definitions]
|
|
122
|
-
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring]
|
|
123
|
-
* **Strengths**: [A list of strengths of the user guide/API documentation on consistency]
|
|
124
|
-
|
|
125
|
-
### **Output Format**
|
|
126
|
-
Your output **must exactly match** the following format:
|
|
127
|
-
```
|
|
128
|
-
**Consistency**:
|
|
129
|
-
* **Score**: [Poor / Fair / Good / Excellent]
|
|
130
|
-
* **Assessment**: [Your evaluation of whether the user guide/API documentation is consistent with the code definitions]
|
|
131
|
-
* **Development**: [A list of inconsistent function/class/method name and inconsistent docstring]
|
|
132
|
-
* **Strengths**: [A list of strengths of the user guide/API documentation on consistency]
|
|
133
|
-
```
|
|
134
|
-
|
|
135
|
-
### **Output Example**
|
|
136
|
-
|
|
137
|
-
```
|
|
138
|
-
**Consistency**:
|
|
139
|
-
* **Assessment**: [Your evaluation of whether the user guide/API documentation is consistent with the code definitions]
|
|
140
|
-
* **Development**:
|
|
141
|
-
- Inconsistent function/class/method name 1
|
|
142
|
-
- Inconsistent docstring 1
|
|
143
|
-
- Inconsistent function/class/method name 2
|
|
144
|
-
- Inconsistent docstring 2
|
|
145
|
-
- ...
|
|
146
|
-
* **Strengths**:
|
|
147
|
-
- Strengths 1
|
|
148
|
-
- Strengths 2
|
|
149
|
-
- ...
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
---
|
|
153
|
-
|
|
154
|
-
### **Input User Guide/API Documentation**
|
|
155
|
-
{user_guide_api_documentation}
|
|
156
|
-
|
|
157
|
-
### **Code Definitions**
|
|
158
|
-
{code_definitions}
|
|
159
|
-
|
|
160
|
-
---
|
|
161
|
-
|
|
162
|
-
"""
|
|
@@ -24,13 +24,13 @@ from .evaluation_userguide_prompts import INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM
|
|
|
24
24
|
class UserGuideEvaluationResult(BaseModel):
|
|
25
25
|
overall_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
26
26
|
overall_key_strengths: str=Field(description="A string value, the key strengths of the user guide")
|
|
27
|
-
|
|
27
|
+
|
|
28
28
|
readability_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
29
|
-
readability_suggestions: str=Field(description="
|
|
29
|
+
readability_suggestions: list[str]=Field(description="A list of string values, suggestions to improve readability if necessary")
|
|
30
30
|
context_and_purpose_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
31
|
-
context_and_purpose_suggestions: str=Field(description="
|
|
31
|
+
context_and_purpose_suggestions: list[str]=Field(description="A list of string values, suggestions to improve context and purpose if necessary")
|
|
32
32
|
error_handling_score: str=Field(description="A string value, could be `Poor`, `Fair`, `Good`, or `Excellent`")
|
|
33
|
-
error_handling_suggestions: str=Field(description="
|
|
33
|
+
error_handling_suggestions: list[str]=Field(description="A list of string values, suggestions to improve error handling if necessary")
|
|
34
34
|
|
|
35
35
|
class IndividualUserGuideEvaluationResult(BaseModel):
|
|
36
36
|
user_guide_evaluation: UserGuideEvaluationResult | None=Field(description="The evaluation result of the user guide")
|
|
@@ -11,7 +11,7 @@ LLM_SECTION_PROMPT = """
|
|
|
11
11
|
You are “BioGuider,” a concise documentation generator for biomedical/bioinformatics software.
|
|
12
12
|
|
|
13
13
|
GOAL
|
|
14
|
-
Write or refine a single documentation section named "{section}". Produce
|
|
14
|
+
Write or refine a single documentation section named "{section}". Produce professional, comprehensive, style-consistent content that addresses only this section.
|
|
15
15
|
|
|
16
16
|
INPUTS (use only what is provided; never invent)
|
|
17
17
|
- suggestion_category: {suggestion_category}
|
|
@@ -21,6 +21,7 @@ INPUTS (use only what is provided; never invent)
|
|
|
21
21
|
- repo_context_excerpt (analyze tone/formatting; do not paraphrase it blindly): <<{context}>>
|
|
22
22
|
|
|
23
23
|
STYLE & CONSTRAINTS
|
|
24
|
+
- Fix obvious errors in the content.
|
|
24
25
|
- Preserve the existing tone and style markers: {tone_markers}
|
|
25
26
|
- Use heading style "{heading_style}" and list style "{list_style}"; link style "{link_style}".
|
|
26
27
|
- Neutral, professional tone; avoid marketing claims.
|
|
@@ -29,9 +30,11 @@ STYLE & CONSTRAINTS
|
|
|
29
30
|
- Biomedical examples must avoid PHI; assume de-identified data.
|
|
30
31
|
- Output must be plain markdown for this section only, with no commentary and no backticks.
|
|
31
32
|
- Avoid duplication: if similar content exists in the repo context, rewrite succinctly instead of repeating.
|
|
33
|
+
- Never remove, alter, or recreate top-of-file badges/shields/logos (e.g., CI, PyPI, Conda, Docs shields). Assume they remain unchanged; do not output replacements for them.
|
|
34
|
+
- When targeting README content, do not rewrite the document title or header area; generate only the requested section body to be inserted below existing headers/badges.
|
|
32
35
|
|
|
33
36
|
SECTION GUIDELINES
|
|
34
|
-
- Dependencies: short bullet list; clearly separate Mandatory and Optional if applicable
|
|
37
|
+
- Dependencies: short bullet list; clearly separate Mandatory and Optional if applicable.
|
|
35
38
|
- System Requirements: runtime versions and supported OS; add hardware notes only if guidance provides specifics.
|
|
36
39
|
- Hardware Requirements: brief bullets with RAM/CPU only if guidance includes numbers.
|
|
37
40
|
- License: one sentence referencing the license and pointing to the LICENSE file.
|
|
@@ -26,6 +26,35 @@ ERROR CATEGORIES (inject all)
|
|
|
26
26
|
- bio_term: slightly wrong domain term (e.g., “single sell” for “single cell”); do not invent new science
|
|
27
27
|
- function: misspell a known function/API name **from the input README-lite only**
|
|
28
28
|
- markdown_structure: break a header level, list indentation, or code fence (one-off)
|
|
29
|
+
- list_structure: remove bullet space (e.g., “-item”), mix markers inconsistently
|
|
30
|
+
- section_title: subtly change a section title casing or wording
|
|
31
|
+
- image_syntax: break image markdown spacing (e.g., `![alt] (url)`)
|
|
32
|
+
- inline_code: remove backticks around inline code
|
|
33
|
+
- emphasis: break emphasis markers (e.g., missing closing `*`)
|
|
34
|
+
- table_alignment: misalign or omit a `|` in a markdown table
|
|
35
|
+
- code_lang_tag: use the wrong fenced code language (e.g., ```py for R)
|
|
36
|
+
|
|
37
|
+
BIOLOGY-SPECIFIC ERROR CATEGORIES (inject all; keep realistic & subtle)
|
|
38
|
+
- gene_symbol_case: change gene symbol casing or add suffix (e.g., “tp53”, “CD3e”), but **do not alter** protected keywords
|
|
39
|
+
- species_swap: imply human vs mouse mix-up (e.g., “mm10” vs “GRCh38”) in a short phrase
|
|
40
|
+
- ref_genome_mismatch: claim a reference genome that conflicts with the example file or text
|
|
41
|
+
- modality_confusion: conflate RNA-seq with ATAC or proteomics in a brief phrase
|
|
42
|
+
- normalization_error: misuse terms like CPM/TPM/CLR/log1p in a sentence
|
|
43
|
+
- umi_vs_read: confuse UMI counts vs read counts in a short line
|
|
44
|
+
- batch_effect: misstate “batch correction” vs “normalization” terminology
|
|
45
|
+
- qc_threshold: use a common but slightly wrong QC gate (e.g., mito% 0.5 instead of 5)
|
|
46
|
+
- file_format: mix up FASTQ/BAM/MTX/H5AD/RDS in a brief mention
|
|
47
|
+
- strandedness: claim “stranded” when workflow is unstranded (or vice versa)
|
|
48
|
+
- coordinates: confuse 0-based vs 1-based or chromosome naming style (chr1 vs 1)
|
|
49
|
+
- units_scale: use the wrong scale/unit (e.g., μm vs mm; 10e6 instead of 1e6)
|
|
50
|
+
- sample_type: conflate “primary tissue” with “cell line” in a single phrase
|
|
51
|
+
- contamination: misuse “ambient RNA” vs “doublets” terminology
|
|
52
|
+
|
|
53
|
+
CLI/CONFIG ERROR CATEGORIES (inject all)
|
|
54
|
+
- param_name: slightly misspell a CLI flag or config key (e.g., `--min-cell` → `--min-cells`)
|
|
55
|
+
- default_value: state a plausible but incorrect default value
|
|
56
|
+
- path_hint: introduce a subtle path typo (e.g., `data/filtrd`)
|
|
57
|
+
|
|
29
58
|
|
|
30
59
|
CONSTRAINTS
|
|
31
60
|
- Keep edits minimal and local; **≥85% token overlap** with input.
|
|
@@ -43,6 +72,7 @@ CONSTRAINTS
|
|
|
43
72
|
- Maintain a **concise length** (≤ {max_words} words).
|
|
44
73
|
- Do **not** alter the protected keywords (exact casing/spelling): {keywords}
|
|
45
74
|
- Keep at least **{min_per_category} errors per category** listed above.
|
|
75
|
+
- Limit `duplicate` injections to at most **{min_per_category}**.
|
|
46
76
|
- If the input contains runnable code, keep it mostly intact but introduce **one** realistic break
|
|
47
77
|
(e.g., missing quote/paren or wrong function name) without adding new libraries.
|
|
48
78
|
- Keep at least one **valid** URL so the fixer can compare.
|
|
@@ -204,8 +234,8 @@ class LLMErrorInjector:
|
|
|
204
234
|
corrupted = corrupted.replace(orig, mut, 1)
|
|
205
235
|
errors.append({"id": f"e_link_sup_{len(errors)}", "category": "link", "original_snippet": orig, "mutated_snippet": mut, "rationale": "scheme colon removed"})
|
|
206
236
|
|
|
207
|
-
# duplicate supplements
|
|
208
|
-
for _ in range(need("duplicate")):
|
|
237
|
+
# duplicate supplements (cap to min_per_category)
|
|
238
|
+
for _ in range(min(need("duplicate"), min_per_category)):
|
|
209
239
|
lines = corrupted.splitlines()
|
|
210
240
|
idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("- ") or ln.strip().startswith("## ")), None)
|
|
211
241
|
if idx is None:
|
|
@@ -264,6 +294,48 @@ class LLMErrorInjector:
|
|
|
264
294
|
else:
|
|
265
295
|
break
|
|
266
296
|
|
|
297
|
+
# list_structure supplements
|
|
298
|
+
for _ in range(need("list_structure")):
|
|
299
|
+
m = re.search(r"^\-\s+\S", corrupted, flags=re.M)
|
|
300
|
+
if not m:
|
|
301
|
+
break
|
|
302
|
+
orig = m.group(0)
|
|
303
|
+
mut = orig.replace("- ", "-", 1)
|
|
304
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
305
|
+
errors.append({"id": f"e_list_sup_{len(errors)}", "category": "list_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "bullet missing space"})
|
|
306
|
+
|
|
307
|
+
# section_title supplements
|
|
308
|
+
for _ in range(need("section_title")):
|
|
309
|
+
m = re.search(r"^##\s+(What is it\?|What can it do\?|Requirements|Install|Quick example|Learn more|License & Contact)$", corrupted, flags=re.M)
|
|
310
|
+
if not m:
|
|
311
|
+
break
|
|
312
|
+
orig = m.group(0)
|
|
313
|
+
mut = orig.replace("What is it?", "What is It?").replace("Install", "Installation")
|
|
314
|
+
if mut == orig:
|
|
315
|
+
break
|
|
316
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
317
|
+
errors.append({"id": f"e_title_sup_{len(errors)}", "category": "section_title", "original_snippet": orig, "mutated_snippet": mut, "rationale": "subtle title change"})
|
|
318
|
+
|
|
319
|
+
# image_syntax supplements
|
|
320
|
+
for _ in range(need("image_syntax")):
|
|
321
|
+
m = re.search(r"!\[[^\]]*\]\([^\)]+\)", corrupted)
|
|
322
|
+
if not m:
|
|
323
|
+
break
|
|
324
|
+
orig = m.group(0)
|
|
325
|
+
mut = orig.replace("](", "] (")
|
|
326
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
327
|
+
errors.append({"id": f"e_img_sup_{len(errors)}", "category": "image_syntax", "original_snippet": orig, "mutated_snippet": mut, "rationale": "broken image spacing"})
|
|
328
|
+
|
|
329
|
+
# inline_code supplements
|
|
330
|
+
for _ in range(need("inline_code")):
|
|
331
|
+
m = re.search(r"`[^`\n]+`", corrupted)
|
|
332
|
+
if not m:
|
|
333
|
+
break
|
|
334
|
+
orig = m.group(0)
|
|
335
|
+
mut = orig.strip("`")
|
|
336
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
337
|
+
errors.append({"id": f"e_code_sup_{len(errors)}", "category": "inline_code", "original_snippet": orig, "mutated_snippet": mut, "rationale": "removed inline code backticks"})
|
|
338
|
+
|
|
267
339
|
data["errors"] = errors
|
|
268
340
|
return corrupted, data
|
|
269
341
|
|
|
@@ -14,6 +14,10 @@ class EvaluationReport(BaseModel):
|
|
|
14
14
|
readme_evaluation: Optional[Dict[str, Any]] = None
|
|
15
15
|
readme_files: Optional[List[str]] = None
|
|
16
16
|
|
|
17
|
+
# Optional: rich user guide evaluation content and any explicitly listed files
|
|
18
|
+
userguide_evaluation: Optional[Dict[str, Any]] = None
|
|
19
|
+
userguide_files: Optional[List[str]] = None
|
|
20
|
+
|
|
17
21
|
submission_requirements_evaluation: Optional[Dict[str, Any]] = None
|
|
18
22
|
submission_requirements_files: Optional[List[str]] = None
|
|
19
23
|
|
|
@@ -22,11 +22,56 @@ def _count_markdown_issues(text: str) -> int:
|
|
|
22
22
|
def evaluate_fixes(baseline: str, corrupted: str, revised: str, injection_manifest: Dict[str, Any]) -> Dict[str, Any]:
|
|
23
23
|
per_error: List[Dict[str, Any]] = []
|
|
24
24
|
per_cat: Dict[str, Dict[str, int]] = {}
|
|
25
|
+
# aggregate counters
|
|
26
|
+
totals = {"total_errors": 0, "fixed_to_baseline": 0, "fixed_to_valid": 0, "unchanged": 0, "worsened": 0}
|
|
25
27
|
|
|
26
28
|
def mark(cat: str, key: str):
|
|
27
29
|
per_cat.setdefault(cat, {"total": 0, "fixed_to_baseline": 0, "fixed_to_valid": 0, "unchanged": 0, "worsened": 0})
|
|
28
30
|
per_cat[cat][key] += 1
|
|
29
31
|
|
|
32
|
+
# Precompute some structural counts
|
|
33
|
+
def count_malformed_bullets(text: str) -> int:
|
|
34
|
+
return len(re.findall(r"^[-*]\S", text, flags=re.M))
|
|
35
|
+
|
|
36
|
+
def count_bad_image_spacing(text: str) -> int:
|
|
37
|
+
return len(re.findall(r"!\[[^\]]*\]\s+\(", text))
|
|
38
|
+
|
|
39
|
+
def table_variance(text: str) -> int:
|
|
40
|
+
rows = [ln for ln in text.splitlines() if '|' in ln]
|
|
41
|
+
groups: List[List[str]] = []
|
|
42
|
+
cur: List[str] = []
|
|
43
|
+
for ln in rows:
|
|
44
|
+
if '|' in ln:
|
|
45
|
+
cur.append(ln)
|
|
46
|
+
else:
|
|
47
|
+
if len(cur) >= 2:
|
|
48
|
+
groups.append(cur)
|
|
49
|
+
cur = []
|
|
50
|
+
if len(cur) >= 2:
|
|
51
|
+
groups.append(cur)
|
|
52
|
+
vari = 0
|
|
53
|
+
for g in groups:
|
|
54
|
+
counts = [ln.count('|') for ln in g]
|
|
55
|
+
vari += (max(counts) - min(counts))
|
|
56
|
+
return vari
|
|
57
|
+
|
|
58
|
+
malformed_bullets_before = count_malformed_bullets(corrupted)
|
|
59
|
+
malformed_bullets_after = count_malformed_bullets(revised)
|
|
60
|
+
bad_img_before = count_bad_image_spacing(corrupted)
|
|
61
|
+
bad_img_after = count_bad_image_spacing(revised)
|
|
62
|
+
table_var_before = table_variance(corrupted)
|
|
63
|
+
table_var_after = table_variance(revised)
|
|
64
|
+
|
|
65
|
+
canonical_titles = {
|
|
66
|
+
"## What is it?",
|
|
67
|
+
"## What can it do?",
|
|
68
|
+
"## Requirements",
|
|
69
|
+
"## Install",
|
|
70
|
+
"## Quick example",
|
|
71
|
+
"## Learn more",
|
|
72
|
+
"## License & Contact",
|
|
73
|
+
}
|
|
74
|
+
|
|
30
75
|
for e in injection_manifest.get("errors", []):
|
|
31
76
|
cat = e.get("category", "unknown")
|
|
32
77
|
per_cat.setdefault(cat, {"total": 0, "fixed_to_baseline": 0, "fixed_to_valid": 0, "unchanged": 0, "worsened": 0})
|
|
@@ -76,10 +121,43 @@ def evaluate_fixes(baseline: str, corrupted: str, revised: str, injection_manife
|
|
|
76
121
|
status = "unchanged"
|
|
77
122
|
else:
|
|
78
123
|
status = "fixed_to_valid"
|
|
124
|
+
elif cat == "list_structure":
|
|
125
|
+
status = "fixed_to_valid" if malformed_bullets_after < malformed_bullets_before else "unchanged"
|
|
126
|
+
elif cat == "image_syntax":
|
|
127
|
+
status = "fixed_to_valid" if bad_img_after < bad_img_before else "unchanged"
|
|
128
|
+
elif cat == "section_title":
|
|
129
|
+
# valid if mutated title removed and any canonical title present
|
|
130
|
+
if mut and mut not in revised and any(t in revised for t in canonical_titles):
|
|
131
|
+
status = "fixed_to_valid"
|
|
132
|
+
else:
|
|
133
|
+
status = "unchanged"
|
|
134
|
+
elif cat == "inline_code":
|
|
135
|
+
# check that the raw content regained backticks somewhere
|
|
136
|
+
raw = mut.strip('`') if mut else ""
|
|
137
|
+
rewrapped = f"`{raw}`" if raw else ""
|
|
138
|
+
if raw and rewrapped and rewrapped in revised and mut not in revised:
|
|
139
|
+
status = "fixed_to_valid"
|
|
140
|
+
else:
|
|
141
|
+
status = "unchanged"
|
|
142
|
+
elif cat == "emphasis":
|
|
143
|
+
status = "fixed_to_valid" if mut and mut not in revised else "unchanged"
|
|
144
|
+
elif cat == "table_alignment":
|
|
145
|
+
status = "fixed_to_valid" if table_var_after < table_var_before else "unchanged"
|
|
146
|
+
elif cat == "code_lang_tag":
|
|
147
|
+
status = "fixed_to_valid" if mut and mut not in revised else "unchanged"
|
|
148
|
+
# Biology-specific and CLI/CONFIG categories: treat as fixed if mutated snippet removed
|
|
149
|
+
elif cat in {
|
|
150
|
+
"gene_symbol_case","species_swap","ref_genome_mismatch","modality_confusion","normalization_error",
|
|
151
|
+
"umi_vs_read","batch_effect","qc_threshold","file_format","strandedness","coordinates","units_scale",
|
|
152
|
+
"sample_type","contamination","param_name","default_value","path_hint"
|
|
153
|
+
}:
|
|
154
|
+
status = "fixed_to_valid" if mut and mut not in revised else "unchanged"
|
|
79
155
|
else:
|
|
80
156
|
status = "unchanged"
|
|
81
157
|
|
|
82
158
|
mark(cat, status)
|
|
159
|
+
totals["total_errors"] += 1
|
|
160
|
+
totals[status] += 1
|
|
83
161
|
per_error.append({
|
|
84
162
|
"id": e.get("id"),
|
|
85
163
|
"category": cat,
|
|
@@ -95,10 +173,17 @@ def evaluate_fixes(baseline: str, corrupted: str, revised: str, injection_manife
|
|
|
95
173
|
global_metrics = {
|
|
96
174
|
"markdown_validity_delta": issues_before - issues_after,
|
|
97
175
|
}
|
|
176
|
+
success = totals["fixed_to_baseline"] + totals["fixed_to_valid"]
|
|
177
|
+
success_rate = (success / totals["total_errors"] * 100.0) if totals["total_errors"] else 0.0
|
|
178
|
+
summary = {
|
|
179
|
+
"totals": totals,
|
|
180
|
+
"success_rate": round(success_rate, 2),
|
|
181
|
+
}
|
|
98
182
|
return {
|
|
99
183
|
"per_error": per_error,
|
|
100
184
|
"per_category": per_cat,
|
|
101
185
|
"global": global_metrics,
|
|
186
|
+
"summary": summary,
|
|
102
187
|
}
|
|
103
188
|
|
|
104
189
|
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import os
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import Tuple, Dict
|
|
5
|
+
from typing import Tuple, Dict, List
|
|
6
6
|
|
|
7
7
|
from bioguider.generation import (
|
|
8
8
|
EvaluationReportLoader,
|
|
@@ -49,12 +49,24 @@ class DocumentationGenerationManager:
|
|
|
49
49
|
|
|
50
50
|
self.print_step(step_name="ReadRepoFiles", step_output=f"repo_path={repo_path}")
|
|
51
51
|
reader = RepoReader(repo_path)
|
|
52
|
-
# Prefer report-listed files if available
|
|
52
|
+
# Prefer report-listed files if available; include all report-declared file lists
|
|
53
53
|
target_files = []
|
|
54
|
-
if report
|
|
54
|
+
if getattr(report, "readme_files", None):
|
|
55
55
|
target_files.extend(report.readme_files)
|
|
56
|
-
if report
|
|
56
|
+
if getattr(report, "installation_files", None):
|
|
57
57
|
target_files.extend(report.installation_files)
|
|
58
|
+
# If userguide_files not explicitly provided, derive from userguide_evaluation keys
|
|
59
|
+
userguide_files: list[str] = []
|
|
60
|
+
if getattr(report, "userguide_files", None):
|
|
61
|
+
userguide_files.extend([p for p in report.userguide_files if isinstance(p, str)])
|
|
62
|
+
elif getattr(report, "userguide_evaluation", None) and isinstance(report.userguide_evaluation, dict):
|
|
63
|
+
for key in report.userguide_evaluation.keys():
|
|
64
|
+
if isinstance(key, str) and key.strip():
|
|
65
|
+
userguide_files.append(key)
|
|
66
|
+
target_files.extend(userguide_files)
|
|
67
|
+
if getattr(report, "submission_requirements_files", None):
|
|
68
|
+
target_files.extend(report.submission_requirements_files)
|
|
69
|
+
target_files = [p for p in target_files if isinstance(p, str) and p.strip()]
|
|
58
70
|
target_files = list(dict.fromkeys(target_files)) # de-dup
|
|
59
71
|
files, missing = reader.read_files(target_files) if target_files else reader.read_default_targets()
|
|
60
72
|
|
|
@@ -118,7 +130,10 @@ class DocumentationGenerationManager:
|
|
|
118
130
|
|
|
119
131
|
self.print_step(step_name="WriteOutputs", step_output=f"repo_key={out_repo_key}")
|
|
120
132
|
out_dir = self.output.prepare_output_dir(out_repo_key)
|
|
121
|
-
|
|
133
|
+
# Ensure all files we read (even without edits) are written to outputs alongside revisions
|
|
134
|
+
all_files_to_write: Dict[str, str] = dict(files)
|
|
135
|
+
all_files_to_write.update(revised)
|
|
136
|
+
artifacts = self.output.write_files(out_dir, all_files_to_write, diff_stats_by_file=diff_stats)
|
|
122
137
|
|
|
123
138
|
manifest = GenerationManifest(
|
|
124
139
|
repo_url=report.repo_url,
|
|
@@ -131,14 +146,31 @@ class DocumentationGenerationManager:
|
|
|
131
146
|
)
|
|
132
147
|
self.output.write_manifest(out_dir, manifest)
|
|
133
148
|
# Write human-readable generation report
|
|
134
|
-
gen_report_path = self._write_generation_report(
|
|
149
|
+
gen_report_path = self._write_generation_report(
|
|
150
|
+
out_dir,
|
|
151
|
+
report.repo_url or str(self.repo_url_or_path or ""),
|
|
152
|
+
plan,
|
|
153
|
+
diff_stats,
|
|
154
|
+
suggestions,
|
|
155
|
+
artifacts,
|
|
156
|
+
missing,
|
|
157
|
+
)
|
|
135
158
|
self.print_step(step_name="Done", step_output=f"output_dir={out_dir}")
|
|
136
159
|
return out_dir
|
|
137
160
|
|
|
138
|
-
def _write_generation_report(
|
|
161
|
+
def _write_generation_report(
|
|
162
|
+
self,
|
|
163
|
+
out_dir: str,
|
|
164
|
+
repo_url: str,
|
|
165
|
+
plan,
|
|
166
|
+
diff_stats: Dict[str, dict],
|
|
167
|
+
suggestions,
|
|
168
|
+
artifacts,
|
|
169
|
+
skipped: List[str],
|
|
170
|
+
):
|
|
139
171
|
# Build a simple markdown report
|
|
140
172
|
lines: list[str] = []
|
|
141
|
-
lines.append(f"# Documentation
|
|
173
|
+
lines.append(f"# Documentation Changelog\n")
|
|
142
174
|
lines.append(f"Repo: {repo_url}\n")
|
|
143
175
|
lines.append(f"Output: {out_dir}\n")
|
|
144
176
|
lines.append("\n## Summary of Changes\n")
|
|
@@ -151,6 +183,20 @@ class DocumentationGenerationManager:
|
|
|
151
183
|
lines.append("\n## Planned Edits\n")
|
|
152
184
|
for e in plan.planned_edits:
|
|
153
185
|
lines.append(f"- `{e.file_path}` -> {e.edit_type} -> {e.anchor.get('value','')}")
|
|
186
|
+
|
|
187
|
+
# Summarize all files written with basic status
|
|
188
|
+
lines.append("\n## Files Written\n")
|
|
189
|
+
for art in artifacts:
|
|
190
|
+
stats = art.diff_stats or {}
|
|
191
|
+
added = stats.get("added_lines", 0)
|
|
192
|
+
status = "Revised" if added and added > 0 else "Copied"
|
|
193
|
+
lines.append(f"- {art.dest_rel_path} | status: {status} | added_lines: {added}")
|
|
194
|
+
|
|
195
|
+
# Skipped or missing files
|
|
196
|
+
if skipped:
|
|
197
|
+
lines.append("\n## Skipped or Missing Files\n")
|
|
198
|
+
for rel in skipped:
|
|
199
|
+
lines.append(f"- {rel}")
|
|
154
200
|
report_md = "\n".join(lines)
|
|
155
201
|
dest = os.path.join(out_dir, "GENERATION_REPORT.md")
|
|
156
202
|
with open(dest, "w", encoding="utf-8") as fobj:
|
|
@@ -19,14 +19,14 @@ class GenerationTestManager:
|
|
|
19
19
|
if self.step_output:
|
|
20
20
|
self.step_output(step_name=name, step_output=out)
|
|
21
21
|
|
|
22
|
-
def run_quant_test(self, report_path: str, baseline_repo_path: str, tmp_repo_path: str) -> str:
|
|
22
|
+
def run_quant_test(self, report_path: str, baseline_repo_path: str, tmp_repo_path: str, min_per_category: int = 3) -> str:
|
|
23
23
|
self.print_step("QuantTest:LoadBaseline", baseline_repo_path)
|
|
24
24
|
baseline_readme_path = os.path.join(baseline_repo_path, "README.md")
|
|
25
25
|
baseline = read_file(baseline_readme_path) or ""
|
|
26
26
|
|
|
27
27
|
self.print_step("QuantTest:Inject")
|
|
28
28
|
injector = LLMErrorInjector(self.llm)
|
|
29
|
-
corrupted, inj_manifest = injector.inject(baseline, min_per_category=
|
|
29
|
+
corrupted, inj_manifest = injector.inject(baseline, min_per_category=min_per_category)
|
|
30
30
|
|
|
31
31
|
# write corrupted into tmp repo path
|
|
32
32
|
os.makedirs(tmp_repo_path, exist_ok=True)
|
|
@@ -49,13 +49,38 @@ class GenerationTestManager:
|
|
|
49
49
|
# write results
|
|
50
50
|
with open(os.path.join(out_dir, "GEN_TEST_RESULTS.json"), "w", encoding="utf-8") as fobj:
|
|
51
51
|
json.dump(results, fobj, indent=2)
|
|
52
|
-
#
|
|
53
|
-
|
|
54
|
-
|
|
52
|
+
# slides-like markdown report
|
|
53
|
+
totals = results.get("summary", {}).get("totals", {})
|
|
54
|
+
success_rate = results.get("summary", {}).get("success_rate", 0.0)
|
|
55
|
+
lines = ["# 🔬 Quantifiable Testing Results\n",
|
|
56
|
+
"\n## BioGuider Error Correction Performance Analysis\n",
|
|
57
|
+
"\n---\n",
|
|
58
|
+
"\n## 📊 Slide 1: Testing Results Overview\n",
|
|
59
|
+
"\n### 🎯 Totals\n",
|
|
60
|
+
f"- Total Errors: {totals.get('total_errors', 0)}\n",
|
|
61
|
+
f"- Fixed to Baseline: {totals.get('fixed_to_baseline', 0)}\n",
|
|
62
|
+
f"- Fixed to Valid: {totals.get('fixed_to_valid', 0)}\n",
|
|
63
|
+
f"- Unchanged: {totals.get('unchanged', 0)}\n",
|
|
64
|
+
f"- Success Rate: {success_rate}%\n",
|
|
65
|
+
"\n### 📂 Per-Category Metrics\n"]
|
|
55
66
|
for cat, m in results["per_category"].items():
|
|
56
|
-
lines.append(f"- {cat}: {m}")
|
|
57
|
-
|
|
58
|
-
|
|
67
|
+
lines.append(f"- {cat}: total={m.get('total',0)}, fixed_to_baseline={m.get('fixed_to_baseline',0)}, fixed_to_valid={m.get('fixed_to_valid',0)}, unchanged={m.get('unchanged',0)}")
|
|
68
|
+
# Per-file change counts (simple heuristic from manifest artifacts)
|
|
69
|
+
try:
|
|
70
|
+
manifest_path = os.path.join(out_dir, "manifest.json")
|
|
71
|
+
with open(manifest_path, "r", encoding="utf-8") as mf:
|
|
72
|
+
mani = json.load(mf)
|
|
73
|
+
lines.append("\n### 🗂️ Per-File Changes\n")
|
|
74
|
+
for art in mani.get("artifacts", []):
|
|
75
|
+
rel = art.get("dest_rel_path")
|
|
76
|
+
stats = art.get("diff_stats", {})
|
|
77
|
+
added = stats.get("added_lines", 0)
|
|
78
|
+
status = "Revised" if added and added > 0 else "Copied"
|
|
79
|
+
lines.append(f"- {rel}: {status}, added_lines={added}")
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
lines.append("\n---\n\n## 📝 Notes\n")
|
|
83
|
+
lines.append("- README versions saved: README.original.md, README.corrupted.md, README.md (fixed).\n")
|
|
59
84
|
with open(os.path.join(out_dir, "GEN_TEST_REPORT.md"), "w", encoding="utf-8") as fobj:
|
|
60
85
|
fobj.write("\n".join(lines))
|
|
61
86
|
# Save versioned files into output dir
|
|
@@ -71,4 +96,12 @@ class GenerationTestManager:
|
|
|
71
96
|
self.print_step("QuantTest:Done", out_dir)
|
|
72
97
|
return out_dir
|
|
73
98
|
|
|
99
|
+
def run_quant_suite(self, report_path: str, baseline_repo_path: str, base_tmp_repo_path: str, levels: dict[str, int]) -> dict:
|
|
100
|
+
results = {}
|
|
101
|
+
for level, min_cnt in levels.items():
|
|
102
|
+
tmp_repo_path = f"{base_tmp_repo_path}_{level}"
|
|
103
|
+
out_dir = self.run_quant_test(report_path, baseline_repo_path, tmp_repo_path, min_per_category=min_cnt)
|
|
104
|
+
results[level] = out_dir
|
|
105
|
+
return results
|
|
106
|
+
|
|
74
107
|
|
|
@@ -53,10 +53,14 @@ def run_command(command: list, cwd: str = None, timeout: int = None):
|
|
|
53
53
|
return e.stdout or "", e.stderr or f"Command timed out after {timeout} seconds", -1
|
|
54
54
|
|
|
55
55
|
def escape_braces(text: str) -> str:
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
56
|
+
def fix_braces(m):
|
|
57
|
+
s = m.group(0)
|
|
58
|
+
# If odd number of braces, double the last one
|
|
59
|
+
if len(s) % 2 == 1:
|
|
60
|
+
return s + s[-1]
|
|
61
|
+
return s
|
|
62
|
+
# Handle both { and } sequences
|
|
63
|
+
text = re.sub(r'{+|}+', fix_braces, text)
|
|
60
64
|
return text
|
|
61
65
|
|
|
62
66
|
def increase_token_usage(
|
|
@@ -105,7 +109,7 @@ def convert_to_serializable(obj):
|
|
|
105
109
|
else:
|
|
106
110
|
return obj
|
|
107
111
|
|
|
108
|
-
def convert_html_to_text(html_path: str | Path, exclude_tags: list[str]
|
|
112
|
+
def convert_html_to_text(html_path: str | Path, exclude_tags: list[str] = ["script", "style", "img", "svg", "meta", "link"]) -> str:
|
|
109
113
|
"""
|
|
110
114
|
This function is used to convert html string to text, that is,
|
|
111
115
|
extract text from html content, including tables.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{bioguider-0.2.25 → bioguider-0.2.27}/bioguider/agents/evaluation_submission_requirements_task.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|