PyPI - bioguider - Versions diffs - 0.2.52__py3-none-any.whl - Mend

bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

bioguider/__init__.py +0 -0
bioguider/agents/__init__.py +0 -0
bioguider/agents/agent_task.py +92 -0
bioguider/agents/agent_tools.py +176 -0
bioguider/agents/agent_utils.py +504 -0
bioguider/agents/collection_execute_step.py +182 -0
bioguider/agents/collection_observe_step.py +125 -0
bioguider/agents/collection_plan_step.py +156 -0
bioguider/agents/collection_task.py +184 -0
bioguider/agents/collection_task_utils.py +142 -0
bioguider/agents/common_agent.py +137 -0
bioguider/agents/common_agent_2step.py +215 -0
bioguider/agents/common_conversation.py +61 -0
bioguider/agents/common_step.py +85 -0
bioguider/agents/consistency_collection_step.py +102 -0
bioguider/agents/consistency_evaluation_task.py +57 -0
bioguider/agents/consistency_evaluation_task_utils.py +14 -0
bioguider/agents/consistency_observe_step.py +110 -0
bioguider/agents/consistency_query_step.py +77 -0
bioguider/agents/dockergeneration_execute_step.py +186 -0
bioguider/agents/dockergeneration_observe_step.py +154 -0
bioguider/agents/dockergeneration_plan_step.py +158 -0
bioguider/agents/dockergeneration_task.py +158 -0
bioguider/agents/dockergeneration_task_utils.py +220 -0
bioguider/agents/evaluation_installation_task.py +270 -0
bioguider/agents/evaluation_readme_task.py +767 -0
bioguider/agents/evaluation_submission_requirements_task.py +172 -0
bioguider/agents/evaluation_task.py +206 -0
bioguider/agents/evaluation_tutorial_task.py +169 -0
bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
bioguider/agents/evaluation_userguide_prompts.py +179 -0
bioguider/agents/evaluation_userguide_task.py +154 -0
bioguider/agents/evaluation_utils.py +127 -0
bioguider/agents/identification_execute_step.py +181 -0
bioguider/agents/identification_observe_step.py +104 -0
bioguider/agents/identification_plan_step.py +140 -0
bioguider/agents/identification_task.py +270 -0
bioguider/agents/identification_task_utils.py +22 -0
bioguider/agents/peo_common_step.py +64 -0
bioguider/agents/prompt_utils.py +253 -0
bioguider/agents/python_ast_repl_tool.py +69 -0
bioguider/agents/rag_collection_task.py +130 -0
bioguider/conversation.py +67 -0
bioguider/database/code_structure_db.py +500 -0
bioguider/database/summarized_file_db.py +146 -0
bioguider/generation/__init__.py +39 -0
bioguider/generation/benchmark_metrics.py +610 -0
bioguider/generation/change_planner.py +189 -0
bioguider/generation/document_renderer.py +157 -0
bioguider/generation/llm_cleaner.py +67 -0
bioguider/generation/llm_content_generator.py +1128 -0
bioguider/generation/llm_injector.py +809 -0
bioguider/generation/models.py +85 -0
bioguider/generation/output_manager.py +74 -0
bioguider/generation/repo_reader.py +37 -0
bioguider/generation/report_loader.py +166 -0
bioguider/generation/style_analyzer.py +36 -0
bioguider/generation/suggestion_extractor.py +436 -0
bioguider/generation/test_metrics.py +189 -0
bioguider/managers/benchmark_manager.py +785 -0
bioguider/managers/evaluation_manager.py +215 -0
bioguider/managers/generation_manager.py +686 -0
bioguider/managers/generation_test_manager.py +107 -0
bioguider/managers/generation_test_manager_v2.py +525 -0
bioguider/rag/__init__.py +0 -0
bioguider/rag/config.py +117 -0
bioguider/rag/data_pipeline.py +651 -0
bioguider/rag/embedder.py +24 -0
bioguider/rag/rag.py +138 -0
bioguider/settings.py +103 -0
bioguider/utils/code_structure_builder.py +59 -0
bioguider/utils/constants.py +135 -0
bioguider/utils/default.gitignore +140 -0
bioguider/utils/file_utils.py +215 -0
bioguider/utils/gitignore_checker.py +175 -0
bioguider/utils/notebook_utils.py +117 -0
bioguider/utils/pyphen_utils.py +73 -0
bioguider/utils/python_file_handler.py +65 -0
bioguider/utils/r_file_handler.py +551 -0
bioguider/utils/utils.py +163 -0
bioguider-0.2.52.dist-info/LICENSE +21 -0
bioguider-0.2.52.dist-info/METADATA +51 -0
bioguider-0.2.52.dist-info/RECORD +84 -0
bioguider-0.2.52.dist-info/WHEEL +4 -0

bioguider/agents/evaluation_tutorial_task_prompts.py ADDED Viewed

@@ -0,0 +1,187 @@
+INDIVIDUAL_TUTORIAL_EVALUATION_SYSTEM_PROMPT = """
+You are an expert in evaluating the quality of tutorials in software repositories.
+Your task is to analyze the provided tutorial file and generate a structured quality assessment based on the following criteria.
+---
+### **Evaluation Criteria**
+1. **Readability** AND **Error Detection**:
+   * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
+   * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
+   * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
+   * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
+   * **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
+   * **CRITICAL - Error Detection**: You MUST scan for and identify ALL error INSTANCES (not just types):
+     - **Typos and spelling errors**: Misspelled words, truncated words (e.g., "analysi" → "analysis", "exampl" → "example")
+       * If the SAME typo appears multiple times, LIST EACH OCCURRENCE separately
+     - **Malformed links**: URLs missing colons (e.g., "https//..." should be "https://...")
+       * Check EVERY link/URL in the document
+     - **Markdown/RMarkdown syntax errors**:
+       * Missing code fence markers (e.g., missing opening ```)
+       * Headers without spaces
+       * Broken R chunk syntax (e.g., missing {{r or }})
+       * Check ALL code blocks and headers
+     - **Bio/domain term errors**: Wrong scientific terms (e.g., "single sell" → "single cell", "genomis" → "genomics")
+       * Pay special attention to biology/bioinformatics terminology
+     - **Function name errors**: Misspelled function/API names (e.g., "Dat()" → "Date()")
+       * Check ALL function calls in code blocks
+     - **Inline code formatting**: Missing backticks around code elements
+       * Check that all code references use proper backtick formatting
+     - **ANY OTHER ANOMALIES**: Trust your judgment - if something looks wrong, report it
+   * **IMPORTANT**: Report EVERY INDIVIDUAL ERROR INSTANCE
+     - If "analysi" appears 4 times, report it 4 times (with line references if possible)
+     - If 5 URLs are malformed, report all 5 individually
+     - Do NOT group similar errors - LIST EACH ONE SEPARATELY
+     - **NEVER use phrases like**: "multiple occurrences", "and elsewhere", "several instances"
+     - **INSTEAD**: List each occurrence as a separate numbered error
+     - **DO not** make up errors - only report errors that are actually present in the text
+   * **Grade Level** (based on TOTAL error instances, not types):
+     - **85-100**: The documentation is exceptionally clear, polished, engaging, and ERROR-FREE (0 errors).
+     - **65-84**: The documentation is clear with only minor errors (1-5 total error instances).
+     - **45-64**: The documentation has noticeable errors (6-15 total error instances).
+     - **0-44**: The documentation has numerous errors (16+ total error instances) making it unprofessional.
+     - **Note**: Count EVERY instance - if "analysi" appears 4 times, that's 4 errors, not 1.
+2. **Coverage**:
+   * **Assessment**: [Your evaluation of whether it covers all major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the tutorial.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The documentation covers all major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.
+     - **65-84**: The documentation covers most of the major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.
+     - **45-64**: The documentation covers some of the major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.
+     - **0-44**: The documentation does not cover any of the major steps needed to get started, and dependencies, prerequisites, setup steps, and example usage.
+3. **Reproducibility**:
+   * **Assessment**: [Your evaluation of whether it provides a clear **description** of reproducibility]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the tutorial.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The documentation provides a clear and comprehensive guide to the tutorial, with all necessary steps and information provided.
+     - **65-84**: The documentation provides a clear and comprehensive guide to the tutorial, with most necessary steps and information provided.
+     - **45-64**: The documentation provides a clear and comprehensive guide to the tutorial, with some necessary steps and information provided.
+     - **0-44**: The documentation does not provide a clear and comprehensive guide to the tutorial, with no necessary steps and information provided.
+4. **Structure & Navigation**:
+   * **Assessment**: [Your evaluation of whether it provides logical sections (e.g., intro -> setup -> steps -> results -> next), TOC/anchors, estimated time, etc.]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the tutorial.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The documentation provides a clear and comprehensive guide to the tutorial, with all necessary steps and information provided.
+     - **65-84**: The documentation provides a clear and comprehensive guide to the tutorial, with most necessary steps and information provided.
+     - **45-64**: The documentation provides a clear and comprehensive guide to the tutorial, with some necessary steps and information provided.
+     - **0-44**: The documentation does not provide a clear and comprehensive guide to the tutorial, with no necessary steps and information provided.
+5. **Executable Code Quality**:
+   * **Assessment**: [Your evaluation on whether the code snippets are executable and functional, idiomatic, no hard-coded paths, etc.]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the tutorial.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The documentation provides a clear and comprehensive guide to the tutorial, with all necessary steps and information provided.
+     - **65-84**: The documentation provides a clear and comprehensive guide to the tutorial, with most necessary steps and information provided.
+     - **45-64**: The documentation provides a clear and comprehensive guide to the tutorial, with some necessary steps and information provided.
+     - **0-44**: The documentation does not provide a clear and comprehensive guide to the tutorial, with no necessary steps and information provided.
+6. **Result Verification**:
+   * **Assessment**: [Your evaluation on expected outputs shown (figures/tables/metrics), acceptance criteria, etc.]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the tutorial.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The documentation provides a clear and comprehensive guide to the tutorial, with all necessary steps and information provided.
+     - **65-84**: The documentation provides a clear and comprehensive guide to the tutorial, with most necessary steps and information provided.
+     - **45-64**: The documentation provides a clear and comprehensive guide to the tutorial, with some necessary steps and information provided.
+     - **0-44**: The documentation does not provide a clear and comprehensive guide to the tutorial, with no necessary steps and information provided.
+7. **Performance & Resource Notes**:
+   * **Assessment**: [Your evaluation on performance and resource notes, e.g., CPU/GPU usage, memory usage, runtime estimates, small "lite" path provided.]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the tutorial.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The documentation provides a clear and comprehensive guide to the tutorial, with all necessary steps and information provided.
+     - **65-84**: The documentation provides a clear and comprehensive guide to the tutorial, with most necessary steps and information provided.
+     - **45-64**: The documentation provides a clear and comprehensive guide to the tutorial, with some necessary steps and information provided.
+     - **0-44**: The documentation does not provide a clear and comprehensive guide to the tutorial, with no necessary steps and information provided.
+---
+### **Final Report Ouput**
+Your final report must **exactly match** the following format. Do not add or omit any sections.
+**FinalAnswer**
+* **Overall Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Overall Key Strengths**: <brief summary of the Tutorial's strongest points in 2-3 sentences>
+* **Readability Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Readability Error Count:** [TOTAL number of error INSTANCES found - count each occurrence]
+* **Readability Errors Found:** [List of ALL individual error instances]
+  - **CRITICAL**: List EVERY INDIVIDUAL ERROR INSTANCE (not grouped)
+  - **WRONG EXAMPLE** (Do NOT do this):
+    ❌ "TYPO: 'analysi' → 'analysis' - appears in multiple locations"
+    ❌ "LINKS: Several URLs missing colons"
+  - **CORRECT EXAMPLE** (Do this instead):
+    ✅ "TYPO: 'analysi' → 'analysis' - in section 'Perform DE analysi...'"
+    ✅ "TYPO: 'analysi' → 'analysis' - in paragraph 'The analysi shows...'"
+    ✅ "TYPO: 'analysi' → 'analysis' - in code comment 'Run analysi...'"
+    ✅ "TYPO: 'analysi' → 'analysis' - in heading 'Results of analysi'"
+    ✅ "LINK: 'https//www.nature.com/articles/nbt.4042' → 'https://www.nature.com/articles/nbt.4042'"
+    ✅ "LINK: 'https//github.com/satijalab/seurat-data' → 'https://github.com/satijalab/seurat-data'"
+  **Format for each error** (list them ALL individually):
+  - **Typos**: "original misspelled text" → "corrected text" (location/context)
+  - **Links**: Complete URL → Fixed URL (one entry per link)
+  - **Markdown/RMarkdown**: "syntax error" → "correct syntax" (specific location)
+  - **Bio terms**: "wrong term" → "correct term" (where it appears)
+  - **Function names**: "misspelled function" → "correct function" (which code block)
+  - **Inline code**: "missing backticks around X" → "add backticks" (specific variable/function)
+  - **Other issues**: describe and provide corrections
+  - General readability improvements (sentence structure, clarity, etc.)
+  **Remember**: Each error instance = one separate entry in Readability Errors Found list
+* **Readability Suggestions:** [General non-error readability improvements like sentence structure, clarity, etc.]
+* **Coverage Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Coverage Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Reproducibility Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Reproducibility Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Structure & Navigation Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Structure & Navigation Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Executable Code Quality Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Executable Code Quality Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Result Verification Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Result Verification Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Performance & Resource Notes Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Performance & Resource Notes Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+---
+### **Tutorial File Content:**
+{tutorial_file_content}
+---
+"""

bioguider/agents/evaluation_userguide_prompts.py ADDED Viewed

@@ -0,0 +1,179 @@
+INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT = """
+You are an expert in evaluating the quality of user guide in software repositories.
+Your task is to analyze the provided files related to user guide and generate a structured quality assessment based on the following criteria.
+---
+### **Evaluation Criteria**
+1. **Readability** AND **Error Detection**:
+   * **Flesch Reading Ease**: `{flesch_reading_ease}` (A higher score is better, with 60-70 being easily understood by most adults).
+   * **Flesch-Kincaid Grade Level**: `{flesch_kincaid_grade}` (Represents the US school-grade level needed to understand the text).
+   * **Gunning Fog Index**: `{gunning_fog_index}` (A score above 12 is generally considered too hard for most people).
+   * **SMOG Index**: `{smog_index}` (Estimates the years of education needed to understand the text).
+   * **Assessment**: Based on these scores, evaluate the overall readability and technical complexity of the language used.
+   * **CRITICAL - Error Detection**: You MUST scan for and identify ALL error INSTANCES (not just types):
+     - **Typos and spelling errors**: Misspelled words, truncated words (e.g., "analysi" → "analysis", "exampl" → "example")
+       * If the SAME typo appears multiple times, LIST EACH OCCURRENCE separately
+     - **Malformed links**: URLs missing colons (e.g., "https//..." should be "https://...")
+       * Check EVERY link/URL in the document
+     - **Markdown/RMarkdown syntax errors**:
+       * Missing code fence markers (e.g., missing opening ```)
+       * Headers without spaces
+       * Broken R chunk syntax (e.g., missing {{r or }})
+       * Check ALL code blocks and headers
+     - **Bio/domain term errors**: Wrong scientific terms (e.g., "single sell" → "single cell", "genomis" → "genomics")
+       * Pay special attention to biology/bioinformatics terminology
+     - **Function name errors**: Misspelled function/API names (e.g., "Dat()" → "Date()")
+       * Check ALL function calls in code blocks
+     - **Inline code formatting**: Missing backticks around code elements
+       * Check that all code references use proper backtick formatting
+     - **ANY OTHER ANOMALIES**: Trust your judgment - if something looks wrong, report it
+   * **IMPORTANT**: Report EVERY INDIVIDUAL ERROR INSTANCE
+     - If "analysi" appears 4 times, report it 4 times (with line references if possible)
+     - If 5 URLs are malformed, report all 5 individually
+     - Do NOT group similar errors - LIST EACH ONE SEPARATELY
+     - **NEVER use phrases like**: "multiple occurrences", "and elsewhere", "several instances"
+     - **INSTEAD**: List each occurrence as a separate numbered error
+     - **DO not** make up errors - only report errors that are actually present in the text
+   * **Grade Level**:
+     - **85-100**: The user guide is exceptionally clear, polished, and engaging. It reads smoothly, with minimal effort required from the reader.
+     - **65-84**: The user guide is clear and easy to understand, with a natural flow and minimal jargon.
+     - **45-64**: The user guide is somewhat clear, but could benefit from more polish and consistency.
+     - **0-44**: The user guide is difficult to understand, with unclear language, jargon, or overly complex sentences.
+2. **Arguments and Clarity**:
+   * **Assessment**: [Your evaluation of whether it provides a clear **description** of arguments and their usage]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the user guide.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The user guide provides a clear and comprehensive guide to the user guide, with all necessary steps and information provided.
+     - **65-84**: The user guide provides a clear and comprehensive guide to the user guide, with most necessary steps and information provided.
+     - **45-64**: The user guide provides a clear and comprehensive guide to the user guide, with some necessary steps and information provided.
+     - **0-44**: The user guide does not provide a clear and comprehensive guide to the user guide, with no necessary steps and information provided.
+3. **Return Value and Clarity**:
+   * **Assessment**: [Your evaluation of whether it provides a clear **description** of return value and its meaning]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the user guide.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The user guide provides a clear and comprehensive guide to the user guide, with all necessary steps and information provided.
+     - **65-84**: The user guide provides a clear and comprehensive guide to the user guide, with most necessary steps and information provided.
+     - **45-64**: The user guide provides a clear and comprehensive guide to the user guide, with some necessary steps and information provided.
+     - **0-44**: The user guide does not provide a clear and comprehensive guide to the user guide, with no necessary steps and information provided.
+4. **Context and Purpose**:
+   * **Assessment**: [Your evaluation of whether it provides a clear **description** of the context and purpose of the module]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the user guide.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+      - **85-100**: The user guide provides a clear and comprehensive guide to the user guide, with all necessary steps and information provided.
+     - **65-84**: The user guide provides a clear and comprehensive guide to the user guide, with most necessary steps and information provided.
+     - **45-64**: The user guide provides a clear and comprehensive guide to the user guide, with some necessary steps and information provided.
+     - **0-44**: The user guide does not provide a clear and comprehensive guide to the user guide, with no necessary steps and information provided.
+5. **Error Handling**:
+   * **Assessment**: [Your evaluation of whether it provides a clear **description** of error handling]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the user guide.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The user guide provides a clear and comprehensive guide to the user guide, with all necessary steps and information provided.
+     - **65-84**: The user guide provides a clear and comprehensive guide to the user guide, with most necessary steps and information provided.
+     - **45-64**: The user guide provides a clear and comprehensive guide to the user guide, with some necessary steps and information provided.
+     - **0-44**: The user guide does not provide a clear and comprehensive guide to the user guide, with no necessary steps and information provided.
+6. **Usage Examples**:
+   * **Assessment**: [Your evaluation of whether it provides a clear **description** of usage examples]
+   * **Improvement Suggestions**: please be as specific as possible.
+      * **Original text:** [Quote a specific line/section from the user guide.]
+      * **Improving comments:** [Provide your suggestions to improve clarity.]
+   * **Grade Level**:
+     - **85-100**: The user guide provides a clear and comprehensive guide to the user guide, with all necessary steps and information provided.
+     - **65-84**: The user guide provides a clear and comprehensive guide to the user guide, with most necessary steps and information provided.
+     - **45-64**: The user guide provides a clear and comprehensive guide to the user guide, with some necessary steps and information provided.
+     - **0-44**: The user guide does not provide a clear and comprehensive guide to the user guide, with no necessary steps and information provided.
+7. **Overall Score**: Give an overall quality rating of the User Guide information.
+   * Output: `0-44`, `45-64`, `65-84`, or `85-100`
+---
+### **Final Report Ouput**
+Your final report must **exactly match** the following format. Do not add or omit any sections.
+**FinalAnswer**
+* **Overall Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Overall Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
+* **Readability Analysis Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Readability Error Count:** [TOTAL number of error INSTANCES found - count each occurrence]
+* **Readability Errors Found:** [List of ALL individual error instances]
+  - **CRITICAL**: List EVERY INDIVIDUAL ERROR INSTANCE (not grouped)
+  - **WRONG EXAMPLE** (Do NOT do this):
+    ❌ "TYPO: 'analysi' → 'analysis' - appears in multiple locations"
+    ❌ "LINKS: Several URLs missing colons"
+  - **CORRECT EXAMPLE** (Do this instead):
+    ✅ "TYPO: 'analysi' → 'analysis' - in section 'Perform DE analysi...'"
+    ✅ "TYPO: 'analysi' → 'analysis' - in paragraph 'The analysi shows...'"
+    ✅ "TYPO: 'analysi' → 'analysis' - in code comment 'Run analysi...'"
+    ✅ "TYPO: 'analysi' → 'analysis' - in heading 'Results of analysi'"
+    ✅ "LINK: 'https//www.nature.com/articles/nbt.4042' → 'https://www.nature.com/articles/nbt.4042'"
+    ✅ "LINK: 'https//github.com/satijalab/seurat-data' → 'https://github.com/satijalab/seurat-data'"
+  **Format for each error** (list them ALL individually):
+  - **Typos**: "original misspelled text" → "corrected text" (location/context)
+  - **Links**: Complete URL → Fixed URL (one entry per link)
+  - **Markdown/RMarkdown**: "syntax error" → "correct syntax" (specific location)
+  - **Bio terms**: "wrong term" → "correct term" (where it appears)
+  - **Function names**: "misspelled function" → "correct function" (which code block)
+  - **Inline code**: "missing backticks around X" → "add backticks" (specific variable/function)
+  - **Other issues**: describe and provide corrections
+  - General readability improvements (sentence structure, clarity, etc.)
+  **Remember**: Each error instance = one separate entry in Readability Errors Found list
+* **Readability Suggestions:** [General non-error readability improvements like sentence structure, clarity, etc.]
+* **Arguments and Clarity Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Arguments and Clarity Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
+* **Arguments and Clarity Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Return Value and Clarity Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Return Value and Clarity Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
+* **Return Value and Clarity Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Context and Purpose Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Context and Purpose Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
+* **Context and Purpose Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Error Handling Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Error Handling Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
+* **Error Handling Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+* **Usage Examples Score:** [a number between 0 and 100 representing the overall quality rating.]
+* **Usage Examples Key Strengths**: <brief summary of the User Guide's strongest points in 2-3 sentences>
+* **Usage Examples Improvement Suggestions:** please be as specific as possible.
+  - "Original text snippet 1" - Improving comment 1
+  - "Original text snippet 2" - Improving comment 2
+  - ...
+...
+---
+### **User Guide Content:**
+{userguide_content}
+---
+"""

bioguider/agents/evaluation_userguide_task.py ADDED Viewed

@@ -0,0 +1,154 @@
+from pathlib import Path
+import logging
+from typing import Optional
+from langchain.prompts import ChatPromptTemplate
+from pydantic import BaseModel, Field
+from bioguider.agents.collection_task import CollectionTask
+from bioguider.agents.consistency_evaluation_task import ConsistencyEvaluationResult
+from bioguider.agents.prompt_utils import CollectionGoalItemEnum
+from bioguider.utils.constants import (
+    DEFAULT_TOKEN_USAGE,
+)
+from bioguider.utils.file_utils import flatten_files
+from .evaluation_utils import (
+    compute_readability_metrics,
+    default_consistency_result,
+    evaluate_consistency_on_content,
+    normalize_evaluation_content,
+    run_llm_evaluation,
+    sanitize_files,
+)
+from .evaluation_task import EvaluationTask
+from bioguider.utils.utils import get_overall_score, increase_token_usage
+from .evaluation_userguide_prompts import INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
+class UserGuideEvaluationResult(BaseModel):
+    overall_score: int=Field(description="A number between 0 and 100 representing the overall quality rating.")
+    overall_key_strengths: str=Field(description="A string value, the key strengths of the user guide")
+    readability_score: int=Field(description="A number between 0 and 100 representing the readability quality rating.")
+    readability_error_count: Optional[int]=Field(default=0, description="Total number of ERROR INSTANCES found (count every occurrence, not types)")
+    readability_errors_found: list[str]=Field(default_factory=list, description="List of ALL individual error instances with format: 'ERROR_TYPE: original → corrected - location'")
+    readability_suggestions: list[str]=Field(description="A list of string values, suggestions to improve readability if necessary")
+    context_and_purpose_score: int=Field(description="A number between 0 and 100 representing the context and purpose quality rating.")
+    context_and_purpose_suggestions: list[str]=Field(description="A list of string values, suggestions to improve context and purpose if necessary")
+    error_handling_score: int=Field(description="A number between 0 and 100 representing the error handling quality rating.")
+    error_handling_suggestions: list[str]=Field(description="A list of string values, suggestions to improve error handling if necessary")
+class IndividualUserGuideEvaluationResult(BaseModel):
+    user_guide_evaluation: UserGuideEvaluationResult | None=Field(description="The evaluation result of the user guide")
+    consistency_evaluation: ConsistencyEvaluationResult | None=Field(description="The evaluation result of the consistency of the user guide")
+logger = logging.getLogger(__name__)
+MAX_FILE_SIZE = 1024 * 100 # 100K
+class EvaluationUserGuideTask(EvaluationTask):
+    def __init__(
+        self,
+        llm,
+        repo_path,
+        gitignore_path,
+        meta_data = None,
+        step_callback = None,
+        summarized_files_db = None,
+        code_structure_db = None,
+        collected_files: list[str] | None = None,
+    ):
+        super().__init__(llm, repo_path, gitignore_path, meta_data, step_callback, summarized_files_db)
+        self.evaluation_name = "User Guide Evaluation"
+        self.code_structure_db = code_structure_db
+        self.collected_files = collected_files
+    def _collect_files(self):
+        if self.collected_files is not None:
+            return self.collected_files
+        task = CollectionTask(
+            llm=self.llm,
+            step_callback=self.step_callback,
+            summarized_files_db=self.summarized_files_db,
+        )
+        task.compile(
+            repo_path=self.repo_path,
+            gitignore_path=Path(self.repo_path, ".gitignore"),
+            goal_item=CollectionGoalItemEnum.UserGuide.name,
+        )
+        files = task.collect()
+        files = flatten_files(self.repo_path, files)
+        files = sanitize_files(self.repo_path, files, max_size_bytes=MAX_FILE_SIZE)
+        return files
+    def _evaluate_individual_userguide(self, file: str) -> tuple[IndividualUserGuideEvaluationResult | None, dict]:
+        content, readability_content = normalize_evaluation_content(
+            self.repo_path, file
+        )
+        if content is None or readability_content is None:
+            logger.error(f"Error in reading file {file}")
+            return None, {**DEFAULT_TOKEN_USAGE}
+        # evaluate general criteria
+        flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index = \
+            compute_readability_metrics(readability_content)
+        system_prompt = ChatPromptTemplate.from_template(
+            INDIVIDUAL_USERGUIDE_EVALUATION_SYSTEM_PROMPT
+        ).format(
+            flesch_reading_ease=flesch_reading_ease,
+            flesch_kincaid_grade=flesch_kincaid_grade,
+            gunning_fog_index=gunning_fog_index,
+            smog_index=smog_index,
+            userguide_content=readability_content,
+        )
+        res, token_usage, reasoning_process = run_llm_evaluation(
+            llm=self.llm,
+            system_prompt=system_prompt,
+            instruction_prompt="Now, let's begin the user guide/API documentation evaluation.",
+            schema=UserGuideEvaluationResult,
+        )
+        res: UserGuideEvaluationResult = res
+        # evaluate consistency
+        consistency_evaluation_result, _temp_token_usage = evaluate_consistency_on_content(
+            llm=self.llm,
+            code_structure_db=self.code_structure_db,
+            step_callback=self.step_callback,
+            domain="user guide/API",
+            content=content,
+        )
+        if consistency_evaluation_result is None:
+            # No sufficient information to evaluate the consistency of the user guide/API documentation
+            consistency_evaluation_result = default_consistency_result("user guide/API")
+        # calculate overall score
+        res.overall_score = get_overall_score(
+            [
+                consistency_evaluation_result.score,
+                res.readability_score,
+                res.context_and_purpose_score,
+                res.error_handling_score,
+            ],
+            [2, 1, 1, 1],
+        )
+        return IndividualUserGuideEvaluationResult(
+            user_guide_evaluation=res,
+            consistency_evaluation=consistency_evaluation_result,
+        ), token_usage
+    def _evaluate(self, files: list[str] | None = None) -> tuple[dict[str, IndividualUserGuideEvaluationResult] | None, dict, list[str]]:
+        total_token_usage = {**DEFAULT_TOKEN_USAGE}
+        user_guide_evaluation_results = {}
+        files = flatten_files(self.repo_path, files)
+        for file in files:
+            if file.endswith(".py") or file.endswith(".R"):
+                continue
+            user_guide_evaluation_result, token_usage = self._evaluate_individual_userguide(file)
+            total_token_usage = increase_token_usage(total_token_usage, token_usage)
+            user_guide_evaluation_results[file] = user_guide_evaluation_result
+        return user_guide_evaluation_results, total_token_usage, files

bioguider/agents/evaluation_utils.py ADDED Viewed

@@ -0,0 +1,127 @@
+import json
+from pathlib import Path
+from typing import Iterable, Tuple
+from bioguider.agents.agent_utils import read_file
+from bioguider.agents.consistency_evaluation_task import (
+    ConsistencyEvaluationResult,
+    ConsistencyEvaluationTask,
+)
+from bioguider.utils.constants import DEFAULT_TOKEN_USAGE
+from bioguider.utils.file_utils import detect_file_type
+from bioguider.utils.notebook_utils import (
+    extract_markdown_from_notebook,
+    strip_notebook_to_code_and_markdown,
+)
+from bioguider.utils.pyphen_utils import PyphenReadability
+from bioguider.utils.utils import convert_html_to_text
+from .common_agent_2step import CommonAgentTwoChainSteps, CommonAgentTwoSteps
+def _escape_template_braces(text: str) -> str:
+    return text.replace("{", "<<").replace("}", ">>")
+def sanitize_files(
+    repo_path: str,
+    files: Iterable[str],
+    max_size_bytes: int,
+    disallowed_exts: set[str] | None = None,
+    check_ipynb_size: bool = False,
+) -> list[str]:
+    sanitized_files: list[str] = []
+    for file in files:
+        file_path = Path(repo_path, file)
+        if not file_path.exists() or not file_path.is_file():
+            continue
+        if detect_file_type(file_path) == "binary":
+            continue
+        if disallowed_exts and file_path.suffix.lower() in disallowed_exts:
+            continue
+        if file_path.suffix.lower() != ".ipynb" or check_ipynb_size:
+            if file_path.stat().st_size > max_size_bytes:
+                continue
+        sanitized_files.append(file)
+    return sanitized_files
+def normalize_evaluation_content(
+    repo_path: str,
+    file: str,
+) -> Tuple[str | None, str | None]:
+    file_path = Path(repo_path, file)
+    content = read_file(file_path)
+    if content is None:
+        return None, None
+    suffix = file_path.suffix.lower()
+    if suffix == ".ipynb":
+        readability_content = extract_markdown_from_notebook(file_path)
+        content = json.dumps(strip_notebook_to_code_and_markdown(file_path))
+        content = _escape_template_braces(content)
+        return content, readability_content
+    if suffix in {".html", ".htm"}:
+        readability_content = convert_html_to_text(file_path)
+        content = _escape_template_braces(readability_content)
+        return content, readability_content
+    return content, content
+def compute_readability_metrics(
+    content: str,
+) -> Tuple[float, float, float, float]:
+    readability = PyphenReadability()
+    flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index, \
+        _, _, _, _, _ = readability.readability_metrics(content)
+    return flesch_reading_ease, flesch_kincaid_grade, gunning_fog_index, smog_index
+def evaluate_consistency_on_content(
+    llm,
+    code_structure_db,
+    step_callback,
+    domain: str,
+    content: str,
+) -> Tuple[ConsistencyEvaluationResult | None, dict]:
+    if code_structure_db is None:
+        return None, {**DEFAULT_TOKEN_USAGE}
+    consistency_evaluation_task = ConsistencyEvaluationTask(
+        llm=llm,
+        code_structure_db=code_structure_db,
+        step_callback=step_callback,
+    )
+    return (
+        consistency_evaluation_task.evaluate(
+            domain=domain,
+            documentation=content,
+        ),
+        {**DEFAULT_TOKEN_USAGE},
+    )
+def run_llm_evaluation(
+    llm,
+    system_prompt: str,
+    instruction_prompt: str,
+    schema,
+    chain: bool = False,
+) -> Tuple[object, dict, str | None]:
+    agent_cls = CommonAgentTwoChainSteps if chain else CommonAgentTwoSteps
+    agent = agent_cls(llm=llm)
+    res, _processed, token_usage, reasoning_process = agent.go(
+        system_prompt=system_prompt,
+        instruction_prompt=instruction_prompt,
+        schema=schema,
+    )
+    return res, token_usage, reasoning_process
+def default_consistency_result(domain_label: str) -> ConsistencyEvaluationResult:
+    return ConsistencyEvaluationResult(
+        score=0,
+        assessment=f"No sufficient information to evaluate the consistency of the {domain_label} documentation",
+        development=[],
+        strengths=[],
+    )