RubyGems - roast-ai - Versions diffs - 0.4.8 → 0.4.9 - Mend

roast-ai 0.4.8 → 0.4.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/.rubocop.yml +1 -0
data/Gemfile.lock +3 -3
data/README.md +9 -5
data/dsl/less_simple.rb +112 -0
data/dsl/prototype.rb +17 -0
data/dsl/simple.rb +5 -7
data/dsl/step_communication.rb +18 -0
data/examples/grading/README.md +46 -0
data/examples/grading/analyze_coverage/prompt.md +52 -0
data/examples/grading/calculate_final_grade.rb +64 -0
data/examples/grading/format_result.rb +61 -0
data/examples/grading/generate_grades/prompt.md +105 -0
data/examples/grading/generate_recommendations/output.txt +17 -0
data/examples/grading/generate_recommendations/prompt.md +60 -0
data/examples/grading/read_dependencies/prompt.md +15 -0
data/examples/grading/verify_mocks_and_stubs/prompt.md +12 -0
data/examples/grading/verify_test_helpers/prompt.md +53 -0
data/examples/grading/workflow.md +5 -0
data/examples/grading/workflow.yml +28 -0
data/lib/roast/dsl/cog/config.rb +31 -0
data/lib/roast/dsl/cog/stack.rb +21 -0
data/lib/roast/dsl/cog/store.rb +26 -0
data/lib/roast/dsl/cog.rb +70 -0
data/lib/roast/dsl/cog_execution_context.rb +29 -0
data/lib/roast/dsl/cogs/cmd.rb +55 -0
data/lib/roast/dsl/cogs/graph.rb +53 -0
data/lib/roast/dsl/cogs.rb +65 -0
data/lib/roast/dsl/config_context.rb +54 -0
data/lib/roast/dsl/executor.rb +62 -7
data/lib/roast/dsl/workflow_execution_context.rb +47 -0
data/lib/roast/error.rb +7 -0
data/lib/roast/errors.rb +3 -3
data/lib/roast/graph/edge.rb +25 -0
data/lib/roast/graph/node.rb +40 -0
data/lib/roast/graph/quantum_edge.rb +27 -0
data/lib/roast/graph/threaded_exec.rb +93 -0
data/lib/roast/graph.rb +233 -0
data/lib/roast/resources/api_resource.rb +2 -2
data/lib/roast/resources/url_resource.rb +2 -2
data/lib/roast/tools/apply_diff.rb +1 -1
data/lib/roast/tools/ask_user.rb +1 -1
data/lib/roast/tools/bash.rb +1 -1
data/lib/roast/tools/cmd.rb +2 -2
data/lib/roast/tools/coding_agent.rb +2 -2
data/lib/roast/tools/grep.rb +1 -1
data/lib/roast/tools/read_file.rb +1 -1
data/lib/roast/tools/search_file.rb +1 -1
data/lib/roast/tools/swarm.rb +1 -1
data/lib/roast/tools/update_files.rb +2 -2
data/lib/roast/tools/write_file.rb +1 -1
data/lib/roast/tools.rb +1 -1
data/lib/roast/value_objects/api_token.rb +1 -1
data/lib/roast/value_objects/uri_base.rb +1 -1
data/lib/roast/value_objects/workflow_path.rb +1 -1
data/lib/roast/version.rb +1 -1
data/lib/roast/workflow/base_workflow.rb +38 -2
data/lib/roast/workflow/command_executor.rb +1 -1
data/lib/roast/workflow/configuration_loader.rb +1 -1
data/lib/roast/workflow/error_handler.rb +1 -1
data/lib/roast/workflow/step_executor_registry.rb +1 -1
data/lib/roast/workflow/step_loader.rb +1 -1
data/lib/roast/workflow/workflow_executor.rb +1 -1
data/lib/roast.rb +1 -1
data/sorbet/config +2 -0
data/sorbet/rbi/annotations/.gitattributes +1 -0
data/sorbet/rbi/annotations/activesupport.rbi +495 -0
data/sorbet/rbi/annotations/faraday.rbi +17 -0
data/sorbet/rbi/annotations/minitest.rbi +119 -0
data/sorbet/rbi/annotations/mocha.rbi +34 -0
data/sorbet/rbi/annotations/rainbow.rbi +269 -0
data/sorbet/rbi/annotations/webmock.rbi +9 -0
data/sorbet/rbi/gems/rbs-inline@0.12.0.rbi +2170 -0
data/sorbet/rbi/gems/{rexml@3.4.1.rbi → rexml@3.4.2.rbi} +284 -239
data/sorbet/rbi/shims/lib/roast/dsl/config_context.rbi +11 -0
data/sorbet/rbi/shims/lib/roast/dsl/workflow_execution_context.rbi +11 -0
data/sorbet/rbi/todo.rbi +7 -0
metadata +46 -5
data/package-lock.json +0 -6
/data/sorbet/rbi/gems/{rack@2.2.17.rbi → rack@2.2.18.rbi} +0 -0

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: '0019e9658942394301e91581ecc45b259c76f1999f17cc8f1ddf4f97692ea639'
-  data.tar.gz: feb3d19b4316a2ea2751347a614f816c22cbd53182821eed661488c760a9897a
+  metadata.gz: fc01ff6a90dbe17c735b17935bb75bcfda0ee47c0b9dbee6234292cc4f5f3799
+  data.tar.gz: caa21e581b5476a65c2dc8b0b7a27f1b20baf2f493260d954785724f73a3ff40
 SHA512:
-  metadata.gz: b8c5442247e1e0aae1cb7aa4509a3724fb1867bf9b05b15cfe1da4a8287db1db8229026ad3be66cc450b57650e8dd00d4eca632021680f3320d2bbf8dd19fc89
-  data.tar.gz: bbf18e07720a9cfe48abc2780f0f2cdba7d091bb3b2103318556910693dc54b09980ac0f2133876a811700345cdb6fa9e32d87cd4622f9039fe1bbbb7ffe3151
+  metadata.gz: 6ea65bb2381627119b5740f9353b064d046e26b2769e0be36baf3f13b9696e30e6bd6332fbbf8c4305fea33d1b2bb12d81a4a56871f3793c3718e2ac8407c6ae
+  data.tar.gz: 23ff60c01817d64bae76bb16c0395d8cc7aa95cd621550cebc5fc6763fcf5ba8a023542bf07b49067fa9210d8f228f538b0e82159d897397d08532deece73bc4

data/.gitignore CHANGED Viewed

@@ -8,6 +8,7 @@
 /pkg/
 /spec/reports/
 /tmp/
+/sig/generated/
 .CLAUDE.local.md
 .rspec_status
 **/.claude/settings.local.json

data/.rubocop.yml CHANGED Viewed

@@ -15,6 +15,7 @@ AllCops:
     - 'bin/*'
     - 'spec/fixtures/**/*'
     - 'test/fixtures/**/*'
+    - 'examples/**/*'
   SuggestExtensions: false
   TargetRubyVersion: 3.4
   UseCache: true

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    roast-ai (0.4.8)
+    roast-ai (0.4.9)
       activesupport (>= 7.0)
       cli-kit (~> 5.0)
       cli-ui (= 2.3.0)
@@ -167,7 +167,7 @@ GEM
       method_source (~> 1.0)
     public_suffix (6.0.2)
     racc (1.8.1)
-    rack (2.2.17)
+    rack (2.2.19)
     rainbow (3.1.1)
     raix (1.0.2)
       activesupport (>= 6.0)
@@ -185,7 +185,7 @@ GEM
     rbs (3.9.4)
       logger
     regexp_parser (2.10.0)
-    rexml (3.4.1)
+    rexml (3.4.2)
     rubocop (1.77.0)
       json (~> 2.3)
       language_server-protocol (~> 3.17.0.2)

data/README.md CHANGED Viewed

@@ -374,20 +374,24 @@ Roast supports several types of steps:
     ```yaml
     steps:
       - analyze_code
-      - get_user_feedback:
+      - input:
+          name: get_user_feedback
           prompt: "Should we proceed with the refactoring? (yes/no)"
           type: confirm
-      - review_changes:
+      - input:
+          name: review_changes
           prompt: "Enter your review comments"
           type: text
-      - select_strategy:
+      - input:
+          name: select_strategy
           prompt: "Choose optimization strategy"
-          type: select
+          type: choice
           options:
             - "Performance optimization"
             - "Memory optimization"
             - "Code clarity"
-      - api_configuration:
+      - input:
+          name: api_configuration
           prompt: "Enter API key"
           type: password
     ```

data/dsl/less_simple.rb ADDED Viewed

@@ -0,0 +1,112 @@
+# typed: false
+# frozen_string_literal: true
+#### cmd
+# Passing just the command to execute will run it and return the output
+cmd <<~SHELLSTEP
+  echo "raw run without storing, should see me once""
+SHELLSTEP
+# Passing a name finds or creates an object and returns that
+cmd_cog = cmd(:hello)
+puts "This is our new cmd_cog named ':hello': #{cmd_cog}"
+# We can set a command to run for later
+cmd(:set_and_run).set("echo 'set_and_run, should see me once'")
+cmd(:set_and_run).run
+# Similarly, we can run immediately and then re-run later
+cmd(:run_and_rerun).run("echo 'run_and_rerun, should see me twice'")
+cmd(:run_and_rerun).run
+#### graph
+# We can open and re-open a graph, and then execute it
+graph(:updatable) do |graph|
+  graph.node(:open_cmd) do |state|
+    state[:open] = cmd("echo 'From a node added in first open, should see me once'")
+  end
+end
+graph(:updatable) do |graph|
+  graph.node(:reopen_cmd) do |state|
+    state[:reopen] = cmd("echo 'From a node added in reopen, should see me once'")
+  end
+end
+graph(:yea).execute
+# We can also just populate and execute a graph in one go by calling graph.execute in the block.
+graph(:define_and_exec) do |graph|
+  graph.node(:hi) do |state|
+    state[:hi_msg] = cmd("echo 'hi msg'")
+  end
+  graph.execute
+end
+# We can have subgraphs, because why not
+graph(:outer) do |graph|
+  graph.subgraph(:inner) do |subgraph|
+    subgraph.node(:inner_node) do |inner_state|
+      inner_state[:foo] = cmd("echo 'inner_state foo'")
+    end
+  end
+  graph.node(:outer) do |outer_state|
+    outer_state[:bar] = cmd("echo 'outer_state bar'")
+  end
+  graph.execute
+end
+# We can specify our own edges
+graph(:edges) do |graph|
+  graph.node(:thing1) do |state|
+    state[:thing1] = cmd("echo 'thing1'")
+  end
+  graph.node(:thing2) do |state|
+    state[:thing2] = cmd("echo 'thing2'")
+  end
+  graph.edge(from: :START, to: :thing1)
+  graph.edge(from: :thing1, to: :thing2)
+  graph.edge(from: :thing2, to: :DONE)
+  graph.execute
+end
+# We can have parallel execution
+graph(:parallel) do |graph|
+  graph.node(:thing1) do |state|
+    state[:thing1] = cmd("sleep 0.5 && echo 'parallel thing1'")
+  end
+  graph.node(:thing2) do |state|
+    state[:thing2] = cmd("sleep 0.5 && echo 'parallel thing2'")
+  end
+  graph.edge(from: :START, to: [:thing1, :thing2])
+  graph.edge(from: [:thing1, :thing2], to: :DONE)
+  graph.execute
+end
+# We can have edges that are defined with a block
+graph(:quantum) do |graph|
+  graph.node(:thing1) do |state|
+    state[:thing1] = cmd("echo 'quantum thing1'")
+  end
+  graph.edge(from: :START) do |_state|
+    :thing1
+  end
+  graph.edge(from: :thing1) do |_state|
+    :DONE
+  end
+  graph.execute
+end

data/dsl/prototype.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# typed: true
+# frozen_string_literal: true
+#: self as Roast::DSL::Executor
+config do
+  cmd(:echo) { print_all! }
+end
+execute do
+  # Anonymous cog. Added to the stack directly and given an autogenerated key in cog storage
+  # Use for actions you do once and forget about, don't need configuration
+  cmd { "touch tmp/#{SecureRandom.uuid} " }
+  # Named cog. Configuration for this specific instance will be looked up from config block
+  cmd(:echo) { "echo 'Hello World!'" }
+end

data/dsl/simple.rb CHANGED Viewed

@@ -1,10 +1,8 @@
-# typed: true
+# typed: false
 # frozen_string_literal: true
-#: self as Roast::DSL::Executor
-# This is a dead simple workflow that calls two shell scripts
-shell <<~SHELLSTEP
+# This is a dead simple workflow that calls two commands
+cmd <<~CMDSTEP
   echo "I have no idea what's going on"
-SHELLSTEP
-shell "pwd"
+CMDSTEP
+cmd "pwd"

data/dsl/step_communication.rb ADDED Viewed

@@ -0,0 +1,18 @@
+# typed: false
+# frozen_string_literal: true
+# How do we pass information between steps?
+# Demonstrate by passing result of a command output to another step
+config do
+  cmd(:echo) { display! }
+end
+execute do
+  cmd(:ls) { "ls -al" }
+  cmd(:echo) do
+    # TODO: this is a bespoke output object for cmd, is there a generic one we can offer
+    first_line = cmd(:ls).command_output.split("\n").second
+    "echo '#{first_line}'"
+  end
+end

data/examples/grading/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# Test Grading Workflow
+This workflow acts as a senior software engineer and testing expert to evaluate the quality of test files based on best practices and guidelines.
+## Usage
+```bash
+# Run the grading workflow on a test file
+roast execute examples/grading/workflow.yml path/to/your_test.rb
+```
+## How it Works
+1. **read_dependencies**: Analyzes the test file and its dependencies
+2. **run_coverage**: Executes the test with coverage tracking
+3. **generate_grades**: Evaluates test quality across multiple dimensions
+4. **verify_test_helpers**: Checks for proper test helper usage
+5. **verify_mocks_and_stubs**: Ensures appropriate use of test doubles
+6. **analyze_coverage**: Reviews code coverage metrics
+7. **generate_recommendations**: Provides improvement suggestions
+8. **calculate_final_grade**: Computes an overall grade (A-F scale)
+9. **format_result**: Formats the final output
+## Customization
+Feel free to adapt this workflow to your testing environment:
+- **Different test frameworks**: Modify `run_coverage.rb` to work with RSpec, Jest, pytest, etc.
+- **Coverage tools**: Replace the coverage command with your preferred tool (SimpleCov, Istanbul, Coverage.py)
+- **Grading criteria**: Adjust the prompts in each step to match your team's standards
+## Example Output
+```
+========== TEST GRADE REPORT ==========
+Test file: test/example_test.rb
+FINAL GRADE:
+  Score: 85/100
+  Letter Grade: B
+RECOMMENDATIONS:
+- Add edge case testing for error conditions
+- Improve test descriptions for clarity
+- Consider extracting common setup to helper methods
+```

data/examples/grading/analyze_coverage/prompt.md ADDED Viewed

@@ -0,0 +1,52 @@
+<coverage_results>
+<%= workflow.output["run_coverage"] %>
+</coverage_results>
+Analyze the results and score them on a scale of 1-10 using the following rubrics:
+<line_coverage>
+0-1: Critical failure (0-20% coverage) - Core functionality remains completely untested
+2-3: Poor coverage (21-40%) - Major gaps; many key functions lack any testing
+4-5: Inadequate coverage (41-60%) - Several important code paths are not executed
+6-7: Moderate coverage (61-80%) - Notable gaps remain; some important functionality lacks coverage
+8-9: Good coverage (81-95%) - Only minor or edge case code paths remain untested
+10: Excellent coverage (96-100%)
+</line_coverage>
+<branch_coverage>
+0-1: Critical failure (0-20% branch coverage) - Almost no conditional branches are tested
+2-3: Poor coverage (21-40%) - Most conditional logic remains untested
+4-5: Inadequate coverage (41-60%) - Many conditions are only tested for one outcome
+6-7: Moderate coverage (61-80%) - Some conditions lack testing for all outcomes
+8-9: Good coverage (81-95%) - Most conditions are tested for most outcomes
+10: Excellent coverage (96-100%)
+</branch_coverage>
+<method_coverage>
+0-1: Critical failure (0-20% method coverage) - Most or core functionality methods are untested
+2-3: Poor coverage (21-40%) - Several public API methods remain untested
+4-5: Inadequate coverage (41-60%) - Some important public methods lack tests
+6-7: Moderate coverage (61-80%) - Notable gaps remain; some public methods may lack comprehensive testing
+8-9: Good coverage (81-95%) - Nearly all public methods are tested; private methods are mostly covered via public method tests
+10: Excellent coverage (96-100%)
+</method_coverage>
+RESPONSE FORMAT
+You must respond in JSON format within <json> XML tags. Example:
+<json>
+{
+  "method_coverage": {
+    "score": "10",
+    "justification": "The source file has 100% method coverage, indicating all methods are being tested."
+  },
+  "line_coverage": {
+    "score": 10,
+    "justification": "The source file has 100% line coverage, indicating all executable lines are tested."
+  },
+  "branch_coverage": {
+    "score": 8,
+    "justification": "The source file has 80% branch coverage, indicating some branches need testing."
+  }
+}
+</json>

data/examples/grading/calculate_final_grade.rb ADDED Viewed

@@ -0,0 +1,64 @@
+# frozen_string_literal: true
+class CalculateFinalGrade < Roast::Workflow::BaseStep
+  WEIGHTS = {
+    test_helpers: 0.2,
+    mocks_and_stubs: 0.2,
+    readability: 0.2,
+    maintainability: 0.2,
+    effectiveness: 0.2,
+  }.freeze
+  def call
+    llm_analysis = workflow.output["generate_grades"]
+    weighted_sum = WEIGHTS.sum do |criterion, weight|
+      score = llm_analysis[criterion.to_s]["score"].to_f / 10.0
+      score * weight
+    end
+    {
+      final_score: {
+        weighted_score: weighted_sum,
+        letter_grade: calculate_letter_grade(weighted_sum),
+      },
+      rubric_scores: calculate_rubric_scores(llm_analysis),
+    }
+  end
+  private
+  def calculate_letter_grade(score)
+    case score
+    when 0.9..1.0
+      "A"
+    when 0.8...0.9
+      "B"
+    when 0.7...0.8
+      "C"
+    when 0.6...0.7
+      "D"
+    else
+      "F"
+    end
+  end
+  def calculate_rubric_scores(llm_analysis)
+    scores = {}
+    WEIGHTS.each_key do |criterion|
+      next 1 unless llm_analysis[criterion.to_s]
+      raw_score = llm_analysis[criterion.to_s]["score"].to_f
+      normalized_score = raw_score / 10.0
+      scores[criterion] = {
+        raw_value: raw_score,
+        score: normalized_score,
+        description: llm_analysis[criterion.to_s]["justification"],
+        weighted_score: normalized_score * WEIGHTS[criterion],
+      }
+    end
+    scores
+  end
+end

data/examples/grading/format_result.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+class FormatResult < Roast::Workflow::BaseStep
+  RUBRIC = {
+    test_helpers: { description: "Test Helpers Usage", weight: 0.2 },
+    mocks_and_stubs: { description: "Mocks and Stubs Usage", weight: 0.2 },
+    readability: { description: "Test Readability", weight: 0.2 },
+    maintainability: { description: "Test Maintainability", weight: 0.2 },
+    effectiveness: { description: "Test Effectiveness", weight: 0.2 },
+  }.freeze
+  def call
+    append_to_final_output(<<~OUTPUT)
+      ========== TEST GRADE REPORT ==========
+      Test file: #{workflow.file}
+    OUTPUT
+    format_results
+    append_to_final_output("\n\n")
+  end
+  private
+  def format_results
+    # With HashWithIndifferentAccess, we can simply access with either syntax
+    grade_data = workflow.output["calculate_final_grade"]
+    unless grade_data
+      return append_to_final_output("Error: Grading data not available. This may be because you're replaying the workflow from this step, but the previous step data is missing or not found in the selected session.")
+    end
+    format_grade(grade_data)
+    # Make sure rubric_scores exists before trying to iterate over it
+    unless grade_data[:rubric_scores]
+      return append_to_final_output("Error: Rubric scores data not available in the workflow output.")
+    end
+    append_to_final_output("RUBRIC SCORES:")
+    grade_data[:rubric_scores].each do |category, data|
+      # Safely access RUBRIC with a fallback for potentially missing categories
+      rubric_item = RUBRIC[category.to_sym] || { description: "Unknown Category", weight: 0 }
+      append_to_final_output("  #{rubric_item[:description]} (#{(rubric_item[:weight] * 100).round}% of grade):")
+      append_to_final_output("    Value: #{data[:raw_value] || "N/A"}")
+      append_to_final_output("    Score: #{data[:score] ? (data[:score] * 10).round : "N/A"}/10 - \"#{data[:description] || "No description available"}\"")
+    end
+  end
+  def format_grade(grade_data)
+    return append_to_final_output("\nError: Final grade data not available.") unless grade_data && grade_data[:final_score]
+    letter_grade = grade_data[:final_score][:letter_grade]
+    celebration_emoji = letter_grade == "A" ? "🎉" : ""
+    append_to_final_output(<<~OUTPUT)
+      \nFINAL GRADE:
+        Score: #{(grade_data[:final_score][:weighted_score] * 100).round}/100
+        Letter Grade: #{letter_grade} #{celebration_emoji}
+    OUTPUT
+  end
+end

data/examples/grading/generate_grades/prompt.md ADDED Viewed

@@ -0,0 +1,105 @@
+These are the key testing guidelines to consider in your evaluation:
+- Tests should serve as specifications that define expected behaviors
+- Tests should have descriptive names that clearly communicate intent
+- Tests should focus on behavior rather than implementation details
+- Excessive mocking/stubbing should be avoided in favor of testing real behavior
+- Tests should be well-structured with minimal setup complexity
+- Tests should be maintainable and not break when implementation details change
+- Tests should cover edge cases and error conditions
+- Tests should follow proper naming conventions and directory structure
+- Tests should not modify the behaviour of the code being tested (e.g. making a private method public in tests)
+Now consider the full transcript and evaluate the test being graded based on the following rubrics on a scale of 1-10:
+<test_helpers>
+0-1: Extremely poor helper usage - Helpers used incorrectly or inappropriately, making tests harder to understand
+2-3: Poor helper usage - Helpers are poorly designed, tightly coupled to implementation, or used incorrectly
+4-5: Basic helper usage - Helpers work but may be poorly organized or not reusable
+6-7: Good helper usage - Helpers are well-designed and used appropriately
+8-9: Very good helper usage - Helpers are well-factored, reusable, and make tests clearer
+10: Excellent helper usage - Helpers are perfectly designed, highly reusable, and significantly improve test clarity and maintainability. Also give this score to tests that DO NOT use test helpers at all.
+</test_helpers>
+<mocks_and_stubs>
+0-1: Extremely poor mocking - Mocks/stubs used incorrectly or excessively, completely hiding real behavior
+2-3: Poor mocking - Heavy reliance on mocks that couple tests to implementation; mocks don't match real behavior
+4-5: Basic mocking - Mocks used appropriately but may be overused or not match implementation exactly
+6-7: Good mocking - Mocks used judiciously where needed; generally match implementation
+8-9: Very good mocking - Minimal mocking focused on external dependencies; accurately reflects real behavior
+10: Excellent mocking - Mocks used only where absolutely necessary (external APIs, etc); perfectly match real implementations; maintain loose coupling
+</mocks_and_stubs>
+<readability>
+0-1: Extremely poor readability - Test purpose is impossible to understand; no structure or organization
+2-3: Poor readability - Test names are vague or misleading; structure is confusing with no clear assertions
+4-5: Basic readability - Structure is understandable but not optimized for clarity
+6-7: Good readability - Structure is logical with clear assertions
+8-9: Very readable - Well-organized with explicit, meaningful test names and assertions
+10: Exceptionally readable - Test names serve as perfect specifications; elegant structure with context-providing descriptions; self-documenting with clear setup, execution, and verification phases
+</readability>
+<maintenability>
+0-1: Extremely brittle - Tests are completely coupled to implementation details
+2-3: Highly unmaintainable - Will require significant rework when code changes because of heavy coupling to implementation details
+4-5: Somewhat maintainable - Some coupling to implementation details
+6-7: Reasonably maintainable - Tests mostly focus on behavior over implementation; limited coupling to implementation details
+8-9: Highly maintainable - Tests focus on behavior rather than implementation; changes to implementation should rarely break tests
+10: Exceptionally maintainable - Tests purely focus on behavior and public interfaces; implementation can be completely refactored without breaking tests; well-factored test helpers and fixtures
+</maintenability>
+<effectiveness>
+0-1: Ineffective - Don't validate actual behavior and could pass even if code is broken
+2-3: Minimally effective - Only the most basic functionality validated. Many incorrect behaviors would not be caught
+4-5: Partially effective - Only catch obvious issues but miss subtle bugs; limited validation of actual outcomes
+6-7: Reasonably effective - Should catch most common bugs
+8-9: Highly effective - Should catch nearly all bugs
+10: Exceptionally effective - Should catch even subtle edge case bugs; validate both positive and negative cases
+</effectiveness>
+While grading, consider the following goals as being applicable across all rubrics:
+SUBJECTIVE:
+- Well-written: Organized, easy to understand, and follow best practices
+- Real behavior: Validate what the code does rather than implementation details
+- Isolated: Should not depend on external systems, services, or APIs. Note: The use of fixtures such as `shops(:snowdevil)` is expected and should not be penalized. The only exception is when the SUT is being loaded as a fixture unnecessarily when it could be instantiated directly.
+OBJECTIVE
+- Idempotent: Should be able to run repeatedly without affecting outcome or side effects.
+- Deterministic: Should produce the same results across all runs and environments.
+- No sleep: Does not include sleep calls or rely on timing for synchronization.
+- Concurrent: Properly handles concurrent execution paths without errors.
+- Timeless: Does not depend on the current date or time. Will not fail due to changes such as daylight savings or leap years. Specifically with regards to handling time, look for anti-patterns like `Time.current + 7.days.to_i`, which fails on DST changes. The correct approach is `7.days.from_now`.
+VIOLATING ANY OBJECTIVE GOAL SHOULD RESULT IN AN OVERALL SCORE LESS THAN 5!
+Provide a brief justification for each score, using a maximum of 1-3 sentences. (Note that specific recommendations for improvement are not needed at this step.)
+You are acting as a stern and relentless striver for excellence in programming, so you must be highly critical. The point of this grading exercise is to facilitate substantial improvement, not just stroking the programmer's ego. Do not hesitate to give a failing overall score (0) for serious violations!
+RESPONSE FORMAT: You must respond in JSON format within <json> XML tags.
+<json>
+{
+  "test_helpers": {
+    "score": 4,
+    "justification": "Helpers are used incorrectly in several places, reducing test maintainability and clarity. The assert_valid_record helper is being misused with hashes instead of model instances."
+  },
+  "mocks_and_stubs": {
+    "score": 4,
+    "justification": "Several mocks don't match the actual implementation, making tests brittle and potentially hiding production bugs. For example, mocking success: true when the service returns status: 'success'."
+  },
+  "readability": {
+    "score": 8,
+    "justification": "Test names clearly describe behavior being tested."
+  },
+  "maintainability": {
+    "score": 6,
+    "justification": "Tests mostly focus on behavior but have some coupling to implementation."
+  },
+  "effectiveness": {
+    "score": 7,
+    "justification": "Tests validate most expected behaviors and would catch common bugs."
+  }
+}
+</json>

data/examples/grading/generate_recommendations/output.txt ADDED Viewed

@@ -0,0 +1,17 @@
+========== TEST RECOMMENDATIONS ==========
+<%- if response.recommendations.empty? -%>
+No recommendations found.
+<%- else -%>
+<%- response.recommendations.each_with_index do |rec, index| -%>
+Recommendation #<%= index + 1 %>:
+Description: <%= rec.description %>
+Impact: <%= rec.impact %>
+Priority: <%= rec.priority %>
+Code Suggestion:
+<%= rec.code_suggestion %>
+<%- end -%>
+<%- end -%>
+===========================================

data/examples/grading/generate_recommendations/prompt.md ADDED Viewed

@@ -0,0 +1,60 @@
+Finally, based on the conversation transcript above, go ahead and provide specific, actionable recommendations that would most effectively improve the overall test score.
+Focus on recommendations that would:
+1. Increase coverage
+2. Add more assertions where needed
+3. Make the tests more maintainable or readable
+4. Ensure tests serve as specifications by having clear, descriptive names
+5. Reduce excessive mocking/stubbing that couples tests to implementation details
+6. Improve test structure to reduce setup complexity
+7. Ensure tests focus on behavior rather than implementation details
+8. Ensure gaps in private methods are tested through public methods
+9. Fix any issues with test helpers that are used incorrectly or unnecessarily
+10. Improve efficiency by combining or deleting tests where appropriate (note that having more than one assertion per test is acceptable)
+11. Fix any violations of the objective criteria (idempotency, determinism, etc.)
+12. Be specific about edge cases that should be covered by tests. Write down in the recommendations which edge cases you are referring to.
+13. Do not recommend the use of RSpec features like `let` for Minispec tests.
+IF YOU IDENTIFY EDGE CASES, YOU MUST BE SPECIFIC ABOUT THEM IN THE RECOMMENDATIONS.
+RESPONSE FORMAT: You must respond in JSON format inside <json> XML tags without additional commentary.
+Example:
+<json>
+{
+  "recommendations": [
+    {
+      "description": "Add tests for uncovered method X",
+      "impact": "Would increase method coverage by Y%",
+      "priority": "High",
+      "code_suggestion": "def test_method_x_with_valid_input\n  result = subject.method_x('valid_input')\n  assert_equal expected_result, result\nend"
+    },
+    {
+      "description": "Fix time handling to avoid DST issues",
+      "impact": "Would make tests deterministic across DST changes",
+      "priority": "High",
+      "code_suggestion": "# Replace\nexpiry_time = Time.current + 7.days.to_i\n\n# With\nexpiry_time = 7.days.from_now"
+    },
+    {
+      "description": "Add edge case tests for the show action for when the parameter X is blank",
+      "impact": "Would improve test completeness and effectiveness",
+      "priority": "Medium",
+      "code_suggestion": "..."
+    },
+    {
+      "description": "Improve test descriptions to better serve as specifications",
+      "impact": "Would make tests more valuable as documentation",
+      "priority": "Medium",
+      "code_suggestion": "# Replace\ndef test_process\n\n# With\ndef test_process_returns_success_with_valid_input"
+    },
+    {
+      "description": "Replace implementation-focused mocks with behavior assertions",
+      "impact": "Would make tests less brittle and more maintainable",
+      "priority": "High",
+      "code_suggestion": "# Replace\nUserNotifier.expects(:notify).with(user, 'welcome')\n\n# With\nassert_sends_notification(user, 'welcome') do\n  subject.process\nend"
+    }
+  ]
+}
+</json>