roast-ai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.claude/settings.json +12 -0
- data/.github/workflows/ci.yaml +29 -0
- data/.github/workflows/cla.yml +22 -0
- data/.gitignore +13 -0
- data/.rspec +1 -0
- data/.rubocop.yml +12 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +0 -0
- data/CLAUDE.md +31 -0
- data/CODE_OF_CONDUCT.md +133 -0
- data/CONTRIBUTING.md +35 -0
- data/Gemfile +19 -0
- data/Gemfile.lock +194 -0
- data/LICENSE.md +21 -0
- data/README.md +27 -0
- data/Rakefile +24 -0
- data/bin/console +11 -0
- data/examples/grading/analyze_coverage/prompt.md +52 -0
- data/examples/grading/calculate_final_grade.rb +67 -0
- data/examples/grading/format_result.rb +48 -0
- data/examples/grading/generate_grades/prompt.md +105 -0
- data/examples/grading/generate_recommendations/output.txt +17 -0
- data/examples/grading/generate_recommendations/prompt.md +60 -0
- data/examples/grading/run_coverage.rb +47 -0
- data/examples/grading/verify_mocks_and_stubs/prompt.md +12 -0
- data/examples/grading/verify_test_helpers/prompt.md +53 -0
- data/examples/grading/workflow.md +8 -0
- data/examples/grading/workflow.rb.md +6 -0
- data/examples/grading/workflow.ts+tsx.md +6 -0
- data/examples/grading/workflow.yml +46 -0
- data/exe/roast +17 -0
- data/lib/roast/helpers/function_caching_interceptor.rb +27 -0
- data/lib/roast/helpers/logger.rb +104 -0
- data/lib/roast/helpers/minitest_coverage_runner.rb +244 -0
- data/lib/roast/helpers/path_resolver.rb +148 -0
- data/lib/roast/helpers/prompt_loader.rb +97 -0
- data/lib/roast/helpers.rb +12 -0
- data/lib/roast/tools/cmd.rb +72 -0
- data/lib/roast/tools/grep.rb +43 -0
- data/lib/roast/tools/read_file.rb +49 -0
- data/lib/roast/tools/search_file.rb +51 -0
- data/lib/roast/tools/write_file.rb +60 -0
- data/lib/roast/tools.rb +50 -0
- data/lib/roast/version.rb +5 -0
- data/lib/roast/workflow/base_step.rb +94 -0
- data/lib/roast/workflow/base_workflow.rb +79 -0
- data/lib/roast/workflow/configuration.rb +117 -0
- data/lib/roast/workflow/configuration_parser.rb +92 -0
- data/lib/roast/workflow/validator.rb +37 -0
- data/lib/roast/workflow/workflow_executor.rb +119 -0
- data/lib/roast/workflow.rb +13 -0
- data/lib/roast.rb +40 -0
- data/roast.gemspec +44 -0
- data/schema/workflow.json +92 -0
- data/shipit.rubygems.yml +0 -0
- metadata +171 -0
@@ -0,0 +1,67 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class CalculateFinalGrade < Roast::Workflow::BaseStep
|
4
|
+
attr_accessor :llm_analysis
|
5
|
+
|
6
|
+
WEIGHTS = {
|
7
|
+
line_coverage: 0.1,
|
8
|
+
method_coverage: 0.1,
|
9
|
+
branch_coverage: 0.3,
|
10
|
+
test_helpers: 0.1,
|
11
|
+
mocks_and_stubs: 0.1,
|
12
|
+
readability: 0.1,
|
13
|
+
maintainability: 0.1,
|
14
|
+
effectiveness: 0.1,
|
15
|
+
}.freeze
|
16
|
+
|
17
|
+
def call
|
18
|
+
@llm_analysis = workflow.output["generate_grades"].merge(workflow.output["analyze_coverage"])
|
19
|
+
weighted_sum = WEIGHTS.sum do |criterion, weight|
|
20
|
+
score = llm_analysis[criterion.to_s]["score"].to_f / 10.0
|
21
|
+
score * weight
|
22
|
+
end
|
23
|
+
|
24
|
+
{
|
25
|
+
final_score: {
|
26
|
+
weighted_score: weighted_sum,
|
27
|
+
letter_grade: calculate_letter_grade(weighted_sum),
|
28
|
+
},
|
29
|
+
rubric_scores: calculate_rubric_scores,
|
30
|
+
}
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def calculate_letter_grade(score)
|
36
|
+
case score
|
37
|
+
when 0.9..1.0
|
38
|
+
"A"
|
39
|
+
when 0.8...0.9
|
40
|
+
"B"
|
41
|
+
when 0.7...0.8
|
42
|
+
"C"
|
43
|
+
when 0.6...0.7
|
44
|
+
"D"
|
45
|
+
else
|
46
|
+
"F"
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def calculate_rubric_scores
|
51
|
+
scores = {}
|
52
|
+
|
53
|
+
WEIGHTS.each_key do |criterion|
|
54
|
+
raw_score = llm_analysis[criterion.to_s]["score"].to_f
|
55
|
+
normalized_score = raw_score / 10.0
|
56
|
+
|
57
|
+
scores[criterion] = {
|
58
|
+
raw_value: raw_score,
|
59
|
+
score: normalized_score,
|
60
|
+
description: llm_analysis[criterion.to_s]["justification"],
|
61
|
+
weighted_score: normalized_score * WEIGHTS[criterion],
|
62
|
+
}
|
63
|
+
end
|
64
|
+
|
65
|
+
scores
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class FormatResult < Roast::Workflow::BaseStep
|
4
|
+
RUBRIC = {
|
5
|
+
line_coverage: { description: "Line Coverage", weight: 0.1 },
|
6
|
+
method_coverage: { description: "Method Coverage", weight: 0.1 },
|
7
|
+
branch_coverage: { description: "Branch Coverage", weight: 0.3 },
|
8
|
+
test_helpers: { description: "Test Helpers Usage", weight: 0.1 },
|
9
|
+
mocks_and_stubs: { description: "Mocks and Stubs Usage", weight: 0.1 },
|
10
|
+
readability: { description: "Test Readability", weight: 0.1 },
|
11
|
+
maintainability: { description: "Test Maintainability", weight: 0.1 },
|
12
|
+
effectiveness: { description: "Test Effectiveness", weight: 0.1 },
|
13
|
+
}.freeze
|
14
|
+
|
15
|
+
def call
|
16
|
+
append_to_final_output(<<~OUTPUT)
|
17
|
+
========== TEST GRADE REPORT ==========
|
18
|
+
Test file: #{workflow.file}
|
19
|
+
Source file: #{workflow.subject_file}
|
20
|
+
OUTPUT
|
21
|
+
|
22
|
+
format_results
|
23
|
+
append_to_final_output("\n\n")
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
def format_results
|
29
|
+
format_grade
|
30
|
+
|
31
|
+
append_to_final_output("RUBRIC SCORES:")
|
32
|
+
workflow.output["calculate_final_grade"][:rubric_scores].each do |category, data|
|
33
|
+
append_to_final_output(" #{RUBRIC[category][:description]} (#{(RUBRIC[category][:weight] * 100).round}% of grade):")
|
34
|
+
append_to_final_output(" Value: #{data[:raw_value]}")
|
35
|
+
append_to_final_output(" Score: #{(data[:score] * 10).round}/10 - \"#{data[:description]}\"")
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def format_grade
|
40
|
+
letter_grade = workflow.output["calculate_final_grade"][:final_score][:letter_grade]
|
41
|
+
celebration_emoji = letter_grade == "A" ? "🎉" : ""
|
42
|
+
append_to_final_output(<<~OUTPUT)
|
43
|
+
\nFINAL GRADE:
|
44
|
+
Score: #{(workflow.output["calculate_final_grade"][:final_score][:weighted_score] * 100).round}/100
|
45
|
+
Letter Grade: #{letter_grade} #{celebration_emoji}
|
46
|
+
OUTPUT
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,105 @@
|
|
1
|
+
These are the key testing guidelines to consider in your evaluation:
|
2
|
+
|
3
|
+
- Tests should serve as specifications that define expected behaviors
|
4
|
+
- Tests should have descriptive names that clearly communicate intent
|
5
|
+
- Tests should focus on behavior rather than implementation details
|
6
|
+
- Excessive mocking/stubbing should be avoided in favor of testing real behavior
|
7
|
+
- Tests should be well-structured with minimal setup complexity
|
8
|
+
- Tests should be maintainable and not break when implementation details change
|
9
|
+
- Tests should cover edge cases and error conditions
|
10
|
+
- Tests should follow proper naming conventions and directory structure
|
11
|
+
- Tests should not modify the behaviour of the code being tested (e.g. making a private method public in tests)
|
12
|
+
|
13
|
+
Now consider the full transcript and evaluate the test being graded based on the following rubrics on a scale of 1-10:
|
14
|
+
|
15
|
+
<test_helpers>
|
16
|
+
0-1: Extremely poor helper usage - Helpers used incorrectly or inappropriately, making tests harder to understand
|
17
|
+
2-3: Poor helper usage - Helpers are poorly designed, tightly coupled to implementation, or used incorrectly
|
18
|
+
4-5: Basic helper usage - Helpers work but may be poorly organized or not reusable
|
19
|
+
6-7: Good helper usage - Helpers are well-designed and used appropriately
|
20
|
+
8-9: Very good helper usage - Helpers are well-factored, reusable, and make tests clearer
|
21
|
+
10: Excellent helper usage - Helpers are perfectly designed, highly reusable, and significantly improve test clarity and maintainability. Also give this score to tests that DO NOT use test helpers at all.
|
22
|
+
</test_helpers>
|
23
|
+
|
24
|
+
<mocks_and_stubs>
|
25
|
+
0-1: Extremely poor mocking - Mocks/stubs used incorrectly or excessively, completely hiding real behavior
|
26
|
+
2-3: Poor mocking - Heavy reliance on mocks that couple tests to implementation; mocks don't match real behavior
|
27
|
+
4-5: Basic mocking - Mocks used appropriately but may be overused or not match implementation exactly
|
28
|
+
6-7: Good mocking - Mocks used judiciously where needed; generally match implementation
|
29
|
+
8-9: Very good mocking - Minimal mocking focused on external dependencies; accurately reflects real behavior
|
30
|
+
10: Excellent mocking - Mocks used only where absolutely necessary (external APIs, etc); perfectly match real implementations; maintain loose coupling
|
31
|
+
</mocks_and_stubs>
|
32
|
+
|
33
|
+
<readability>
|
34
|
+
0-1: Extremely poor readability - Test purpose is impossible to understand; no structure or organization
|
35
|
+
2-3: Poor readability - Test names are vague or misleading; structure is confusing with no clear assertions
|
36
|
+
4-5: Basic readability - Structure is understandable but not optimized for clarity
|
37
|
+
6-7: Good readability - Structure is logical with clear assertions
|
38
|
+
8-9: Very readable - Well-organized with explicit, meaningful test names and assertions
|
39
|
+
10: Exceptionally readable - Test names serve as perfect specifications; elegant structure with context-providing descriptions; self-documenting with clear setup, execution, and verification phases
|
40
|
+
</readability>
|
41
|
+
|
42
|
+
<maintenability>
|
43
|
+
0-1: Extremely brittle - Tests are completely coupled to implementation details
|
44
|
+
2-3: Highly unmaintainable - Will require significant rework when code changes because of heavy coupling to implementation details
|
45
|
+
4-5: Somewhat maintainable - Some coupling to implementation details
|
46
|
+
6-7: Reasonably maintainable - Tests mostly focus on behavior over implementation; limited coupling to implementation details
|
47
|
+
8-9: Highly maintainable - Tests focus on behavior rather than implementation; changes to implementation should rarely break tests
|
48
|
+
10: Exceptionally maintainable - Tests purely focus on behavior and public interfaces; implementation can be completely refactored without breaking tests; well-factored test helpers and fixtures
|
49
|
+
</maintenability>
|
50
|
+
|
51
|
+
<effectiveness>
|
52
|
+
0-1: Ineffective - Don't validate actual behavior and could pass even if code is broken
|
53
|
+
2-3: Minimally effective - Only the most basic functionality validated. Many incorrect behaviors would not be caught
|
54
|
+
4-5: Partially effective - Only catch obvious issues but miss subtle bugs; limited validation of actual outcomes
|
55
|
+
6-7: Reasonably effective - Should catch most common bugs
|
56
|
+
8-9: Highly effective - Should catch nearly all bugs
|
57
|
+
10: Exceptionally effective - Should catch even subtle edge case bugs; validate both positive and negative cases
|
58
|
+
</effectiveness>
|
59
|
+
|
60
|
+
While grading, consider the following goals as being applicable across all rubrics:
|
61
|
+
|
62
|
+
SUBJECTIVE:
|
63
|
+
- Well-written: Organized, easy to understand, and follow best practices
|
64
|
+
- Real behavior: Validate what the code does rather than implementation details
|
65
|
+
- Isolated: Should not depend on external systems, services, or APIs. Note: The use of fixtures such as `shops(:snowdevil)` is expected and should not be penalized. The only exception is when the SUT is being loaded as a fixture unnecessarily when it could be instantiated directly.
|
66
|
+
|
67
|
+
OBJECTIVE
|
68
|
+
- Idempotent: Should be able to run repeatedly without affecting outcome or side effects.
|
69
|
+
- Deterministic: Should produce the same results across all runs and environments.
|
70
|
+
- No sleep: Does not include sleep calls or rely on timing for synchronization.
|
71
|
+
- Concurrent: Properly handles concurrent execution paths without errors.
|
72
|
+
- Timeless: Does not depend on the current date or time. Will not fail due to changes such as daylight savings or leap years. Specifically with regards to handling time, look for anti-patterns like `Time.current + 7.days.to_i`, which fails on DST changes. The correct approach is `7.days.from_now`.
|
73
|
+
|
74
|
+
VIOLATING ANY OBJECTIVE GOAL SHOULD RESULT IN AN OVERALL SCORE LESS THAN 5!
|
75
|
+
|
76
|
+
Provide a brief justification for each score, using a maximum of 1-3 sentences. (Note that specific recommendations for improvement are not needed at this step.)
|
77
|
+
|
78
|
+
You are acting as a stern and relentless striver for excellence in programming, so you must be highly critical. The point of this grading exercise is to facilitate substantial improvement, not just stroking the programmer's ego. Do not hesitate to give a failing overall score (0) for serious violations!
|
79
|
+
|
80
|
+
RESPONSE FORMAT: You must respond in JSON format within <json> XML tags.
|
81
|
+
|
82
|
+
<json>
|
83
|
+
{
|
84
|
+
"test_helpers": {
|
85
|
+
"score": 4,
|
86
|
+
"justification": "Helpers are used incorrectly in several places, reducing test maintainability and clarity. The assert_valid_record helper is being misused with hashes instead of model instances."
|
87
|
+
},
|
88
|
+
"mocks_and_stubs": {
|
89
|
+
"score": 4,
|
90
|
+
"justification": "Several mocks don't match the actual implementation, making tests brittle and potentially hiding production bugs. For example, mocking success: true when the service returns status: 'success'."
|
91
|
+
},
|
92
|
+
"readability": {
|
93
|
+
"score": 8,
|
94
|
+
"justification": "Test names clearly describe behavior being tested."
|
95
|
+
},
|
96
|
+
"maintainability": {
|
97
|
+
"score": 6,
|
98
|
+
"justification": "Tests mostly focus on behavior but have some coupling to implementation."
|
99
|
+
},
|
100
|
+
"effectiveness": {
|
101
|
+
"score": 7,
|
102
|
+
"justification": "Tests validate most expected behaviors and would catch common bugs."
|
103
|
+
}
|
104
|
+
}
|
105
|
+
</json>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
========== TEST RECOMMENDATIONS ==========
|
2
|
+
<%- if response["recommendations"].empty? -%>
|
3
|
+
No recommendations found.
|
4
|
+
<%- else -%>
|
5
|
+
<%- response["recommendations"].each_with_index do |rec, index| -%>
|
6
|
+
Recommendation #<%= index + 1 %>:
|
7
|
+
Description: <%= rec["description"] %>
|
8
|
+
Impact: <%= rec["impact"] %>
|
9
|
+
Priority: <%= rec["priority"] %>
|
10
|
+
|
11
|
+
Code Suggestion:
|
12
|
+
|
13
|
+
<%= rec["code_suggestion"] %>
|
14
|
+
|
15
|
+
<%- end -%>
|
16
|
+
<%- end -%>
|
17
|
+
===========================================
|
@@ -0,0 +1,60 @@
|
|
1
|
+
Finally, based on the conversation transcript above, go ahead and provide specific, actionable recommendations that would most effectively improve the overall test score.
|
2
|
+
|
3
|
+
Focus on recommendations that would:
|
4
|
+
|
5
|
+
1. Increase coverage
|
6
|
+
2. Add more assertions where needed
|
7
|
+
3. Make the tests more maintainable or readable
|
8
|
+
4. Ensure tests serve as specifications by having clear, descriptive names
|
9
|
+
5. Reduce excessive mocking/stubbing that couples tests to implementation details
|
10
|
+
6. Improve test structure to reduce setup complexity
|
11
|
+
7. Ensure tests focus on behavior rather than implementation details
|
12
|
+
8. Ensure gaps in private methods are tested through public methods
|
13
|
+
9. Fix any issues with test helpers that are used incorrectly or unnecessarily
|
14
|
+
10. Improve efficiency by combining or deleting tests where appropriate (note that having more than one assertion per test is acceptable)
|
15
|
+
11. Fix any violations of the objective criteria (idempotency, determinism, etc.)
|
16
|
+
12. Be specific about edge cases that should be covered by tests. Write down in the recommendations which edge cases you are referring to.
|
17
|
+
13. Do not recommend the use of RSpec features like `let` for Minispec tests.
|
18
|
+
|
19
|
+
IF YOU IDENTIFY EDGE CASES, YOU MUST BE SPECIFIC ABOUT THEM IN THE RECOMMENDATIONS.
|
20
|
+
|
21
|
+
RESPONSE FORMAT: You must respond in JSON format inside <json> XML tags without additional commentary.
|
22
|
+
|
23
|
+
Example:
|
24
|
+
|
25
|
+
<json>
|
26
|
+
{
|
27
|
+
"recommendations": [
|
28
|
+
{
|
29
|
+
"description": "Add tests for uncovered method X",
|
30
|
+
"impact": "Would increase method coverage by Y%",
|
31
|
+
"priority": "High",
|
32
|
+
"code_suggestion": "def test_method_x_with_valid_input\n result = subject.method_x('valid_input')\n assert_equal expected_result, result\nend"
|
33
|
+
},
|
34
|
+
{
|
35
|
+
"description": "Fix time handling to avoid DST issues",
|
36
|
+
"impact": "Would make tests deterministic across DST changes",
|
37
|
+
"priority": "High",
|
38
|
+
"code_suggestion": "# Replace\nexpiry_time = Time.current + 7.days.to_i\n\n# With\nexpiry_time = 7.days.from_now"
|
39
|
+
},
|
40
|
+
{
|
41
|
+
"description": "Add edge case tests for the show action for when the parameter X is blank",
|
42
|
+
"impact": "Would improve test completeness and effectiveness",
|
43
|
+
"priority": "Medium",
|
44
|
+
"code_suggestion": "..."
|
45
|
+
},
|
46
|
+
{
|
47
|
+
"description": "Improve test descriptions to better serve as specifications",
|
48
|
+
"impact": "Would make tests more valuable as documentation",
|
49
|
+
"priority": "Medium",
|
50
|
+
"code_suggestion": "# Replace\ndef test_process\n\n# With\ndef test_process_returns_success_with_valid_input"
|
51
|
+
},
|
52
|
+
{
|
53
|
+
"description": "Replace implementation-focused mocks with behavior assertions",
|
54
|
+
"impact": "Would make tests less brittle and more maintainable",
|
55
|
+
"priority": "High",
|
56
|
+
"code_suggestion": "# Replace\nUserNotifier.expects(:notify).with(user, 'welcome')\n\n# With\nassert_sends_notification(user, 'welcome') do\n subject.process\nend"
|
57
|
+
}
|
58
|
+
]
|
59
|
+
}
|
60
|
+
</json>
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "open3"
|
4
|
+
|
5
|
+
class RunCoverage < Roast::Workflow::BaseStep
|
6
|
+
def call
|
7
|
+
# Run the test with coverage analysis
|
8
|
+
run_test_with_coverage
|
9
|
+
end
|
10
|
+
|
11
|
+
private
|
12
|
+
|
13
|
+
def run_test_with_coverage
|
14
|
+
subject_file = workflow.output["read_dependencies"]
|
15
|
+
subject_file = subject_file.match(%r{<sut>(.*?)</sut>})&.[](1) || subject_file
|
16
|
+
test_file = workflow.file
|
17
|
+
extension = File.extname(test_file).gsub(".", "")
|
18
|
+
|
19
|
+
# Handle JS/TS test files
|
20
|
+
extension = "js" if ["js", "jsx", "ts", "tsx"].include?(extension)
|
21
|
+
|
22
|
+
# Get the absolute path to the test_runner executable
|
23
|
+
test_runner_path = File.expand_path("../../bin/#{extension}_test_runner", __dir__)
|
24
|
+
|
25
|
+
# Make sure the test_runner executable exists
|
26
|
+
unless File.exist?(test_runner_path)
|
27
|
+
Roast::Helpers::Logger.error("Test runner executable not found: #{test_runner_path}")
|
28
|
+
exit(1)
|
29
|
+
end
|
30
|
+
|
31
|
+
# Resolve paths to prevent issues when pwd differs from project root
|
32
|
+
resolved_subject_file = Roast::Helpers::PathResolver.resolve(subject_file)
|
33
|
+
resolved_test_file = Roast::Helpers::PathResolver.resolve(test_file)
|
34
|
+
|
35
|
+
# Run the test_runner using shadowenv for environment consistency
|
36
|
+
command = "shadowenv exec --dir . -- #{test_runner_path} #{resolved_subject_file} #{resolved_test_file}"
|
37
|
+
output, status = Open3.capture2(command)
|
38
|
+
|
39
|
+
unless status.success?
|
40
|
+
Roast::Helpers::Logger.error("Test runner exited with non-zero status: #{status.exitstatus}")
|
41
|
+
Roast::Helpers::Logger.error(output)
|
42
|
+
exit(status.exitstatus)
|
43
|
+
end
|
44
|
+
|
45
|
+
output
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
Find places in the provided test code where stubbing and mocking are used. Search for the corresponding implementation source code of those dependencies elsewhere in the codebase to validate that the stub or mock matches the implementation that it is doubling. Use the tool functions provided to find and read the dependencies.
|
2
|
+
|
3
|
+
Once you've found the dependencies, verify that any mocks and stubs accurately reflect the real implementation. If there are discrepancies, list them out alphabetically with:
|
4
|
+
|
5
|
+
1. The name of the mocked/stubbed method
|
6
|
+
2. What the mock/stub expects in terms of arguments and/or return values
|
7
|
+
3. What the actual implementation actually takes as arguments and returns
|
8
|
+
4. Suggestions for fixing the discrepancy
|
9
|
+
|
10
|
+
Note: If there are no discrepancies, do not summarize those that accurately reflect their real implementations in the codebase, just respond "All mocks and stubs verified."
|
11
|
+
|
12
|
+
IMPORTANT: There's absolutely no need for you to waste time grepping for methods/functions that you know belong to testing libraries such as Mocha's `expects` and `stubs`. Only search for the implementation of things that are stubbed and/or mocked in the test to verify whether the test code matches the implementation code.
|
@@ -0,0 +1,53 @@
|
|
1
|
+
Now identify custom test helpers used in this test for the following purpose:
|
2
|
+
|
3
|
+
1. Analyzing if they are used correctly
|
4
|
+
2. Understanding test code that has had significant chunks of implementation abstracted away into helpers
|
5
|
+
3. Fully understanding custom assertions that are not included by default in Ruby on Rails or part of your base knowledge
|
6
|
+
|
7
|
+
Your grep tool function is vital for this work. It provides 4 lines of context before and after the matching line.
|
8
|
+
|
9
|
+
For example, if you call `grep(string: "def assert_sql")`, the output will include:
|
10
|
+
|
11
|
+
```
|
12
|
+
.test/support/helpers/sql_assertions.rb-101- end
|
13
|
+
.test/support/helpers/sql_assertions.rb-102- result
|
14
|
+
.test/support/helpers/sql_assertions.rb-103- end
|
15
|
+
.test/support/helpers/sql_assertions.rb-104-
|
16
|
+
.test/support/helpers/sql_assertions.rb:105: def assert_sql(*patterns_to_match, **kwargs, &block)
|
17
|
+
.test/support/helpers/sql_assertions.rb-106- mysql_only_test!
|
18
|
+
.test/support/helpers/sql_assertions.rb-107-
|
19
|
+
.test/support/helpers/sql_assertions.rb-108- result = T.let(nil, T.nilable(T::Boolean))
|
20
|
+
.test/support/helpers/sql_assertions.rb-109- counter = ActiveRecord::SQLCounter.new(**kwargs)
|
21
|
+
```
|
22
|
+
|
23
|
+
Unfortunately, many test helper methods are undocumented. In those cases (like the example above) the pre-context will be junk. However, there are a number of helper methods that do have very specific and narrow use cases, and those do tend to be well-documented. In those cases, you should use `read_file` to be able to read the full documentation.
|
24
|
+
|
25
|
+
For example, here is the result of calling `grep(string: "def assert_sql_events")`
|
26
|
+
|
27
|
+
```
|
28
|
+
.test/support/helpers/externals_helper.rb-93- # @example Logs events in the list that did not occur
|
29
|
+
.test/support/helpers/externals_helper.rb-94- # expected_queries = { "Shop Load" => 1, "User Load" => 1 }
|
30
|
+
.test/support/helpers/externals_helper.rb-95- # # Fails and reports that User Load occured 0 times instead of expected 1
|
31
|
+
.test/support/helpers/externals_helper.rb-96- # assert_sql_events(expected_queries) { Shop.current_or_find(shop.id) }
|
32
|
+
.test/support/helpers/externals_helper.rb:97: def assert_sql_events(expected_events, &block)
|
33
|
+
.test/support/helpers/externals_helper.rb-98- mysql_only_test!
|
34
|
+
.test/support/helpers/externals_helper.rb-99-
|
35
|
+
.test/support/helpers/externals_helper.rb-100- mysql_events = ExternalsCollector.new(&block).events
|
36
|
+
.test/support/helpers/externals_helper.rb-101- .select { |e| e.first == :mysql }
|
37
|
+
```
|
38
|
+
|
39
|
+
Notice that the documentation for the `assert_sql_events` method is cutoff. Use your `read_file` tool function to get the whole test helper source code and gain better understanding of how it is intended to be used, with the side benefit of also being able to see how it is implemented.
|
40
|
+
|
41
|
+
Note: You will undoubtedly already be familiar with some of Minitest and RSpec's built-in helpers. There is no need to search for those, since they are packaged as gems you won't find them anyway.
|
42
|
+
|
43
|
+
DO NOT FORGET TO PREPEND `def` TO YOUR QUERY TO FIND A METHOD DEFINITION INSTEAD OF USAGES, otherwise you may bring back a very large and useless result set!!!
|
44
|
+
|
45
|
+
Once you are done understanding the custom test helpers used in the test file, analyze and report on whether it seems like any of the helpers are:
|
46
|
+
|
47
|
+
1. Used incorrectly
|
48
|
+
2. Used unnecessarily
|
49
|
+
3. Any other problem related to the use of helper methods
|
50
|
+
|
51
|
+
Where possible, use your best judgment to make recommendations for how to fix problems that you find, but ONLY related to test helpers.
|
52
|
+
|
53
|
+
Note: You are only being used to help find problems so it is not necessary to report on correct usage of helpers or to make positive comments.
|
@@ -0,0 +1,8 @@
|
|
1
|
+
As a senior software engineer and testing expert, evaluate the quality of this test file based on guidelines that will be subsequently provided.
|
2
|
+
|
3
|
+
Next I will now provide the source code of the test that we will be analyzing, and then step you through a series of analysis activities, before finally asking you to provided a final report.
|
4
|
+
|
5
|
+
<test>
|
6
|
+
# <%= file %>
|
7
|
+
<%= File.read(file) %>
|
8
|
+
</test>
|
@@ -0,0 +1,6 @@
|
|
1
|
+
As a senior Ruby engineer and testing expert, evaluate the quality of this Ruby test file. Next I will now provide the source code of the test that we will be analyzing, and then step you through a series of analysis activities, before finally asking you to provided a final report.
|
2
|
+
|
3
|
+
<test>
|
4
|
+
# <%= file %>
|
5
|
+
<%= File.read(file) %>
|
6
|
+
</test>
|
@@ -0,0 +1,6 @@
|
|
1
|
+
As a senior front-end engineer and testing expert, evaluate the quality of this test file. Next I will now provide the source code of the test that we will be analyzing, and then step you through a series of analysis activities, before finally asking you to provided a final report.
|
2
|
+
|
3
|
+
<test>
|
4
|
+
# <%= file %>
|
5
|
+
<%= File.read(file) %>
|
6
|
+
</test>
|
@@ -0,0 +1,46 @@
|
|
1
|
+
name: Grading current test changes
|
2
|
+
|
3
|
+
tools:
|
4
|
+
- Roast::Tools::Grep
|
5
|
+
- Roast::Tools::ReadFile
|
6
|
+
- Roast::Tools::SearchFile
|
7
|
+
|
8
|
+
each: '% cd $(git rev-parse --show-toplevel) && git status --porcelain | grep "_test\.rb" | cut -c4- | xargs realpath'
|
9
|
+
|
10
|
+
steps:
|
11
|
+
- read_dependencies
|
12
|
+
- run_coverage
|
13
|
+
-
|
14
|
+
- analyze_coverage
|
15
|
+
- verify_test_helpers
|
16
|
+
- verify_mocks_and_stubs
|
17
|
+
- generate_grades
|
18
|
+
- calculate_final_grade
|
19
|
+
- format_result
|
20
|
+
- generate_recommendations
|
21
|
+
- annotate_pr_with_comments
|
22
|
+
|
23
|
+
# set non-default attributes for steps below
|
24
|
+
analyze_coverage:
|
25
|
+
model: gpt-4.1-mini
|
26
|
+
loop: false
|
27
|
+
json: true
|
28
|
+
|
29
|
+
generate_grades:
|
30
|
+
model: o3
|
31
|
+
json: true
|
32
|
+
|
33
|
+
generate_recommendations:
|
34
|
+
model: o3
|
35
|
+
loop: false
|
36
|
+
json: true
|
37
|
+
params:
|
38
|
+
max_completion_tokens: 5_000
|
39
|
+
|
40
|
+
annotate_pr_with_comments:
|
41
|
+
tools:
|
42
|
+
- Roast::Tools::Github::Annotator
|
43
|
+
model: o3
|
44
|
+
params:
|
45
|
+
max_completion_tokens: 5_000
|
46
|
+
if: "workflow.pr? && output.recommendations.any?"
|
data/exe/roast
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
Encoding.default_external = Encoding::UTF_8
|
5
|
+
Encoding.default_internal = Encoding::UTF_8
|
6
|
+
|
7
|
+
unshift_path = ->(path) {
|
8
|
+
p = File.expand_path("../../#{path}", __FILE__)
|
9
|
+
$LOAD_PATH.unshift(p) unless $LOAD_PATH.include?(p)
|
10
|
+
}
|
11
|
+
unshift_path.call("lib")
|
12
|
+
|
13
|
+
require "bundler/setup"
|
14
|
+
require "roast"
|
15
|
+
|
16
|
+
puts "🔥🔥🔥 Everyone loves a good roast 🔥🔥🔥\n\n"
|
17
|
+
Roast::CLI.start(ARGV)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support/cache"
|
4
|
+
require "active_support/notifications"
|
5
|
+
require_relative "logger"
|
6
|
+
|
7
|
+
module Roast
|
8
|
+
module Helpers
|
9
|
+
# Intercepts function dispatching to add caching capabilities
|
10
|
+
# This module wraps around Raix::FunctionDispatch to provide caching for tool functions
|
11
|
+
module FunctionCachingInterceptor
|
12
|
+
def dispatch_tool_function(function_name, params)
|
13
|
+
# legacy workflows don't have a configuration
|
14
|
+
return super(function_name, params) if configuration.blank?
|
15
|
+
|
16
|
+
function_config = configuration.function_config(function_name)
|
17
|
+
if function_config&.dig("cache", "enabled")
|
18
|
+
# Call the original function and pass in the cache
|
19
|
+
super(function_name, params, cache: Roast::Tools::CACHE)
|
20
|
+
else
|
21
|
+
Roast::Helpers::Logger.debug("⚠️ Caching not enabled for #{function_name}")
|
22
|
+
super(function_name, params)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
require "forwardable"
|
5
|
+
|
6
|
+
module Roast
|
7
|
+
module Helpers
|
8
|
+
# Central logger for the Roast application
|
9
|
+
class Logger
|
10
|
+
extend Forwardable
|
11
|
+
VALID_LOG_LEVELS = ["DEBUG", "INFO", "WARN", "ERROR", "FATAL"].freeze
|
12
|
+
|
13
|
+
attr_reader :logger, :log_level
|
14
|
+
|
15
|
+
# Delegate info and warn methods to the underlying logger
|
16
|
+
def_delegators :logger, :info, :warn
|
17
|
+
|
18
|
+
# Create a specialized debug method that ensures proper functionality
|
19
|
+
def debug(message)
|
20
|
+
logger.debug(message)
|
21
|
+
end
|
22
|
+
|
23
|
+
def error(message)
|
24
|
+
# Add any custom error handling logic here
|
25
|
+
logger.error(message)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fatal(message)
|
29
|
+
# Add any custom fatal error handling logic here
|
30
|
+
logger.fatal(message)
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(stdout: $stdout, log_level: ENV["ROAST_LOG_LEVEL"] || "INFO")
|
34
|
+
@log_level = validate_log_level(log_level)
|
35
|
+
@logger = create_logger(stdout)
|
36
|
+
end
|
37
|
+
|
38
|
+
def log_level=(level)
|
39
|
+
@log_level = validate_log_level(level)
|
40
|
+
logger.level = ::Logger.const_get(@log_level)
|
41
|
+
end
|
42
|
+
|
43
|
+
class << self
|
44
|
+
extend Forwardable
|
45
|
+
|
46
|
+
def instance
|
47
|
+
@instance ||= new
|
48
|
+
end
|
49
|
+
|
50
|
+
# Delegate logging methods to the singleton instance
|
51
|
+
def_delegators :instance, :debug, :info, :warn, :error, :fatal
|
52
|
+
|
53
|
+
# For testing purposes
|
54
|
+
def reset
|
55
|
+
@instance = nil
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
private
|
60
|
+
|
61
|
+
def validate_log_level(level)
|
62
|
+
level_str = level.to_s.upcase
|
63
|
+
unless VALID_LOG_LEVELS.include?(level_str)
|
64
|
+
raise ArgumentError, "Invalid log level: #{level}. Valid levels are: #{VALID_LOG_LEVELS.join(", ")}"
|
65
|
+
end
|
66
|
+
|
67
|
+
level_str
|
68
|
+
end
|
69
|
+
|
70
|
+
def create_logger(stdout)
|
71
|
+
::Logger.new(stdout).tap do |logger|
|
72
|
+
logger.level = ::Logger.const_get(@log_level)
|
73
|
+
logger.formatter = proc do |severity, datetime, _progname, msg|
|
74
|
+
msg_string = format_message(msg)
|
75
|
+
|
76
|
+
if severity == "INFO" && !msg_string.start_with?("[")
|
77
|
+
msg_string
|
78
|
+
else
|
79
|
+
"[#{datetime.strftime("%Y-%m-%d %H:%M:%S")}] #{severity}: #{msg_string.gsub(/^\[|\]$/, "").strip}\n"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
# Ensures that the message is a string, and if it's an array, it's formatted correctly for the console
|
86
|
+
def format_message(msg)
|
87
|
+
case msg
|
88
|
+
when String
|
89
|
+
msg
|
90
|
+
when Array
|
91
|
+
if msg.first.is_a?(String) && msg.length == 1
|
92
|
+
msg.first
|
93
|
+
else
|
94
|
+
msg.map { |item| item.is_a?(String) ? item : item.inspect.gsub(/^\[|\]$/, "").strip }.join("\n")
|
95
|
+
end
|
96
|
+
when NilClass
|
97
|
+
""
|
98
|
+
else
|
99
|
+
msg.to_s
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|