roast-ai 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (286) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/ci.yaml +3 -1
  3. data/.gitignore +7 -0
  4. data/.rubocop.yml +14 -0
  5. data/Gemfile +2 -1
  6. data/Gemfile.lock +9 -1
  7. data/Rakefile +16 -4
  8. data/examples/README.md +9 -0
  9. data/examples/available_tools_demo/workflow.yml +2 -2
  10. data/examples/basic_prompt_workflow/workflow.md +1 -0
  11. data/examples/basic_prompt_workflow/workflow.yml +14 -0
  12. data/lib/roast/dsl/executor.rb +2 -1
  13. data/lib/roast/helpers/cmd_runner.rb +199 -0
  14. data/lib/roast/initializers.rb +1 -1
  15. data/lib/roast/tools/apply_diff.rb +1 -1
  16. data/lib/roast/tools/bash.rb +4 -4
  17. data/lib/roast/tools/cmd.rb +3 -5
  18. data/lib/roast/tools/coding_agent.rb +1 -1
  19. data/lib/roast/tools/grep.rb +6 -2
  20. data/lib/roast/tools/read_file.rb +2 -1
  21. data/lib/roast/tools/swarm.rb +2 -7
  22. data/lib/roast/tools.rb +10 -1
  23. data/lib/roast/version.rb +1 -1
  24. data/lib/roast/workflow/base_step.rb +2 -3
  25. data/lib/roast/workflow/command_executor.rb +3 -3
  26. data/lib/roast/workflow/resource_resolver.rb +1 -1
  27. data/lib/roast/workflow/shell_script_step.rb +1 -1
  28. data/lib/roast/workflow/step_loader.rb +2 -7
  29. data/lib/roast.rb +7 -1
  30. data/rubocop/cop/roast/use_cmd_runner.rb +93 -0
  31. data/rubocop/cop/roast.rb +4 -0
  32. data/sorbet/rbi/gems/docile@1.4.1.rbi +377 -0
  33. data/sorbet/rbi/gems/lint_roller@1.1.0.rbi +233 -2
  34. data/sorbet/rbi/gems/racc@1.8.1.rbi +6 -4
  35. data/sorbet/rbi/gems/rainbow@3.1.1.rbi +396 -2
  36. data/sorbet/rbi/gems/regexp_parser@2.10.0.rbi +3788 -2
  37. data/sorbet/rbi/gems/rubocop-ast@1.45.1.rbi +7747 -2
  38. data/sorbet/rbi/gems/rubocop-sorbet@0.10.5.rbi +2386 -0
  39. data/sorbet/rbi/gems/rubocop@1.77.0.rbi +62813 -2
  40. data/sorbet/rbi/gems/ruby-progressbar@1.13.0.rbi +1311 -2
  41. data/sorbet/rbi/gems/simplecov-html@0.13.2.rbi +225 -0
  42. data/sorbet/rbi/gems/simplecov@0.22.0.rbi +2259 -0
  43. data/sorbet/rbi/gems/simplecov_json_formatter@0.1.4.rbi +9 -0
  44. data/sorbet/rbi/gems/unicode-display_width@3.1.4.rbi +125 -2
  45. data/sorbet/rbi/gems/unicode-emoji@4.0.4.rbi +244 -2
  46. data/sorbet/tapioca/require.rb +2 -1
  47. metadata +12 -240
  48. data/CHANGELOG.md +0 -364
  49. data/examples/agent_continue/add_documentation/prompt.md +0 -5
  50. data/examples/agent_continue/add_error_handling/prompt.md +0 -5
  51. data/examples/agent_continue/analyze_codebase/prompt.md +0 -7
  52. data/examples/agent_continue/combined_workflow.yml +0 -24
  53. data/examples/agent_continue/continue_adding_features/prompt.md +0 -4
  54. data/examples/agent_continue/create_integration_tests/prompt.md +0 -3
  55. data/examples/agent_continue/document_with_context/prompt.md +0 -5
  56. data/examples/agent_continue/explore_api/prompt.md +0 -6
  57. data/examples/agent_continue/implement_client/prompt.md +0 -6
  58. data/examples/agent_continue/inline_workflow.yml +0 -20
  59. data/examples/agent_continue/refactor_code/prompt.md +0 -2
  60. data/examples/agent_continue/verify_changes/prompt.md +0 -6
  61. data/examples/agent_continue/workflow.yml +0 -27
  62. data/examples/agent_workflow/README.md +0 -75
  63. data/examples/agent_workflow/apply_refactorings/prompt.md +0 -22
  64. data/examples/agent_workflow/identify_code_smells/prompt.md +0 -15
  65. data/examples/agent_workflow/summarize_improvements/prompt.md +0 -18
  66. data/examples/agent_workflow/workflow.png +0 -0
  67. data/examples/agent_workflow/workflow.yml +0 -16
  68. data/examples/api_workflow/README.md +0 -85
  69. data/examples/api_workflow/fetch_api_data/prompt.md +0 -10
  70. data/examples/api_workflow/generate_report/prompt.md +0 -10
  71. data/examples/api_workflow/prompt.md +0 -10
  72. data/examples/api_workflow/transform_data/prompt.md +0 -10
  73. data/examples/api_workflow/workflow.png +0 -0
  74. data/examples/api_workflow/workflow.yml +0 -30
  75. data/examples/apply_diff_demo/README.md +0 -58
  76. data/examples/apply_diff_demo/apply_simple_change/prompt.md +0 -13
  77. data/examples/apply_diff_demo/create_sample_file/prompt.md +0 -11
  78. data/examples/apply_diff_demo/workflow.yml +0 -24
  79. data/examples/available_tools_demo/workflow.png +0 -0
  80. data/examples/bash_prototyping/README.md +0 -53
  81. data/examples/bash_prototyping/analyze_network/prompt.md +0 -13
  82. data/examples/bash_prototyping/analyze_system/prompt.md +0 -11
  83. data/examples/bash_prototyping/api_testing.png +0 -0
  84. data/examples/bash_prototyping/api_testing.yml +0 -14
  85. data/examples/bash_prototyping/check_processes/prompt.md +0 -11
  86. data/examples/bash_prototyping/generate_report/prompt.md +0 -16
  87. data/examples/bash_prototyping/process_json_response/prompt.md +0 -24
  88. data/examples/bash_prototyping/system_analysis.png +0 -0
  89. data/examples/bash_prototyping/system_analysis.yml +0 -14
  90. data/examples/bash_prototyping/test_public_api/prompt.md +0 -22
  91. data/examples/case_when/README.md +0 -58
  92. data/examples/case_when/detect_language/prompt.md +0 -16
  93. data/examples/case_when/workflow.png +0 -0
  94. data/examples/case_when/workflow.yml +0 -58
  95. data/examples/cmd/README.md +0 -99
  96. data/examples/cmd/analyze_project/prompt.md +0 -57
  97. data/examples/cmd/basic_demo/prompt.md +0 -48
  98. data/examples/cmd/basic_workflow.png +0 -0
  99. data/examples/cmd/basic_workflow.yml +0 -17
  100. data/examples/cmd/check_repository/prompt.md +0 -57
  101. data/examples/cmd/create_and_verify/prompt.md +0 -56
  102. data/examples/cmd/dev_workflow.png +0 -0
  103. data/examples/cmd/dev_workflow.yml +0 -26
  104. data/examples/cmd/explore_project/prompt.md +0 -67
  105. data/examples/cmd/explorer_workflow.png +0 -0
  106. data/examples/cmd/explorer_workflow.yml +0 -21
  107. data/examples/cmd/smart_tool_selection/prompt.md +0 -99
  108. data/examples/coding_agent_with_model.yml +0 -20
  109. data/examples/coding_agent_with_retries.yml +0 -30
  110. data/examples/conditional/README.md +0 -161
  111. data/examples/conditional/check_condition/prompt.md +0 -1
  112. data/examples/conditional/simple_workflow.png +0 -0
  113. data/examples/conditional/simple_workflow.yml +0 -15
  114. data/examples/conditional/workflow.png +0 -0
  115. data/examples/conditional/workflow.yml +0 -23
  116. data/examples/context_management_demo/README.md +0 -43
  117. data/examples/context_management_demo/workflow.yml +0 -42
  118. data/examples/direct_coerce_syntax/README.md +0 -32
  119. data/examples/direct_coerce_syntax/workflow.png +0 -0
  120. data/examples/direct_coerce_syntax/workflow.yml +0 -36
  121. data/examples/dot_notation/README.md +0 -37
  122. data/examples/dot_notation/workflow.png +0 -0
  123. data/examples/dot_notation/workflow.yml +0 -44
  124. data/examples/exit_on_error/README.md +0 -50
  125. data/examples/exit_on_error/analyze_lint_output/prompt.md +0 -9
  126. data/examples/exit_on_error/apply_fixes/prompt.md +0 -2
  127. data/examples/exit_on_error/workflow.png +0 -0
  128. data/examples/exit_on_error/workflow.yml +0 -19
  129. data/examples/grading/README.md +0 -71
  130. data/examples/grading/analyze_coverage/prompt.md +0 -52
  131. data/examples/grading/calculate_final_grade.rb +0 -67
  132. data/examples/grading/format_result.rb +0 -64
  133. data/examples/grading/generate_grades/prompt.md +0 -105
  134. data/examples/grading/generate_recommendations/output.txt +0 -17
  135. data/examples/grading/generate_recommendations/prompt.md +0 -60
  136. data/examples/grading/js_test_runner +0 -31
  137. data/examples/grading/rb_test_runner +0 -19
  138. data/examples/grading/read_dependencies/prompt.md +0 -16
  139. data/examples/grading/run_coverage.rb +0 -54
  140. data/examples/grading/verify_mocks_and_stubs/prompt.md +0 -12
  141. data/examples/grading/verify_test_helpers/prompt.md +0 -53
  142. data/examples/grading/workflow.md +0 -8
  143. data/examples/grading/workflow.png +0 -0
  144. data/examples/grading/workflow.rb.md +0 -6
  145. data/examples/grading/workflow.ts+tsx.md +0 -6
  146. data/examples/grading/workflow.yml +0 -41
  147. data/examples/instrumentation.rb +0 -76
  148. data/examples/interpolation/README.md +0 -50
  149. data/examples/interpolation/analyze_file/prompt.md +0 -1
  150. data/examples/interpolation/analyze_patterns/prompt.md +0 -27
  151. data/examples/interpolation/generate_report_for_js/prompt.md +0 -3
  152. data/examples/interpolation/generate_report_for_rb/prompt.md +0 -3
  153. data/examples/interpolation/sample.js +0 -48
  154. data/examples/interpolation/sample.rb +0 -42
  155. data/examples/interpolation/workflow.md +0 -1
  156. data/examples/interpolation/workflow.png +0 -0
  157. data/examples/interpolation/workflow.yml +0 -21
  158. data/examples/iteration/IMPLEMENTATION.md +0 -88
  159. data/examples/iteration/README.md +0 -68
  160. data/examples/iteration/analyze_complexity/prompt.md +0 -22
  161. data/examples/iteration/generate_recommendations/prompt.md +0 -21
  162. data/examples/iteration/generate_report/prompt.md +0 -129
  163. data/examples/iteration/implement_fix/prompt.md +0 -25
  164. data/examples/iteration/prioritize_issues/prompt.md +0 -24
  165. data/examples/iteration/prompts/analyze_file.md +0 -28
  166. data/examples/iteration/prompts/generate_summary.md +0 -24
  167. data/examples/iteration/prompts/update_report.md +0 -29
  168. data/examples/iteration/prompts/write_report.md +0 -22
  169. data/examples/iteration/read_file/prompt.md +0 -9
  170. data/examples/iteration/select_next_issue/prompt.md +0 -25
  171. data/examples/iteration/simple_workflow.md +0 -39
  172. data/examples/iteration/simple_workflow.yml +0 -58
  173. data/examples/iteration/update_fix_count/prompt.md +0 -26
  174. data/examples/iteration/verify_fix/prompt.md +0 -29
  175. data/examples/iteration/workflow.png +0 -0
  176. data/examples/iteration/workflow.yml +0 -42
  177. data/examples/json_handling/README.md +0 -32
  178. data/examples/json_handling/workflow.png +0 -0
  179. data/examples/json_handling/workflow.yml +0 -52
  180. data/examples/mcp/README.md +0 -223
  181. data/examples/mcp/analyze_changes/prompt.md +0 -8
  182. data/examples/mcp/analyze_issues/prompt.md +0 -4
  183. data/examples/mcp/analyze_schema/prompt.md +0 -4
  184. data/examples/mcp/check_data_quality/prompt.md +0 -5
  185. data/examples/mcp/check_documentation/prompt.md +0 -4
  186. data/examples/mcp/create_recommendations/prompt.md +0 -5
  187. data/examples/mcp/database_workflow.png +0 -0
  188. data/examples/mcp/database_workflow.yml +0 -29
  189. data/examples/mcp/env_demo/workflow.png +0 -0
  190. data/examples/mcp/env_demo/workflow.yml +0 -34
  191. data/examples/mcp/fetch_pr_context/prompt.md +0 -4
  192. data/examples/mcp/filesystem_demo/create_test_file/prompt.md +0 -2
  193. data/examples/mcp/filesystem_demo/list_files/prompt.md +0 -6
  194. data/examples/mcp/filesystem_demo/read_with_mcp/prompt.md +0 -7
  195. data/examples/mcp/filesystem_demo/workflow.png +0 -0
  196. data/examples/mcp/filesystem_demo/workflow.yml +0 -38
  197. data/examples/mcp/generate_insights/prompt.md +0 -4
  198. data/examples/mcp/generate_report/prompt.md +0 -6
  199. data/examples/mcp/generate_review/prompt.md +0 -16
  200. data/examples/mcp/github_workflow.png +0 -0
  201. data/examples/mcp/github_workflow.yml +0 -32
  202. data/examples/mcp/multi_mcp_workflow.png +0 -0
  203. data/examples/mcp/multi_mcp_workflow.yml +0 -58
  204. data/examples/mcp/post_review/prompt.md +0 -3
  205. data/examples/mcp/save_report/prompt.md +0 -6
  206. data/examples/mcp/search_issues/prompt.md +0 -2
  207. data/examples/mcp/summarize/prompt.md +0 -1
  208. data/examples/mcp/test_filesystem/prompt.md +0 -6
  209. data/examples/mcp/test_github/prompt.md +0 -8
  210. data/examples/mcp/test_read/prompt.md +0 -1
  211. data/examples/mcp/workflow.png +0 -0
  212. data/examples/mcp/workflow.yml +0 -35
  213. data/examples/no_model_fallback/README.md +0 -17
  214. data/examples/no_model_fallback/analyze_file/prompt.md +0 -1
  215. data/examples/no_model_fallback/analyze_patterns/prompt.md +0 -27
  216. data/examples/no_model_fallback/generate_report_for_md/prompt.md +0 -10
  217. data/examples/no_model_fallback/generate_report_for_rb/prompt.md +0 -3
  218. data/examples/no_model_fallback/sample.rb +0 -42
  219. data/examples/no_model_fallback/workflow.yml +0 -19
  220. data/examples/openrouter_example/README.md +0 -48
  221. data/examples/openrouter_example/analyze_input/prompt.md +0 -16
  222. data/examples/openrouter_example/generate_response/prompt.md +0 -9
  223. data/examples/openrouter_example/workflow.png +0 -0
  224. data/examples/openrouter_example/workflow.yml +0 -12
  225. data/examples/pre_post_processing/README.md +0 -111
  226. data/examples/pre_post_processing/analyze_test_file/prompt.md +0 -23
  227. data/examples/pre_post_processing/improve_test_coverage/prompt.md +0 -17
  228. data/examples/pre_post_processing/optimize_test_performance/prompt.md +0 -25
  229. data/examples/pre_post_processing/post_processing/aggregate_metrics/prompt.md +0 -31
  230. data/examples/pre_post_processing/post_processing/cleanup_environment/prompt.md +0 -28
  231. data/examples/pre_post_processing/post_processing/generate_summary_report/prompt.md +0 -32
  232. data/examples/pre_post_processing/post_processing/output.txt +0 -24
  233. data/examples/pre_post_processing/pre_processing/gather_baseline_metrics/prompt.md +0 -26
  234. data/examples/pre_post_processing/pre_processing/setup_test_environment/prompt.md +0 -11
  235. data/examples/pre_post_processing/validate_changes/prompt.md +0 -24
  236. data/examples/pre_post_processing/workflow.png +0 -0
  237. data/examples/pre_post_processing/workflow.yml +0 -21
  238. data/examples/retry/workflow.yml +0 -23
  239. data/examples/rspec_to_minitest/README.md +0 -68
  240. data/examples/rspec_to_minitest/analyze_spec/prompt.md +0 -30
  241. data/examples/rspec_to_minitest/create_minitest/prompt.md +0 -33
  242. data/examples/rspec_to_minitest/run_and_improve/prompt.md +0 -35
  243. data/examples/rspec_to_minitest/workflow.md +0 -10
  244. data/examples/rspec_to_minitest/workflow.png +0 -0
  245. data/examples/rspec_to_minitest/workflow.yml +0 -40
  246. data/examples/shared_config/README.md +0 -52
  247. data/examples/shared_config/example_with_shared_config/workflow.png +0 -0
  248. data/examples/shared_config/example_with_shared_config/workflow.yml +0 -6
  249. data/examples/shared_config/shared.png +0 -0
  250. data/examples/shared_config/shared.yml +0 -7
  251. data/examples/single_target_prepost/README.md +0 -36
  252. data/examples/single_target_prepost/post_processing/output.txt +0 -27
  253. data/examples/single_target_prepost/pre_processing/gather_dependencies/prompt.md +0 -11
  254. data/examples/single_target_prepost/workflow.png +0 -0
  255. data/examples/single_target_prepost/workflow.yml +0 -20
  256. data/examples/smart_coercion_defaults/README.md +0 -65
  257. data/examples/smart_coercion_defaults/workflow.png +0 -0
  258. data/examples/smart_coercion_defaults/workflow.yml +0 -44
  259. data/examples/step_configuration/README.md +0 -84
  260. data/examples/step_configuration/workflow.png +0 -0
  261. data/examples/step_configuration/workflow.yml +0 -57
  262. data/examples/swarm_example.yml +0 -25
  263. data/examples/tool_config_example/README.md +0 -109
  264. data/examples/tool_config_example/example_step/prompt.md +0 -42
  265. data/examples/tool_config_example/workflow.png +0 -0
  266. data/examples/tool_config_example/workflow.yml +0 -17
  267. data/examples/user_input/README.md +0 -90
  268. data/examples/user_input/funny_name/create_backstory/prompt.md +0 -10
  269. data/examples/user_input/funny_name/workflow.png +0 -0
  270. data/examples/user_input/funny_name/workflow.yml +0 -26
  271. data/examples/user_input/generate_summary/prompt.md +0 -11
  272. data/examples/user_input/simple_input_demo/workflow.png +0 -0
  273. data/examples/user_input/simple_input_demo/workflow.yml +0 -35
  274. data/examples/user_input/survey_workflow.png +0 -0
  275. data/examples/user_input/survey_workflow.yml +0 -71
  276. data/examples/user_input/welcome_message/prompt.md +0 -3
  277. data/examples/user_input/workflow.png +0 -0
  278. data/examples/user_input/workflow.yml +0 -73
  279. data/examples/workflow_generator/README.md +0 -27
  280. data/examples/workflow_generator/analyze_user_request/prompt.md +0 -34
  281. data/examples/workflow_generator/create_workflow_files/prompt.md +0 -32
  282. data/examples/workflow_generator/get_user_input/prompt.md +0 -14
  283. data/examples/workflow_generator/info_from_roast.rb +0 -22
  284. data/examples/workflow_generator/workflow.png +0 -0
  285. data/examples/workflow_generator/workflow.yml +0 -34
  286. data/lib/roast/helpers/timeout_handler.rb +0 -89
@@ -1,44 +0,0 @@
1
- name: dot_notation_example
2
- description: Example demonstrating dot notation access for workflow outputs
3
-
4
- steps:
5
- initialize:
6
- prompt: |
7
- Initialize the workflow with some sample data.
8
-
9
- Set output.counter to 0
10
- Set output.config.max_iterations to 5
11
- Set output.config.enabled to true
12
-
13
- process_items:
14
- repeat:
15
- # Using dot notation in conditions
16
- until: "counter >= config.max_iterations || !config.enabled?"
17
- steps:
18
- - increment_counter
19
- - check_status
20
-
21
- increment_counter:
22
- prompt: |
23
- Increment the counter by 1.
24
- Current counter value: {{counter}}
25
-
26
- Set output.counter to {{counter}} + 1
27
-
28
- check_status:
29
- prompt: |
30
- Check if we should continue processing.
31
-
32
- Current counter: {{counter}}
33
- Max iterations: {{config.max_iterations}}
34
-
35
- If counter is 3, set output.config.enabled to false
36
- Set output.status.last_checked to current counter value
37
-
38
- summarize:
39
- prompt: |
40
- Summarize the results:
41
- - Total iterations: {{counter}}
42
- - Last checked at: {{status.last_checked}}
43
- - Was enabled: {{config.enabled}}
44
- - Hit max iterations: {{counter >= config.max_iterations ? "Yes" : "No"}}
@@ -1,50 +0,0 @@
1
- # Exit on Error Example
2
-
3
- This example demonstrates how to use the `exit_on_error` configuration option to continue workflow execution even when a command fails.
4
-
5
- ## Use Case
6
-
7
- When running a linter like RuboCop on a file with syntax errors or style violations, the command will exit with a non-zero status. By default, this would halt the workflow. However, we often want to:
8
-
9
- 1. Capture the linter output (including errors)
10
- 2. Analyze what went wrong
11
- 3. Apply fixes based on the analysis
12
-
13
- ## Configuration
14
-
15
- The key configuration is in the step configuration section:
16
-
17
- ```yaml
18
- lint_check:
19
- exit_on_error: false
20
- ```
21
-
22
- This tells Roast to:
23
- - Continue workflow execution even if the command fails
24
- - Capture the full output (stdout and stderr)
25
- - Append the exit status to the output
26
-
27
- ## Output Format
28
-
29
- When a command fails with `exit_on_error: false`, the output will look like:
30
-
31
- ```
32
- lib/example.rb:5:3: C: Style/StringLiterals: Prefer double-quoted strings
33
- 'hello'
34
- ^^^^^^^
35
- [Exit status: 1]
36
- ```
37
-
38
- This allows subsequent steps to process both the error output and the exit status.
39
-
40
- ## Running the Example
41
-
42
- ```bash
43
- roast execute workflow.yml path/to/file.rb
44
- ```
45
-
46
- The workflow will:
47
- 1. Run RuboCop on the file
48
- 2. Continue even if RuboCop finds issues
49
- 3. Analyze the linter output
50
- 4. Apply fixes based on the analysis
@@ -1,9 +0,0 @@
1
- The linter output from the previous step shows issues with the code.
2
- Please analyze the output and identify the specific problems that need to be fixed.
3
-
4
- Focus on:
5
- - Syntax errors
6
- - Style violations
7
- - Best practice violations
8
-
9
- Provide a structured list of issues found.
@@ -1,2 +0,0 @@
1
- Based on the analysis, please generate the fixes for the identified issues.
2
- Use the CodingAgent tool to apply the necessary changes to the file.
Binary file
@@ -1,19 +0,0 @@
1
- name: Linting with Error Recovery
2
- tools:
3
- - Roast::Tools::ReadFile
4
- - Roast::Tools::WriteFile
5
- - Roast::Tools::CodingAgent
6
-
7
- steps:
8
- # Run linter on the file - may fail if there are syntax errors
9
- - lint_check: $(rubocop {{file}})
10
-
11
- # Analyze linter output and fix issues even if linter failed
12
- - analyze_lint_output
13
-
14
- # Apply fixes based on the analysis
15
- - apply_fixes
16
-
17
- # Step configuration
18
- lint_check:
19
- exit_on_error: false # Continue even if rubocop exits with non-zero status
@@ -1,71 +0,0 @@
1
- # Test Grading Workflow
2
-
3
- This workflow acts as a senior software engineer and testing expert to evaluate the quality of test files based on best practices and guidelines.
4
-
5
- ## Prerequisites
6
-
7
- This example uses `shadowenv` for environment management, which is specific to Shopify's development environment. If you're not using shadowenv, you'll need to adapt the commands to your own setup.
8
-
9
- ### If you're using shadowenv:
10
- ```bash
11
- brew install shadowenv
12
- ```
13
-
14
- ### If you're NOT using shadowenv:
15
- You'll need to modify the `run_coverage.rb` file to remove the shadowenv commands. Look for lines like:
16
- ```ruby
17
- command = "shadowenv exec -- bundle exec ruby ..."
18
- ```
19
-
20
- And change them to match your environment:
21
- ```ruby
22
- # For standard Ruby/Bundler setup:
23
- command = "bundle exec ruby ..."
24
-
25
- # Or if you're using rbenv/rvm:
26
- command = "ruby ..."
27
- ```
28
-
29
- ## Usage
30
-
31
- ```bash
32
- # Run the grading workflow on a test file
33
- roast execute examples/grading/workflow.yml path/to/your_test.rb
34
- ```
35
-
36
- ## How it Works
37
-
38
- 1. **read_dependencies**: Analyzes the test file and its dependencies
39
- 2. **run_coverage**: Executes the test with coverage tracking
40
- 3. **generate_grades**: Evaluates test quality across multiple dimensions
41
- 4. **verify_test_helpers**: Checks for proper test helper usage
42
- 5. **verify_mocks_and_stubs**: Ensures appropriate use of test doubles
43
- 6. **analyze_coverage**: Reviews code coverage metrics
44
- 7. **generate_recommendations**: Provides improvement suggestions
45
- 8. **calculate_final_grade**: Computes an overall grade (A-F scale)
46
- 9. **format_result**: Formats the final output
47
-
48
- ## Customization
49
-
50
- Feel free to adapt this workflow to your testing environment:
51
-
52
- - **Different test frameworks**: Modify `run_coverage.rb` to work with RSpec, Jest, pytest, etc.
53
- - **Coverage tools**: Replace the coverage command with your preferred tool (SimpleCov, Istanbul, Coverage.py)
54
- - **Grading criteria**: Adjust the prompts in each step to match your team's standards
55
- - **Environment setup**: Remove or replace shadowenv with your environment management tool
56
-
57
- ## Example Output
58
-
59
- ```
60
- ========== TEST GRADE REPORT ==========
61
- Test file: test/example_test.rb
62
-
63
- FINAL GRADE:
64
- Score: 85/100
65
- Letter Grade: B
66
-
67
- RECOMMENDATIONS:
68
- - Add edge case testing for error conditions
69
- - Improve test descriptions for clarity
70
- - Consider extracting common setup to helper methods
71
- ```
@@ -1,52 +0,0 @@
1
- <coverage_results>
2
- <%= workflow.output["run_coverage"] %>
3
- </coverage_results>
4
-
5
- Analyze the results and score them on a scale of 1-10 using the following rubrics:
6
-
7
- <line_coverage>
8
- 0-1: Critical failure (0-20% coverage) - Core functionality remains completely untested
9
- 2-3: Poor coverage (21-40%) - Major gaps; many key functions lack any testing
10
- 4-5: Inadequate coverage (41-60%) - Several important code paths are not executed
11
- 6-7: Moderate coverage (61-80%) - Notable gaps remain; some important functionality lacks coverage
12
- 8-9: Good coverage (81-95%) - Only minor or edge case code paths remain untested
13
- 10: Excellent coverage (96-100%)
14
- </line_coverage>
15
-
16
- <branch_coverage>
17
- 0-1: Critical failure (0-20% branch coverage) - Almost no conditional branches are tested
18
- 2-3: Poor coverage (21-40%) - Most conditional logic remains untested
19
- 4-5: Inadequate coverage (41-60%) - Many conditions are only tested for one outcome
20
- 6-7: Moderate coverage (61-80%) - Some conditions lack testing for all outcomes
21
- 8-9: Good coverage (81-95%) - Most conditions are tested for most outcomes
22
- 10: Excellent coverage (96-100%)
23
- </branch_coverage>
24
-
25
- <method_coverage>
26
- 0-1: Critical failure (0-20% method coverage) - Most or core functionality methods are untested
27
- 2-3: Poor coverage (21-40%) - Several public API methods remain untested
28
- 4-5: Inadequate coverage (41-60%) - Some important public methods lack tests
29
- 6-7: Moderate coverage (61-80%) - Notable gaps remain; some public methods may lack comprehensive testing
30
- 8-9: Good coverage (81-95%) - Nearly all public methods are tested; private methods are mostly covered via public method tests
31
- 10: Excellent coverage (96-100%)
32
- </method_coverage>
33
-
34
- RESPONSE FORMAT
35
- You must respond in JSON format within <json> XML tags. Example:
36
-
37
- <json>
38
- {
39
- "method_coverage": {
40
- "score": "10",
41
- "justification": "The source file has 100% method coverage, indicating all methods are being tested."
42
- },
43
- "line_coverage": {
44
- "score": 10,
45
- "justification": "The source file has 100% line coverage, indicating all executable lines are tested."
46
- },
47
- "branch_coverage": {
48
- "score": 8,
49
- "justification": "The source file has 80% branch coverage, indicating some branches need testing."
50
- }
51
- }
52
- </json>
@@ -1,67 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- class CalculateFinalGrade < Roast::Workflow::BaseStep
4
- attr_accessor :llm_analysis
5
-
6
- WEIGHTS = {
7
- line_coverage: 0.1,
8
- method_coverage: 0.1,
9
- branch_coverage: 0.3,
10
- test_helpers: 0.1,
11
- mocks_and_stubs: 0.1,
12
- readability: 0.1,
13
- maintainability: 0.1,
14
- effectiveness: 0.1,
15
- }.freeze
16
-
17
- def call
18
- @llm_analysis = workflow.output["generate_grades"].merge(workflow.output["analyze_coverage"])
19
- weighted_sum = WEIGHTS.sum do |criterion, weight|
20
- score = llm_analysis[criterion.to_s]["score"].to_f / 10.0
21
- score * weight
22
- end
23
-
24
- {
25
- final_score: {
26
- weighted_score: weighted_sum,
27
- letter_grade: calculate_letter_grade(weighted_sum),
28
- },
29
- rubric_scores: calculate_rubric_scores,
30
- }
31
- end
32
-
33
- private
34
-
35
- def calculate_letter_grade(score)
36
- case score
37
- when 0.9..1.0
38
- "A"
39
- when 0.8...0.9
40
- "B"
41
- when 0.7...0.8
42
- "C"
43
- when 0.6...0.7
44
- "D"
45
- else
46
- "F"
47
- end
48
- end
49
-
50
- def calculate_rubric_scores
51
- scores = {}
52
-
53
- WEIGHTS.each_key do |criterion|
54
- raw_score = llm_analysis[criterion.to_s]["score"].to_f
55
- normalized_score = raw_score / 10.0
56
-
57
- scores[criterion] = {
58
- raw_value: raw_score,
59
- score: normalized_score,
60
- description: llm_analysis[criterion.to_s]["justification"],
61
- weighted_score: normalized_score * WEIGHTS[criterion],
62
- }
63
- end
64
-
65
- scores
66
- end
67
- end
@@ -1,64 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- class FormatResult < Roast::Workflow::BaseStep
4
- RUBRIC = {
5
- line_coverage: { description: "Line Coverage", weight: 0.1 },
6
- method_coverage: { description: "Method Coverage", weight: 0.1 },
7
- branch_coverage: { description: "Branch Coverage", weight: 0.3 },
8
- test_helpers: { description: "Test Helpers Usage", weight: 0.1 },
9
- mocks_and_stubs: { description: "Mocks and Stubs Usage", weight: 0.1 },
10
- readability: { description: "Test Readability", weight: 0.1 },
11
- maintainability: { description: "Test Maintainability", weight: 0.1 },
12
- effectiveness: { description: "Test Effectiveness", weight: 0.1 },
13
- }.freeze
14
-
15
- def call
16
- append_to_final_output(<<~OUTPUT)
17
- ========== TEST GRADE REPORT ==========
18
- Test file: #{workflow.file}
19
- OUTPUT
20
-
21
- format_results
22
- append_to_final_output("\n\n")
23
- end
24
-
25
- private
26
-
27
- def format_results
28
- # With HashWithIndifferentAccess, we can simply access with either syntax
29
- grade_data = workflow.output["calculate_final_grade"]
30
-
31
- unless grade_data
32
- return append_to_final_output("Error: Grading data not available. This may be because you're replaying the workflow from this step, but the previous step data is missing or not found in the selected session.")
33
- end
34
-
35
- format_grade(grade_data)
36
-
37
- # Make sure rubric_scores exists before trying to iterate over it
38
- unless grade_data[:rubric_scores]
39
- return append_to_final_output("Error: Rubric scores data not available in the workflow output.")
40
- end
41
-
42
- append_to_final_output("RUBRIC SCORES:")
43
- grade_data[:rubric_scores].each do |category, data|
44
- # Safely access RUBRIC with a fallback for potentially missing categories
45
- rubric_item = RUBRIC[category.to_sym] || { description: "Unknown Category", weight: 0 }
46
-
47
- append_to_final_output(" #{rubric_item[:description]} (#{(rubric_item[:weight] * 100).round}% of grade):")
48
- append_to_final_output(" Value: #{data[:raw_value] || "N/A"}")
49
- append_to_final_output(" Score: #{data[:score] ? (data[:score] * 10).round : "N/A"}/10 - \"#{data[:description] || "No description available"}\"")
50
- end
51
- end
52
-
53
- def format_grade(grade_data)
54
- return append_to_final_output("\nError: Final grade data not available.") unless grade_data && grade_data[:final_score]
55
-
56
- letter_grade = grade_data[:final_score][:letter_grade]
57
- celebration_emoji = letter_grade == "A" ? "🎉" : ""
58
- append_to_final_output(<<~OUTPUT)
59
- \nFINAL GRADE:
60
- Score: #{(grade_data[:final_score][:weighted_score] * 100).round}/100
61
- Letter Grade: #{letter_grade} #{celebration_emoji}
62
- OUTPUT
63
- end
64
- end
@@ -1,105 +0,0 @@
1
- These are the key testing guidelines to consider in your evaluation:
2
-
3
- - Tests should serve as specifications that define expected behaviors
4
- - Tests should have descriptive names that clearly communicate intent
5
- - Tests should focus on behavior rather than implementation details
6
- - Excessive mocking/stubbing should be avoided in favor of testing real behavior
7
- - Tests should be well-structured with minimal setup complexity
8
- - Tests should be maintainable and not break when implementation details change
9
- - Tests should cover edge cases and error conditions
10
- - Tests should follow proper naming conventions and directory structure
11
- - Tests should not modify the behaviour of the code being tested (e.g. making a private method public in tests)
12
-
13
- Now consider the full transcript and evaluate the test being graded based on the following rubrics on a scale of 1-10:
14
-
15
- <test_helpers>
16
- 0-1: Extremely poor helper usage - Helpers used incorrectly or inappropriately, making tests harder to understand
17
- 2-3: Poor helper usage - Helpers are poorly designed, tightly coupled to implementation, or used incorrectly
18
- 4-5: Basic helper usage - Helpers work but may be poorly organized or not reusable
19
- 6-7: Good helper usage - Helpers are well-designed and used appropriately
20
- 8-9: Very good helper usage - Helpers are well-factored, reusable, and make tests clearer
21
- 10: Excellent helper usage - Helpers are perfectly designed, highly reusable, and significantly improve test clarity and maintainability. Also give this score to tests that DO NOT use test helpers at all.
22
- </test_helpers>
23
-
24
- <mocks_and_stubs>
25
- 0-1: Extremely poor mocking - Mocks/stubs used incorrectly or excessively, completely hiding real behavior
26
- 2-3: Poor mocking - Heavy reliance on mocks that couple tests to implementation; mocks don't match real behavior
27
- 4-5: Basic mocking - Mocks used appropriately but may be overused or not match implementation exactly
28
- 6-7: Good mocking - Mocks used judiciously where needed; generally match implementation
29
- 8-9: Very good mocking - Minimal mocking focused on external dependencies; accurately reflects real behavior
30
- 10: Excellent mocking - Mocks used only where absolutely necessary (external APIs, etc); perfectly match real implementations; maintain loose coupling
31
- </mocks_and_stubs>
32
-
33
- <readability>
34
- 0-1: Extremely poor readability - Test purpose is impossible to understand; no structure or organization
35
- 2-3: Poor readability - Test names are vague or misleading; structure is confusing with no clear assertions
36
- 4-5: Basic readability - Structure is understandable but not optimized for clarity
37
- 6-7: Good readability - Structure is logical with clear assertions
38
- 8-9: Very readable - Well-organized with explicit, meaningful test names and assertions
39
- 10: Exceptionally readable - Test names serve as perfect specifications; elegant structure with context-providing descriptions; self-documenting with clear setup, execution, and verification phases
40
- </readability>
41
-
42
- <maintenability>
43
- 0-1: Extremely brittle - Tests are completely coupled to implementation details
44
- 2-3: Highly unmaintainable - Will require significant rework when code changes because of heavy coupling to implementation details
45
- 4-5: Somewhat maintainable - Some coupling to implementation details
46
- 6-7: Reasonably maintainable - Tests mostly focus on behavior over implementation; limited coupling to implementation details
47
- 8-9: Highly maintainable - Tests focus on behavior rather than implementation; changes to implementation should rarely break tests
48
- 10: Exceptionally maintainable - Tests purely focus on behavior and public interfaces; implementation can be completely refactored without breaking tests; well-factored test helpers and fixtures
49
- </maintenability>
50
-
51
- <effectiveness>
52
- 0-1: Ineffective - Don't validate actual behavior and could pass even if code is broken
53
- 2-3: Minimally effective - Only the most basic functionality validated. Many incorrect behaviors would not be caught
54
- 4-5: Partially effective - Only catch obvious issues but miss subtle bugs; limited validation of actual outcomes
55
- 6-7: Reasonably effective - Should catch most common bugs
56
- 8-9: Highly effective - Should catch nearly all bugs
57
- 10: Exceptionally effective - Should catch even subtle edge case bugs; validate both positive and negative cases
58
- </effectiveness>
59
-
60
- While grading, consider the following goals as being applicable across all rubrics:
61
-
62
- SUBJECTIVE:
63
- - Well-written: Organized, easy to understand, and follow best practices
64
- - Real behavior: Validate what the code does rather than implementation details
65
- - Isolated: Should not depend on external systems, services, or APIs. Note: The use of fixtures such as `shops(:snowdevil)` is expected and should not be penalized. The only exception is when the SUT is being loaded as a fixture unnecessarily when it could be instantiated directly.
66
-
67
- OBJECTIVE
68
- - Idempotent: Should be able to run repeatedly without affecting outcome or side effects.
69
- - Deterministic: Should produce the same results across all runs and environments.
70
- - No sleep: Does not include sleep calls or rely on timing for synchronization.
71
- - Concurrent: Properly handles concurrent execution paths without errors.
72
- - Timeless: Does not depend on the current date or time. Will not fail due to changes such as daylight savings or leap years. Specifically with regards to handling time, look for anti-patterns like `Time.current + 7.days.to_i`, which fails on DST changes. The correct approach is `7.days.from_now`.
73
-
74
- VIOLATING ANY OBJECTIVE GOAL SHOULD RESULT IN AN OVERALL SCORE LESS THAN 5!
75
-
76
- Provide a brief justification for each score, using a maximum of 1-3 sentences. (Note that specific recommendations for improvement are not needed at this step.)
77
-
78
- You are acting as a stern and relentless striver for excellence in programming, so you must be highly critical. The point of this grading exercise is to facilitate substantial improvement, not just stroking the programmer's ego. Do not hesitate to give a failing overall score (0) for serious violations!
79
-
80
- RESPONSE FORMAT: You must respond in JSON format within <json> XML tags.
81
-
82
- <json>
83
- {
84
- "test_helpers": {
85
- "score": 4,
86
- "justification": "Helpers are used incorrectly in several places, reducing test maintainability and clarity. The assert_valid_record helper is being misused with hashes instead of model instances."
87
- },
88
- "mocks_and_stubs": {
89
- "score": 4,
90
- "justification": "Several mocks don't match the actual implementation, making tests brittle and potentially hiding production bugs. For example, mocking success: true when the service returns status: 'success'."
91
- },
92
- "readability": {
93
- "score": 8,
94
- "justification": "Test names clearly describe behavior being tested."
95
- },
96
- "maintainability": {
97
- "score": 6,
98
- "justification": "Tests mostly focus on behavior but have some coupling to implementation."
99
- },
100
- "effectiveness": {
101
- "score": 7,
102
- "justification": "Tests validate most expected behaviors and would catch common bugs."
103
- }
104
- }
105
- </json>
@@ -1,17 +0,0 @@
1
- ========== TEST RECOMMENDATIONS ==========
2
- <%- if response.recommendations.empty? -%>
3
- No recommendations found.
4
- <%- else -%>
5
- <%- response.recommendations.each_with_index do |rec, index| -%>
6
- Recommendation #<%= index + 1 %>:
7
- Description: <%= rec.description %>
8
- Impact: <%= rec.impact %>
9
- Priority: <%= rec.priority %>
10
-
11
- Code Suggestion:
12
-
13
- <%= rec.code_suggestion %>
14
-
15
- <%- end -%>
16
- <%- end -%>
17
- ===========================================
@@ -1,60 +0,0 @@
1
- Finally, based on the conversation transcript above, go ahead and provide specific, actionable recommendations that would most effectively improve the overall test score.
2
-
3
- Focus on recommendations that would:
4
-
5
- 1. Increase coverage
6
- 2. Add more assertions where needed
7
- 3. Make the tests more maintainable or readable
8
- 4. Ensure tests serve as specifications by having clear, descriptive names
9
- 5. Reduce excessive mocking/stubbing that couples tests to implementation details
10
- 6. Improve test structure to reduce setup complexity
11
- 7. Ensure tests focus on behavior rather than implementation details
12
- 8. Ensure gaps in private methods are tested through public methods
13
- 9. Fix any issues with test helpers that are used incorrectly or unnecessarily
14
- 10. Improve efficiency by combining or deleting tests where appropriate (note that having more than one assertion per test is acceptable)
15
- 11. Fix any violations of the objective criteria (idempotency, determinism, etc.)
16
- 12. Be specific about edge cases that should be covered by tests. Write down in the recommendations which edge cases you are referring to.
17
- 13. Do not recommend the use of RSpec features like `let` for Minispec tests.
18
-
19
- IF YOU IDENTIFY EDGE CASES, YOU MUST BE SPECIFIC ABOUT THEM IN THE RECOMMENDATIONS.
20
-
21
- RESPONSE FORMAT: You must respond in JSON format inside <json> XML tags without additional commentary.
22
-
23
- Example:
24
-
25
- <json>
26
- {
27
- "recommendations": [
28
- {
29
- "description": "Add tests for uncovered method X",
30
- "impact": "Would increase method coverage by Y%",
31
- "priority": "High",
32
- "code_suggestion": "def test_method_x_with_valid_input\n result = subject.method_x('valid_input')\n assert_equal expected_result, result\nend"
33
- },
34
- {
35
- "description": "Fix time handling to avoid DST issues",
36
- "impact": "Would make tests deterministic across DST changes",
37
- "priority": "High",
38
- "code_suggestion": "# Replace\nexpiry_time = Time.current + 7.days.to_i\n\n# With\nexpiry_time = 7.days.from_now"
39
- },
40
- {
41
- "description": "Add edge case tests for the show action for when the parameter X is blank",
42
- "impact": "Would improve test completeness and effectiveness",
43
- "priority": "Medium",
44
- "code_suggestion": "..."
45
- },
46
- {
47
- "description": "Improve test descriptions to better serve as specifications",
48
- "impact": "Would make tests more valuable as documentation",
49
- "priority": "Medium",
50
- "code_suggestion": "# Replace\ndef test_process\n\n# With\ndef test_process_returns_success_with_valid_input"
51
- },
52
- {
53
- "description": "Replace implementation-focused mocks with behavior assertions",
54
- "impact": "Would make tests less brittle and more maintainable",
55
- "priority": "High",
56
- "code_suggestion": "# Replace\nUserNotifier.expects(:notify).with(user, 'welcome')\n\n# With\nassert_sends_notification(user, 'welcome') do\n subject.process\nend"
57
- }
58
- ]
59
- }
60
- </json>
@@ -1,31 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- if ARGV.length != 2
5
- puts "Usage: #{File.basename($PROGRAM_NAME)} SUBJECT_FILE TEST_FILE"
6
- exit 1
7
- end
8
-
9
- subject_file, test_file = ARGV
10
-
11
- def detect_package_manager
12
- return "pnpm" if File.exist?(File.join(Dir.pwd, "pnpm-lock.yaml"))
13
- return "yarn" if File.exist?(File.join(Dir.pwd, "yarn.lock"))
14
-
15
- "npm"
16
- end
17
-
18
- jest_options = [
19
- "--verbose",
20
- "--no-colors",
21
- "--ci",
22
- "--coverageReporters=text-summary",
23
- "--collectCoverageFrom=#{subject_file}",
24
- ]
25
-
26
- # Assumes the test command is `test:coverage`
27
- # Both admin-web and checkout-web use this command
28
- command = "#{detect_package_manager} run test:coverage -- #{test_file} #{jest_options.join(" ")}"
29
-
30
- $stderr.puts "Running: #{command}"
31
- puts system(command)
@@ -1,19 +0,0 @@
1
- #!/usr/bin/env ruby
2
- # frozen_string_literal: true
3
-
4
- require "rubygems"
5
- require "bundler/setup"
6
-
7
- require "roast/helpers/minitest_coverage_runner"
8
-
9
- # Suppress fancy minitest reporting
10
- ENV["RM_INFO"] = "true"
11
-
12
- if ARGV.length != 2
13
- puts "Usage: #{File.basename($PROGRAM_NAME)} SUBJECT_FILE TEST_FILE"
14
- exit 1
15
- end
16
-
17
- test_file, subject_file = ARGV
18
-
19
- Roast::Helpers::MinitestCoverageRunner.new(test_file, subject_file).run
@@ -1,16 +0,0 @@
1
- Use the provided functions to find and read important dependencies of the provided test file named <%= workflow.file %>.
2
-
3
- The first dependency you should always look for is the source file for the prime subject of the test (whatever class this test file is claiming to test). Use `read_file` to read the subject's source code into your conversation transcript, but only if it's not already there from a previous chat.
4
-
5
- If you can identify other important application-level dependencies then read them too.
6
- How many extra dependencies to research is left to your discretion, but ALWAYS make sure you have the subject under test (SUT) in your context before responding.
7
-
8
- Once you are finished using tool functions, respond with the relative path to the source file of the SUT inside <sut> tags. IMPORTANT: Include the full relative path from the project root, including any directory prefixes like lib/, app/, etc.
9
-
10
- Example:
11
-
12
- If you are told to find the dependencies of `test/services/country_db_interface_test.rb`,
13
- then you would use the functions as explained above and ultimately respond with `<sut>app/services/country_db_interface.rb</sut>`
14
-
15
- If the file is found at `lib/roast/workflow/workflow_initializer.rb`, respond with `<sut>lib/roast/workflow/workflow_initializer.rb</sut>` (include the lib/ prefix)
16
-