RubyGems - ace-test-runner-e2e - Versions diffs - 0.29.8 → 0.38.11 - Mend

ace-test-runner-e2e 0.29.8 → 0.38.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

data/lib/ace/test/end_to_end_runner/molecules/suite_report_writer.rb CHANGED Viewed

@@ -1,7 +1,10 @@
 # frozen_string_literal: true
 require "fileutils"
+require "ostruct"
 require "yaml"
+require "set"
+require "date"
 require "ace/llm"
 require "ace/llm/query_interface"
@@ -9,7 +12,7 @@ module Ace
   module Test
     module EndToEndRunner
       module Molecules
-        # Writes a suite-level final report aggregating all test results
+        # Writes an aggregated package or suite report
         #
         # Uses LLM synthesis to generate rich reports with root cause analysis,
         # friction insights, and improvement suggestions. Falls back to a static
@@ -22,7 +25,12 @@ module Ace
             @timeout = reporting["timeout"] || 60
           end
-          # Write a suite-level final report
+          REPORT_KINDS = {
+            package: ->(timestamp, package) { "#{timestamp}-#{package}-report.md" },
+            suite: ->(timestamp, _package) { "#{timestamp}-suite-report.md" }
+          }.freeze
+          # Write an aggregated report
           #
           # @param results [Array<Models::TestResult>] Test results (ordered)
           # @param scenarios [Array<Models::TestScenario>] Corresponding scenarios
@@ -30,22 +38,31 @@ module Ace
           # @param timestamp [String] Timestamp ID for this run
           # @param base_dir [String] Base directory for cache output
           # @return [String] Path to the written report file
-          def write(results, scenarios, package:, timestamp:, base_dir:)
+          def write(results, scenarios, package:, timestamp:, base_dir:, report_kind: :package, diagnostics: nil)
             cache_dir = File.join(base_dir, ".ace-local", "test-e2e")
             FileUtils.mkdir_p(cache_dir)
-            report_path = File.join(cache_dir, "#{timestamp}-final-report.md")
+            report_path = File.join(cache_dir, report_filename(report_kind, timestamp, package))
             overall_status = compute_status(results)
             executed_at = Time.now.utc.strftime("%Y-%m-%dT%H:%M:%SZ")
-            content = synthesize_report(
-              results, scenarios,
+            results_data = build_results_data(results, scenarios)
+            narrative_sections = synthesize_narrative_sections(
+              results_data,
               package: package,
               timestamp: timestamp,
               overall_status: overall_status,
               executed_at: executed_at
             )
+            content = build_report(
+              results_data,
+              package: package,
+              timestamp: timestamp,
+              overall_status: overall_status,
+              executed_at: executed_at,
+              narrative_sections: narrative_sections,
+              diagnostics: diagnostics
+            )
             File.write(report_path, content)
             report_path
@@ -53,10 +70,16 @@ module Ace
           private
-          # Attempt LLM synthesis, falling back to static template
-          def synthesize_report(results, scenarios, package:, timestamp:, overall_status:, executed_at:)
-            results_data = build_results_data(results, scenarios)
+          def report_filename(report_kind, timestamp, package)
+            builder = REPORT_KINDS[report_kind.to_sym]
+            raise ArgumentError, "Unknown report kind: #{report_kind}" unless builder
+            builder.call(timestamp, package)
+          end
+          # Attempt LLM synthesis for narrative sections only, falling back to
+          # deterministic defaults when the model is unavailable or malformed.
+          def synthesize_narrative_sections(results_data, package:, timestamp:, overall_status:, executed_at:)
             prompt_builder = Atoms::SuiteReportPromptBuilder.new
             user_prompt = prompt_builder.build(
               results_data,
@@ -73,51 +96,35 @@ module Ace
               timeout: @timeout,
               temperature: 0.3
             )
-            total_passed = results.sum(&:passed_count)
-            total_tc = results.sum(&:total_count)
-            validate_overall_line(response[:text], total_passed, total_tc)
+            extract_narrative_sections(response[:text])
           rescue => e
-            # LLM failed — fall back to static report
-            warn "Warning: LLM synthesis failed (#{e.class}: #{e.message}), using static report" if ENV["DEBUG"]
-            executed_date = Time.now.utc.strftime("%Y-%m-%d")
-            total_passed = results.sum(&:passed_count)
-            total_failed = results.sum(&:failed_count)
-            total_tc = results.sum(&:total_count)
-            build_static_report(
-              results, scenarios,
-              package: package,
-              timestamp: timestamp,
-              overall_status: overall_status,
-              executed_at: executed_at,
-              executed_date: executed_date,
-              total_passed: total_passed,
-              total_failed: total_failed,
-              total_tc: total_tc
-            )
+            warn "Warning: LLM synthesis failed (#{e.class}: #{e.message}), using deterministic narrative" if ENV["DEBUG"]
+            fallback_narrative_sections(results_data)
           end
           # Read summary and experience report content from each result's report dir
           def build_results_data(results, scenarios)
             results.each_with_index.map do |result, i|
-              scenario = scenarios[i]
+              scenario = scenario_for_result(result, scenarios, i)
               report_dir = result.report_dir
               summary_content = read_report_file(report_dir, "summary.r.md")
               experience_content = read_report_file(report_dir, "experience.r.md")
+              report_metadata = read_report_frontmatter(report_dir)
               {
                 test_id: result.test_id,
                 title: scenario.title,
                 status: result.status,
-                passed: result.passed_count,
-                failed: result.failed_count,
-                total: result.total_count,
-                test_cases: result.test_cases,
+                passed: reported_count(report_metadata, result, "passed"),
+                failed: reported_count(report_metadata, result, "failed"),
+                total: reported_count(report_metadata, result, "total"),
+                test_cases: canonical_test_cases(report_metadata, result),
                 report_dir_name: report_dir ? File.basename(report_dir) : nil,
                 summary_content: summary_content,
-                experience_content: experience_content
+                experience_content: experience_content,
+                canonical_tc_source: !report_metadata.empty?
               }
             end
           end
@@ -132,21 +139,71 @@ module Ace
             File.read(path)
           end
-          # Validate the LLM-generated Overall line against deterministic totals.
-          # If the LLM hallucinated wrong numbers, replace the line with correct values.
-          def validate_overall_line(report_text, expected_passed, expected_total)
-            expected_pct = (expected_total > 0) ? (expected_passed * 100.0 / expected_total).round(0) : 0
-            correct_line = "**Overall:** #{expected_passed}/#{expected_total} test cases passed (#{expected_pct}%)"
+          def read_report_frontmatter(report_dir)
+            return {} unless report_dir
-            # Match patterns like "**Overall:** X/Y test cases passed (Z%)"
-            overall_pattern = /\*\*Overall:\*\*\s*\d+\/\d+\s+test cases passed\s*\(\d+%\)/
+            path = File.join(report_dir, "report.md")
+            return {} unless File.exist?(path)
-            if report_text.match?(overall_pattern)
-              report_text.gsub(overall_pattern, correct_line)
-            else
-              # No Overall line found — append the correct one after the summary table
-              "#{report_text.rstrip}\n\n#{correct_line}\n"
+            content = File.read(path)
+            match = content.match(/\A---\s*\n(.*?)\n---\s*\n/m)
+            return {} unless match
+            YAML.safe_load(match[1], permitted_classes: [Time, Date]) || {}
+          rescue
+            {}
+          end
+          def reported_count(report_metadata, result, kind)
+            key = "tcs-#{kind}"
+            fallback =
+              case kind
+              when "passed" then result.passed_count
+              when "failed" then result.failed_count
+              else result.total_count
+              end
+            report_metadata[key] || fallback
+          end
+          def canonical_test_cases(report_metadata, result)
+            return result.test_cases if report_metadata.empty?
+            failed_entries = Array(report_metadata["failed"]).filter_map do |entry|
+              next unless entry.is_a?(Hash)
+              id = entry["tc"] || entry[:tc]
+              next unless id
+              {
+                id: id,
+                description: "",
+                status: "fail",
+                notes: entry["evidence"] || entry[:evidence] || "See scenario report for details",
+                category: entry["category"] || entry[:category] || "runner-error"
+              }
+            end
+            failed_ids = failed_entries.map { |entry| entry[:id] }.to_set
+            Array(report_metadata["canonical-failed-tcs"]).each do |tc_id|
+              next if failed_ids.include?(tc_id)
+              failed_entries << {
+                id: tc_id,
+                description: "",
+                status: "fail",
+                notes: "See scenario report for details",
+                category: "runner-error"
+              }
             end
+            passed_entries = Array(report_metadata["passed"]).filter_map do |tc_id|
+              next if failed_ids.include?(tc_id)
+              {id: tc_id, description: "", status: "pass", notes: ""}
+            end
+            canonical = passed_entries + failed_entries
+            canonical.empty? ? result.test_cases : canonical
           end
           def compute_status(results)
@@ -163,22 +220,26 @@ module Ace
             end
           end
-          # Static fallback report (original template-based approach)
-          def build_static_report(results, scenarios, package:, timestamp:, overall_status:,
-            executed_at:, executed_date:, total_passed:, total_failed:, total_tc:)
-            total_skipped = results.count(&:skipped?)
+          def build_report(results_data, package:, timestamp:, overall_status:, executed_at:, narrative_sections:, diagnostics:)
+            total_skipped = results_data.count { |r| r[:status] == "skip" }
+            total_passed = results_data.sum { |r| r[:passed] }
+            total_tc = results_data.sum { |r| r[:total] }
             parts = []
             parts << build_frontmatter(
               timestamp: timestamp, package: package, overall_status: overall_status,
-              tests_run: results.size, executed_at: executed_at, skipped: total_skipped
+              tests_run: results_data.size, executed_at: executed_at, skipped: total_skipped
             )
-            parts << build_header(package: package, tests_run: results.size, executed_date: executed_date, skipped: total_skipped)
-            parts << build_summary_table(results, scenarios)
+            parts << build_header(package: package)
+            parts << build_summary_table(results_data)
             parts << build_overall_line(total_passed: total_passed, total_tc: total_tc)
-            parts << build_failed_section(results, scenarios) if results.any?(&:failed?)
-            parts << build_reports_section(results, scenarios)
-            parts.join("\n")
+            parts << build_failed_section(results_data) if results_data.any? { |r| r[:failed].positive? }
+            parts << build_runner_diagnostics_section(diagnostics)
+            parts << build_narrative_section("Friction Analysis", narrative_sections[:friction])
+            parts << build_narrative_section("Improvement Suggestions", narrative_sections[:improvements])
+            parts << build_narrative_section("Positive Observations", narrative_sections[:positive])
+            parts << build_reports_section(results_data)
+            parts.compact.join("\n")
           end
           def build_frontmatter(timestamp:, package:, overall_status:, tests_run:, executed_at:, skipped: 0)
@@ -194,82 +255,165 @@ module Ace
             FRONTMATTER
           end
-          def build_header(package:, tests_run:, executed_date:, skipped: 0)
-            skipped_info = (skipped > 0) ? " (#{skipped} skipped)" : ""
+          def build_header(package:)
             <<~HEADER
-              # E2E Test Suite Report
-              **Package:** #{package}
-              **Tests:** #{tests_run}#{skipped_info}
-              **Executed:** #{executed_date}
+              # E2E Suite Report: `#{package}`
             HEADER
           end
-          def build_summary_table(results, scenarios)
-            rows = results.each_with_index.map do |result, i|
-              scenario = scenarios[i]
-              status_label = result.status.capitalize
-              passed = result.skipped? ? "-" : result.passed_count.to_s
-              failed = result.skipped? ? "-" : result.failed_count.to_s
-              total = result.skipped? ? "-" : result.total_count.to_s
-              "| #{result.test_id} | #{scenario.title} | #{status_label} | #{passed} | #{failed} | #{total} |"
+          def build_summary_table(results_data)
+            rows = results_data.map do |result|
+              status_label = result[:status].capitalize
+              passed = (result[:status] == "skip") ? "-" : result[:passed].to_s
+              failed = (result[:status] == "skip") ? "-" : result[:failed].to_s
+              total = (result[:status] == "skip") ? "-" : result[:total].to_s
+              "| #{result[:test_id]} | #{result[:title]} | #{status_label} | #{passed} | #{failed} | #{total} |"
             end
             <<~TABLE
-              ## Summary
+              ## Summary Table
               | Test ID | Title | Status | Passed | Failed | Total |
-              |---------|-------|--------|--------|--------|-------|
+              |---|---|---:|---:|---:|---:|
               #{rows.join("\n")}
             TABLE
           end
           def build_overall_line(total_passed:, total_tc:)
-            pct = (total_tc > 0) ? (total_passed * 100.0 / total_tc).round(0) : 0
-            "**Overall:** #{total_passed}/#{total_tc} test cases passed (#{pct}%)\n"
+            pct = (total_tc > 0) ? (total_passed * 100.0 / total_tc).round(1) : 0.0
+            formatted_pct = (pct % 1).zero? ? pct.to_i.to_s : format("%.1f", pct)
+            <<~OVERALL
+              ## Overall Line
+              **Overall:** #{total_passed}/#{total_tc} test cases passed (#{formatted_pct}%)
+            OVERALL
           end
-          def build_failed_section(results, scenarios)
+          def build_failed_section(results_data)
             parts = ["\n## Failed Tests\n"]
-            results.each_with_index do |result, i|
-              next if result.success? || result.skipped?
+            results_data.each do |result|
+              next unless result[:failed].positive?
-              scenario = scenarios[i]
-              parts << "### #{result.test_id}: #{scenario.title} (#{result.passed_count}/#{result.total_count})\n"
+              parts << "### #{result[:test_id]}"
+              parts << ""
+              parts << "**Failed test case details**"
-              failed_tcs = result.test_cases.select { |tc| tc[:status] == "fail" }
+              failed_tcs = result[:test_cases].select { |tc| tc[:status] == "fail" }
               if failed_tcs.any?
-                parts << "**Failed Test Cases:**"
                 failed_tcs.each do |tc|
-                  parts << "- #{tc[:id]}: #{tc[:description]}"
+                  category = tc[:category] || "runner-error"
+                  details = tc[:notes].to_s.strip
+                  details = tc[:description].to_s if details.empty?
+                  parts << "- `#{tc[:id]}` (#{category}) — #{details}"
                 end
-                parts << ""
+              else
+                parts << "- Exact failed TC mapping unavailable in aggregate view — see scenario report for canonical details."
               end
-              if result.report_dir
-                parts << "**Report:** #{result.report_dir}\n"
+              if result[:report_dir_name]
+                parts << ""
+                parts << "**Report directory:** `#{result[:report_dir_name]}`"
               end
+              parts << ""
             end
             parts.join("\n")
           end
-          def build_reports_section(results, scenarios)
-            rows = results.each_with_index.map do |result, i|
-              dir = result.report_dir ? File.basename(result.report_dir) : "N/A"
-              "| #{result.test_id} | #{dir} |"
+          def build_runner_diagnostics_section(diagnostics)
+            return nil unless diagnostics.is_a?(Hash) && diagnostics[:dirty_worktree]
+            entries = Array(diagnostics[:new_tracked_entries]).map { |line| "- `#{line}`" }.join("\n")
+            entries = "- No specific entries captured." if entries.empty?
+            <<~SECTION
+              ## Runner Diagnostics
+              Suite execution introduced new tracked working-tree changes relative to the pre-run snapshot.
+              #{entries}
+            SECTION
+          end
+          def build_narrative_section(title, content)
+            return nil if content.to_s.strip.empty?
+            <<~SECTION
+              ## #{title}
+              #{content.to_s.strip}
+            SECTION
+          end
+          def build_reports_section(results_data)
+            rows = results_data.map do |result|
+              dir = result[:report_dir_name] || "N/A"
+              "| #{result[:test_id]} | `#{dir}` |"
             end
             <<~SECTION
-              ## Reports
+              ## Reports Table
-              | Test ID | Reports Folder |
-              |---------|----------------|
+              | Test ID | Report Directory |
+              |---|---|
               #{rows.join("\n")}
             SECTION
           end
+          def extract_narrative_sections(report_text)
+            text = report_text.to_s
+            sections = {
+              friction: extract_markdown_section(text, "Friction Analysis"),
+              improvements: extract_markdown_section(text, "Improvement Suggestions"),
+              positive: extract_markdown_section(text, "Positive Observations")
+            }
+            fallback = strip_canonical_sections(text)
+            has_markdown_sections = text.match?(/^\#{2,3}\s+/)
+            sections[:positive] = fallback if sections.values.all? { |value| value.to_s.strip.empty? } &&
+              !fallback.empty? && !has_markdown_sections
+            sections
+          end
+          def extract_markdown_section(text, heading)
+            match = text.match(/^\#{2,3}\s+#{Regexp.escape(heading)}\s*$\n?(.*?)(?=^\#{1,3}\s|\z)/mi)
+            return "" unless match
+            match[1].to_s.strip
+          end
+          def strip_canonical_sections(text)
+            body = text.to_s.dup
+            body.sub!(/\A---.*?^---\s*/m, "")
+            body.gsub!(/^\#{1,3}\s+.*$/, "")
+            body.gsub!(/^\|.*\|\s*$/, "")
+            body.gsub!(/^\*\*Overall:\*\*.*$/, "")
+            body.lines.map(&:rstrip).reject(&:empty?).join("\n").strip
+          end
+          def fallback_narrative_sections(results_data)
+            failed_results = results_data.select { |result| result[:failed].positive? }
+            {
+              friction: failed_results.empty? ? "" : failed_results.map { |result|
+                "- #{result[:test_id]} had #{result[:failed]} failing test case(s); inspect `#{result[:report_dir_name]}` for scenario details."
+              }.join("\n"),
+              improvements: failed_results.empty? ? "" : failed_results.map { |result|
+                "- Re-run #{result[:test_id]} after the targeted fix and confirm the failing test case set is empty."
+              }.join("\n"),
+              positive: results_data.select { |result| result[:failed].zero? }.map { |result|
+                "- #{result[:test_id]} passed #{result[:passed]}/#{result[:total]} test cases."
+              }.join("\n")
+            }
+          end
+          def scenario_for_result(result, scenarios, index)
+            scenarios[index] || OpenStruct.new(
+              title: result.metadata[:phase] == "preflight" || result.metadata["phase"] == "preflight" ? "Preflight" : result.test_id
+            )
+          end
         end
       end
     end

data/lib/ace/test/end_to_end_runner/molecules/test_discoverer.rb CHANGED Viewed

@@ -6,16 +6,18 @@ module Ace
   module Test
     module EndToEndRunner
       module Molecules
-        # Discovers E2E test scenario directories (TS-*/scenario.yml) in packages
+        # Discovers deterministic preflight tests and agent E2E scenarios in packages
         #
         # Finds test scenarios in the TS-format directory structure:
+        #   {package}/test/feat/**/*_test.rb
         #   {package}/test/e2e/TS-*/scenario.yml
         #
         # Note: This is a Molecule (not an Atom) because it performs filesystem
         # I/O via Dir.glob.
         class TestDiscoverer
-          TEST_DIR = "test/e2e"
+          TEST_DIRS = ["test/e2e"].freeze
           SCENARIO_FILE = "scenario.yml"
+          DEFAULT_PREFLIGHT_GLOBS = ["test/feat/**/*_test.rb"].freeze
           SCENARIO_DIR_PATTERN = "TS-*"
           # Find E2E test scenario files matching criteria
@@ -47,6 +49,17 @@ module Ace
             ).map(&:file_path).sort
           end
+          # @return [Array<String>] Sorted list of matching deterministic preflight test files
+          def find_integration_tests(package:, base_dir: Dir.pwd)
+            package_path = File.join(base_dir, package)
+            preflight_globs.each do |glob|
+              files = Dir.glob(File.join(package_path, glob)).sort
+              return files unless files.empty?
+            end
+            []
+          end
           # Find TS-format scenario directories and load them as TestScenario models
           #
           # @param package [String] Package name
@@ -56,9 +69,11 @@ module Ace
           # @param base_dir [String] Base directory to search from
           # @return [Array<Models::TestScenario>] Loaded scenario models with test_cases
           def find_scenarios(package:, test_id: nil, tags: nil, exclude_tags: nil, base_dir: Dir.pwd)
-            test_dir = File.join(base_dir, package, TEST_DIR)
-            pattern = File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
-            scenario_files = Dir.glob(pattern).sort
+            patterns = TEST_DIRS.map do |test_dir_name|
+              test_dir = File.join(base_dir, package, test_dir_name)
+              File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
+            end
+            scenario_files = Dir.glob(patterns).sort
             loader = ScenarioLoader.new
             scenarios = scenario_files.map do |yml_path|
@@ -82,11 +97,13 @@ module Ace
           # @param base_dir [String] Base directory to search from
           # @return [Array<String>] Sorted list of package names
           def list_packages(base_dir: Dir.pwd)
-            pattern = File.join(base_dir, "*/#{TEST_DIR}/#{SCENARIO_DIR_PATTERN}/#{SCENARIO_FILE}")
+            patterns = TEST_DIRS.map do |test_dir_name|
+              File.join(base_dir, "*/#{test_dir_name}/#{SCENARIO_DIR_PATTERN}/#{SCENARIO_FILE}")
+            end
             base = Pathname.new(base_dir)
-            Dir.glob(pattern)
+            Dir.glob(patterns)
               .map { |f| Pathname.new(f).relative_path_from(base).each_filename.first }
               .uniq
               .sort
@@ -96,12 +113,14 @@ module Ace
           # Build glob pattern for finding TS-format scenario.yml files
           def build_scenario_pattern(package, test_id, base_dir)
-            test_dir = File.join(base_dir, package, TEST_DIR)
-            if test_id
-              File.join(test_dir, "*#{test_id}*", SCENARIO_FILE)
-            else
-              File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
+            TEST_DIRS.map do |test_dir_name|
+              test_dir = File.join(base_dir, package, test_dir_name)
+              if test_id
+                File.join(test_dir, "*#{test_id}*", SCENARIO_FILE)
+              else
+                File.join(test_dir, SCENARIO_DIR_PATTERN, SCENARIO_FILE)
+              end
             end
           end
@@ -129,6 +148,12 @@ module Ace
             filtered
           end
+          def preflight_globs
+            configured = Molecules::ConfigLoader.load.dig("patterns", "preflight")
+            globs = [configured, *DEFAULT_PREFLIGHT_GLOBS].compact.uniq
+            globs.reject(&:empty?)
+          end
         end
       end
     end

data/lib/ace/test/end_to_end_runner/molecules/test_executor.rb CHANGED Viewed

@@ -16,12 +16,18 @@ module Ace
           # @param provider [String] LLM provider:model string
           # @param timeout [Integer] Request timeout in seconds
           # @param config [Hash] Configuration hash (string keys) from ConfigLoader
-          def initialize(provider: nil, timeout: nil, config: nil)
+          def initialize(provider: nil, timeout: nil, config: nil, sandbox_backend_factory: nil)
             config ||= Molecules::ConfigLoader.load
-            @provider = provider || config.dig("execution", "provider") || "claude:sonnet"
+            @provider = provider || config.dig("execution", "runner_provider") ||
+              config.dig("execution", "provider") || "claude:sonnet"
+            @verifier_provider = config.dig("execution", "verifier_provider") ||
+              config.dig("execution", "provider") || @provider
             @timeout = timeout || config.dig("execution", "timeout") || 300
             @prompt_builder = Atoms::PromptBuilder.new
             @cli_provider_adapter = Atoms::CliProviderAdapter.new(config)
+            @sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
+              Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
+            }
           end
           # Execute a single test scenario via LLM
@@ -192,9 +198,10 @@ module Ace
           # Execute TC via skill invocation for CLI providers
           def execute_tc_via_skill(test_case, sandbox_path, scenario, cli_args: nil, run_id: nil, env_vars: nil)
             with_tc_error_handling(scenario) do |started_at|
+              sandbox_backend, prepared_env = prepared_env_for(sandbox_path, env_vars)
               prompt = @cli_provider_adapter.build_tc_skill_prompt(
                 test_case: test_case, scenario: scenario,
-                sandbox_path: sandbox_path, run_id: run_id, env_vars: env_vars
+                sandbox_path: sandbox_path, run_id: run_id, env_vars: prepared_env
               )
               response = Ace::LLM::QueryInterface.query(
@@ -202,7 +209,8 @@ module Ace
                 system: nil, cli_args: cli_args,
                 timeout: @timeout, fallback: false,
                 working_dir: sandbox_path,
-                subprocess_env: env_vars
+                subprocess_env: prepared_env,
+                subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: prepared_env)
               )
               invocation_error = detect_skill_invocation_error(response[:text])
@@ -322,9 +330,23 @@ module Ace
             @pipeline_executors ||= {}
             @pipeline_executors[timeout] ||= Molecules::PipelineExecutor.new(
               provider: @provider,
-              timeout: timeout
+              verifier_provider: @verifier_provider,
+              timeout: timeout,
+              sandbox_backend_factory: @sandbox_backend_factory
             )
           end
+          def build_sandbox_backend(sandbox_path, env_vars)
+            @sandbox_backend_factory.call(
+              sandbox_path,
+              source_root: env_vars&.dig("ACE_E2E_SOURCE_ROOT") || env_vars&.dig(:ACE_E2E_SOURCE_ROOT)
+            )
+          end
+          def prepared_env_for(sandbox_path, env_vars)
+            sandbox_backend = build_sandbox_backend(sandbox_path, env_vars || {})
+            [sandbox_backend, sandbox_backend.prepared_env(env_vars || {})]
+          end
         end
       end
     end