RubyGems - ace-test-runner-e2e - Versions diffs - 0.29.6 → 0.38.11 - Mend

ace-test-runner-e2e 0.29.6 → 0.38.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

data/lib/ace/test/end_to_end_runner/molecules/integration_runner.rb ADDED Viewed

@@ -0,0 +1,122 @@
+# frozen_string_literal: true
+require "fileutils"
+require "open3"
+require "shellwords"
+require "ace/test_support/sandbox_package_copy"
+module Ace
+  module Test
+    module EndToEndRunner
+      module Molecules
+        # Runs deterministic preflight tests inside a sandboxed package copy.
+        class IntegrationRunner
+          def initialize(base_dir: Dir.pwd, package_copy: nil)
+            @base_dir = File.expand_path(base_dir)
+            @package_copy = package_copy || Ace::TestSupport::SandboxPackageCopy.new(source_root: @base_dir)
+          end
+          def run(package:, files:, timestamp:, output: $stdout)
+            return nil if files.nil? || files.empty?
+            started_at = Time.now
+            sandbox_root = File.join(@base_dir, ".ace-local", "test-e2e", "#{timestamp}-#{package}-preflight")
+            FileUtils.mkdir_p(sandbox_root)
+            package_copy_result = @package_copy.prepare(package_name: package, sandbox_root: sandbox_root)
+            package_root = resolve_package_root(sandbox_root, package)
+            env = package_copy_result[:env] || {}
+            test_cases = files.map do |file|
+              run_file(package_root, file, env, output)
+            end
+            status = if test_cases.any? { |tc| tc[:status] == "error" }
+              "error"
+            elsif test_cases.any? { |tc| tc[:status] == "fail" }
+              "fail"
+            else
+              "pass"
+            end
+            Models::TestResult.new(
+              test_id: "PREFLIGHT",
+              status: status,
+              test_cases: test_cases,
+              summary: preflight_summary(status, test_cases),
+              started_at: started_at,
+              completed_at: Time.now,
+              metadata: {
+                phase: "preflight",
+                package: package,
+                sandbox_root: sandbox_root
+              }
+            )
+          end
+          private
+          def run_file(package_root, file, env, output)
+            relative = file.sub(%r{\A#{Regexp.escape(@base_dir)}/?}, "")
+            package_relative = relative.sub(%r{\A[^/]+/}, "")
+            stdout, stderr, status = Open3.capture3(
+              env,
+              "ace-test",
+              package_relative,
+              chdir: package_root
+            )
+            output.puts "Preflight: #{package_relative} (#{status.success? ? "pass" : "fail"})"
+            {
+              id: package_relative,
+              description: package_relative,
+              status: status.success? ? "pass" : "fail",
+              actual: stdout,
+              notes: stderr,
+              metadata: {
+                phase: "preflight",
+                exit_status: status.exitstatus,
+                command: Shellwords.join(["ace-test", package_relative])
+              }
+            }
+          rescue StandardError => e
+            output.puts "Preflight: #{package_relative} (error)"
+            {
+              id: package_relative,
+              description: package_relative,
+              status: "error",
+              actual: "",
+              notes: e.message,
+              metadata: {
+                phase: "preflight",
+                command: Shellwords.join(["ace-test", package_relative])
+              }
+            }
+          end
+          def resolve_package_root(sandbox_root, package)
+            candidate = File.join(sandbox_root, package)
+            return candidate if Dir.exist?(candidate)
+            sandbox_root
+          end
+          def preflight_summary(status, test_cases)
+            passed = test_cases.count { |tc| tc[:status] == "pass" }
+            total = test_cases.size
+            prefix =
+              case status
+              when "pass" then "Preflight passed"
+              when "fail" then "Preflight failed"
+              else "Preflight errored"
+              end
+            "#{prefix}: #{passed}/#{total} files passed"
+          end
+        end
+      end
+    end
+  end
+end

data/lib/ace/test/end_to_end_runner/molecules/pipeline_executor.rb CHANGED Viewed

@@ -1,5 +1,8 @@
 # frozen_string_literal: true
+require "fileutils"
+require "json"
+require "time"
 require "ace/llm"
 require "ace/llm/query_interface"
@@ -9,17 +12,25 @@ module Ace
       module Molecules
         # Executes standalone scenarios using the deterministic pipeline.
         class PipelineExecutor
+          AMBIENT_TMUX_ENV_VARS = %w[TMUX TMUX_PANE].freeze
           # @param provider [String]
+          # @param verifier_provider [String, nil]
           # @param timeout [Integer]
           # @param sandbox_builder [Molecules::PipelineSandboxBuilder]
           # @param prompt_bundler [Molecules::PipelinePromptBundler]
           # @param report_generator [Molecules::PipelineReportGenerator]
-          def initialize(provider:, timeout:, sandbox_builder: nil, prompt_bundler: nil, report_generator: nil)
+          def initialize(provider:, verifier_provider: nil, timeout:, sandbox_builder: nil, prompt_bundler: nil,
+            report_generator: nil, sandbox_backend_factory: nil)
             @provider = provider
+            @verifier_provider = verifier_provider || provider
             @timeout = timeout
             @sandbox_builder = sandbox_builder || PipelineSandboxBuilder.new
             @prompt_bundler = prompt_bundler || PipelinePromptBundler.new
             @report_generator = report_generator || PipelineReportGenerator.new
+            @sandbox_backend_factory = sandbox_backend_factory || lambda { |sandbox_path, source_root: nil|
+              Molecules::BwrapSandboxBackend.new(sandbox_root: sandbox_path, source_root: source_root)
+            }
           end
           # @param scenario [Models::TestScenario]
@@ -31,57 +42,89 @@ module Ace
           # @return [Models::TestResult]
           def execute(scenario:, cli_args:, sandbox_path:, report_dir:, env_vars: nil, test_cases: nil)
             started_at = Time.now
+            FileUtils.mkdir_p(report_dir)
+            write_command_record(report_dir, "runner", provider: @provider, cli_args: cli_args)
+            write_tc_manifests(report_dir, scenario, test_cases: test_cases)
-            build_env = @sandbox_builder.build(
-              scenario: scenario,
-              sandbox_path: sandbox_path,
-              test_cases: test_cases
+            build_env = if prepared_sandbox?(sandbox_path, env_vars)
+              @sandbox_builder.prepare_existing_sandbox(
+                scenario: scenario,
+                sandbox_path: sandbox_path,
+                test_cases: test_cases
+              )
+            else
+              @sandbox_builder.build(
+                scenario: scenario,
+                sandbox_path: sandbox_path,
+                test_cases: test_cases
+              )
+            end
+            merged_env = sanitize_subprocess_env((env_vars || {}).merge(build_env))
+            sandbox_backend = @sandbox_backend_factory.call(
+              sandbox_path,
+              source_root: merged_env["ACE_E2E_SOURCE_ROOT"] || merged_env[:ACE_E2E_SOURCE_ROOT]
             )
-            merged_env = (env_vars || {}).merge(build_env)
+            merged_env = sandbox_backend.prepared_env(merged_env)
             runner = @prompt_bundler.prepare_runner(
               scenario: scenario,
               sandbox_path: sandbox_path,
               test_cases: test_cases
             )
-            run_llm(
+            runner_response = run_llm(
               prompt_path: runner[:prompt_path],
               system_path: runner[:system_path],
               output_path: runner[:output_path],
               cli_args: cli_args,
-              env_vars: merged_env
+              env_vars: merged_env,
+              subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
+              provider: @provider
             )
+            runner_observations = extract_runner_observations(runner_response[:text])
+            artifact_contract = snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases: test_cases)
             verifier = @prompt_bundler.prepare_verifier(
               scenario: scenario,
               sandbox_path: sandbox_path,
-              test_cases: test_cases
+              test_cases: test_cases,
+              runner_observations: runner_observations,
+              artifact_contract: artifact_contract
             )
+            write_command_record(report_dir, "verifier", provider: @verifier_provider, cli_args: cli_args)
             verifier_response = run_llm(
               prompt_path: verifier[:prompt_path],
               system_path: verifier[:system_path],
               output_path: verifier[:output_path],
               cli_args: cli_args,
-              env_vars: merged_env
+              env_vars: merged_env,
+              subprocess_command_prefix: sandbox_backend.command_prefix(chdir: sandbox_path, env: merged_env),
+              provider: @verifier_provider
             )
             @report_generator.generate(
               scenario: scenario,
               verifier_output: verifier_response[:text],
               report_dir: report_dir,
-              provider: @provider,
+              provider: @verifier_provider,
               started_at: started_at,
-              completed_at: Time.now
+              completed_at: Time.now,
+              metadata: base_metadata(
+                report_dir,
+                runner_observations: runner_observations,
+                artifact_contract: artifact_contract
+              )
             )
           rescue => e
             begin
               @report_generator.write_failure_report(
                 scenario: scenario,
                 report_dir: report_dir,
-                provider: @provider,
+                provider: @verifier_provider,
                 started_at: started_at || Time.now,
                 completed_at: Time.now,
-                error_message: "#{e.class}: #{e.message}"
+                error_message: "#{e.class}: #{e.message}",
+                failure_category: "runner-error",
+                metadata: base_metadata(report_dir)
               )
             rescue => write_error
               Models::TestResult.new(
@@ -97,24 +140,121 @@ module Ace
           private
-          def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:)
+          def run_llm(prompt_path:, system_path:, output_path:, cli_args:, env_vars:, subprocess_command_prefix:, provider:)
             prompt = File.read(prompt_path)
             system = File.read(system_path)
             sandbox_dir = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
-            Dir.chdir(sandbox_dir) do
-              Ace::LLM::QueryInterface.query(
-                @provider,
-                prompt,
-                system: system,
-                cli_args: cli_args,
-                timeout: @timeout,
-                fallback: false,
-                output: output_path,
-                subprocess_env: env_vars
+            Ace::LLM::QueryInterface.query(
+              provider,
+              prompt,
+              system: system,
+              cli_args: cli_args,
+              timeout: @timeout,
+              fallback: false,
+              output: output_path,
+              subprocess_env: env_vars,
+              subprocess_command_prefix: subprocess_command_prefix,
+              working_dir: sandbox_dir
+            )
+          end
+          def write_tc_manifests(report_dir, scenario, test_cases:)
+            selected = select_test_cases(scenario, test_cases)
+            selected.each do |test_case|
+              manifest = {
+                tc_id: test_case.tc_id,
+                title: test_case.title,
+                declared_artifacts: Array(test_case.declared_artifacts),
+                optional_artifacts: Array(test_case.optional_artifacts),
+                goal_format: test_case.goal_format
+              }
+              File.write(
+                File.join(report_dir, "#{test_case.short_id}.manifest.json"),
+                JSON.pretty_generate(manifest)
               )
             end
           end
+          def write_command_record(report_dir, phase, provider:, cli_args:)
+            record = {
+              phase: phase,
+              provider: provider,
+              cli_args: cli_args,
+              timeout: @timeout,
+              recorded_at: Time.now.utc.iso8601
+            }
+            File.write(
+              File.join(report_dir, "#{phase}.command.json"),
+              JSON.pretty_generate(record)
+            )
+          end
+          def snapshot_artifacts(report_dir, sandbox_path, scenario, test_cases:)
+            snapshot = select_test_cases(scenario, test_cases).to_h do |test_case|
+              required = Array(test_case.declared_artifacts).sort
+              optional = Array(test_case.optional_artifacts).sort
+              present_required = required.select { |path| File.exist?(File.join(sandbox_path, path)) }
+              present_optional = optional.select { |path| File.exist?(File.join(sandbox_path, path)) }
+              missing_required = required - present_required
+              [test_case.tc_id, {
+                "present_artifacts" => (present_required + present_optional).sort,
+                "required_artifacts" => required,
+                "present_required_artifacts" => present_required,
+                "missing_required_artifacts" => missing_required,
+                "optional_artifacts" => optional,
+                "present_optional_artifacts" => present_optional
+              }]
+            end
+            File.write(File.join(report_dir, "artifact-snapshot.json"), JSON.pretty_generate(snapshot))
+            snapshot
+          end
+          def select_test_cases(scenario, test_cases)
+            return Array(scenario.test_cases) if test_cases.nil? || test_cases.empty?
+            wanted = test_cases.map { |value| value.to_s.upcase }
+            Array(scenario.test_cases).select { |tc| wanted.include?(tc.tc_id.to_s.upcase) }
+          end
+          def base_metadata(report_dir, runner_observations: nil, artifact_contract: nil)
+            metadata = {
+              "runner_provider" => @provider,
+              "verifier_provider" => @verifier_provider,
+              "report_dir" => report_dir
+            }
+            if runner_observations && !runner_observations.empty?
+              metadata["runner_observations"] = runner_observations
+            end
+            if artifact_contract
+              metadata["missing_required_artifacts"] = artifact_contract.to_h.transform_values do |entry|
+                Array(entry["missing_required_artifacts"])
+              end.reject { |_tc_id, paths| paths.empty? }
+            end
+            metadata
+          end
+          def sanitize_subprocess_env(env_vars)
+            sanitized = env_vars.reject { |key, _value| AMBIENT_TMUX_ENV_VARS.include?(key.to_s) }
+            AMBIENT_TMUX_ENV_VARS.each { |key| sanitized[key] = nil }
+            sanitized
+          end
+          def prepared_sandbox?(sandbox_path, env_vars)
+            return false unless env_vars.is_a?(Hash) && !env_vars.empty?
+            env_root = env_vars["PROJECT_ROOT_PATH"] || env_vars[:PROJECT_ROOT_PATH]
+            return false if env_root.to_s.strip.empty?
+            File.expand_path(env_root) == File.expand_path(sandbox_path)
+          end
+          def extract_runner_observations(text)
+            Atoms::SkillResultParser.parse(text)[:observations].to_s
+          rescue Atoms::ResultParser::ParseError
+            ""
+          end
         end
       end
     end

data/lib/ace/test/end_to_end_runner/molecules/pipeline_prompt_bundler.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 require "date"
 require "fileutils"
+require "time"
 require "yaml"
 module Ace
@@ -15,21 +16,35 @@ module Ace
             Rules:
             - Execute each goal in order
-            - Save all artifacts to results/tc/{NN}/ directories as specified
-            - Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep artifact writes under SANDBOX_ROOT/results
+            - Treat the initial working directory as SANDBOX_ROOT; if a goal needs commands in a created worktree, cd there for execution but keep any declared outcome artifacts under SANDBOX_ROOT/results
+            - Preserve the sandbox runtime environment; do not reset PATH, HOME, or other provided env vars
+            - If `ACE_E2E_SANDBOX_RUNTIME_ROOT` is set, make sure command execution uses `$ACE_E2E_SANDBOX_RUNTIME_ROOT/bin` on PATH in the shell where you run scenario commands
+            - Run `ace-*` commands directly; do not wrap them with `timeout`, `env -i`, or other execution wrappers that can change behavior or hide diagnostics
+            - Do not bypass the public CLI with repo-local executables such as `./exe/ace-*`, `bin/ace-*`, or `ruby .../exe/ace-*`
             - Do not fabricate output - all artifacts must come from real tool execution
+            - Never background commands or start dependent verification captures before the command they verify has completed
+            - When a goal requires command captures, keep stdout and stderr separate; do not merge streams and do not use `2>&1`
+            - A command capture set is incomplete unless the matching `.stdout`, `.stderr`, and `.exit` files all exist
+            - Persist each command's `.stdout`, `.stderr`, and `.exit` files immediately after that command finishes, before starting the next command
+            - For commands that establish state, write that command's `.exit` file before running any list/status/fs-check/tmux verification for the same goal
+            - When a successful command prints a filesystem path to a generated artifact, copy that artifact into `results/` if the goal asks for supporting evidence from the generated file
             - If a goal fails, note the failure and continue to the next goal
-            - After all goals, output a brief summary of what you produced for each goal
+            - Do not create synthetic helper reports or temp input files under results/ unless the scenario explicitly treats them as product outcomes
+            - After all goals, return concise runner observations describing what you did and what happened
           PROMPT
           VERIFIER_SYSTEM_PROMPT = <<~PROMPT
             You are an E2E test verifier. You inspect artifacts and render PASS/FAIL verdicts.
             Rules:
-            - Evaluate each goal independently based solely on the artifacts provided
-            - Do not speculate about what the runner did - only judge what exists
+            - Evaluate each goal independently based on sandbox state first, then runner observations, then raw debug captures only when needed
+            - Treat declared artifacts and helper filenames as hints, not as the source of truth
+            - If a helper file is missing or stale, inspect the sandbox directly before failing the goal
+            - Use artifact mtimes to detect runner ordering mistakes; if postcondition captures are older than the primary command's stdout/stderr/exit, classify the goal as `runner-error` unless direct sandbox state proves a product failure after the command completed
+            - Use read-only commands in the sandbox when they materially improve confidence (for example: git log/status/show, ls/find/cat)
+            - Do not speculate beyond the provided sandbox evidence and runner observations
             - For each failed goal, include a category:
-              test-spec-error | tool-bug | runner-error | infrastructure-error
+              test-spec-error | tool-bug | runner-error | infrastructure-error | missing-artifact
             - For each goal, cite specific evidence (filenames, content snippets)
             - Follow the output format exactly
           PROMPT
@@ -60,16 +75,20 @@ module Ace
           # @param sandbox_path [String]
           # @param test_cases [Array<String>, nil]
           # @return [Hash]
-          def prepare_verifier(scenario:, sandbox_path:, test_cases: nil)
+          def prepare_verifier(scenario:, sandbox_path:, test_cases: nil, runner_observations: nil, artifact_contract: nil)
             cache_dir = ensure_cache_dir(sandbox_path)
             system_path = File.join(cache_dir, "verifier-system.md")
             prompt_path = File.join(cache_dir, "verifier-prompt.md")
             File.write(system_path, VERIFIER_SYSTEM_PROMPT)
+            project_context = build_project_context_section(scenario)
+            sandbox_context = build_sandbox_context_section(sandbox_path)
             artifacts = build_artifact_section(sandbox_path)
+            contract = build_artifact_contract_section(artifact_contract)
+            observations = build_runner_observation_section(runner_observations)
             criteria = bundle_markdown_file(File.join(scenario.dir_path, "verifier.yml.md"), test_cases: test_cases)
-            File.write(prompt_path, [artifacts, criteria].join("\n\n---\n\n"))
+            File.write(prompt_path, [project_context, sandbox_context, artifacts, contract, observations, criteria].join("\n\n---\n\n"))
             {
               system_path: system_path,
@@ -154,6 +173,13 @@ module Ace
             parts.concat(tree_entries)
             parts << "```"
             parts << ""
+            parts << "## File metadata"
+            parts << "```"
+            files.each do |file|
+              parts << "#{relative_path(file, sandbox_path)}\tmtime=#{File.mtime(file).utc.iso8601}"
+            end
+            parts << "```"
+            parts << ""
             parts << "## File contents"
             parts << ""
@@ -168,6 +194,93 @@ module Ace
             parts.join("\n").rstrip
           end
+          def build_project_context_section(scenario)
+            package_root = File.expand_path("../../..", scenario.dir_path)
+            source_root = File.expand_path("..", package_root)
+            files = [
+              File.join(package_root, "README.md"),
+              File.join(package_root, "docs", "usage.md"),
+              File.join(package_root, "docs", "getting-started.md"),
+              File.join(source_root, "CLAUDE.md")
+            ].select { |path| File.file?(path) }.first(3)
+            parts = []
+            parts << "# Project Context"
+            parts << ""
+            parts << "- Package: `#{scenario.package}`"
+            parts << "- Test ID: `#{scenario.test_id}`"
+            parts << "- Sandbox profile: `#{scenario.sandbox_profile}`"
+            parts << ""
+            files.each do |file|
+              parts << "## `#{File.basename(file)}`"
+              parts << "```"
+              parts << safe_read(file)
+              parts << "```"
+              parts << ""
+            end
+            parts.join("\n").rstrip
+          end
+          def build_sandbox_context_section(sandbox_path)
+            sandbox_path = File.expand_path(sandbox_path)
+            entries = Dir.glob(File.join(sandbox_path, "*"), File::FNM_DOTMATCH)
+              .reject { |path| %w[. ..].include?(File.basename(path)) }
+              .sort
+            parts = []
+            parts << "# Sandbox Context"
+            parts << ""
+            parts << "- Sandbox root: `#{sandbox_path}`"
+            parts << "- Inspect the sandbox directly when verifying source-of-truth state."
+            parts << ""
+            parts << "## Top-level entries"
+            parts << "```"
+            parts.concat(entries.map { |path| relative_path(path, sandbox_path) })
+            parts << "```"
+            parts.join("\n").rstrip
+          end
+          def build_runner_observation_section(runner_observations)
+            <<~MARKDOWN.rstrip
+              # Runner Observations
+              #{runner_observations.to_s.strip.empty? ? "(none provided)" : runner_observations.to_s.strip}
+            MARKDOWN
+          end
+          def build_artifact_contract_section(artifact_contract)
+            return "# Artifact Contract\n\n(no snapshot provided)" if artifact_contract.nil? || artifact_contract.empty?
+            parts = []
+            parts << "# Artifact Contract"
+            parts << ""
+            parts << "Use this only as supporting context. Missing helper artifacts may be acceptable when sandbox state still proves the goal."
+            parts << ""
+            artifact_contract.sort.each do |tc_id, entry|
+              parts << "## #{tc_id}"
+              parts << ""
+              parts << "- Required artifacts: #{format_artifact_list(entry["required_artifacts"])}"
+              parts << "- Present required artifacts: #{format_artifact_list(entry["present_required_artifacts"])}"
+              parts << "- Missing required artifacts: #{format_artifact_list(entry["missing_required_artifacts"])}"
+              parts << "- Optional artifacts: #{format_artifact_list(entry["optional_artifacts"])}"
+              parts << "- Present optional artifacts: #{format_artifact_list(entry["present_optional_artifacts"])}"
+              parts << ""
+            end
+            parts.join("\n").rstrip
+          end
+          def format_artifact_list(paths)
+            items = Array(paths)
+            return "(none)" if items.empty?
+            items.map { |path| "`#{path}`" }.join(", ")
+          end
           def relative_path(path, root)
             File.expand_path(path).sub("#{File.expand_path(root)}/", "")
           end