RubyGems - ace-test-runner-e2e - Versions diffs - 0.29.6 → 0.38.11 - Mend

ace-test-runner-e2e 0.29.6 → 0.38.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

data/lib/ace/test/end_to_end_runner/molecules/pipeline_report_generator.rb CHANGED Viewed

@@ -9,7 +9,16 @@ module Ace
       module Molecules
         # Generates TC-first reports from standalone verifier output.
         class PipelineReportGenerator
-          FAILURE_CATEGORIES = %w[test-spec-error tool-bug runner-error infrastructure-error].freeze
+          FAILURE_CATEGORIES = %w[
+            test-spec-error
+            tool-bug
+            runner-error
+            infrastructure-error
+            missing-artifact
+            state-drift
+            behavior-regression
+            discoverability-gap
+          ].freeze
           # @param report_writer [Molecules::ReportWriter]
           def initialize(report_writer: nil)
@@ -23,8 +32,9 @@ module Ace
           # @param started_at [Time]
           # @param completed_at [Time]
           # @return [Models::TestResult]
-          def generate(scenario:, verifier_output:, report_dir:, provider:, started_at:, completed_at:)
+          def generate(scenario:, verifier_output:, report_dir:, provider:, started_at:, completed_at:, metadata: {})
             parsed = parse_verifier_output(verifier_output, scenario)
+            merged_metadata = metadata.merge(parsed[:metadata] || {})
             result = Models::TestResult.new(
               test_id: scenario.test_id,
@@ -32,8 +42,10 @@ module Ace
               test_cases: parsed[:test_cases],
               summary: parsed[:summary],
               error: parsed[:error],
+              observations: parsed[:observations].to_s,
               started_at: started_at,
-              completed_at: completed_at
+              completed_at: completed_at,
+              metadata: merged_metadata
             )
             FileUtils.mkdir_p(report_dir)
@@ -57,15 +69,18 @@ module Ace
           # @param completed_at [Time]
           # @param error_message [String]
           # @return [Models::TestResult]
-          def write_failure_report(scenario:, report_dir:, provider:, started_at:, completed_at:, error_message:)
+          def write_failure_report(scenario:, report_dir:, provider:, started_at:, completed_at:, error_message:,
+            failure_category: "runner-error", test_cases: [], metadata: {})
             result = Models::TestResult.new(
               test_id: scenario.test_id,
               status: "error",
-              test_cases: [],
+              test_cases: test_cases,
               summary: "Execution pipeline failed",
               error: error_message,
+              observations: metadata["runner_observations"].to_s,
               started_at: started_at,
-              completed_at: completed_at
+              completed_at: completed_at,
+              metadata: metadata.merge("failure_category" => failure_category)
             )
             FileUtils.mkdir_p(report_dir)
@@ -83,14 +98,16 @@ module Ace
           def parse_verifier_output(text, scenario)
             goals = parse_goal_sections(text, scenario)
-            return build_result_from_goals(goals) unless goals.empty?
+            return build_result_from_goals(goals, text) unless goals.empty?
             parsed = Atoms::SkillResultParser.parse_verifier(text)
             {
               status: parsed[:status],
               test_cases: parsed[:test_cases],
               summary: parsed[:summary],
-              error: parsed[:observations]
+              error: parsed[:observations],
+              observations: parsed[:observations],
+              metadata: extract_overall_user_outcome(text)
             }
           rescue Atoms::ResultParser::ParseError => e
             issue = summarize_unstructured_verifier_output(text)
@@ -98,7 +115,9 @@ module Ace
               status: "error",
               test_cases: [],
               summary: "Verifier returned unstructured output",
-              error: issue || e.message
+              error: issue || e.message,
+              observations: "",
+              metadata: {}
             }
           end
@@ -121,7 +140,9 @@ module Ace
               evidence = extract_evidence(block)
               next if verdict.nil?
-              tc_id = scenario_test_cases[goal_number - 1]&.tc_id || format("TC-%03d", goal_number)
+              direct_goal_id = format("TC-%03d", goal_number)
+              direct_match = scenario_test_cases.find { |tc| tc.tc_id.to_s.upcase == direct_goal_id }
+              tc_id = direct_match&.tc_id || scenario_test_cases[goal_number - 1]&.tc_id || direct_goal_id
               category = extract_category(block, evidence)
               {
@@ -180,10 +201,14 @@ module Ace
             explicit = extract_field_token(block, %w[Category])
             return normalize_category(explicit) if explicit
-            inline = block.to_s.match(/`(test-spec-error|tool-bug|runner-error|infrastructure-error)`/i)
+            inline = block.to_s.match(
+              /`(test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)`/i
+            )
             return normalize_category(inline[1]) if inline
-            paren = block.to_s.match(/\((test-spec-error|tool-bug|runner-error|infrastructure-error)\)/i)
+            paren = block.to_s.match(
+              /\((test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)\)/i
+            )
             return normalize_category(paren[1]) if paren
             normalize_category("#{block}\n#{evidence}")
@@ -191,7 +216,9 @@ module Ace
           def normalize_category(value)
             category = value.to_s.strip.downcase
-            match = category.match(/\b(test-spec-error|tool-bug|runner-error|infrastructure-error)\b/)
+            match = category.match(
+              /\b(test-spec-error|tool-bug|runner-error|infrastructure-error|missing-artifact|state-drift|behavior-regression|discoverability-gap)\b/
+            )
             return match[1] if match
             "runner-error"
@@ -222,7 +249,7 @@ module Ace
             nil
           end
-          def build_result_from_goals(goals)
+          def build_result_from_goals(goals, text)
             passed = goals.count { |goal| goal[:status] == "pass" }
             total = goals.size
             status = if passed == total
@@ -236,10 +263,25 @@ module Ace
             {
               status: status,
               test_cases: goals,
-              summary: "#{passed}/#{total} passed"
+              summary: "#{passed}/#{total} passed",
+              observations: "",
+              error: nil,
+              metadata: extract_overall_user_outcome(text)
             }
           end
+          def extract_overall_user_outcome(text)
+            works = text.to_s.match(/\*\*Works for end user\*\*:\s*(yes|partial|no)/i)&.captures&.first
+            friction = text.to_s.match(/^\s*[-*]?\s*\*\*Friction\*\*:\s*(.+?)\s*$/im)&.captures&.first
+            feedback = text.to_s.match(/^\s*[-*]?\s*\*\*Feedback\*\*:\s*(.+?)\s*$/im)&.captures&.first
+            metadata = {}
+            metadata["works_for_end_user"] = works.to_s.downcase unless works.to_s.empty?
+            metadata["user_friction"] = friction.to_s.strip unless friction.to_s.strip.empty?
+            metadata["user_feedback"] = feedback.to_s.strip unless feedback.to_s.strip.empty?
+            metadata
+          end
           def summarize_unstructured_verifier_output(text)
             summary = text.to_s.lines.map(&:strip).reject(&:empty?).first(3).join(" ")
             return nil if summary.empty?
@@ -266,8 +308,8 @@ module Ace
               "test-id" => scenario.test_id,
               "title" => scenario.title,
               "package" => scenario.package,
-              "runner-provider" => provider,
-              "verifier-provider" => provider,
+              "runner-provider" => result.metadata["runner_provider"] || provider,
+              "verifier-provider" => result.metadata["verifier_provider"] || provider,
               "timestamp" => result.completed_at.utc.strftime("%Y-%m-%dT%H:%M:%SZ"),
               "tcs-passed" => passed,
               "tcs-failed" => failed,
@@ -281,12 +323,21 @@ module Ace
                   "category" => tc[:category] || "runner-error",
                   "evidence" => tc[:notes].to_s
                 }
-              end
+              end,
+              "canonical-failed-tcs" => result.failed_test_case_ids
             }
+            frontmatter["works-for-end-user"] = result.metadata["works_for_end_user"] if result.metadata["works_for_end_user"]
+            frontmatter["user-friction"] = result.metadata["user_friction"] if result.metadata["user_friction"]
+            frontmatter["user-feedback"] = result.metadata["user_feedback"] if result.metadata["user_feedback"]
+            frontmatter["missing-required-artifacts"] = result.metadata["missing_required_artifacts"] if result.metadata["missing_required_artifacts"]
             frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\s*\n/, "").sub(/\.\.\.\s*\n\z/, "")
             rows = result.test_cases.map do |tc|
-              "| #{tc[:id]} | #{tc[:status].upcase} | #{tc[:notes]} |"
+              "| #{tc[:id]} | #{tc[:status].upcase} | #{canonical_goal_evidence(tc)} |"
+            end.join("\n")
+            verdict_rows = result.test_cases.map do |tc|
+              "| #{tc[:id]} | #{tc[:status].upcase} |"
             end.join("\n")
             content = <<~REPORT
@@ -310,10 +361,31 @@ module Ace
               | Failed | #{failed} |
               | Total  | #{total} |
               | Score  | #{(score * 100).round(1)}% |
+              ## Canonical Goal Verdicts
+              | Goal | Canonical Verdict |
+              |------|-------------------|
+              #{verdict_rows}
+              ## Overall User Outcome
+              | Field | Value |
+              |-------|-------|
+              | Works for end user | #{result.metadata["works_for_end_user"] || "unspecified"} |
+              | Friction | #{result.metadata["user_friction"] || "None"} |
+              | Feedback | #{result.metadata["user_feedback"] || "None"} |
             REPORT
             File.write(path, content)
           end
+          def canonical_goal_evidence(test_case)
+            notes = test_case[:notes].to_s.strip
+            return notes unless notes.match?(/\bverdict\s+correction\b/i)
+            "Canonical verdict #{test_case[:status].to_s.upcase}. Preserved verifier note: #{notes}"
+          end
         end
       end
     end

data/lib/ace/test/end_to_end_runner/molecules/pipeline_sandbox_builder.rb CHANGED Viewed

@@ -2,6 +2,8 @@
 require "fileutils"
 require "open3"
+require "yaml"
+require "ace/test_support/sandbox_package_copy"
 module Ace
   module Test
@@ -10,8 +12,14 @@ module Ace
         # Builds deterministic sandbox state for standalone execution.
         class PipelineSandboxBuilder
           # @param config_root [String] Project root used for provider symlink/bin path
-          def initialize(config_root: Dir.pwd)
+          def initialize(config_root: Dir.pwd, package_copy: nil, runtime_builder: nil, config: nil)
             @config_root = File.expand_path(config_root)
+            @config = config || Molecules::ConfigLoader.load
+            @package_copy = package_copy || Ace::TestSupport::SandboxPackageCopy.new(source_root: @config_root)
+            @runtime_builder = runtime_builder || Molecules::SandboxRuntimeBuilder.new(
+              source_root: @config_root,
+              ruby_version: @config.dig("sandbox", "ruby_version") || Molecules::ConfigLoader.default_sandbox_ruby_version
+            )
           end
           # @param scenario [Models::TestScenario]
@@ -25,32 +33,69 @@ module Ace
             FileUtils.mkdir_p(File.join(sandbox_path, "reports"))
             initialize_git_repo(sandbox_path)
-            ensure_package_available(scenario.package, sandbox_path)
+            package_copy_result = ensure_package_available(scenario.package, sandbox_path)
+            sync_protocol_sources(sandbox_path)
+            runtime_result = @runtime_builder.prepare(
+              sandbox_root: sandbox_path,
+              env: package_copy_result[:env],
+              tool_names: scenario.requires.fetch("tools", [])
+            )
             link_provider_configs(sandbox_path)
             create_result_directories(scenario, sandbox_path, test_cases: test_cases)
-            verify_tool_access(scenario, sandbox_path)
+            run_default_bootstrap(scenario, sandbox_path, runtime_result[:env])
+            verify_tool_access(scenario, sandbox_path, runtime_result[:env])
-            {
-              "PROJECT_ROOT_PATH" => sandbox_path
-            }
+            runtime_result[:env]
+          end
+          # Prepare only the runner/verifier layout for a sandbox that was
+          # already created by the deterministic setup path.
+          #
+          # This must not mutate tracked sandbox contents by copying packages,
+          # syncing protocol sources, or replacing config directories with
+          # symlinks after the scenario setup has already established git state.
+          #
+          # @param scenario [Models::TestScenario]
+          # @param sandbox_path [String]
+          # @param test_cases [Array<String>, nil] Optional TC filter
+          # @return [Hash] Additional environment variables (none required)
+          def prepare_existing_sandbox(scenario:, sandbox_path:, test_cases: nil)
+            sandbox_path = File.expand_path(sandbox_path)
+            FileUtils.mkdir_p(sandbox_path)
+            FileUtils.mkdir_p(File.join(sandbox_path, ".ace-local", "e2e"))
+            FileUtils.mkdir_p(File.join(sandbox_path, "reports"))
+            create_result_directories(scenario, sandbox_path, test_cases: test_cases)
+            {}
+          end
+          # Sync protocol source manifests and backing directories into a
+          # prepared sandbox before deterministic setup runs.
+          #
+          # This is safe before setup because no scenario-owned git baseline has
+          # been established yet. It is intentionally separate from
+          # prepare_existing_sandbox so the post-setup pipeline path remains
+          # non-mutating.
+          #
+          # @param sandbox_path [String]
+          # @return [void]
+          def sync_protocol_sources_into(sandbox_path)
+            sync_protocol_sources(File.expand_path(sandbox_path))
           end
           private
           def ensure_package_available(package_name, sandbox_path)
             package_name = package_name.to_s.strip
-            return if package_name.empty?
-            package_source = File.join(@config_root, package_name)
-            package_target = File.join(sandbox_path, package_name)
-            return if File.exist?(package_target)
-            unless File.directory?(package_source)
-              raise "Scenario package not found: #{package_name} (expected #{package_source})"
+            if package_name.empty?
+              return {
+                env: {
+                  "PROJECT_ROOT_PATH" => sandbox_path,
+                  "ACE_E2E_SOURCE_ROOT" => @config_root
+                }
+              }
             end
-            FileUtils.cp_r(package_source, package_target)
+            @package_copy.prepare(package_name: package_name, sandbox_root: sandbox_path)
           end
           def initialize_git_repo(sandbox_path)
@@ -62,6 +107,45 @@ module Ace
             raise "Sandbox git init failed: #{stderr}".strip
           end
+          def sync_protocol_sources(sandbox_path)
+            %w[skill wfi].each do |protocol|
+              Dir.glob(File.join(@config_root, "*", ".ace-defaults", "nav", "protocols",
+                "#{protocol}-sources", "*.yml")).sort.each do |manifest_path|
+                sync_protocol_source_manifest(protocol, manifest_path, sandbox_path)
+              end
+            end
+          end
+          def sync_protocol_source_manifest(protocol, manifest_path, sandbox_path)
+            source_data = YAML.safe_load_file(manifest_path) || {}
+            relative_path = source_data.dig("config", "relative_path").to_s.strip
+            return if relative_path.empty?
+            package_root = File.expand_path("../../../../..", manifest_path)
+            package_name = File.basename(package_root)
+            target_package_root = File.join(sandbox_path, package_name)
+            target_manifest_path = File.join(
+              target_package_root,
+              ".ace-defaults",
+              "nav",
+              "protocols",
+              "#{protocol}-sources",
+              File.basename(manifest_path)
+            )
+            source_dir = File.join(package_root, relative_path)
+            target_dir = File.join(target_package_root, relative_path)
+            FileUtils.mkdir_p(File.dirname(target_manifest_path))
+            FileUtils.cp(manifest_path, target_manifest_path) unless File.exist?(target_manifest_path)
+            return unless File.directory?(source_dir)
+            return if File.exist?(target_dir)
+            FileUtils.mkdir_p(File.dirname(target_dir))
+            FileUtils.cp_r(source_dir, target_dir)
+          rescue Psych::SyntaxError
+            nil
+          end
           def link_provider_configs(sandbox_path)
             source = File.join(@config_root, ".ace", "llm", "providers")
             target = File.join(sandbox_path, ".ace", "llm", "providers")
@@ -115,15 +199,32 @@ module Ace
             match ? match[1].to_i : nil
           end
-          def verify_tool_access(scenario, sandbox_path)
+          def verify_tool_access(scenario, sandbox_path, env)
             tool = scenario.tool_under_test.to_s.strip
             return if tool.empty?
-            _stdout, stderr, status = Open3.capture3(tool, "--help", chdir: sandbox_path)
+            _stdout, stderr, status = Open3.capture3(env, tool, "--help", chdir: sandbox_path)
             return if status.success?
             raise "Sandbox tool check failed for #{tool}: #{stderr}".strip
           end
+          def run_default_bootstrap(scenario, sandbox_path, env)
+            return unless scenario.sandbox_profile == "ace-default"
+            stdout, stderr, status = Open3.capture3(
+              env,
+              "bash", "--noprofile", "--norc", "-c", "ace-config init && ace-handbook sync",
+              chdir: sandbox_path
+            )
+            return if status.success?
+            raise [
+              "Default sandbox bootstrap failed for #{scenario.test_id}",
+              stdout.to_s.strip,
+              stderr.to_s.strip
+            ].reject(&:empty?).join("\n")
+          end
         end
       end
     end

data/lib/ace/test/end_to_end_runner/molecules/report_writer.rb CHANGED Viewed

@@ -130,6 +130,8 @@ module Ace
           # @return [String] Path to written file
           def write_experience(result, scenario, report_dir, test_case = nil)
             path = File.join(report_dir, "experience.r.md")
+            runner_observations = result.metadata["runner_observations"].to_s.strip
+            verifier_observations = result.observations.to_s.strip
             tc_title_suffix = test_case ? " / #{test_case.tc_id}" : ""
@@ -154,22 +156,15 @@ module Ace
               ## Summary
-              Executed via ace-test-e2e CLI using LLM provider.
-              #{(result.status == "pass") ? "No significant friction encountered." : "Test execution completed with issues noted below."}
+              Runner observations captured by the harness for this scenario.
-              ## Friction Points
+              ## Runner Observations
-              ### Documentation Gaps
+              #{runner_observations.empty? ? "- None provided by runner." : runner_observations}
-              - Automated execution via LLM - no documentation gaps observed
+              ## Verifier Notes
-              ### Tool Behavior Issues
-              - #{result.error || "None observed"}
-              ## Positive Observations
-              - Automated test execution completed successfully via LLM
+              - #{verifier_observations.empty? ? (result.error || "None recorded.") : verifier_observations}
             REPORT
             File.write(path, content)
@@ -219,6 +214,12 @@ module Ace
               end,
               "failed_test_cases" => result.failed_test_case_ids
             }
+            metadata["runner_observations"] = result.metadata["runner_observations"] if result.metadata.key?("runner_observations")
+            metadata["verifier_observations"] = result.observations unless result.observations.to_s.empty?
+            metadata["missing_required_artifacts"] = result.metadata["missing_required_artifacts"] if result.metadata.key?("missing_required_artifacts")
+            metadata["works_for_end_user"] = result.metadata["works_for_end_user"] if result.metadata.key?("works_for_end_user")
+            metadata["user_friction"] = result.metadata["user_friction"] if result.metadata.key?("user_friction")
+            metadata["user_feedback"] = result.metadata["user_feedback"] if result.metadata.key?("user_feedback")
             if test_case
               metadata["scenario-id"] = scenario.test_id