RubyGems - phronomy - Versions diffs - 0.1.2 → 0.1.4 - Mend

phronomy 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +4 -4
data/lib/generators/phronomy/install/templates/create_phronomy_messages.rb.tt +1 -1
data/lib/phronomy/agent/base.rb +68 -35
data/lib/phronomy/agent/handoff.rb +6 -2
data/lib/phronomy/agent/react_agent.rb +57 -31
data/lib/phronomy/agent/runner.rb +6 -4
data/lib/phronomy/configuration.rb +6 -0
data/lib/phronomy/context/assembler.rb +11 -3
data/lib/phronomy/context/compaction_context.rb +1 -3
data/lib/phronomy/context/context_version_cache.rb +22 -8
data/lib/phronomy/context/token_estimator.rb +19 -2
data/lib/phronomy/eval/eval_result.rb +15 -5
data/lib/phronomy/eval/runner.rb +46 -11
data/lib/phronomy/eval/scorer/llm_judge.rb +7 -2
data/lib/phronomy/graph/compiled_graph.rb +9 -1
data/lib/phronomy/graph/parallel_node.rb +53 -18
data/lib/phronomy/graph/state_graph.rb +7 -1
data/lib/phronomy/guardrail/builtin/pii_pattern_detector.rb +47 -3
data/lib/phronomy/guardrail/builtin/prompt_injection_detector.rb +15 -1
data/lib/phronomy/memory/compression/summary.rb +4 -3
data/lib/phronomy/memory/compression/tool_output_pruner.rb +11 -6
data/lib/phronomy/memory/conversation_manager.rb +59 -14
data/lib/phronomy/memory/retrieval/base.rb +4 -3
data/lib/phronomy/memory/retrieval/composite.rb +5 -4
data/lib/phronomy/memory/retrieval/recent.rb +4 -3
data/lib/phronomy/memory/retrieval/semantic.rb +50 -17
data/lib/phronomy/memory/storage/active_record.rb +18 -13
data/lib/phronomy/memory/storage/in_memory.rb +25 -16
data/lib/phronomy/rails/agent_job.rb +20 -3
data/lib/phronomy/runnable.rb +4 -1
data/lib/phronomy/state_store/active_record.rb +7 -3
data/lib/phronomy/state_store/base.rb +16 -2
data/lib/phronomy/state_store/in_memory.rb +5 -4
data/lib/phronomy/tool/base.rb +19 -3
data/lib/phronomy/tool/mcp_tool.rb +67 -9
data/lib/phronomy/tracing/base.rb +0 -2
data/lib/phronomy/tracing/langfuse_tracer.rb +24 -4
data/lib/phronomy/tracing/null_tracer.rb +6 -3
data/lib/phronomy/trust_pipeline.rb +32 -4
data/lib/phronomy/vector_store/in_memory.rb +7 -5
data/lib/phronomy/vector_store/redis_search.rb +30 -23
data/lib/phronomy/version.rb +1 -1
data/lib/phronomy.rb +39 -0
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4bd9ee98ca8c05a22a5488a6996edb95dcffc26d14b1af92d6551bb24b66a530
-  data.tar.gz: 14967148ee9764e8502ba8b45d28aa1640f9a086b80ac1a49d9bab3c228a3a70
+  metadata.gz: d95954b46d12542673b5a319b338c7733d579b72105499a55dc4251628bc807f
+  data.tar.gz: 174341a0e329d861066d475b062260c3d78fac86da3d024ebc1594d7a37ec348
 SHA512:
-  metadata.gz: b005dd5bac44045180bdbf9945c773155e27ee95d69c90b35e3864d01685d831fcd3617762b16871fc637efc86daca27224dda7040e9ef50120675fd7f18b986
-  data.tar.gz: 2732036e9ed83a86eb75b2e3b55d0a2ccce53b65e354967c1220f1f98b1fb84d09ad42c82673d415dba263a698453827551fe27831e1278252d675e271a67369
+  metadata.gz: ee299b8d67fec8cb268683ffe672daab04a5c5b4794728dbeea3877e6c5216cefe91f292acac977d7293b837cd0b159445d58a87f925781a26f24438faecd010
+  data.tar.gz: 6da71943dc65b3671f5bd34ff18509a8ee2e0b12bfb5fcb206df77ba2c7345c9fd1b59dec90088dc3e8588e9327cb162046a803a9ac7cd405f4d30fc6712ebc3

data/lib/generators/phronomy/install/templates/create_phronomy_messages.rb.tt CHANGED Viewed

@@ -3,7 +3,7 @@ class CreatePhronomyMessages < ActiveRecord::Migration[<%= ActiveRecord::Migrati
     create_table :phronomy_messages do |t|
       t.string :thread_id,       null: false
       t.string :role,            null: false
-      t.text   :content,         null: false
+      t.text   :content
       t.text   :tool_calls_json
       t.string :model_id
       t.timestamps

data/lib/phronomy/agent/base.rb CHANGED Viewed

@@ -446,55 +446,88 @@ module Phronomy
       def stream(input, config: {}, &block)
         return invoke(input, config: config) unless block
-        run_input_guardrails!(input)
+        caller_meta = {}
+        caller_meta[:user_id] = config[:user_id] if config[:user_id]
+        caller_meta[:session_id] = config[:session_id] if config[:session_id]
+        trace("agent.invoke", input: input, **caller_meta) do |_span|
+          run_input_guardrails!(input)
-        memory = config[:memory]
-        thread_id = config[:thread_id]
+          memory = config[:memory]
+          thread_id = config[:thread_id]
-        chat = build_chat
-        user_message = extract_message(input)
+          chat = build_chat
+          user_message = extract_message(input)
+          budget = build_token_budget
-        # Assemble context via Assembler (same as invoke_once).
-        assembler = Context::Assembler.new(budget: build_token_budget)
-        system_msg = build_instructions(input)
-        assembler.add_instruction(system_msg) if system_msg
+          # Assemble context via Assembler (same as invoke_once).
+          assembler = Context::Assembler.new(budget: budget)
+          system_msg = build_instructions(input)
+          assembler.add_instruction(system_msg) if system_msg
-        Array(config[:knowledge_sources]).each do |ks|
-          ks.fetch(query: user_message).each do |chunk|
-            assembler.add_knowledge(chunk[:content], type: chunk[:type], source: chunk[:source])
+          Array(config[:knowledge_sources]).each do |ks|
+            ks.fetch(query: user_message).each do |chunk|
+              assembler.add_knowledge(chunk[:content], type: chunk[:type], source: chunk[:source])
+            end
           end
-        end
-        if memory && thread_id
-          msgs = load_from_memory(memory, thread_id: thread_id, query: user_message)
-          assembler.add_messages(msgs)
-        end
+          if memory && thread_id
+            msgs = load_from_memory(memory, thread_id: thread_id, query: user_message)
+            message_elements = build_message_elements(msgs)
-        context = assembler.build
-        apply_instructions(chat, context[:system]) if context[:system]
-        context[:messages].each { |msg| chat.messages << msg }
+            # Run on_trim: app may call ctx.remove(seqs) to drop messages this turn.
+            if (trim_cb = self.class._on_trim_callback)
+              trim_ctx = Context::TrimContext.new(message_elements: message_elements, budget: budget)
+              trim_cb.call(trim_ctx)
+              message_elements = trim_ctx.message_elements
+            end
-        # Wire per-event callbacks to yield StreamEvents.
-        chat.on_tool_call { |tool_call| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tool_call})) }
-        chat.on_tool_result { |tool_result| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tool_result})) }
+            # Run on_compaction_trigger → on_compact pipeline before calling the LLM.
+            if (trigger_cb = self.class._on_compaction_trigger_callback)
+              trigger_ctx = Context::TriggerContext.new(message_elements: message_elements, budget: budget)
+              if trigger_cb.call(trigger_ctx)
+                if (compact_cb = self.class._on_compact_callback)
+                  compact_ctx = Context::CompactionContext.new(
+                    message_elements: message_elements,
+                    budget: budget,
+                    thread_id: thread_id,
+                    memory: memory
+                  )
+                  compact_cb.call(compact_ctx)
+                  message_elements = build_message_elements(compact_ctx.result_messages)
+                end
+              end
+            end
-        # Run before_completion hooks (global → class → instance) before the LLM call.
-        run_before_completion_hooks!(chat, config)
+            assembler.add_messages(message_elements.map { |e| e[:message] })
+          end
-        response = chat.ask(user_message) do |chunk|
-          block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
-        end
+          context = assembler.build
+          apply_instructions(chat, context[:system]) if context[:system]
+          context[:messages].each { |msg| chat.messages << msg }
-        save_to_memory(memory, thread_id: thread_id, messages: chat.messages) if memory && thread_id
+          # Wire per-event callbacks to yield StreamEvents.
+          chat.before_tool_call { |tool_call| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tool_call})) }
+          chat.after_tool_result { |tool_result| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tool_result})) }
-        output = response.content
-        usage = Phronomy::TokenUsage.from_tokens(response.tokens)
+          # Run before_completion hooks (global → class → instance) before the LLM call.
+          run_before_completion_hooks!(chat, config)
-        run_output_guardrails!(output)
+          response = chat.ask(user_message) do |chunk|
+            block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
+          end
-        result = {output: output, messages: chat.messages, usage: usage}
-        block.call(StreamEvent.new(type: :done, payload: result))
-        result
+          save_to_memory(memory, thread_id: thread_id, messages: chat.messages) if memory && thread_id
+          output = response.content
+          usage = Phronomy::TokenUsage.from_tokens(response.tokens)
+          run_output_guardrails!(output)
+          result = {output: output, messages: chat.messages, usage: usage}
+          block.call(StreamEvent.new(type: :done, payload: result))
+          [result, usage]
+        end
       rescue => e
         block&.call(StreamEvent.new(type: :error, payload: {error: e}))
         raise

data/lib/phronomy/agent/handoff.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require "securerandom"
 module Phronomy
   module Agent
     # Represents a transfer edge from one agent to another.
@@ -23,7 +25,9 @@ module Phronomy
       def initialize(target_agent:, description: nil)
         @target_agent = target_agent
         klass_name = target_agent.class.name&.split("::")&.last || "Agent"
-        @tool_name = "transfer_to_#{snake_case(klass_name)}"
+        # Use a UUID so that two handoffs targeting the same class remain distinct.
+        @uuid = SecureRandom.uuid
+        @tool_name = "transfer_to_#{snake_case(klass_name)}_#{@uuid.delete("-")[0, 8]}"
         @description = description || "Transfer the conversation to #{klass_name}."
       end
@@ -43,7 +47,7 @@ module Phronomy
       # The sentinel string embedded in the tool result.
       # @return [String]
       def sentinel
-        "#{SENTINEL_PREFIX}:#{target_agent.class.name}"
+        "#{SENTINEL_PREFIX}:#{target_agent.class.name}:#{@uuid}"
       end
       private

data/lib/phronomy/agent/react_agent.rb CHANGED Viewed

@@ -5,7 +5,11 @@ module Phronomy
     # ReAct pattern (Reasoning + Acting) agent.
     # Repeats the LLM <-> Tool loop until no more tool calls are made.
     class ReactAgent < Base
-      def invoke(input, config: {})
+      private
+      # Performs a single (non-retried) ReAct invocation.
+      # Overrides Base#invoke_once so that Base#invoke's retry loop is inherited.
+      def invoke_once(input, config: {})
         caller_meta = {}
         caller_meta[:user_id] = config[:user_id] if config[:user_id]
         caller_meta[:session_id] = config[:session_id] if config[:session_id]
@@ -28,27 +32,37 @@ module Phronomy
           messages = initial_messages.dup
           user_asked = false
           total_usage = Phronomy::TokenUsage.zero
+          iterations_exhausted = true
           max_iter.times do
             response = step(messages, input, user_asked: user_asked, config: config)
             user_asked = true
             messages = response[:messages]
             total_usage += response[:usage]
-            break if response[:done]
+            if response[:done]
+              iterations_exhausted = false
+              break
+            end
           end
           save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
-          output = messages.last&.content
+          # Fall back to the last message that carries non-nil content. This
+          # guards against the case where the final message is a tool-call or
+          # tool-result message (content == nil) when max_iterations is
+          # exhausted before the model produces a text reply.
+          output = messages.reverse.find { |m| m.content && !m.content.empty? }&.content
           # Run output guardrails before returning to the caller.
           run_output_guardrails!(output)
-          result = {output: output, messages: messages, usage: total_usage}
+          result = {output: output, messages: messages, usage: total_usage, iterations_exhausted: iterations_exhausted}
           [result, total_usage]
         end
       end
+      public
       # Streaming version of #invoke for the ReAct loop.
       # Yields {Phronomy::Agent::StreamEvent} events while the LLM-tool loop runs.
       #
@@ -59,38 +73,50 @@ module Phronomy
       def stream(input, config: {}, &block)
         return invoke(input, config: config) unless block
-        run_input_guardrails!(input)
+        caller_meta = {}
+        caller_meta[:user_id] = config[:user_id] if config[:user_id]
+        caller_meta[:session_id] = config[:session_id] if config[:session_id]
+        trace("agent.invoke", input: input, **caller_meta) do |_span|
+          run_input_guardrails!(input)
-        memory = config[:memory]
-        thread_id = config[:thread_id]
-        max_iter = self.class.max_iterations
+          memory = config[:memory]
+          thread_id = config[:thread_id]
+          max_iter = self.class.max_iterations
-        initial_messages = if memory && thread_id
-          load_from_memory(memory, thread_id: thread_id, query: extract_message(input))
-        else
-          []
-        end
+          initial_messages = if memory && thread_id
+            load_from_memory(memory, thread_id: thread_id, query: extract_message(input))
+          else
+            []
+          end
-        messages = initial_messages.dup
-        user_asked = false
-        total_usage = Phronomy::TokenUsage.zero
+          messages = initial_messages.dup
+          user_asked = false
+          total_usage = Phronomy::TokenUsage.zero
+          iterations_exhausted = true
-        max_iter.times do
-          response = stream_step(messages, input, user_asked: user_asked, config: config, &block)
-          user_asked = true
-          messages = response[:messages]
-          total_usage += response[:usage]
-          break if response[:done]
-        end
+          max_iter.times do
+            response = stream_step(messages, input, user_asked: user_asked, config: config, &block)
+            user_asked = true
+            messages = response[:messages]
+            total_usage += response[:usage]
+            if response[:done]
+              iterations_exhausted = false
+              break
+            end
+          end
-        save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
+          save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
-        output = messages.last&.content
-        run_output_guardrails!(output)
+          # Fall back to the last message that carries non-nil content (same as
+          # the non-streaming path above).
+          output = messages.reverse.find { |m| m.content && !m.content.empty? }&.content
+          run_output_guardrails!(output)
-        result = {output: output, messages: messages, usage: total_usage}
-        block.call(StreamEvent.new(type: :done, payload: result))
-        result
+          result = {output: output, messages: messages, usage: total_usage, iterations_exhausted: iterations_exhausted}
+          block.call(StreamEvent.new(type: :done, payload: result))
+          [result, total_usage]
+        end
       rescue => e
         block&.call(StreamEvent.new(type: :error, payload: {error: e}))
         raise
@@ -128,8 +154,8 @@ module Phronomy
         chat = build_chat
         messages.each { |m| chat.add_message(m) }
-        chat.on_tool_call { |tc| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tc})) }
-        chat.on_tool_result { |tr| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tr})) }
+        chat.before_tool_call { |tc| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tc})) }
+        chat.after_tool_result { |tr| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tr})) }
         # Run before_completion hooks before each LLM call in the streaming loop.
         run_before_completion_hooks!(chat, config)

data/lib/phronomy/agent/runner.rb CHANGED Viewed

@@ -52,14 +52,16 @@ module Phronomy
         handoffs_taken = 0
         loop do
-          result = current.invoke(input, config: config)
-          target = find_handoff_target(result[:messages])
-          return result.merge(agent: current) unless target
+          # Check before invoking so we raise after exactly MAX_HANDOFFS handoffs,
+          # not after MAX_HANDOFFS + 1 LLM calls.
           if handoffs_taken >= MAX_HANDOFFS
             raise Phronomy::HandoffError, "Exceeded maximum handoffs (#{MAX_HANDOFFS})"
           end
+          result = current.invoke(input, config: config)
+          target = find_handoff_target(result[:messages])
+          return result.merge(agent: current) unless target
           current = target
           handoffs_taken += 1
         end

data/lib/phronomy/configuration.rb CHANGED Viewed

@@ -42,11 +42,17 @@ module Phronomy
     # Recursion limit for graph execution (default: 25)
     attr_accessor :recursion_limit
+    # When true (default), user input and LLM output are recorded in trace spans.
+    # Set to false in privacy-sensitive environments to prevent PII from reaching
+    # the tracing backend (OTel, Langfuse, etc.).
+    attr_accessor :trace_pii
     def initialize
       @recursion_limit = 25
       @tracer = Phronomy::Tracing::NullTracer.new
       @memory_async = false
       @memory_job_queue = :default
+      @trace_pii = true
     end
   end
 end

data/lib/phronomy/context/assembler.rb CHANGED Viewed

@@ -1,5 +1,7 @@
 # frozen_string_literal: true
+require "cgi"
 module Phronomy
   module Context
     # Assembler collects all four context regions and produces the final
@@ -34,7 +36,7 @@ module Phronomy
       # @param trusted [Boolean]
       # @return [String]
       def self.xml_tag(text, type:, trusted: false)
-        "<context type=\"#{type}\" trusted=\"#{trusted}\">\n#{text}\n</context>"
+        "<context type=\"#{CGI.escapeHTML(type.to_s)}\" trusted=\"#{trusted}\">\n#{CGI.escapeHTML(text.to_s)}\n</context>"
       end
       # @param budget [Phronomy::Context::TokenBudget, nil]
@@ -104,8 +106,8 @@ module Phronomy
       private
       def xml_context_tag(chunk)
-        src_attr = chunk[:source] ? " source=\"#{chunk[:source]}\"" : ""
-        "<context type=\"#{chunk[:type]}\"#{src_attr} trusted=\"#{chunk[:trusted]}\">\n#{chunk[:text]}\n</context>"
+        src_attr = chunk[:source] ? " source=\"#{CGI.escapeHTML(chunk[:source].to_s)}\"" : ""
+        "<context type=\"#{CGI.escapeHTML(chunk[:type].to_s)}\"#{src_attr} trusted=\"#{chunk[:trusted]}\">\n#{CGI.escapeHTML(chunk[:text].to_s)}\n</context>"
       end
       def trim_messages_to_budget(messages, system_text)
@@ -122,6 +124,12 @@ module Phronomy
           accumulated += tokens
           result.push(msg)
         end
+        if result.empty? && messages.any?
+          warn "[Phronomy::Assembler] All #{messages.length} conversation message(s) dropped: " \
+               "token budget exhausted by system context (budget=#{@budget.context_window}, used_by_system=#{used})"
+        end
         result.reverse
       end
     end

data/lib/phronomy/context/compaction_context.rb CHANGED Viewed

@@ -1,7 +1,5 @@
 # frozen_string_literal: true
-require "ostruct"
 module Phronomy
   module Context
     # Context object passed to the +on_compact+ callback registered on an agent.
@@ -103,7 +101,7 @@ module Phronomy
         end
         remaining = (@message_elements[(last_idx + 1)..] || []).map { |e| e[:message] }
-        summary_msg = OpenStruct.new(role: :system, content: summary_text)
+        summary_msg = RubyLLM::Message.new(role: :system, content: summary_text)
         @result_messages = [summary_msg] + remaining
       end
     end

data/lib/phronomy/context/context_version_cache.rb CHANGED Viewed

@@ -27,32 +27,46 @@ module Phronomy
       attr_reader :system_tokens
       def initialize
-        reset
+        @mutex = Mutex.new
+        @fingerprint = nil
+        @system_text = nil
+        @system_tokens = 0
       end
       # Returns true when the given fingerprint matches the stored one.
+      # The check is performed under a mutex so that a concurrent #update cannot
+      # expose a partially-written state where fingerprint is new but system_text
+      # is still nil (Issue #55).
       #
       # @param fingerprint [String] SHA-256 hex digest to compare
       # @return [Boolean]
       def valid?(fingerprint)
-        !@fingerprint.nil? && @fingerprint == fingerprint
+        @mutex.synchronize do
+          !@fingerprint.nil? && !@system_text.nil? && @fingerprint == fingerprint
+        end
       end
       # Update the cache with a new fingerprint and system text.
+      # All three assignments are performed atomically under a mutex so that
+      # concurrent readers never observe a partial state (Issue #55).
       #
       # @param fingerprint  [String] new SHA-256 hex digest
       # @param system_text  [String] fully assembled system prompt text
       def update(fingerprint:, system_text:)
-        @fingerprint = fingerprint
-        @system_text = system_text.to_s
-        @system_tokens = TokenEstimator.estimate(@system_text)
+        @mutex.synchronize do
+          @fingerprint = fingerprint
+          @system_text = system_text.to_s
+          @system_tokens = TokenEstimator.estimate(@system_text)
+        end
       end
       # Clear all cached values (used for testing and forced invalidation).
       def reset
-        @fingerprint = nil
-        @system_text = nil
-        @system_tokens = 0
+        @mutex.synchronize do
+          @fingerprint = nil
+          @system_text = nil
+          @system_tokens = 0
+        end
       end
     end
   end

data/lib/phronomy/context/token_estimator.rb CHANGED Viewed

@@ -23,13 +23,29 @@ module Phronomy
     #   Phronomy::Context::TokenEstimator.tokenizer = nil
     module TokenEstimator
       @tokenizer = nil
+      @tokenizer_mutex = Mutex.new
       class << self
         # Replace the built-in heuristic with a callable that takes a String
         # and returns an Integer token count.  Set to nil to restore the default.
         #
+        # @note This is a process-wide setting. Set it once at application startup.
+        #   In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
+        #   prevent cross-test contamination.
         # @param callable [#call, nil]
-        attr_accessor :tokenizer
+        def tokenizer=(callable)
+          @tokenizer_mutex.synchronize { @tokenizer = callable }
+        end
+        # @return [#call, nil]
+        def tokenizer
+          @tokenizer_mutex.synchronize { @tokenizer }
+        end
+        # Resets the tokenizer to the built-in heuristic. Intended for test isolation.
+        def reset_tokenizer!
+          @tokenizer_mutex.synchronize { @tokenizer = nil }
+        end
         # Estimate the number of tokens for the given input.
         #
@@ -37,9 +53,10 @@ module Phronomy
         #   or an Array of message-like objects (each must respond to #content).
         # @return [Integer] estimated token count (>= 0)
         def estimate(input)
+          tok = @tokenizer_mutex.synchronize { @tokenizer }
           case input
           when String
-            @tokenizer ? @tokenizer.call(input) : (input.length / 4.0).ceil
+            tok ? tok.call(input) : (input.length / 4.0).ceil
           when Array
             input.sum { |m| estimate(m.content.to_s) }
           else

data/lib/phronomy/eval/eval_result.rb CHANGED Viewed

@@ -4,16 +4,26 @@ module Phronomy
   module Eval
     # An immutable record holding the outcome of evaluating one EvalCase.
     #
-    # @!attribute eval_case [EvalCase]  the original sample
-    # @!attribute actual    [String]    the callable's output
-    # @!attribute score     [Float]     scorer-assigned value in [0.0, 1.0]
-    # @!attribute usage     [Phronomy::TokenUsage, nil]
+    # @!attribute eval_case  [EvalCase]  the original sample
+    # @!attribute actual     [String]    the callable's output
+    # @!attribute score      [Float]     scorer-assigned value in [0.0, 1.0]
+    # @!attribute usage      [Phronomy::TokenUsage, nil]
     # @!attribute latency_ms [Integer]  wall-clock time of the callable in ms
-    EvalResult = Data.define(:eval_case, :actual, :score, :usage, :latency_ms) do
+    # @!attribute error      [Exception, nil] set when the scorer raised an exception
+    EvalResult = Data.define(:eval_case, :actual, :score, :usage, :latency_ms, :error) do
+      def initialize(eval_case:, actual:, score:, usage:, latency_ms:, error: nil)
+        super
+      end
       # Returns true when the scorer assigned a perfect score of 1.0.
       def pass?
         score >= 1.0
       end
+      # Returns true when the scorer raised an exception.
+      def scorer_error?
+        !error.nil?
+      end
     end
   end
 end

data/lib/phronomy/eval/runner.rb CHANGED Viewed

@@ -22,24 +22,52 @@ module Phronomy
         @scorer = scorer
       end
-      # @param dataset  [Dataset]  collection of EvalCase objects
-      # @param callable [#call]    accepts a single String argument
+      # @param dataset     [Dataset]  collection of EvalCase objects
+      # @param callable    [#call]    accepts a single String argument
+      # @param concurrency [Integer]  number of parallel threads (default: 1, sequential)
       # @return [Array<EvalResult>]
-      def run(dataset, callable)
-        dataset.map do |eval_case|
-          t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
-          result = callable.call(eval_case.input)
-          latency_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - t0
+      def run(dataset, callable, concurrency: 1)
+        cases = dataset.to_a
+        return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
-          actual, usage = extract(result)
-          score = @scorer.score(actual: actual, expected: eval_case.expected, input: eval_case.input)
-          EvalResult.new(eval_case: eval_case, actual: actual, score: score, usage: usage, latency_ms: latency_ms)
+        # Run cases in slices of +concurrency+ threads. Each slice is joined
+        # before the next starts, bounding peak thread count to +concurrency+.
+        # Writing to pre-allocated slots (one per thread) is safe because each
+        # thread writes to a unique index and all threads in a slice are joined
+        # before the next slice begins.
+        # Exceptions in worker threads are collected and re-raised after all
+        # threads in the slice are joined, preventing orphaned threads.
+        results = Array.new(cases.length)
+        cases.each_with_index.each_slice(concurrency) do |batch|
+          errors = []
+          errors_mu = Mutex.new
+          threads = batch.map do |eval_case, i|
+            Thread.new do
+              results[i] = run_one(eval_case, callable)
+            rescue => e
+              errors_mu.synchronize { errors << e }
+            end
+          end
+          threads.each(&:join)
+          raise errors.first if errors.any?
         end
+        results
       end
       private
+      # Evaluate a single EvalCase with the given callable and return an EvalResult.
+      def run_one(eval_case, callable)
+        t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
+        result = callable.call(eval_case.input)
+        latency_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - t0
+        actual, usage = extract(result)
+        score, score_error = score_safely(@scorer, actual: actual, expected: eval_case.expected, input: eval_case.input)
+        EvalResult.new(eval_case: eval_case, actual: actual, score: score, usage: usage, latency_ms: latency_ms, error: score_error)
+      end
       # Normalises the callable's return value into [actual_string, usage_or_nil].
       def extract(result)
         if result.is_a?(Hash)
@@ -48,6 +76,13 @@ module Phronomy
           [result.to_s, nil]
         end
       end
+      # Calls the scorer and returns [score, error]. On failure, returns [0.0, exception].
+      def score_safely(scorer, **kwargs)
+        [scorer.score(**kwargs), nil]
+      rescue => e
+        [0.0, e]
+      end
     end
   end
 end

data/lib/phronomy/eval/scorer/llm_judge.rb CHANGED Viewed

@@ -34,17 +34,22 @@ module Phronomy
         # @param model           [String]  RubyLLM model identifier
         # @param prompt_template [String]  format string with %<input>s, %<expected>s, %<actual>s
-        def initialize(model:, prompt_template: DEFAULT_PROMPT)
+        # @param raise_on_error  [Boolean] when true, re-raises scoring exceptions instead of
+        #   returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
+        def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
           @model = model
           @prompt_template = prompt_template
+          @raise_on_error = raise_on_error
         end
-        # @return [Float] score in [0.0, 1.0]; 0.0 on any error
+        # @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
         def score(actual:, expected:, input: nil)
           prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
           response = RubyLLM.chat(model: @model).ask(prompt)
           response.content.to_s.strip.scan(/-?\d+\.?\d*/).first.to_f.clamp(0.0, 1.0)
         rescue => e
+          raise if @raise_on_error
           warn "[LlmJudge] Scoring failed: #{e.message}"
           0.0
         end