phronomy 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +4 -4
  2. data/lib/generators/phronomy/install/templates/create_phronomy_messages.rb.tt +1 -1
  3. data/lib/phronomy/agent/base.rb +68 -35
  4. data/lib/phronomy/agent/handoff.rb +6 -2
  5. data/lib/phronomy/agent/react_agent.rb +57 -31
  6. data/lib/phronomy/agent/runner.rb +6 -4
  7. data/lib/phronomy/configuration.rb +6 -0
  8. data/lib/phronomy/context/assembler.rb +11 -3
  9. data/lib/phronomy/context/compaction_context.rb +1 -3
  10. data/lib/phronomy/context/context_version_cache.rb +22 -8
  11. data/lib/phronomy/context/token_estimator.rb +19 -2
  12. data/lib/phronomy/eval/eval_result.rb +15 -5
  13. data/lib/phronomy/eval/runner.rb +46 -11
  14. data/lib/phronomy/eval/scorer/llm_judge.rb +7 -2
  15. data/lib/phronomy/graph/compiled_graph.rb +9 -1
  16. data/lib/phronomy/graph/parallel_node.rb +53 -18
  17. data/lib/phronomy/graph/state_graph.rb +7 -1
  18. data/lib/phronomy/guardrail/builtin/pii_pattern_detector.rb +47 -3
  19. data/lib/phronomy/guardrail/builtin/prompt_injection_detector.rb +15 -1
  20. data/lib/phronomy/memory/compression/summary.rb +4 -3
  21. data/lib/phronomy/memory/compression/tool_output_pruner.rb +11 -6
  22. data/lib/phronomy/memory/conversation_manager.rb +59 -14
  23. data/lib/phronomy/memory/retrieval/base.rb +4 -3
  24. data/lib/phronomy/memory/retrieval/composite.rb +5 -4
  25. data/lib/phronomy/memory/retrieval/recent.rb +4 -3
  26. data/lib/phronomy/memory/retrieval/semantic.rb +50 -17
  27. data/lib/phronomy/memory/storage/active_record.rb +18 -13
  28. data/lib/phronomy/memory/storage/in_memory.rb +25 -16
  29. data/lib/phronomy/rails/agent_job.rb +20 -3
  30. data/lib/phronomy/runnable.rb +4 -1
  31. data/lib/phronomy/state_store/active_record.rb +7 -3
  32. data/lib/phronomy/state_store/base.rb +16 -2
  33. data/lib/phronomy/state_store/in_memory.rb +5 -4
  34. data/lib/phronomy/tool/base.rb +19 -3
  35. data/lib/phronomy/tool/mcp_tool.rb +67 -9
  36. data/lib/phronomy/tracing/base.rb +0 -2
  37. data/lib/phronomy/tracing/langfuse_tracer.rb +24 -4
  38. data/lib/phronomy/tracing/null_tracer.rb +6 -3
  39. data/lib/phronomy/trust_pipeline.rb +32 -4
  40. data/lib/phronomy/vector_store/in_memory.rb +7 -5
  41. data/lib/phronomy/vector_store/redis_search.rb +30 -23
  42. data/lib/phronomy/version.rb +1 -1
  43. data/lib/phronomy.rb +39 -0
  44. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4bd9ee98ca8c05a22a5488a6996edb95dcffc26d14b1af92d6551bb24b66a530
4
- data.tar.gz: 14967148ee9764e8502ba8b45d28aa1640f9a086b80ac1a49d9bab3c228a3a70
3
+ metadata.gz: d95954b46d12542673b5a319b338c7733d579b72105499a55dc4251628bc807f
4
+ data.tar.gz: 174341a0e329d861066d475b062260c3d78fac86da3d024ebc1594d7a37ec348
5
5
  SHA512:
6
- metadata.gz: b005dd5bac44045180bdbf9945c773155e27ee95d69c90b35e3864d01685d831fcd3617762b16871fc637efc86daca27224dda7040e9ef50120675fd7f18b986
7
- data.tar.gz: 2732036e9ed83a86eb75b2e3b55d0a2ccce53b65e354967c1220f1f98b1fb84d09ad42c82673d415dba263a698453827551fe27831e1278252d675e271a67369
6
+ metadata.gz: ee299b8d67fec8cb268683ffe672daab04a5c5b4794728dbeea3877e6c5216cefe91f292acac977d7293b837cd0b159445d58a87f925781a26f24438faecd010
7
+ data.tar.gz: 6da71943dc65b3671f5bd34ff18509a8ee2e0b12bfb5fcb206df77ba2c7345c9fd1b59dec90088dc3e8588e9327cb162046a803a9ac7cd405f4d30fc6712ebc3
@@ -3,7 +3,7 @@ class CreatePhronomyMessages < ActiveRecord::Migration[<%= ActiveRecord::Migrati
3
3
  create_table :phronomy_messages do |t|
4
4
  t.string :thread_id, null: false
5
5
  t.string :role, null: false
6
- t.text :content, null: false
6
+ t.text :content
7
7
  t.text :tool_calls_json
8
8
  t.string :model_id
9
9
  t.timestamps
@@ -446,55 +446,88 @@ module Phronomy
446
446
  def stream(input, config: {}, &block)
447
447
  return invoke(input, config: config) unless block
448
448
 
449
- run_input_guardrails!(input)
449
+ caller_meta = {}
450
+ caller_meta[:user_id] = config[:user_id] if config[:user_id]
451
+ caller_meta[:session_id] = config[:session_id] if config[:session_id]
452
+
453
+ trace("agent.invoke", input: input, **caller_meta) do |_span|
454
+ run_input_guardrails!(input)
450
455
 
451
- memory = config[:memory]
452
- thread_id = config[:thread_id]
456
+ memory = config[:memory]
457
+ thread_id = config[:thread_id]
453
458
 
454
- chat = build_chat
455
- user_message = extract_message(input)
459
+ chat = build_chat
460
+ user_message = extract_message(input)
461
+ budget = build_token_budget
456
462
 
457
- # Assemble context via Assembler (same as invoke_once).
458
- assembler = Context::Assembler.new(budget: build_token_budget)
459
- system_msg = build_instructions(input)
460
- assembler.add_instruction(system_msg) if system_msg
463
+ # Assemble context via Assembler (same as invoke_once).
464
+ assembler = Context::Assembler.new(budget: budget)
465
+ system_msg = build_instructions(input)
466
+ assembler.add_instruction(system_msg) if system_msg
461
467
 
462
- Array(config[:knowledge_sources]).each do |ks|
463
- ks.fetch(query: user_message).each do |chunk|
464
- assembler.add_knowledge(chunk[:content], type: chunk[:type], source: chunk[:source])
468
+ Array(config[:knowledge_sources]).each do |ks|
469
+ ks.fetch(query: user_message).each do |chunk|
470
+ assembler.add_knowledge(chunk[:content], type: chunk[:type], source: chunk[:source])
471
+ end
465
472
  end
466
- end
467
473
 
468
- if memory && thread_id
469
- msgs = load_from_memory(memory, thread_id: thread_id, query: user_message)
470
- assembler.add_messages(msgs)
471
- end
474
+ if memory && thread_id
475
+ msgs = load_from_memory(memory, thread_id: thread_id, query: user_message)
476
+ message_elements = build_message_elements(msgs)
472
477
 
473
- context = assembler.build
474
- apply_instructions(chat, context[:system]) if context[:system]
475
- context[:messages].each { |msg| chat.messages << msg }
478
+ # Run on_trim: app may call ctx.remove(seqs) to drop messages this turn.
479
+ if (trim_cb = self.class._on_trim_callback)
480
+ trim_ctx = Context::TrimContext.new(message_elements: message_elements, budget: budget)
481
+ trim_cb.call(trim_ctx)
482
+ message_elements = trim_ctx.message_elements
483
+ end
476
484
 
477
- # Wire per-event callbacks to yield StreamEvents.
478
- chat.on_tool_call { |tool_call| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tool_call})) }
479
- chat.on_tool_result { |tool_result| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tool_result})) }
485
+ # Run on_compaction_trigger on_compact pipeline before calling the LLM.
486
+ if (trigger_cb = self.class._on_compaction_trigger_callback)
487
+ trigger_ctx = Context::TriggerContext.new(message_elements: message_elements, budget: budget)
488
+ if trigger_cb.call(trigger_ctx)
489
+ if (compact_cb = self.class._on_compact_callback)
490
+ compact_ctx = Context::CompactionContext.new(
491
+ message_elements: message_elements,
492
+ budget: budget,
493
+ thread_id: thread_id,
494
+ memory: memory
495
+ )
496
+ compact_cb.call(compact_ctx)
497
+ message_elements = build_message_elements(compact_ctx.result_messages)
498
+ end
499
+ end
500
+ end
480
501
 
481
- # Run before_completion hooks (global class instance) before the LLM call.
482
- run_before_completion_hooks!(chat, config)
502
+ assembler.add_messages(message_elements.map { |e| e[:message] })
503
+ end
483
504
 
484
- response = chat.ask(user_message) do |chunk|
485
- block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
486
- end
505
+ context = assembler.build
506
+ apply_instructions(chat, context[:system]) if context[:system]
507
+ context[:messages].each { |msg| chat.messages << msg }
487
508
 
488
- save_to_memory(memory, thread_id: thread_id, messages: chat.messages) if memory && thread_id
509
+ # Wire per-event callbacks to yield StreamEvents.
510
+ chat.before_tool_call { |tool_call| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tool_call})) }
511
+ chat.after_tool_result { |tool_result| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tool_result})) }
489
512
 
490
- output = response.content
491
- usage = Phronomy::TokenUsage.from_tokens(response.tokens)
513
+ # Run before_completion hooks (global → class → instance) before the LLM call.
514
+ run_before_completion_hooks!(chat, config)
492
515
 
493
- run_output_guardrails!(output)
516
+ response = chat.ask(user_message) do |chunk|
517
+ block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
518
+ end
494
519
 
495
- result = {output: output, messages: chat.messages, usage: usage}
496
- block.call(StreamEvent.new(type: :done, payload: result))
497
- result
520
+ save_to_memory(memory, thread_id: thread_id, messages: chat.messages) if memory && thread_id
521
+
522
+ output = response.content
523
+ usage = Phronomy::TokenUsage.from_tokens(response.tokens)
524
+
525
+ run_output_guardrails!(output)
526
+
527
+ result = {output: output, messages: chat.messages, usage: usage}
528
+ block.call(StreamEvent.new(type: :done, payload: result))
529
+ [result, usage]
530
+ end
498
531
  rescue => e
499
532
  block&.call(StreamEvent.new(type: :error, payload: {error: e}))
500
533
  raise
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "securerandom"
4
+
3
5
  module Phronomy
4
6
  module Agent
5
7
  # Represents a transfer edge from one agent to another.
@@ -23,7 +25,9 @@ module Phronomy
23
25
  def initialize(target_agent:, description: nil)
24
26
  @target_agent = target_agent
25
27
  klass_name = target_agent.class.name&.split("::")&.last || "Agent"
26
- @tool_name = "transfer_to_#{snake_case(klass_name)}"
28
+ # Use a UUID so that two handoffs targeting the same class remain distinct.
29
+ @uuid = SecureRandom.uuid
30
+ @tool_name = "transfer_to_#{snake_case(klass_name)}_#{@uuid.delete("-")[0, 8]}"
27
31
  @description = description || "Transfer the conversation to #{klass_name}."
28
32
  end
29
33
 
@@ -43,7 +47,7 @@ module Phronomy
43
47
  # The sentinel string embedded in the tool result.
44
48
  # @return [String]
45
49
  def sentinel
46
- "#{SENTINEL_PREFIX}:#{target_agent.class.name}"
50
+ "#{SENTINEL_PREFIX}:#{target_agent.class.name}:#{@uuid}"
47
51
  end
48
52
 
49
53
  private
@@ -5,7 +5,11 @@ module Phronomy
5
5
  # ReAct pattern (Reasoning + Acting) agent.
6
6
  # Repeats the LLM <-> Tool loop until no more tool calls are made.
7
7
  class ReactAgent < Base
8
- def invoke(input, config: {})
8
+ private
9
+
10
+ # Performs a single (non-retried) ReAct invocation.
11
+ # Overrides Base#invoke_once so that Base#invoke's retry loop is inherited.
12
+ def invoke_once(input, config: {})
9
13
  caller_meta = {}
10
14
  caller_meta[:user_id] = config[:user_id] if config[:user_id]
11
15
  caller_meta[:session_id] = config[:session_id] if config[:session_id]
@@ -28,27 +32,37 @@ module Phronomy
28
32
  messages = initial_messages.dup
29
33
  user_asked = false
30
34
  total_usage = Phronomy::TokenUsage.zero
35
+ iterations_exhausted = true
31
36
 
32
37
  max_iter.times do
33
38
  response = step(messages, input, user_asked: user_asked, config: config)
34
39
  user_asked = true
35
40
  messages = response[:messages]
36
41
  total_usage += response[:usage]
37
- break if response[:done]
42
+ if response[:done]
43
+ iterations_exhausted = false
44
+ break
45
+ end
38
46
  end
39
47
 
40
48
  save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
41
49
 
42
- output = messages.last&.content
50
+ # Fall back to the last message that carries non-nil content. This
51
+ # guards against the case where the final message is a tool-call or
52
+ # tool-result message (content == nil) when max_iterations is
53
+ # exhausted before the model produces a text reply.
54
+ output = messages.reverse.find { |m| m.content && !m.content.empty? }&.content
43
55
 
44
56
  # Run output guardrails before returning to the caller.
45
57
  run_output_guardrails!(output)
46
58
 
47
- result = {output: output, messages: messages, usage: total_usage}
59
+ result = {output: output, messages: messages, usage: total_usage, iterations_exhausted: iterations_exhausted}
48
60
  [result, total_usage]
49
61
  end
50
62
  end
51
63
 
64
+ public
65
+
52
66
  # Streaming version of #invoke for the ReAct loop.
53
67
  # Yields {Phronomy::Agent::StreamEvent} events while the LLM-tool loop runs.
54
68
  #
@@ -59,38 +73,50 @@ module Phronomy
59
73
  def stream(input, config: {}, &block)
60
74
  return invoke(input, config: config) unless block
61
75
 
62
- run_input_guardrails!(input)
76
+ caller_meta = {}
77
+ caller_meta[:user_id] = config[:user_id] if config[:user_id]
78
+ caller_meta[:session_id] = config[:session_id] if config[:session_id]
79
+
80
+ trace("agent.invoke", input: input, **caller_meta) do |_span|
81
+ run_input_guardrails!(input)
63
82
 
64
- memory = config[:memory]
65
- thread_id = config[:thread_id]
66
- max_iter = self.class.max_iterations
83
+ memory = config[:memory]
84
+ thread_id = config[:thread_id]
85
+ max_iter = self.class.max_iterations
67
86
 
68
- initial_messages = if memory && thread_id
69
- load_from_memory(memory, thread_id: thread_id, query: extract_message(input))
70
- else
71
- []
72
- end
87
+ initial_messages = if memory && thread_id
88
+ load_from_memory(memory, thread_id: thread_id, query: extract_message(input))
89
+ else
90
+ []
91
+ end
73
92
 
74
- messages = initial_messages.dup
75
- user_asked = false
76
- total_usage = Phronomy::TokenUsage.zero
93
+ messages = initial_messages.dup
94
+ user_asked = false
95
+ total_usage = Phronomy::TokenUsage.zero
96
+ iterations_exhausted = true
77
97
 
78
- max_iter.times do
79
- response = stream_step(messages, input, user_asked: user_asked, config: config, &block)
80
- user_asked = true
81
- messages = response[:messages]
82
- total_usage += response[:usage]
83
- break if response[:done]
84
- end
98
+ max_iter.times do
99
+ response = stream_step(messages, input, user_asked: user_asked, config: config, &block)
100
+ user_asked = true
101
+ messages = response[:messages]
102
+ total_usage += response[:usage]
103
+ if response[:done]
104
+ iterations_exhausted = false
105
+ break
106
+ end
107
+ end
85
108
 
86
- save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
109
+ save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
87
110
 
88
- output = messages.last&.content
89
- run_output_guardrails!(output)
111
+ # Fall back to the last message that carries non-nil content (same as
112
+ # the non-streaming path above).
113
+ output = messages.reverse.find { |m| m.content && !m.content.empty? }&.content
114
+ run_output_guardrails!(output)
90
115
 
91
- result = {output: output, messages: messages, usage: total_usage}
92
- block.call(StreamEvent.new(type: :done, payload: result))
93
- result
116
+ result = {output: output, messages: messages, usage: total_usage, iterations_exhausted: iterations_exhausted}
117
+ block.call(StreamEvent.new(type: :done, payload: result))
118
+ [result, total_usage]
119
+ end
94
120
  rescue => e
95
121
  block&.call(StreamEvent.new(type: :error, payload: {error: e}))
96
122
  raise
@@ -128,8 +154,8 @@ module Phronomy
128
154
  chat = build_chat
129
155
  messages.each { |m| chat.add_message(m) }
130
156
 
131
- chat.on_tool_call { |tc| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tc})) }
132
- chat.on_tool_result { |tr| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tr})) }
157
+ chat.before_tool_call { |tc| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tc})) }
158
+ chat.after_tool_result { |tr| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tr})) }
133
159
 
134
160
  # Run before_completion hooks before each LLM call in the streaming loop.
135
161
  run_before_completion_hooks!(chat, config)
@@ -52,14 +52,16 @@ module Phronomy
52
52
  handoffs_taken = 0
53
53
 
54
54
  loop do
55
- result = current.invoke(input, config: config)
56
- target = find_handoff_target(result[:messages])
57
- return result.merge(agent: current) unless target
58
-
55
+ # Check before invoking so we raise after exactly MAX_HANDOFFS handoffs,
56
+ # not after MAX_HANDOFFS + 1 LLM calls.
59
57
  if handoffs_taken >= MAX_HANDOFFS
60
58
  raise Phronomy::HandoffError, "Exceeded maximum handoffs (#{MAX_HANDOFFS})"
61
59
  end
62
60
 
61
+ result = current.invoke(input, config: config)
62
+ target = find_handoff_target(result[:messages])
63
+ return result.merge(agent: current) unless target
64
+
63
65
  current = target
64
66
  handoffs_taken += 1
65
67
  end
@@ -42,11 +42,17 @@ module Phronomy
42
42
  # Recursion limit for graph execution (default: 25)
43
43
  attr_accessor :recursion_limit
44
44
 
45
+ # When true (default), user input and LLM output are recorded in trace spans.
46
+ # Set to false in privacy-sensitive environments to prevent PII from reaching
47
+ # the tracing backend (OTel, Langfuse, etc.).
48
+ attr_accessor :trace_pii
49
+
45
50
  def initialize
46
51
  @recursion_limit = 25
47
52
  @tracer = Phronomy::Tracing::NullTracer.new
48
53
  @memory_async = false
49
54
  @memory_job_queue = :default
55
+ @trace_pii = true
50
56
  end
51
57
  end
52
58
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "cgi"
4
+
3
5
  module Phronomy
4
6
  module Context
5
7
  # Assembler collects all four context regions and produces the final
@@ -34,7 +36,7 @@ module Phronomy
34
36
  # @param trusted [Boolean]
35
37
  # @return [String]
36
38
  def self.xml_tag(text, type:, trusted: false)
37
- "<context type=\"#{type}\" trusted=\"#{trusted}\">\n#{text}\n</context>"
39
+ "<context type=\"#{CGI.escapeHTML(type.to_s)}\" trusted=\"#{trusted}\">\n#{CGI.escapeHTML(text.to_s)}\n</context>"
38
40
  end
39
41
 
40
42
  # @param budget [Phronomy::Context::TokenBudget, nil]
@@ -104,8 +106,8 @@ module Phronomy
104
106
  private
105
107
 
106
108
  def xml_context_tag(chunk)
107
- src_attr = chunk[:source] ? " source=\"#{chunk[:source]}\"" : ""
108
- "<context type=\"#{chunk[:type]}\"#{src_attr} trusted=\"#{chunk[:trusted]}\">\n#{chunk[:text]}\n</context>"
109
+ src_attr = chunk[:source] ? " source=\"#{CGI.escapeHTML(chunk[:source].to_s)}\"" : ""
110
+ "<context type=\"#{CGI.escapeHTML(chunk[:type].to_s)}\"#{src_attr} trusted=\"#{chunk[:trusted]}\">\n#{CGI.escapeHTML(chunk[:text].to_s)}\n</context>"
109
111
  end
110
112
 
111
113
  def trim_messages_to_budget(messages, system_text)
@@ -122,6 +124,12 @@ module Phronomy
122
124
  accumulated += tokens
123
125
  result.push(msg)
124
126
  end
127
+
128
+ if result.empty? && messages.any?
129
+ warn "[Phronomy::Assembler] All #{messages.length} conversation message(s) dropped: " \
130
+ "token budget exhausted by system context (budget=#{@budget.context_window}, used_by_system=#{used})"
131
+ end
132
+
125
133
  result.reverse
126
134
  end
127
135
  end
@@ -1,7 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "ostruct"
4
-
5
3
  module Phronomy
6
4
  module Context
7
5
  # Context object passed to the +on_compact+ callback registered on an agent.
@@ -103,7 +101,7 @@ module Phronomy
103
101
  end
104
102
 
105
103
  remaining = (@message_elements[(last_idx + 1)..] || []).map { |e| e[:message] }
106
- summary_msg = OpenStruct.new(role: :system, content: summary_text)
104
+ summary_msg = RubyLLM::Message.new(role: :system, content: summary_text)
107
105
  @result_messages = [summary_msg] + remaining
108
106
  end
109
107
  end
@@ -27,32 +27,46 @@ module Phronomy
27
27
  attr_reader :system_tokens
28
28
 
29
29
  def initialize
30
- reset
30
+ @mutex = Mutex.new
31
+ @fingerprint = nil
32
+ @system_text = nil
33
+ @system_tokens = 0
31
34
  end
32
35
 
33
36
  # Returns true when the given fingerprint matches the stored one.
37
+ # The check is performed under a mutex so that a concurrent #update cannot
38
+ # expose a partially-written state where fingerprint is new but system_text
39
+ # is still nil (Issue #55).
34
40
  #
35
41
  # @param fingerprint [String] SHA-256 hex digest to compare
36
42
  # @return [Boolean]
37
43
  def valid?(fingerprint)
38
- !@fingerprint.nil? && @fingerprint == fingerprint
44
+ @mutex.synchronize do
45
+ !@fingerprint.nil? && !@system_text.nil? && @fingerprint == fingerprint
46
+ end
39
47
  end
40
48
 
41
49
  # Update the cache with a new fingerprint and system text.
50
+ # All three assignments are performed atomically under a mutex so that
51
+ # concurrent readers never observe a partial state (Issue #55).
42
52
  #
43
53
  # @param fingerprint [String] new SHA-256 hex digest
44
54
  # @param system_text [String] fully assembled system prompt text
45
55
  def update(fingerprint:, system_text:)
46
- @fingerprint = fingerprint
47
- @system_text = system_text.to_s
48
- @system_tokens = TokenEstimator.estimate(@system_text)
56
+ @mutex.synchronize do
57
+ @fingerprint = fingerprint
58
+ @system_text = system_text.to_s
59
+ @system_tokens = TokenEstimator.estimate(@system_text)
60
+ end
49
61
  end
50
62
 
51
63
  # Clear all cached values (used for testing and forced invalidation).
52
64
  def reset
53
- @fingerprint = nil
54
- @system_text = nil
55
- @system_tokens = 0
65
+ @mutex.synchronize do
66
+ @fingerprint = nil
67
+ @system_text = nil
68
+ @system_tokens = 0
69
+ end
56
70
  end
57
71
  end
58
72
  end
@@ -23,13 +23,29 @@ module Phronomy
23
23
  # Phronomy::Context::TokenEstimator.tokenizer = nil
24
24
  module TokenEstimator
25
25
  @tokenizer = nil
26
+ @tokenizer_mutex = Mutex.new
26
27
 
27
28
  class << self
28
29
  # Replace the built-in heuristic with a callable that takes a String
29
30
  # and returns an Integer token count. Set to nil to restore the default.
30
31
  #
32
+ # @note This is a process-wide setting. Set it once at application startup.
33
+ # In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
34
+ # prevent cross-test contamination.
31
35
  # @param callable [#call, nil]
32
- attr_accessor :tokenizer
36
+ def tokenizer=(callable)
37
+ @tokenizer_mutex.synchronize { @tokenizer = callable }
38
+ end
39
+
40
+ # @return [#call, nil]
41
+ def tokenizer
42
+ @tokenizer_mutex.synchronize { @tokenizer }
43
+ end
44
+
45
+ # Resets the tokenizer to the built-in heuristic. Intended for test isolation.
46
+ def reset_tokenizer!
47
+ @tokenizer_mutex.synchronize { @tokenizer = nil }
48
+ end
33
49
 
34
50
  # Estimate the number of tokens for the given input.
35
51
  #
@@ -37,9 +53,10 @@ module Phronomy
37
53
  # or an Array of message-like objects (each must respond to #content).
38
54
  # @return [Integer] estimated token count (>= 0)
39
55
  def estimate(input)
56
+ tok = @tokenizer_mutex.synchronize { @tokenizer }
40
57
  case input
41
58
  when String
42
- @tokenizer ? @tokenizer.call(input) : (input.length / 4.0).ceil
59
+ tok ? tok.call(input) : (input.length / 4.0).ceil
43
60
  when Array
44
61
  input.sum { |m| estimate(m.content.to_s) }
45
62
  else
@@ -4,16 +4,26 @@ module Phronomy
4
4
  module Eval
5
5
  # An immutable record holding the outcome of evaluating one EvalCase.
6
6
  #
7
- # @!attribute eval_case [EvalCase] the original sample
8
- # @!attribute actual [String] the callable's output
9
- # @!attribute score [Float] scorer-assigned value in [0.0, 1.0]
10
- # @!attribute usage [Phronomy::TokenUsage, nil]
7
+ # @!attribute eval_case [EvalCase] the original sample
8
+ # @!attribute actual [String] the callable's output
9
+ # @!attribute score [Float] scorer-assigned value in [0.0, 1.0]
10
+ # @!attribute usage [Phronomy::TokenUsage, nil]
11
11
  # @!attribute latency_ms [Integer] wall-clock time of the callable in ms
12
- EvalResult = Data.define(:eval_case, :actual, :score, :usage, :latency_ms) do
12
+ # @!attribute error [Exception, nil] set when the scorer raised an exception
13
+ EvalResult = Data.define(:eval_case, :actual, :score, :usage, :latency_ms, :error) do
14
+ def initialize(eval_case:, actual:, score:, usage:, latency_ms:, error: nil)
15
+ super
16
+ end
17
+
13
18
  # Returns true when the scorer assigned a perfect score of 1.0.
14
19
  def pass?
15
20
  score >= 1.0
16
21
  end
22
+
23
+ # Returns true when the scorer raised an exception.
24
+ def scorer_error?
25
+ !error.nil?
26
+ end
17
27
  end
18
28
  end
19
29
  end
@@ -22,24 +22,52 @@ module Phronomy
22
22
  @scorer = scorer
23
23
  end
24
24
 
25
- # @param dataset [Dataset] collection of EvalCase objects
26
- # @param callable [#call] accepts a single String argument
25
+ # @param dataset [Dataset] collection of EvalCase objects
26
+ # @param callable [#call] accepts a single String argument
27
+ # @param concurrency [Integer] number of parallel threads (default: 1, sequential)
27
28
  # @return [Array<EvalResult>]
28
- def run(dataset, callable)
29
- dataset.map do |eval_case|
30
- t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
31
- result = callable.call(eval_case.input)
32
- latency_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - t0
29
+ def run(dataset, callable, concurrency: 1)
30
+ cases = dataset.to_a
31
+ return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
33
32
 
34
- actual, usage = extract(result)
35
- score = @scorer.score(actual: actual, expected: eval_case.expected, input: eval_case.input)
36
-
37
- EvalResult.new(eval_case: eval_case, actual: actual, score: score, usage: usage, latency_ms: latency_ms)
33
+ # Run cases in slices of +concurrency+ threads. Each slice is joined
34
+ # before the next starts, bounding peak thread count to +concurrency+.
35
+ # Writing to pre-allocated slots (one per thread) is safe because each
36
+ # thread writes to a unique index and all threads in a slice are joined
37
+ # before the next slice begins.
38
+ # Exceptions in worker threads are collected and re-raised after all
39
+ # threads in the slice are joined, preventing orphaned threads.
40
+ results = Array.new(cases.length)
41
+ cases.each_with_index.each_slice(concurrency) do |batch|
42
+ errors = []
43
+ errors_mu = Mutex.new
44
+ threads = batch.map do |eval_case, i|
45
+ Thread.new do
46
+ results[i] = run_one(eval_case, callable)
47
+ rescue => e
48
+ errors_mu.synchronize { errors << e }
49
+ end
50
+ end
51
+ threads.each(&:join)
52
+ raise errors.first if errors.any?
38
53
  end
54
+ results
39
55
  end
40
56
 
41
57
  private
42
58
 
59
+ # Evaluate a single EvalCase with the given callable and return an EvalResult.
60
+ def run_one(eval_case, callable)
61
+ t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
62
+ result = callable.call(eval_case.input)
63
+ latency_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - t0
64
+
65
+ actual, usage = extract(result)
66
+ score, score_error = score_safely(@scorer, actual: actual, expected: eval_case.expected, input: eval_case.input)
67
+
68
+ EvalResult.new(eval_case: eval_case, actual: actual, score: score, usage: usage, latency_ms: latency_ms, error: score_error)
69
+ end
70
+
43
71
  # Normalises the callable's return value into [actual_string, usage_or_nil].
44
72
  def extract(result)
45
73
  if result.is_a?(Hash)
@@ -48,6 +76,13 @@ module Phronomy
48
76
  [result.to_s, nil]
49
77
  end
50
78
  end
79
+
80
+ # Calls the scorer and returns [score, error]. On failure, returns [0.0, exception].
81
+ def score_safely(scorer, **kwargs)
82
+ [scorer.score(**kwargs), nil]
83
+ rescue => e
84
+ [0.0, e]
85
+ end
51
86
  end
52
87
  end
53
88
  end
@@ -34,17 +34,22 @@ module Phronomy
34
34
 
35
35
  # @param model [String] RubyLLM model identifier
36
36
  # @param prompt_template [String] format string with %<input>s, %<expected>s, %<actual>s
37
- def initialize(model:, prompt_template: DEFAULT_PROMPT)
37
+ # @param raise_on_error [Boolean] when true, re-raises scoring exceptions instead of
38
+ # returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
39
+ def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
38
40
  @model = model
39
41
  @prompt_template = prompt_template
42
+ @raise_on_error = raise_on_error
40
43
  end
41
44
 
42
- # @return [Float] score in [0.0, 1.0]; 0.0 on any error
45
+ # @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
43
46
  def score(actual:, expected:, input: nil)
44
47
  prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
45
48
  response = RubyLLM.chat(model: @model).ask(prompt)
46
49
  response.content.to_s.strip.scan(/-?\d+\.?\d*/).first.to_f.clamp(0.0, 1.0)
47
50
  rescue => e
51
+ raise if @raise_on_error
52
+
48
53
  warn "[LlmJudge] Scoring failed: #{e.message}"
49
54
  0.0
50
55
  end