phronomy 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/generators/phronomy/install/templates/create_phronomy_messages.rb.tt +1 -1
- data/lib/phronomy/agent/base.rb +68 -35
- data/lib/phronomy/agent/handoff.rb +6 -2
- data/lib/phronomy/agent/react_agent.rb +57 -31
- data/lib/phronomy/agent/runner.rb +6 -4
- data/lib/phronomy/configuration.rb +6 -0
- data/lib/phronomy/context/assembler.rb +11 -3
- data/lib/phronomy/context/compaction_context.rb +1 -3
- data/lib/phronomy/context/context_version_cache.rb +22 -8
- data/lib/phronomy/context/token_estimator.rb +19 -2
- data/lib/phronomy/eval/eval_result.rb +15 -5
- data/lib/phronomy/eval/runner.rb +46 -11
- data/lib/phronomy/eval/scorer/llm_judge.rb +7 -2
- data/lib/phronomy/graph/compiled_graph.rb +9 -1
- data/lib/phronomy/graph/parallel_node.rb +53 -18
- data/lib/phronomy/graph/state_graph.rb +7 -1
- data/lib/phronomy/guardrail/builtin/pii_pattern_detector.rb +47 -3
- data/lib/phronomy/guardrail/builtin/prompt_injection_detector.rb +15 -1
- data/lib/phronomy/memory/compression/summary.rb +4 -3
- data/lib/phronomy/memory/compression/tool_output_pruner.rb +11 -6
- data/lib/phronomy/memory/conversation_manager.rb +59 -14
- data/lib/phronomy/memory/retrieval/base.rb +4 -3
- data/lib/phronomy/memory/retrieval/composite.rb +5 -4
- data/lib/phronomy/memory/retrieval/recent.rb +4 -3
- data/lib/phronomy/memory/retrieval/semantic.rb +50 -17
- data/lib/phronomy/memory/storage/active_record.rb +18 -13
- data/lib/phronomy/memory/storage/in_memory.rb +25 -16
- data/lib/phronomy/rails/agent_job.rb +20 -3
- data/lib/phronomy/runnable.rb +4 -1
- data/lib/phronomy/state_store/active_record.rb +7 -3
- data/lib/phronomy/state_store/base.rb +16 -2
- data/lib/phronomy/state_store/in_memory.rb +5 -4
- data/lib/phronomy/tool/base.rb +19 -3
- data/lib/phronomy/tool/mcp_tool.rb +67 -9
- data/lib/phronomy/tracing/base.rb +0 -2
- data/lib/phronomy/tracing/langfuse_tracer.rb +24 -4
- data/lib/phronomy/tracing/null_tracer.rb +6 -3
- data/lib/phronomy/trust_pipeline.rb +32 -4
- data/lib/phronomy/vector_store/in_memory.rb +7 -5
- data/lib/phronomy/vector_store/redis_search.rb +30 -23
- data/lib/phronomy/version.rb +1 -1
- data/lib/phronomy.rb +39 -0
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d95954b46d12542673b5a319b338c7733d579b72105499a55dc4251628bc807f
|
|
4
|
+
data.tar.gz: 174341a0e329d861066d475b062260c3d78fac86da3d024ebc1594d7a37ec348
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ee299b8d67fec8cb268683ffe672daab04a5c5b4794728dbeea3877e6c5216cefe91f292acac977d7293b837cd0b159445d58a87f925781a26f24438faecd010
|
|
7
|
+
data.tar.gz: 6da71943dc65b3671f5bd34ff18509a8ee2e0b12bfb5fcb206df77ba2c7345c9fd1b59dec90088dc3e8588e9327cb162046a803a9ac7cd405f4d30fc6712ebc3
|
|
@@ -3,7 +3,7 @@ class CreatePhronomyMessages < ActiveRecord::Migration[<%= ActiveRecord::Migrati
|
|
|
3
3
|
create_table :phronomy_messages do |t|
|
|
4
4
|
t.string :thread_id, null: false
|
|
5
5
|
t.string :role, null: false
|
|
6
|
-
t.text :content
|
|
6
|
+
t.text :content
|
|
7
7
|
t.text :tool_calls_json
|
|
8
8
|
t.string :model_id
|
|
9
9
|
t.timestamps
|
data/lib/phronomy/agent/base.rb
CHANGED
|
@@ -446,55 +446,88 @@ module Phronomy
|
|
|
446
446
|
def stream(input, config: {}, &block)
|
|
447
447
|
return invoke(input, config: config) unless block
|
|
448
448
|
|
|
449
|
-
|
|
449
|
+
caller_meta = {}
|
|
450
|
+
caller_meta[:user_id] = config[:user_id] if config[:user_id]
|
|
451
|
+
caller_meta[:session_id] = config[:session_id] if config[:session_id]
|
|
452
|
+
|
|
453
|
+
trace("agent.invoke", input: input, **caller_meta) do |_span|
|
|
454
|
+
run_input_guardrails!(input)
|
|
450
455
|
|
|
451
|
-
|
|
452
|
-
|
|
456
|
+
memory = config[:memory]
|
|
457
|
+
thread_id = config[:thread_id]
|
|
453
458
|
|
|
454
|
-
|
|
455
|
-
|
|
459
|
+
chat = build_chat
|
|
460
|
+
user_message = extract_message(input)
|
|
461
|
+
budget = build_token_budget
|
|
456
462
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
463
|
+
# Assemble context via Assembler (same as invoke_once).
|
|
464
|
+
assembler = Context::Assembler.new(budget: budget)
|
|
465
|
+
system_msg = build_instructions(input)
|
|
466
|
+
assembler.add_instruction(system_msg) if system_msg
|
|
461
467
|
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
468
|
+
Array(config[:knowledge_sources]).each do |ks|
|
|
469
|
+
ks.fetch(query: user_message).each do |chunk|
|
|
470
|
+
assembler.add_knowledge(chunk[:content], type: chunk[:type], source: chunk[:source])
|
|
471
|
+
end
|
|
465
472
|
end
|
|
466
|
-
end
|
|
467
473
|
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
end
|
|
474
|
+
if memory && thread_id
|
|
475
|
+
msgs = load_from_memory(memory, thread_id: thread_id, query: user_message)
|
|
476
|
+
message_elements = build_message_elements(msgs)
|
|
472
477
|
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
478
|
+
# Run on_trim: app may call ctx.remove(seqs) to drop messages this turn.
|
|
479
|
+
if (trim_cb = self.class._on_trim_callback)
|
|
480
|
+
trim_ctx = Context::TrimContext.new(message_elements: message_elements, budget: budget)
|
|
481
|
+
trim_cb.call(trim_ctx)
|
|
482
|
+
message_elements = trim_ctx.message_elements
|
|
483
|
+
end
|
|
476
484
|
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
485
|
+
# Run on_compaction_trigger → on_compact pipeline before calling the LLM.
|
|
486
|
+
if (trigger_cb = self.class._on_compaction_trigger_callback)
|
|
487
|
+
trigger_ctx = Context::TriggerContext.new(message_elements: message_elements, budget: budget)
|
|
488
|
+
if trigger_cb.call(trigger_ctx)
|
|
489
|
+
if (compact_cb = self.class._on_compact_callback)
|
|
490
|
+
compact_ctx = Context::CompactionContext.new(
|
|
491
|
+
message_elements: message_elements,
|
|
492
|
+
budget: budget,
|
|
493
|
+
thread_id: thread_id,
|
|
494
|
+
memory: memory
|
|
495
|
+
)
|
|
496
|
+
compact_cb.call(compact_ctx)
|
|
497
|
+
message_elements = build_message_elements(compact_ctx.result_messages)
|
|
498
|
+
end
|
|
499
|
+
end
|
|
500
|
+
end
|
|
480
501
|
|
|
481
|
-
|
|
482
|
-
|
|
502
|
+
assembler.add_messages(message_elements.map { |e| e[:message] })
|
|
503
|
+
end
|
|
483
504
|
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
505
|
+
context = assembler.build
|
|
506
|
+
apply_instructions(chat, context[:system]) if context[:system]
|
|
507
|
+
context[:messages].each { |msg| chat.messages << msg }
|
|
487
508
|
|
|
488
|
-
|
|
509
|
+
# Wire per-event callbacks to yield StreamEvents.
|
|
510
|
+
chat.before_tool_call { |tool_call| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tool_call})) }
|
|
511
|
+
chat.after_tool_result { |tool_result| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tool_result})) }
|
|
489
512
|
|
|
490
|
-
|
|
491
|
-
|
|
513
|
+
# Run before_completion hooks (global → class → instance) before the LLM call.
|
|
514
|
+
run_before_completion_hooks!(chat, config)
|
|
492
515
|
|
|
493
|
-
|
|
516
|
+
response = chat.ask(user_message) do |chunk|
|
|
517
|
+
block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
|
|
518
|
+
end
|
|
494
519
|
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
520
|
+
save_to_memory(memory, thread_id: thread_id, messages: chat.messages) if memory && thread_id
|
|
521
|
+
|
|
522
|
+
output = response.content
|
|
523
|
+
usage = Phronomy::TokenUsage.from_tokens(response.tokens)
|
|
524
|
+
|
|
525
|
+
run_output_guardrails!(output)
|
|
526
|
+
|
|
527
|
+
result = {output: output, messages: chat.messages, usage: usage}
|
|
528
|
+
block.call(StreamEvent.new(type: :done, payload: result))
|
|
529
|
+
[result, usage]
|
|
530
|
+
end
|
|
498
531
|
rescue => e
|
|
499
532
|
block&.call(StreamEvent.new(type: :error, payload: {error: e}))
|
|
500
533
|
raise
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "securerandom"
|
|
4
|
+
|
|
3
5
|
module Phronomy
|
|
4
6
|
module Agent
|
|
5
7
|
# Represents a transfer edge from one agent to another.
|
|
@@ -23,7 +25,9 @@ module Phronomy
|
|
|
23
25
|
def initialize(target_agent:, description: nil)
|
|
24
26
|
@target_agent = target_agent
|
|
25
27
|
klass_name = target_agent.class.name&.split("::")&.last || "Agent"
|
|
26
|
-
|
|
28
|
+
# Use a UUID so that two handoffs targeting the same class remain distinct.
|
|
29
|
+
@uuid = SecureRandom.uuid
|
|
30
|
+
@tool_name = "transfer_to_#{snake_case(klass_name)}_#{@uuid.delete("-")[0, 8]}"
|
|
27
31
|
@description = description || "Transfer the conversation to #{klass_name}."
|
|
28
32
|
end
|
|
29
33
|
|
|
@@ -43,7 +47,7 @@ module Phronomy
|
|
|
43
47
|
# The sentinel string embedded in the tool result.
|
|
44
48
|
# @return [String]
|
|
45
49
|
def sentinel
|
|
46
|
-
"#{SENTINEL_PREFIX}:#{target_agent.class.name}"
|
|
50
|
+
"#{SENTINEL_PREFIX}:#{target_agent.class.name}:#{@uuid}"
|
|
47
51
|
end
|
|
48
52
|
|
|
49
53
|
private
|
|
@@ -5,7 +5,11 @@ module Phronomy
|
|
|
5
5
|
# ReAct pattern (Reasoning + Acting) agent.
|
|
6
6
|
# Repeats the LLM <-> Tool loop until no more tool calls are made.
|
|
7
7
|
class ReactAgent < Base
|
|
8
|
-
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
# Performs a single (non-retried) ReAct invocation.
|
|
11
|
+
# Overrides Base#invoke_once so that Base#invoke's retry loop is inherited.
|
|
12
|
+
def invoke_once(input, config: {})
|
|
9
13
|
caller_meta = {}
|
|
10
14
|
caller_meta[:user_id] = config[:user_id] if config[:user_id]
|
|
11
15
|
caller_meta[:session_id] = config[:session_id] if config[:session_id]
|
|
@@ -28,27 +32,37 @@ module Phronomy
|
|
|
28
32
|
messages = initial_messages.dup
|
|
29
33
|
user_asked = false
|
|
30
34
|
total_usage = Phronomy::TokenUsage.zero
|
|
35
|
+
iterations_exhausted = true
|
|
31
36
|
|
|
32
37
|
max_iter.times do
|
|
33
38
|
response = step(messages, input, user_asked: user_asked, config: config)
|
|
34
39
|
user_asked = true
|
|
35
40
|
messages = response[:messages]
|
|
36
41
|
total_usage += response[:usage]
|
|
37
|
-
|
|
42
|
+
if response[:done]
|
|
43
|
+
iterations_exhausted = false
|
|
44
|
+
break
|
|
45
|
+
end
|
|
38
46
|
end
|
|
39
47
|
|
|
40
48
|
save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
|
|
41
49
|
|
|
42
|
-
|
|
50
|
+
# Fall back to the last message that carries non-nil content. This
|
|
51
|
+
# guards against the case where the final message is a tool-call or
|
|
52
|
+
# tool-result message (content == nil) when max_iterations is
|
|
53
|
+
# exhausted before the model produces a text reply.
|
|
54
|
+
output = messages.reverse.find { |m| m.content && !m.content.empty? }&.content
|
|
43
55
|
|
|
44
56
|
# Run output guardrails before returning to the caller.
|
|
45
57
|
run_output_guardrails!(output)
|
|
46
58
|
|
|
47
|
-
result = {output: output, messages: messages, usage: total_usage}
|
|
59
|
+
result = {output: output, messages: messages, usage: total_usage, iterations_exhausted: iterations_exhausted}
|
|
48
60
|
[result, total_usage]
|
|
49
61
|
end
|
|
50
62
|
end
|
|
51
63
|
|
|
64
|
+
public
|
|
65
|
+
|
|
52
66
|
# Streaming version of #invoke for the ReAct loop.
|
|
53
67
|
# Yields {Phronomy::Agent::StreamEvent} events while the LLM-tool loop runs.
|
|
54
68
|
#
|
|
@@ -59,38 +73,50 @@ module Phronomy
|
|
|
59
73
|
def stream(input, config: {}, &block)
|
|
60
74
|
return invoke(input, config: config) unless block
|
|
61
75
|
|
|
62
|
-
|
|
76
|
+
caller_meta = {}
|
|
77
|
+
caller_meta[:user_id] = config[:user_id] if config[:user_id]
|
|
78
|
+
caller_meta[:session_id] = config[:session_id] if config[:session_id]
|
|
79
|
+
|
|
80
|
+
trace("agent.invoke", input: input, **caller_meta) do |_span|
|
|
81
|
+
run_input_guardrails!(input)
|
|
63
82
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
83
|
+
memory = config[:memory]
|
|
84
|
+
thread_id = config[:thread_id]
|
|
85
|
+
max_iter = self.class.max_iterations
|
|
67
86
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
87
|
+
initial_messages = if memory && thread_id
|
|
88
|
+
load_from_memory(memory, thread_id: thread_id, query: extract_message(input))
|
|
89
|
+
else
|
|
90
|
+
[]
|
|
91
|
+
end
|
|
73
92
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
93
|
+
messages = initial_messages.dup
|
|
94
|
+
user_asked = false
|
|
95
|
+
total_usage = Phronomy::TokenUsage.zero
|
|
96
|
+
iterations_exhausted = true
|
|
77
97
|
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
98
|
+
max_iter.times do
|
|
99
|
+
response = stream_step(messages, input, user_asked: user_asked, config: config, &block)
|
|
100
|
+
user_asked = true
|
|
101
|
+
messages = response[:messages]
|
|
102
|
+
total_usage += response[:usage]
|
|
103
|
+
if response[:done]
|
|
104
|
+
iterations_exhausted = false
|
|
105
|
+
break
|
|
106
|
+
end
|
|
107
|
+
end
|
|
85
108
|
|
|
86
|
-
|
|
109
|
+
save_to_memory(memory, thread_id: thread_id, messages: messages) if memory && thread_id
|
|
87
110
|
|
|
88
|
-
|
|
89
|
-
|
|
111
|
+
# Fall back to the last message that carries non-nil content (same as
|
|
112
|
+
# the non-streaming path above).
|
|
113
|
+
output = messages.reverse.find { |m| m.content && !m.content.empty? }&.content
|
|
114
|
+
run_output_guardrails!(output)
|
|
90
115
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
116
|
+
result = {output: output, messages: messages, usage: total_usage, iterations_exhausted: iterations_exhausted}
|
|
117
|
+
block.call(StreamEvent.new(type: :done, payload: result))
|
|
118
|
+
[result, total_usage]
|
|
119
|
+
end
|
|
94
120
|
rescue => e
|
|
95
121
|
block&.call(StreamEvent.new(type: :error, payload: {error: e}))
|
|
96
122
|
raise
|
|
@@ -128,8 +154,8 @@ module Phronomy
|
|
|
128
154
|
chat = build_chat
|
|
129
155
|
messages.each { |m| chat.add_message(m) }
|
|
130
156
|
|
|
131
|
-
chat.
|
|
132
|
-
chat.
|
|
157
|
+
chat.before_tool_call { |tc| block.call(StreamEvent.new(type: :tool_call, payload: {tool_call: tc})) }
|
|
158
|
+
chat.after_tool_result { |tr| block.call(StreamEvent.new(type: :tool_result, payload: {tool_result: tr})) }
|
|
133
159
|
|
|
134
160
|
# Run before_completion hooks before each LLM call in the streaming loop.
|
|
135
161
|
run_before_completion_hooks!(chat, config)
|
|
@@ -52,14 +52,16 @@ module Phronomy
|
|
|
52
52
|
handoffs_taken = 0
|
|
53
53
|
|
|
54
54
|
loop do
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
return result.merge(agent: current) unless target
|
|
58
|
-
|
|
55
|
+
# Check before invoking so we raise after exactly MAX_HANDOFFS handoffs,
|
|
56
|
+
# not after MAX_HANDOFFS + 1 LLM calls.
|
|
59
57
|
if handoffs_taken >= MAX_HANDOFFS
|
|
60
58
|
raise Phronomy::HandoffError, "Exceeded maximum handoffs (#{MAX_HANDOFFS})"
|
|
61
59
|
end
|
|
62
60
|
|
|
61
|
+
result = current.invoke(input, config: config)
|
|
62
|
+
target = find_handoff_target(result[:messages])
|
|
63
|
+
return result.merge(agent: current) unless target
|
|
64
|
+
|
|
63
65
|
current = target
|
|
64
66
|
handoffs_taken += 1
|
|
65
67
|
end
|
|
@@ -42,11 +42,17 @@ module Phronomy
|
|
|
42
42
|
# Recursion limit for graph execution (default: 25)
|
|
43
43
|
attr_accessor :recursion_limit
|
|
44
44
|
|
|
45
|
+
# When true (default), user input and LLM output are recorded in trace spans.
|
|
46
|
+
# Set to false in privacy-sensitive environments to prevent PII from reaching
|
|
47
|
+
# the tracing backend (OTel, Langfuse, etc.).
|
|
48
|
+
attr_accessor :trace_pii
|
|
49
|
+
|
|
45
50
|
def initialize
|
|
46
51
|
@recursion_limit = 25
|
|
47
52
|
@tracer = Phronomy::Tracing::NullTracer.new
|
|
48
53
|
@memory_async = false
|
|
49
54
|
@memory_job_queue = :default
|
|
55
|
+
@trace_pii = true
|
|
50
56
|
end
|
|
51
57
|
end
|
|
52
58
|
end
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
+
require "cgi"
|
|
4
|
+
|
|
3
5
|
module Phronomy
|
|
4
6
|
module Context
|
|
5
7
|
# Assembler collects all four context regions and produces the final
|
|
@@ -34,7 +36,7 @@ module Phronomy
|
|
|
34
36
|
# @param trusted [Boolean]
|
|
35
37
|
# @return [String]
|
|
36
38
|
def self.xml_tag(text, type:, trusted: false)
|
|
37
|
-
"<context type=\"#{type}\" trusted=\"#{trusted}\">\n#{text}\n</context>"
|
|
39
|
+
"<context type=\"#{CGI.escapeHTML(type.to_s)}\" trusted=\"#{trusted}\">\n#{CGI.escapeHTML(text.to_s)}\n</context>"
|
|
38
40
|
end
|
|
39
41
|
|
|
40
42
|
# @param budget [Phronomy::Context::TokenBudget, nil]
|
|
@@ -104,8 +106,8 @@ module Phronomy
|
|
|
104
106
|
private
|
|
105
107
|
|
|
106
108
|
def xml_context_tag(chunk)
|
|
107
|
-
src_attr = chunk[:source] ? " source=\"#{chunk[:source]}\"" : ""
|
|
108
|
-
"<context type=\"#{chunk[:type]}\"#{src_attr} trusted=\"#{chunk[:trusted]}\">\n#{chunk[:text]}\n</context>"
|
|
109
|
+
src_attr = chunk[:source] ? " source=\"#{CGI.escapeHTML(chunk[:source].to_s)}\"" : ""
|
|
110
|
+
"<context type=\"#{CGI.escapeHTML(chunk[:type].to_s)}\"#{src_attr} trusted=\"#{chunk[:trusted]}\">\n#{CGI.escapeHTML(chunk[:text].to_s)}\n</context>"
|
|
109
111
|
end
|
|
110
112
|
|
|
111
113
|
def trim_messages_to_budget(messages, system_text)
|
|
@@ -122,6 +124,12 @@ module Phronomy
|
|
|
122
124
|
accumulated += tokens
|
|
123
125
|
result.push(msg)
|
|
124
126
|
end
|
|
127
|
+
|
|
128
|
+
if result.empty? && messages.any?
|
|
129
|
+
warn "[Phronomy::Assembler] All #{messages.length} conversation message(s) dropped: " \
|
|
130
|
+
"token budget exhausted by system context (budget=#{@budget.context_window}, used_by_system=#{used})"
|
|
131
|
+
end
|
|
132
|
+
|
|
125
133
|
result.reverse
|
|
126
134
|
end
|
|
127
135
|
end
|
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require "ostruct"
|
|
4
|
-
|
|
5
3
|
module Phronomy
|
|
6
4
|
module Context
|
|
7
5
|
# Context object passed to the +on_compact+ callback registered on an agent.
|
|
@@ -103,7 +101,7 @@ module Phronomy
|
|
|
103
101
|
end
|
|
104
102
|
|
|
105
103
|
remaining = (@message_elements[(last_idx + 1)..] || []).map { |e| e[:message] }
|
|
106
|
-
summary_msg =
|
|
104
|
+
summary_msg = RubyLLM::Message.new(role: :system, content: summary_text)
|
|
107
105
|
@result_messages = [summary_msg] + remaining
|
|
108
106
|
end
|
|
109
107
|
end
|
|
@@ -27,32 +27,46 @@ module Phronomy
|
|
|
27
27
|
attr_reader :system_tokens
|
|
28
28
|
|
|
29
29
|
def initialize
|
|
30
|
-
|
|
30
|
+
@mutex = Mutex.new
|
|
31
|
+
@fingerprint = nil
|
|
32
|
+
@system_text = nil
|
|
33
|
+
@system_tokens = 0
|
|
31
34
|
end
|
|
32
35
|
|
|
33
36
|
# Returns true when the given fingerprint matches the stored one.
|
|
37
|
+
# The check is performed under a mutex so that a concurrent #update cannot
|
|
38
|
+
# expose a partially-written state where fingerprint is new but system_text
|
|
39
|
+
# is still nil (Issue #55).
|
|
34
40
|
#
|
|
35
41
|
# @param fingerprint [String] SHA-256 hex digest to compare
|
|
36
42
|
# @return [Boolean]
|
|
37
43
|
def valid?(fingerprint)
|
|
38
|
-
|
|
44
|
+
@mutex.synchronize do
|
|
45
|
+
!@fingerprint.nil? && !@system_text.nil? && @fingerprint == fingerprint
|
|
46
|
+
end
|
|
39
47
|
end
|
|
40
48
|
|
|
41
49
|
# Update the cache with a new fingerprint and system text.
|
|
50
|
+
# All three assignments are performed atomically under a mutex so that
|
|
51
|
+
# concurrent readers never observe a partial state (Issue #55).
|
|
42
52
|
#
|
|
43
53
|
# @param fingerprint [String] new SHA-256 hex digest
|
|
44
54
|
# @param system_text [String] fully assembled system prompt text
|
|
45
55
|
def update(fingerprint:, system_text:)
|
|
46
|
-
@
|
|
47
|
-
|
|
48
|
-
|
|
56
|
+
@mutex.synchronize do
|
|
57
|
+
@fingerprint = fingerprint
|
|
58
|
+
@system_text = system_text.to_s
|
|
59
|
+
@system_tokens = TokenEstimator.estimate(@system_text)
|
|
60
|
+
end
|
|
49
61
|
end
|
|
50
62
|
|
|
51
63
|
# Clear all cached values (used for testing and forced invalidation).
|
|
52
64
|
def reset
|
|
53
|
-
@
|
|
54
|
-
|
|
55
|
-
|
|
65
|
+
@mutex.synchronize do
|
|
66
|
+
@fingerprint = nil
|
|
67
|
+
@system_text = nil
|
|
68
|
+
@system_tokens = 0
|
|
69
|
+
end
|
|
56
70
|
end
|
|
57
71
|
end
|
|
58
72
|
end
|
|
@@ -23,13 +23,29 @@ module Phronomy
|
|
|
23
23
|
# Phronomy::Context::TokenEstimator.tokenizer = nil
|
|
24
24
|
module TokenEstimator
|
|
25
25
|
@tokenizer = nil
|
|
26
|
+
@tokenizer_mutex = Mutex.new
|
|
26
27
|
|
|
27
28
|
class << self
|
|
28
29
|
# Replace the built-in heuristic with a callable that takes a String
|
|
29
30
|
# and returns an Integer token count. Set to nil to restore the default.
|
|
30
31
|
#
|
|
32
|
+
# @note This is a process-wide setting. Set it once at application startup.
|
|
33
|
+
# In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
|
|
34
|
+
# prevent cross-test contamination.
|
|
31
35
|
# @param callable [#call, nil]
|
|
32
|
-
|
|
36
|
+
def tokenizer=(callable)
|
|
37
|
+
@tokenizer_mutex.synchronize { @tokenizer = callable }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# @return [#call, nil]
|
|
41
|
+
def tokenizer
|
|
42
|
+
@tokenizer_mutex.synchronize { @tokenizer }
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Resets the tokenizer to the built-in heuristic. Intended for test isolation.
|
|
46
|
+
def reset_tokenizer!
|
|
47
|
+
@tokenizer_mutex.synchronize { @tokenizer = nil }
|
|
48
|
+
end
|
|
33
49
|
|
|
34
50
|
# Estimate the number of tokens for the given input.
|
|
35
51
|
#
|
|
@@ -37,9 +53,10 @@ module Phronomy
|
|
|
37
53
|
# or an Array of message-like objects (each must respond to #content).
|
|
38
54
|
# @return [Integer] estimated token count (>= 0)
|
|
39
55
|
def estimate(input)
|
|
56
|
+
tok = @tokenizer_mutex.synchronize { @tokenizer }
|
|
40
57
|
case input
|
|
41
58
|
when String
|
|
42
|
-
|
|
59
|
+
tok ? tok.call(input) : (input.length / 4.0).ceil
|
|
43
60
|
when Array
|
|
44
61
|
input.sum { |m| estimate(m.content.to_s) }
|
|
45
62
|
else
|
|
@@ -4,16 +4,26 @@ module Phronomy
|
|
|
4
4
|
module Eval
|
|
5
5
|
# An immutable record holding the outcome of evaluating one EvalCase.
|
|
6
6
|
#
|
|
7
|
-
# @!attribute eval_case
|
|
8
|
-
# @!attribute actual
|
|
9
|
-
# @!attribute score
|
|
10
|
-
# @!attribute usage
|
|
7
|
+
# @!attribute eval_case [EvalCase] the original sample
|
|
8
|
+
# @!attribute actual [String] the callable's output
|
|
9
|
+
# @!attribute score [Float] scorer-assigned value in [0.0, 1.0]
|
|
10
|
+
# @!attribute usage [Phronomy::TokenUsage, nil]
|
|
11
11
|
# @!attribute latency_ms [Integer] wall-clock time of the callable in ms
|
|
12
|
-
|
|
12
|
+
# @!attribute error [Exception, nil] set when the scorer raised an exception
|
|
13
|
+
EvalResult = Data.define(:eval_case, :actual, :score, :usage, :latency_ms, :error) do
|
|
14
|
+
def initialize(eval_case:, actual:, score:, usage:, latency_ms:, error: nil)
|
|
15
|
+
super
|
|
16
|
+
end
|
|
17
|
+
|
|
13
18
|
# Returns true when the scorer assigned a perfect score of 1.0.
|
|
14
19
|
def pass?
|
|
15
20
|
score >= 1.0
|
|
16
21
|
end
|
|
22
|
+
|
|
23
|
+
# Returns true when the scorer raised an exception.
|
|
24
|
+
def scorer_error?
|
|
25
|
+
!error.nil?
|
|
26
|
+
end
|
|
17
27
|
end
|
|
18
28
|
end
|
|
19
29
|
end
|
data/lib/phronomy/eval/runner.rb
CHANGED
|
@@ -22,24 +22,52 @@ module Phronomy
|
|
|
22
22
|
@scorer = scorer
|
|
23
23
|
end
|
|
24
24
|
|
|
25
|
-
# @param dataset
|
|
26
|
-
# @param callable
|
|
25
|
+
# @param dataset [Dataset] collection of EvalCase objects
|
|
26
|
+
# @param callable [#call] accepts a single String argument
|
|
27
|
+
# @param concurrency [Integer] number of parallel threads (default: 1, sequential)
|
|
27
28
|
# @return [Array<EvalResult>]
|
|
28
|
-
def run(dataset, callable)
|
|
29
|
-
dataset.
|
|
30
|
-
|
|
31
|
-
result = callable.call(eval_case.input)
|
|
32
|
-
latency_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - t0
|
|
29
|
+
def run(dataset, callable, concurrency: 1)
|
|
30
|
+
cases = dataset.to_a
|
|
31
|
+
return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
|
|
33
32
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
33
|
+
# Run cases in slices of +concurrency+ threads. Each slice is joined
|
|
34
|
+
# before the next starts, bounding peak thread count to +concurrency+.
|
|
35
|
+
# Writing to pre-allocated slots (one per thread) is safe because each
|
|
36
|
+
# thread writes to a unique index and all threads in a slice are joined
|
|
37
|
+
# before the next slice begins.
|
|
38
|
+
# Exceptions in worker threads are collected and re-raised after all
|
|
39
|
+
# threads in the slice are joined, preventing orphaned threads.
|
|
40
|
+
results = Array.new(cases.length)
|
|
41
|
+
cases.each_with_index.each_slice(concurrency) do |batch|
|
|
42
|
+
errors = []
|
|
43
|
+
errors_mu = Mutex.new
|
|
44
|
+
threads = batch.map do |eval_case, i|
|
|
45
|
+
Thread.new do
|
|
46
|
+
results[i] = run_one(eval_case, callable)
|
|
47
|
+
rescue => e
|
|
48
|
+
errors_mu.synchronize { errors << e }
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
threads.each(&:join)
|
|
52
|
+
raise errors.first if errors.any?
|
|
38
53
|
end
|
|
54
|
+
results
|
|
39
55
|
end
|
|
40
56
|
|
|
41
57
|
private
|
|
42
58
|
|
|
59
|
+
# Evaluate a single EvalCase with the given callable and return an EvalResult.
|
|
60
|
+
def run_one(eval_case, callable)
|
|
61
|
+
t0 = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond)
|
|
62
|
+
result = callable.call(eval_case.input)
|
|
63
|
+
latency_ms = Process.clock_gettime(Process::CLOCK_MONOTONIC, :millisecond) - t0
|
|
64
|
+
|
|
65
|
+
actual, usage = extract(result)
|
|
66
|
+
score, score_error = score_safely(@scorer, actual: actual, expected: eval_case.expected, input: eval_case.input)
|
|
67
|
+
|
|
68
|
+
EvalResult.new(eval_case: eval_case, actual: actual, score: score, usage: usage, latency_ms: latency_ms, error: score_error)
|
|
69
|
+
end
|
|
70
|
+
|
|
43
71
|
# Normalises the callable's return value into [actual_string, usage_or_nil].
|
|
44
72
|
def extract(result)
|
|
45
73
|
if result.is_a?(Hash)
|
|
@@ -48,6 +76,13 @@ module Phronomy
|
|
|
48
76
|
[result.to_s, nil]
|
|
49
77
|
end
|
|
50
78
|
end
|
|
79
|
+
|
|
80
|
+
# Calls the scorer and returns [score, error]. On failure, returns [0.0, exception].
|
|
81
|
+
def score_safely(scorer, **kwargs)
|
|
82
|
+
[scorer.score(**kwargs), nil]
|
|
83
|
+
rescue => e
|
|
84
|
+
[0.0, e]
|
|
85
|
+
end
|
|
51
86
|
end
|
|
52
87
|
end
|
|
53
88
|
end
|
|
@@ -34,17 +34,22 @@ module Phronomy
|
|
|
34
34
|
|
|
35
35
|
# @param model [String] RubyLLM model identifier
|
|
36
36
|
# @param prompt_template [String] format string with %<input>s, %<expected>s, %<actual>s
|
|
37
|
-
|
|
37
|
+
# @param raise_on_error [Boolean] when true, re-raises scoring exceptions instead of
|
|
38
|
+
# returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
|
|
39
|
+
def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
|
|
38
40
|
@model = model
|
|
39
41
|
@prompt_template = prompt_template
|
|
42
|
+
@raise_on_error = raise_on_error
|
|
40
43
|
end
|
|
41
44
|
|
|
42
|
-
# @return [Float] score in [0.0, 1.0]; 0.0 on
|
|
45
|
+
# @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
|
|
43
46
|
def score(actual:, expected:, input: nil)
|
|
44
47
|
prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
|
|
45
48
|
response = RubyLLM.chat(model: @model).ask(prompt)
|
|
46
49
|
response.content.to_s.strip.scan(/-?\d+\.?\d*/).first.to_f.clamp(0.0, 1.0)
|
|
47
50
|
rescue => e
|
|
51
|
+
raise if @raise_on_error
|
|
52
|
+
|
|
48
53
|
warn "[LlmJudge] Scoring failed: #{e.message}"
|
|
49
54
|
0.0
|
|
50
55
|
end
|