phronomy 0.6.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mutant.yml +21 -0
- data/CHANGELOG.md +338 -0
- data/CONTRIBUTING.md +102 -0
- data/README.md +242 -27
- data/RELEASE_CHECKLIST.md +86 -0
- data/SECURITY.md +80 -0
- data/benchmark/baseline.json +9 -0
- data/benchmark/bench_agent_invoke.rb +105 -0
- data/benchmark/bench_context_assembler.rb +46 -0
- data/benchmark/bench_regression.rb +171 -0
- data/benchmark/bench_token_estimator.rb +44 -0
- data/benchmark/bench_tool_schema.rb +69 -0
- data/benchmark/bench_vector_store.rb +39 -0
- data/benchmark/bench_workflow.rb +55 -0
- data/benchmark/run_all.rb +118 -0
- data/docs/decisions/001-rubyllm-as-provider-layer.md +42 -0
- data/docs/decisions/002-workflow-context-immutability.md +42 -0
- data/docs/decisions/003-event-loop-singleton.md +48 -0
- data/docs/decisions/004-invoke-timeout-is-not-cancellation.md +51 -0
- data/docs/decisions/005-static-knowledge-class-level-cache.md +45 -0
- data/docs/decisions/006-no-built-in-guardrails.md +48 -0
- data/docs/decisions/007-mcp-is-beta-stability.md +51 -0
- data/docs/decisions/008-orchestrator-uses-os-threads.md +52 -0
- data/docs/decisions/009-state-store-abstraction.md +141 -0
- data/lib/phronomy/agent/base.rb +194 -12
- data/lib/phronomy/agent/before_completion_context.rb +1 -0
- data/lib/phronomy/agent/checkpoint.rb +1 -0
- data/lib/phronomy/agent/concerns/before_completion.rb +6 -0
- data/lib/phronomy/agent/concerns/error_translation.rb +45 -0
- data/lib/phronomy/agent/concerns/guardrailable.rb +3 -0
- data/lib/phronomy/agent/concerns/retryable.rb +12 -1
- data/lib/phronomy/agent/concerns/suspendable.rb +4 -0
- data/lib/phronomy/agent/fsm.rb +15 -0
- data/lib/phronomy/agent/handoff.rb +3 -0
- data/lib/phronomy/agent/orchestrator.rb +123 -11
- data/lib/phronomy/agent/parallel_tool_chat.rb +21 -4
- data/lib/phronomy/agent/react_agent.rb +8 -6
- data/lib/phronomy/agent/runner.rb +2 -0
- data/lib/phronomy/agent/shared_state.rb +11 -0
- data/lib/phronomy/agent/suspend_signal.rb +2 -0
- data/lib/phronomy/agent/team_coordinator.rb +17 -5
- data/lib/phronomy/cancellation_token.rb +92 -0
- data/lib/phronomy/configuration.rb +26 -2
- data/lib/phronomy/context/assembler.rb +6 -0
- data/lib/phronomy/context/compaction_context.rb +2 -0
- data/lib/phronomy/context/context_version_cache.rb +2 -0
- data/lib/phronomy/context/token_budget.rb +3 -0
- data/lib/phronomy/context/token_estimator.rb +9 -2
- data/lib/phronomy/context/trigger_context.rb +1 -0
- data/lib/phronomy/context/trim_context.rb +4 -0
- data/lib/phronomy/embeddings/base.rb +5 -2
- data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +6 -2
- data/lib/phronomy/eval/comparison.rb +2 -0
- data/lib/phronomy/eval/dataset.rb +4 -0
- data/lib/phronomy/eval/metrics.rb +6 -0
- data/lib/phronomy/eval/runner.rb +2 -0
- data/lib/phronomy/eval/scorer/base.rb +1 -0
- data/lib/phronomy/eval/scorer/exact_match.rb +2 -0
- data/lib/phronomy/eval/scorer/includes_scorer.rb +2 -0
- data/lib/phronomy/eval/scorer/llm_judge.rb +2 -0
- data/lib/phronomy/event_loop.rb +114 -7
- data/lib/phronomy/fsm_session.rb +8 -1
- data/lib/phronomy/generator_verifier.rb +2 -0
- data/lib/phronomy/guardrail/base.rb +3 -0
- data/lib/phronomy/knowledge_source/base.rb +6 -2
- data/lib/phronomy/knowledge_source/entity_knowledge.rb +7 -2
- data/lib/phronomy/knowledge_source/rag_knowledge.rb +8 -4
- data/lib/phronomy/knowledge_source/static_knowledge.rb +7 -2
- data/lib/phronomy/loader/base.rb +1 -0
- data/lib/phronomy/loader/csv_loader.rb +2 -0
- data/lib/phronomy/loader/markdown_loader.rb +2 -0
- data/lib/phronomy/loader/plain_text_loader.rb +1 -0
- data/lib/phronomy/output_parser/base.rb +1 -0
- data/lib/phronomy/output_parser/json_parser.rb +22 -3
- data/lib/phronomy/output_parser/structured_parser.rb +2 -0
- data/lib/phronomy/prompt_template.rb +5 -0
- data/lib/phronomy/runnable.rb +20 -3
- data/lib/phronomy/splitter/base.rb +2 -0
- data/lib/phronomy/splitter/fixed_size_splitter.rb +2 -0
- data/lib/phronomy/splitter/recursive_splitter.rb +2 -0
- data/lib/phronomy/state_store/base.rb +48 -0
- data/lib/phronomy/state_store/in_memory.rb +62 -0
- data/lib/phronomy/tool/agent_tool.rb +1 -0
- data/lib/phronomy/tool/base.rb +189 -27
- data/lib/phronomy/tool/mcp_tool.rb +68 -13
- data/lib/phronomy/tracing/base.rb +3 -0
- data/lib/phronomy/tracing/langfuse_tracer.rb +2 -0
- data/lib/phronomy/tracing/open_telemetry_tracer.rb +2 -0
- data/lib/phronomy/vector_store/base.rb +33 -7
- data/lib/phronomy/vector_store/in_memory.rb +16 -7
- data/lib/phronomy/vector_store/pgvector.rb +40 -9
- data/lib/phronomy/vector_store/redis_search.rb +29 -8
- data/lib/phronomy/version.rb +1 -1
- data/lib/phronomy/workflow.rb +96 -7
- data/lib/phronomy/workflow_context.rb +54 -4
- data/lib/phronomy/workflow_runner.rb +35 -7
- data/lib/phronomy.rb +70 -1
- data/scripts/api_snapshot.rb +91 -0
- data/scripts/check_api_annotations.rb +68 -0
- data/scripts/check_private_enforcement.rb +93 -0
- data/scripts/check_readme_runnable.rb +98 -0
- data/scripts/run_mutation.sh +46 -0
- metadata +45 -2
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
# Provides cooperative cancellation for agent invocations.
|
|
5
|
+
#
|
|
6
|
+
# Pass a token to an agent via +config: { cancellation_token: token }+.
|
|
7
|
+
# The agent checks the token before each LLM call and raises
|
|
8
|
+
# {Phronomy::CancellationError} when the token is cancelled or the
|
|
9
|
+
# optional deadline has passed.
|
|
10
|
+
#
|
|
11
|
+
# A token may be shared across multiple agent invocations and across threads;
|
|
12
|
+
# all access to internal state is protected by a Mutex.
|
|
13
|
+
#
|
|
14
|
+
# @example Explicit cancel from another thread
|
|
15
|
+
# token = Phronomy::CancellationToken.new
|
|
16
|
+
# Thread.new { sleep 5; token.cancel! }
|
|
17
|
+
# result = agent.invoke("...", config: { cancellation_token: token })
|
|
18
|
+
#
|
|
19
|
+
# @example Hard deadline via monotonic clock (recommended)
|
|
20
|
+
# token = Phronomy::CancellationToken.timeout_after(30)
|
|
21
|
+
# result = agent.invoke("...", config: { cancellation_token: token })
|
|
22
|
+
#
|
|
23
|
+
# @example Hard deadline via wall-clock (legacy)
|
|
24
|
+
# token = Phronomy::CancellationToken.new(deadline: Time.now + 30)
|
|
25
|
+
# result = agent.invoke("...", config: { cancellation_token: token })
|
|
26
|
+
#
|
|
27
|
+
# @example Propagate to parallel workers
|
|
28
|
+
# token = Phronomy::CancellationToken.new
|
|
29
|
+
# orchestrator.dispatch_parallel(task1, task2, cancellation_token: token)
|
|
30
|
+
class CancellationToken
|
|
31
|
+
# Returns a new token that will expire after +seconds+ seconds, measured
|
|
32
|
+
# with the monotonic clock (+Process::CLOCK_MONOTONIC+). Unlike constructing
|
|
33
|
+
# a token with +deadline: Time.now + seconds+, this factory is immune to NTP
|
|
34
|
+
# adjustments and DST transitions.
|
|
35
|
+
#
|
|
36
|
+
# @param seconds [Numeric] duration in seconds until the token expires.
|
|
37
|
+
# @return [CancellationToken]
|
|
38
|
+
# @api public
|
|
39
|
+
def self.timeout_after(seconds)
|
|
40
|
+
monotonic_deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + seconds
|
|
41
|
+
new(monotonic_deadline: monotonic_deadline)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
# @param deadline [Time, nil] optional wall-clock deadline; the token reports
|
|
45
|
+
# +cancelled?+ as +true+ once +Time.now >= deadline+. Prefer
|
|
46
|
+
# {.timeout_after} for duration-based cancellation.
|
|
47
|
+
# @param monotonic_deadline [Float, nil] internal monotonic timestamp set by
|
|
48
|
+
# {.timeout_after}; prefer that factory method over passing this directly.
|
|
49
|
+
# @api public
|
|
50
|
+
def initialize(deadline: nil, monotonic_deadline: nil)
|
|
51
|
+
@cancelled = false
|
|
52
|
+
@deadline = deadline
|
|
53
|
+
@monotonic_deadline = monotonic_deadline
|
|
54
|
+
@mutex = Mutex.new
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# @return [Time, nil] the wall-clock deadline passed to {#initialize}, or +nil+.
|
|
58
|
+
attr_reader :deadline
|
|
59
|
+
|
|
60
|
+
# Mark the token as cancelled. Thread-safe; may be called from any thread.
|
|
61
|
+
# @return [self]
|
|
62
|
+
# @api public
|
|
63
|
+
def cancel!
|
|
64
|
+
@mutex.synchronize { @cancelled = true }
|
|
65
|
+
self
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# Returns +true+ when the token has been explicitly cancelled via {#cancel!},
|
|
69
|
+
# when the wall-clock deadline has passed, or when the monotonic deadline
|
|
70
|
+
# (set by {.timeout_after}) has elapsed. Thread-safe.
|
|
71
|
+
# @return [Boolean]
|
|
72
|
+
# @api public
|
|
73
|
+
def cancelled?
|
|
74
|
+
return true if @mutex.synchronize { @cancelled }
|
|
75
|
+
return true if !@deadline.nil? && Time.now >= @deadline
|
|
76
|
+
!@monotonic_deadline.nil? &&
|
|
77
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC) >= @monotonic_deadline
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# Raises {Phronomy::CancellationError} if the token is cancelled.
|
|
81
|
+
# A convenience method for cooperative cancellation checks inside tools,
|
|
82
|
+
# RAG loaders, and hooks, replacing the +if cancelled? then raise+ pattern.
|
|
83
|
+
#
|
|
84
|
+
# @param message [String] optional error message
|
|
85
|
+
# @return [nil] when the token is not cancelled
|
|
86
|
+
# @raise [Phronomy::CancellationError] when the token is cancelled
|
|
87
|
+
# @api public
|
|
88
|
+
def raise_if_cancelled!(message = "invocation cancelled")
|
|
89
|
+
raise Phronomy::CancellationError, message if cancelled?
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
@@ -33,16 +33,40 @@ module Phronomy
|
|
|
33
33
|
# @see Phronomy::EventLoop
|
|
34
34
|
attr_accessor :event_loop
|
|
35
35
|
|
|
36
|
-
# When true
|
|
36
|
+
# When true, user input and LLM output are recorded in trace spans.
|
|
37
|
+
# Defaults to false; set to true only in environments where PII capture is acceptable.
|
|
37
38
|
# Set to false in privacy-sensitive environments to prevent PII from reaching
|
|
38
39
|
# the tracing backend (OTel, Langfuse, etc.).
|
|
39
40
|
attr_accessor :trace_pii
|
|
40
41
|
|
|
42
|
+
# Optional logger for framework diagnostic messages (e.g. unreachable-state warnings).
|
|
43
|
+
# Must respond to +#warn(message)+. When nil (default), messages are written to +$stderr+
|
|
44
|
+
# via +Kernel#warn+.
|
|
45
|
+
# @example
|
|
46
|
+
# Phronomy.configure { |c| c.logger = Rails.logger }
|
|
47
|
+
attr_accessor :logger
|
|
48
|
+
|
|
49
|
+
# Grace period (in seconds) before the EventLoop background thread is force-killed
|
|
50
|
+
# after a cooperative stop request. Applies both to the overall thread join
|
|
51
|
+
# and to the drain-and-cancel phase when +stop(drain: true)+ is used.
|
|
52
|
+
# Default: 5 seconds.
|
|
53
|
+
# @see Phronomy::EventLoop#stop
|
|
54
|
+
attr_accessor :event_loop_stop_grace_seconds
|
|
55
|
+
|
|
56
|
+
# Global state store for workflow persistence.
|
|
57
|
+
# When set, WorkflowRunner routes all state reads and writes through this store.
|
|
58
|
+
# Must be an instance of a class that inherits from Phronomy::StateStore::Base.
|
|
59
|
+
# Defaults to +nil+ (no persistence — state lives only for the duration of invoke).
|
|
60
|
+
# @example
|
|
61
|
+
# Phronomy.configure { |c| c.state_store = Phronomy::StateStore::InMemory.new }
|
|
62
|
+
attr_accessor :state_store
|
|
63
|
+
|
|
41
64
|
def initialize
|
|
42
65
|
@recursion_limit = 25
|
|
43
66
|
@tracer = Phronomy::Tracing::NullTracer.new
|
|
44
|
-
@trace_pii =
|
|
67
|
+
@trace_pii = false
|
|
45
68
|
@event_loop = false
|
|
69
|
+
@event_loop_stop_grace_seconds = 5
|
|
46
70
|
end
|
|
47
71
|
end
|
|
48
72
|
end
|
|
@@ -35,12 +35,14 @@ module Phronomy
|
|
|
35
35
|
# @param type [Symbol, String]
|
|
36
36
|
# @param trusted [Boolean]
|
|
37
37
|
# @return [String]
|
|
38
|
+
# @api private
|
|
38
39
|
def self.xml_tag(text, type:, trusted: false)
|
|
39
40
|
"<context type=\"#{CGI.escapeHTML(type.to_s)}\" trusted=\"#{trusted}\">\n#{CGI.escapeHTML(text.to_s)}\n</context>"
|
|
40
41
|
end
|
|
41
42
|
|
|
42
43
|
# @param budget [Phronomy::Context::TokenBudget, nil]
|
|
43
44
|
# when nil no token trimming is performed
|
|
45
|
+
# @api private
|
|
44
46
|
def initialize(budget: nil)
|
|
45
47
|
@budget = budget
|
|
46
48
|
@instruction = nil
|
|
@@ -53,6 +55,7 @@ module Phronomy
|
|
|
53
55
|
#
|
|
54
56
|
# @param text [String]
|
|
55
57
|
# @return [self]
|
|
58
|
+
# @api private
|
|
56
59
|
def add_instruction(text)
|
|
57
60
|
@instruction = text.to_s
|
|
58
61
|
self
|
|
@@ -67,6 +70,7 @@ module Phronomy
|
|
|
67
70
|
# @param source [String, nil] optional source label (e.g. filename); included in the
|
|
68
71
|
# XML tag so the LLM can produce grounded citations. Omitted when nil.
|
|
69
72
|
# @return [self]
|
|
73
|
+
# @api private
|
|
70
74
|
def add_knowledge(text, type:, trusted: false, source: nil)
|
|
71
75
|
@knowledge_chunks << {text: text.to_s, type: type.to_s, trusted: trusted, source: source}
|
|
72
76
|
self
|
|
@@ -76,6 +80,7 @@ module Phronomy
|
|
|
76
80
|
#
|
|
77
81
|
# @param messages [Array] message-like objects with #role and #content
|
|
78
82
|
# @return [self]
|
|
83
|
+
# @api private
|
|
79
84
|
def add_messages(messages)
|
|
80
85
|
@messages = Array(messages)
|
|
81
86
|
self
|
|
@@ -86,6 +91,7 @@ module Phronomy
|
|
|
86
91
|
# @return [Hash{Symbol => Object}]
|
|
87
92
|
# :system [String, nil] combined system prompt (instruction + knowledge XML tags)
|
|
88
93
|
# :messages [Array] conversation messages, trimmed to budget if set
|
|
94
|
+
# @api private
|
|
89
95
|
def build
|
|
90
96
|
knowledge_text = @knowledge_chunks.map { |c| xml_context_tag(c) }.join("\n\n")
|
|
91
97
|
system_parts = [@instruction, knowledge_text.empty? ? nil : knowledge_text].compact
|
|
@@ -45,6 +45,7 @@ module Phronomy
|
|
|
45
45
|
# @param thread_id [String, nil] used when saving compaction records
|
|
46
46
|
# @param memory [Object, nil] memory object; must respond to #save_compaction
|
|
47
47
|
# for compaction records to be persisted
|
|
48
|
+
# @api private
|
|
48
49
|
def initialize(message_elements:, budget:, thread_id: nil, memory: nil)
|
|
49
50
|
@message_elements = message_elements.dup
|
|
50
51
|
@budget = budget
|
|
@@ -67,6 +68,7 @@ module Phronomy
|
|
|
67
68
|
# @yieldparam elements [Array<Hash>] the selected message elements
|
|
68
69
|
# @yieldreturn [String] summary text to replace the selected messages
|
|
69
70
|
# @return [Array] the updated result_messages array
|
|
71
|
+
# @api private
|
|
70
72
|
def compact(range)
|
|
71
73
|
# Normalise: Integer index → single-element Array; Range → Array slice.
|
|
72
74
|
raw = @message_elements[range]
|
|
@@ -25,6 +25,7 @@ module Phronomy
|
|
|
25
25
|
#
|
|
26
26
|
# @param fingerprint [String] SHA-256 hex digest to compare
|
|
27
27
|
# @return [Boolean]
|
|
28
|
+
# @api private
|
|
28
29
|
def valid?(fingerprint)
|
|
29
30
|
!@fingerprint.nil? && !@system_text.nil? && @fingerprint == fingerprint
|
|
30
31
|
end
|
|
@@ -33,6 +34,7 @@ module Phronomy
|
|
|
33
34
|
#
|
|
34
35
|
# @param fingerprint [String] new SHA-256 hex digest
|
|
35
36
|
# @param system_text [String] fully assembled system prompt text
|
|
37
|
+
# @api private
|
|
36
38
|
def update(fingerprint:, system_text:)
|
|
37
39
|
@fingerprint = fingerprint
|
|
38
40
|
@system_text = system_text.to_s
|
|
@@ -45,6 +45,7 @@ module Phronomy
|
|
|
45
45
|
# @param max_output_tokens [Integer, nil] explicit output reservation; when nil
|
|
46
46
|
# and model is given, uses max_output_tokens
|
|
47
47
|
# @param overhead [Integer] tokens reserved for instructions/tools
|
|
48
|
+
# @api private
|
|
48
49
|
def initialize(model: nil, context_window: nil, max_output_tokens: nil, overhead: 0)
|
|
49
50
|
@overhead = overhead.to_i
|
|
50
51
|
|
|
@@ -65,6 +66,7 @@ module Phronomy
|
|
|
65
66
|
# Always >= 0.
|
|
66
67
|
#
|
|
67
68
|
# @return [Integer]
|
|
69
|
+
# @api private
|
|
68
70
|
def effective_input_limit
|
|
69
71
|
[@context_window - @max_output_tokens - @overhead, 0].max
|
|
70
72
|
end
|
|
@@ -73,6 +75,7 @@ module Phronomy
|
|
|
73
75
|
#
|
|
74
76
|
# @param used [Integer] tokens already committed (e.g. from knowledge injection)
|
|
75
77
|
# @return [Integer] remaining tokens (always >= 0)
|
|
78
|
+
# @api private
|
|
76
79
|
def available(used: 0)
|
|
77
80
|
[effective_input_limit - used.to_i, 0].max
|
|
78
81
|
end
|
|
@@ -9,8 +9,12 @@ module Phronomy
|
|
|
9
9
|
# any other class.
|
|
10
10
|
#
|
|
11
11
|
# Default approximation: ceil(char_count / 4).
|
|
12
|
-
#
|
|
13
|
-
#
|
|
12
|
+
# This heuristic is calibrated for ASCII/Latin text (~4 chars/token).
|
|
13
|
+
# For CJK languages (Chinese, Japanese, Korean) the actual token count is
|
|
14
|
+
# approximately 4× higher than the estimate because CJK characters are
|
|
15
|
+
# typically 1 token each in GPT-4/Claude tokenizers (~1 char/token vs the
|
|
16
|
+
# 4 char/token assumed here). Use a tokenizer-backed callable via
|
|
17
|
+
# +.tokenizer=+ for accurate CJK token counting.
|
|
14
18
|
#
|
|
15
19
|
# Replace the built-in heuristic with any callable via .tokenizer=:
|
|
16
20
|
#
|
|
@@ -33,11 +37,13 @@ module Phronomy
|
|
|
33
37
|
# In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
|
|
34
38
|
# prevent cross-test contamination.
|
|
35
39
|
# @param callable [#call, nil]
|
|
40
|
+
# @api private
|
|
36
41
|
def tokenizer=(callable)
|
|
37
42
|
@tokenizer_mutex.synchronize { @tokenizer = callable }
|
|
38
43
|
end
|
|
39
44
|
|
|
40
45
|
# @return [#call, nil]
|
|
46
|
+
# @api private
|
|
41
47
|
def tokenizer
|
|
42
48
|
@tokenizer_mutex.synchronize { @tokenizer }
|
|
43
49
|
end
|
|
@@ -52,6 +58,7 @@ module Phronomy
|
|
|
52
58
|
# @param input [String, Array, #content] a string, a message-like object,
|
|
53
59
|
# or an Array of message-like objects (each must respond to #content).
|
|
54
60
|
# @return [Integer] estimated token count (>= 0)
|
|
61
|
+
# @api private
|
|
55
62
|
def estimate(input)
|
|
56
63
|
tok = @tokenizer_mutex.synchronize { @tokenizer }
|
|
57
64
|
case input
|
|
@@ -28,6 +28,7 @@ module Phronomy
|
|
|
28
28
|
# @param message_elements [Array<Hash>]
|
|
29
29
|
# each element: { seq: Integer, message: Object, tokens: Integer, role: Symbol }
|
|
30
30
|
# @param budget [Phronomy::Context::TokenBudget, nil]
|
|
31
|
+
# @api private
|
|
31
32
|
def initialize(message_elements:, budget:)
|
|
32
33
|
@message_elements = message_elements.dup
|
|
33
34
|
@budget = budget
|
|
@@ -38,6 +39,7 @@ module Phronomy
|
|
|
38
39
|
# Each element is a Hash with +:seq+, +:message+, +:tokens+, and +:role+.
|
|
39
40
|
#
|
|
40
41
|
# @return [Array<Hash>]
|
|
42
|
+
# @api private
|
|
41
43
|
def message_elements
|
|
42
44
|
@message_elements.dup
|
|
43
45
|
end
|
|
@@ -47,6 +49,7 @@ module Phronomy
|
|
|
47
49
|
#
|
|
48
50
|
# @param seqs [Integer, Array<Integer>] seq number(s) to remove
|
|
49
51
|
# @return [self]
|
|
52
|
+
# @api private
|
|
50
53
|
def remove(seqs)
|
|
51
54
|
seqs_set = Array(seqs).to_set
|
|
52
55
|
@message_elements.reject! { |e| seqs_set.include?(e[:seq]) }
|
|
@@ -57,6 +60,7 @@ module Phronomy
|
|
|
57
60
|
# Convenience: returns the plain message objects (without element metadata).
|
|
58
61
|
#
|
|
59
62
|
# @return [Array]
|
|
63
|
+
# @api private
|
|
60
64
|
def messages
|
|
61
65
|
@message_elements.map { |e| e[:message] }
|
|
62
66
|
end
|
|
@@ -9,9 +9,12 @@ module Phronomy
|
|
|
9
9
|
class Base
|
|
10
10
|
# Embed the given text and return a vector representation.
|
|
11
11
|
#
|
|
12
|
-
# @param text
|
|
12
|
+
# @param text [String] the text to embed
|
|
13
|
+
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
13
14
|
# @return [Array<Float>] the embedding vector
|
|
14
|
-
|
|
15
|
+
# @api public
|
|
16
|
+
def embed(text, cancellation_token = nil)
|
|
17
|
+
cancellation_token&.raise_if_cancelled!
|
|
15
18
|
raise NotImplementedError, "#{self.class}#embed is not implemented"
|
|
16
19
|
end
|
|
17
20
|
end
|
|
@@ -19,6 +19,7 @@ module Phronomy
|
|
|
19
19
|
# @param provider [Symbol, nil] provider override (e.g. :openai); nil uses the RubyLLM default
|
|
20
20
|
# @param assume_model_exists [Boolean] when true, skips RubyLLM model-registry validation
|
|
21
21
|
# (useful for locally hosted models not in the registry)
|
|
22
|
+
# @api public
|
|
22
23
|
def initialize(model: nil, provider: nil, assume_model_exists: false)
|
|
23
24
|
@model = model
|
|
24
25
|
@provider = provider
|
|
@@ -27,9 +28,12 @@ module Phronomy
|
|
|
27
28
|
|
|
28
29
|
# Embed text via RubyLLM.
|
|
29
30
|
#
|
|
30
|
-
# @param text
|
|
31
|
+
# @param text [String]
|
|
32
|
+
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
31
33
|
# @return [Array<Float>]
|
|
32
|
-
|
|
34
|
+
# @api public
|
|
35
|
+
def embed(text, cancellation_token = nil)
|
|
36
|
+
cancellation_token&.raise_if_cancelled!
|
|
33
37
|
opts = {}
|
|
34
38
|
opts[:model] = @model if @model
|
|
35
39
|
opts[:provider] = @provider if @provider
|
|
@@ -19,6 +19,7 @@ module Phronomy
|
|
|
19
19
|
ComparisonPair = Data.define(:eval_case, :result_a, :result_b)
|
|
20
20
|
|
|
21
21
|
# @param scorer [Scorer::Base]
|
|
22
|
+
# @api public
|
|
22
23
|
def initialize(scorer: Scorer::ExactMatch.new)
|
|
23
24
|
@scorer = scorer
|
|
24
25
|
end
|
|
@@ -29,6 +30,7 @@ module Phronomy
|
|
|
29
30
|
# @param callable_a [#call]
|
|
30
31
|
# @param callable_b [#call]
|
|
31
32
|
# @return [Array<ComparisonPair>]
|
|
33
|
+
# @api public
|
|
32
34
|
def compare(dataset, callable_a, callable_b)
|
|
33
35
|
runner_a = Runner.new(scorer: @scorer)
|
|
34
36
|
runner_b = Runner.new(scorer: @scorer)
|
|
@@ -13,6 +13,7 @@ module Phronomy
|
|
|
13
13
|
include Enumerable
|
|
14
14
|
|
|
15
15
|
# @param cases [Array<EvalCase>]
|
|
16
|
+
# @api public
|
|
16
17
|
def initialize(cases = [])
|
|
17
18
|
@cases = cases.freeze
|
|
18
19
|
end
|
|
@@ -23,16 +24,19 @@ module Phronomy
|
|
|
23
24
|
#
|
|
24
25
|
# @param pairs [Array<Hash>]
|
|
25
26
|
# @return [Dataset]
|
|
27
|
+
# @api public
|
|
26
28
|
def self.from_array(pairs)
|
|
27
29
|
new(pairs.map { |h| EvalCase.new(**h) })
|
|
28
30
|
end
|
|
29
31
|
|
|
30
32
|
# @yield [EvalCase]
|
|
33
|
+
# @api public
|
|
31
34
|
def each(&block)
|
|
32
35
|
@cases.each(&block)
|
|
33
36
|
end
|
|
34
37
|
|
|
35
38
|
# @return [Integer]
|
|
39
|
+
# @api public
|
|
36
40
|
def size
|
|
37
41
|
@cases.size
|
|
38
42
|
end
|
|
@@ -11,12 +11,14 @@ module Phronomy
|
|
|
11
11
|
# puts metrics.to_h
|
|
12
12
|
class Metrics
|
|
13
13
|
# @param results [Array<EvalResult>]
|
|
14
|
+
# @api public
|
|
14
15
|
def initialize(results)
|
|
15
16
|
@results = results
|
|
16
17
|
end
|
|
17
18
|
|
|
18
19
|
# Fraction of results that passed (score == 1.0).
|
|
19
20
|
# @return [Float] in [0.0, 1.0]
|
|
21
|
+
# @api public
|
|
20
22
|
def pass_rate
|
|
21
23
|
return 0.0 if @results.empty?
|
|
22
24
|
@results.count(&:pass?).to_f / @results.size
|
|
@@ -24,6 +26,7 @@ module Phronomy
|
|
|
24
26
|
|
|
25
27
|
# Arithmetic mean of all scores.
|
|
26
28
|
# @return [Float]
|
|
29
|
+
# @api public
|
|
27
30
|
def average_score
|
|
28
31
|
return 0.0 if @results.empty?
|
|
29
32
|
@results.sum(&:score) / @results.size
|
|
@@ -32,12 +35,14 @@ module Phronomy
|
|
|
32
35
|
# Sum of all TokenUsage objects present in the results.
|
|
33
36
|
# Results without usage are skipped.
|
|
34
37
|
# @return [Phronomy::TokenUsage]
|
|
38
|
+
# @api public
|
|
35
39
|
def total_usage
|
|
36
40
|
@results.map(&:usage).compact.reduce(TokenUsage.zero, :+)
|
|
37
41
|
end
|
|
38
42
|
|
|
39
43
|
# Arithmetic mean of latency_ms across all results.
|
|
40
44
|
# @return [Float]
|
|
45
|
+
# @api public
|
|
41
46
|
def average_latency_ms
|
|
42
47
|
return 0.0 if @results.empty?
|
|
43
48
|
@results.sum(&:latency_ms).to_f / @results.size
|
|
@@ -45,6 +50,7 @@ module Phronomy
|
|
|
45
50
|
|
|
46
51
|
# Returns a plain Hash summary suitable for logging or serialisation.
|
|
47
52
|
# @return [Hash]
|
|
53
|
+
# @api public
|
|
48
54
|
def to_h
|
|
49
55
|
{
|
|
50
56
|
total: @results.size,
|
data/lib/phronomy/eval/runner.rb
CHANGED
|
@@ -18,6 +18,7 @@ module Phronomy
|
|
|
18
18
|
# results = runner.run(dataset, ->(input) { agent.invoke(input) })
|
|
19
19
|
class Runner
|
|
20
20
|
# @param scorer [Scorer::Base] scorer used to evaluate each result
|
|
21
|
+
# @api public
|
|
21
22
|
def initialize(scorer: Scorer::ExactMatch.new)
|
|
22
23
|
@scorer = scorer
|
|
23
24
|
end
|
|
@@ -26,6 +27,7 @@ module Phronomy
|
|
|
26
27
|
# @param callable [#call] accepts a single String argument
|
|
27
28
|
# @param concurrency [Integer] number of parallel threads (default: 1, sequential)
|
|
28
29
|
# @return [Array<EvalResult>]
|
|
30
|
+
# @api public
|
|
29
31
|
def run(dataset, callable, concurrency: 1)
|
|
30
32
|
cases = dataset.to_a
|
|
31
33
|
return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
|
|
@@ -12,6 +12,7 @@ module Phronomy
|
|
|
12
12
|
# @param expected [String] the ground-truth value from the EvalCase
|
|
13
13
|
# @param input [String, nil] the original input (used by LLM scorers)
|
|
14
14
|
# @return [Float] a value in [0.0, 1.0]
|
|
15
|
+
# @api public
|
|
15
16
|
def score(actual:, expected:, input: nil)
|
|
16
17
|
raise NotImplementedError, "#{self.class}#score is not implemented"
|
|
17
18
|
end
|
|
@@ -12,11 +12,13 @@ module Phronomy
|
|
|
12
12
|
# ExactMatch.new.score(actual: "paris", expected: "Paris") # => 0.0
|
|
13
13
|
class ExactMatch < Base
|
|
14
14
|
# @param case_sensitive [Boolean] default true
|
|
15
|
+
# @api public
|
|
15
16
|
def initialize(case_sensitive: true)
|
|
16
17
|
@case_sensitive = case_sensitive
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# @return [Float] 1.0 on match, 0.0 otherwise
|
|
21
|
+
# @api public
|
|
20
22
|
def score(actual:, expected:, input: nil)
|
|
21
23
|
a = actual.to_s.strip
|
|
22
24
|
e = expected.to_s.strip
|
|
@@ -13,11 +13,13 @@ module Phronomy
|
|
|
13
13
|
# IncludesScorer.new.score(actual: "The answer is 42.", expected: "42") # => 1.0
|
|
14
14
|
class IncludesScorer < Base
|
|
15
15
|
# @param case_sensitive [Boolean] default false
|
|
16
|
+
# @api public
|
|
16
17
|
def initialize(case_sensitive: false)
|
|
17
18
|
@case_sensitive = case_sensitive
|
|
18
19
|
end
|
|
19
20
|
|
|
20
21
|
# @return [Float] 1.0 if actual contains expected, 0.0 otherwise
|
|
22
|
+
# @api public
|
|
21
23
|
def score(actual:, expected:, input: nil)
|
|
22
24
|
a = actual.to_s
|
|
23
25
|
e = expected.to_s
|
|
@@ -36,6 +36,7 @@ module Phronomy
|
|
|
36
36
|
# @param prompt_template [String] format string with %<input>s, %<expected>s, %<actual>s
|
|
37
37
|
# @param raise_on_error [Boolean] when true, re-raises scoring exceptions instead of
|
|
38
38
|
# returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
|
|
39
|
+
# @api public
|
|
39
40
|
def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
|
|
40
41
|
@model = model
|
|
41
42
|
@prompt_template = prompt_template
|
|
@@ -43,6 +44,7 @@ module Phronomy
|
|
|
43
44
|
end
|
|
44
45
|
|
|
45
46
|
# @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
|
|
47
|
+
# @api public
|
|
46
48
|
def score(actual:, expected:, input: nil)
|
|
47
49
|
prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
|
|
48
50
|
response = RubyLLM.chat(model: @model).ask(prompt)
|