phronomy 0.6.0 → 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mutant.yml +22 -0
- data/CHANGELOG.md +488 -0
- data/CONTRIBUTING.md +102 -0
- data/README.md +374 -36
- data/RELEASE_CHECKLIST.md +86 -0
- data/Rakefile +33 -0
- data/SECURITY.md +80 -0
- data/benchmark/baseline.json +9 -0
- data/benchmark/bench_agent_invoke.rb +105 -0
- data/benchmark/bench_context_assembler.rb +46 -0
- data/benchmark/bench_regression.rb +172 -0
- data/benchmark/bench_token_estimator.rb +44 -0
- data/benchmark/bench_tool_schema.rb +69 -0
- data/benchmark/bench_vector_store.rb +39 -0
- data/benchmark/bench_workflow.rb +55 -0
- data/benchmark/run_all.rb +118 -0
- data/docs/decisions/001-rubyllm-as-provider-layer.md +42 -0
- data/docs/decisions/002-workflow-context-immutability.md +42 -0
- data/docs/decisions/003-event-loop-singleton.md +48 -0
- data/docs/decisions/004-invoke-timeout-is-not-cancellation.md +75 -0
- data/docs/decisions/005-static-knowledge-class-level-cache.md +45 -0
- data/docs/decisions/006-no-built-in-guardrails.md +66 -0
- data/docs/decisions/007-mcp-is-beta-stability.md +51 -0
- data/docs/decisions/008-orchestrator-uses-os-threads.md +52 -0
- data/docs/decisions/009-state-store-abstraction.md +141 -0
- data/docs/decisions/010-cooperative-first-concurrency.md +248 -0
- data/lib/phronomy/agent/base.rb +416 -49
- data/lib/phronomy/agent/before_completion_context.rb +1 -0
- data/lib/phronomy/agent/checkpoint.rb +1 -0
- data/lib/phronomy/agent/concerns/before_completion.rb +6 -0
- data/lib/phronomy/agent/concerns/error_translation.rb +45 -0
- data/lib/phronomy/agent/concerns/guardrailable.rb +3 -0
- data/lib/phronomy/agent/concerns/retryable.rb +12 -1
- data/lib/phronomy/agent/concerns/suspendable.rb +19 -0
- data/lib/phronomy/agent/fsm.rb +44 -52
- data/lib/phronomy/agent/handoff.rb +3 -0
- data/lib/phronomy/agent/orchestrator.rb +191 -54
- data/lib/phronomy/agent/parallel_tool_chat.rb +87 -13
- data/lib/phronomy/agent/react_agent.rb +16 -6
- data/lib/phronomy/agent/runner.rb +2 -0
- data/lib/phronomy/agent/shared_state.rb +11 -0
- data/lib/phronomy/agent/suspend_signal.rb +2 -0
- data/lib/phronomy/agent/team_coordinator.rb +17 -5
- data/lib/phronomy/async_queue.rb +155 -0
- data/lib/phronomy/blocking_adapter_pool.rb +435 -0
- data/lib/phronomy/cancellation_scope.rb +123 -0
- data/lib/phronomy/cancellation_token.rb +133 -0
- data/lib/phronomy/concurrency_gate.rb +155 -0
- data/lib/phronomy/configuration.rb +168 -2
- data/lib/phronomy/context/assembler.rb +6 -0
- data/lib/phronomy/context/compaction_context.rb +2 -0
- data/lib/phronomy/context/context_version_cache.rb +2 -0
- data/lib/phronomy/context/token_budget.rb +3 -0
- data/lib/phronomy/context/token_estimator.rb +9 -2
- data/lib/phronomy/context/trigger_context.rb +1 -0
- data/lib/phronomy/context/trim_context.rb +4 -0
- data/lib/phronomy/deadline.rb +63 -0
- data/lib/phronomy/diagnostics.rb +62 -0
- data/lib/phronomy/embeddings/base.rb +22 -2
- data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +6 -2
- data/lib/phronomy/eval/comparison.rb +2 -0
- data/lib/phronomy/eval/dataset.rb +4 -0
- data/lib/phronomy/eval/metrics.rb +6 -0
- data/lib/phronomy/eval/runner.rb +11 -9
- data/lib/phronomy/eval/scorer/base.rb +1 -0
- data/lib/phronomy/eval/scorer/exact_match.rb +2 -0
- data/lib/phronomy/eval/scorer/includes_scorer.rb +2 -0
- data/lib/phronomy/eval/scorer/llm_judge.rb +2 -0
- data/lib/phronomy/event_loop.rb +275 -30
- data/lib/phronomy/fsm_session.rb +57 -4
- data/lib/phronomy/generator_verifier.rb +2 -0
- data/lib/phronomy/guardrail/base.rb +3 -0
- data/lib/phronomy/guardrail/prompt_injection_guardrail.rb +58 -0
- data/lib/phronomy/invocation_context.rb +152 -0
- data/lib/phronomy/knowledge_source/base.rb +24 -2
- data/lib/phronomy/knowledge_source/entity_knowledge.rb +7 -2
- data/lib/phronomy/knowledge_source/rag_knowledge.rb +8 -4
- data/lib/phronomy/knowledge_source/static_knowledge.rb +7 -2
- data/lib/phronomy/llm_adapter/base.rb +104 -0
- data/lib/phronomy/llm_adapter/ruby_llm.rb +41 -0
- data/lib/phronomy/llm_adapter.rb +20 -0
- data/lib/phronomy/loader/base.rb +1 -0
- data/lib/phronomy/loader/csv_loader.rb +2 -0
- data/lib/phronomy/loader/markdown_loader.rb +2 -0
- data/lib/phronomy/loader/plain_text_loader.rb +1 -0
- data/lib/phronomy/metrics.rb +38 -0
- data/lib/phronomy/output_parser/base.rb +1 -0
- data/lib/phronomy/output_parser/json_parser.rb +22 -3
- data/lib/phronomy/output_parser/structured_parser.rb +2 -0
- data/lib/phronomy/prompt_template.rb +5 -0
- data/lib/phronomy/runnable.rb +20 -3
- data/lib/phronomy/runtime/deterministic_scheduler.rb +412 -0
- data/lib/phronomy/runtime/fake_scheduler.rb +165 -0
- data/lib/phronomy/runtime/gate_registry.rb +52 -0
- data/lib/phronomy/runtime/pool_registry.rb +57 -0
- data/lib/phronomy/runtime/runtime_metrics.rb +117 -0
- data/lib/phronomy/runtime/scheduler.rb +98 -0
- data/lib/phronomy/runtime/scheduler_timer_adapter.rb +79 -0
- data/lib/phronomy/runtime/task_registry.rb +48 -0
- data/lib/phronomy/runtime/thread_scheduler.rb +30 -0
- data/lib/phronomy/runtime/timer_queue.rb +106 -0
- data/lib/phronomy/runtime/timer_service.rb +42 -0
- data/lib/phronomy/runtime.rb +374 -0
- data/lib/phronomy/splitter/base.rb +2 -0
- data/lib/phronomy/splitter/fixed_size_splitter.rb +2 -0
- data/lib/phronomy/splitter/recursive_splitter.rb +2 -0
- data/lib/phronomy/state_store/base.rb +48 -0
- data/lib/phronomy/state_store/in_memory.rb +62 -0
- data/lib/phronomy/task/backend.rb +80 -0
- data/lib/phronomy/task/fiber_backend.rb +157 -0
- data/lib/phronomy/task/immediate_backend.rb +89 -0
- data/lib/phronomy/task/thread_backend.rb +84 -0
- data/lib/phronomy/task.rb +275 -0
- data/lib/phronomy/task_group.rb +265 -0
- data/lib/phronomy/testing/fake_clock.rb +109 -0
- data/lib/phronomy/testing/fake_scheduler.rb +104 -0
- data/lib/phronomy/testing/scheduler_helpers.rb +59 -0
- data/lib/phronomy/testing.rb +12 -0
- data/lib/phronomy/tool/agent_tool.rb +1 -0
- data/lib/phronomy/tool/base.rb +298 -28
- data/lib/phronomy/tool/mcp_tool.rb +103 -17
- data/lib/phronomy/tool/scope_policy.rb +50 -0
- data/lib/phronomy/tool_executor.rb +106 -0
- data/lib/phronomy/tracing/base.rb +3 -0
- data/lib/phronomy/tracing/langfuse_tracer.rb +2 -0
- data/lib/phronomy/tracing/open_telemetry_tracer.rb +36 -0
- data/lib/phronomy/vector_store/async_backend.rb +110 -0
- data/lib/phronomy/vector_store/base.rb +40 -7
- data/lib/phronomy/vector_store/in_memory.rb +16 -7
- data/lib/phronomy/vector_store/pgvector.rb +40 -9
- data/lib/phronomy/vector_store/redis_search.rb +29 -8
- data/lib/phronomy/version.rb +1 -1
- data/lib/phronomy/workflow.rb +147 -11
- data/lib/phronomy/workflow_context.rb +83 -6
- data/lib/phronomy/workflow_runner.rb +106 -7
- data/lib/phronomy.rb +112 -1
- data/scripts/api_snapshot.rb +91 -0
- data/scripts/check_api_annotations.rb +68 -0
- data/scripts/check_private_enforcement.rb +93 -0
- data/scripts/check_readme_runnable.rb +98 -0
- data/scripts/run_mutation.sh +46 -0
- metadata +83 -2
|
@@ -45,6 +45,7 @@ module Phronomy
|
|
|
45
45
|
# @param max_output_tokens [Integer, nil] explicit output reservation; when nil
|
|
46
46
|
# and model is given, uses max_output_tokens
|
|
47
47
|
# @param overhead [Integer] tokens reserved for instructions/tools
|
|
48
|
+
# @api private
|
|
48
49
|
def initialize(model: nil, context_window: nil, max_output_tokens: nil, overhead: 0)
|
|
49
50
|
@overhead = overhead.to_i
|
|
50
51
|
|
|
@@ -65,6 +66,7 @@ module Phronomy
|
|
|
65
66
|
# Always >= 0.
|
|
66
67
|
#
|
|
67
68
|
# @return [Integer]
|
|
69
|
+
# @api private
|
|
68
70
|
def effective_input_limit
|
|
69
71
|
[@context_window - @max_output_tokens - @overhead, 0].max
|
|
70
72
|
end
|
|
@@ -73,6 +75,7 @@ module Phronomy
|
|
|
73
75
|
#
|
|
74
76
|
# @param used [Integer] tokens already committed (e.g. from knowledge injection)
|
|
75
77
|
# @return [Integer] remaining tokens (always >= 0)
|
|
78
|
+
# @api private
|
|
76
79
|
def available(used: 0)
|
|
77
80
|
[effective_input_limit - used.to_i, 0].max
|
|
78
81
|
end
|
|
@@ -9,8 +9,12 @@ module Phronomy
|
|
|
9
9
|
# any other class.
|
|
10
10
|
#
|
|
11
11
|
# Default approximation: ceil(char_count / 4).
|
|
12
|
-
#
|
|
13
|
-
#
|
|
12
|
+
# This heuristic is calibrated for ASCII/Latin text (~4 chars/token).
|
|
13
|
+
# For CJK languages (Chinese, Japanese, Korean) the actual token count is
|
|
14
|
+
# approximately 4× higher than the estimate because CJK characters are
|
|
15
|
+
# typically 1 token each in GPT-4/Claude tokenizers (~1 char/token vs the
|
|
16
|
+
# 4 char/token assumed here). Use a tokenizer-backed callable via
|
|
17
|
+
# +.tokenizer=+ for accurate CJK token counting.
|
|
14
18
|
#
|
|
15
19
|
# Replace the built-in heuristic with any callable via .tokenizer=:
|
|
16
20
|
#
|
|
@@ -33,11 +37,13 @@ module Phronomy
|
|
|
33
37
|
# In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
|
|
34
38
|
# prevent cross-test contamination.
|
|
35
39
|
# @param callable [#call, nil]
|
|
40
|
+
# @api private
|
|
36
41
|
def tokenizer=(callable)
|
|
37
42
|
@tokenizer_mutex.synchronize { @tokenizer = callable }
|
|
38
43
|
end
|
|
39
44
|
|
|
40
45
|
# @return [#call, nil]
|
|
46
|
+
# @api private
|
|
41
47
|
def tokenizer
|
|
42
48
|
@tokenizer_mutex.synchronize { @tokenizer }
|
|
43
49
|
end
|
|
@@ -52,6 +58,7 @@ module Phronomy
|
|
|
52
58
|
# @param input [String, Array, #content] a string, a message-like object,
|
|
53
59
|
# or an Array of message-like objects (each must respond to #content).
|
|
54
60
|
# @return [Integer] estimated token count (>= 0)
|
|
61
|
+
# @api private
|
|
55
62
|
def estimate(input)
|
|
56
63
|
tok = @tokenizer_mutex.synchronize { @tokenizer }
|
|
57
64
|
case input
|
|
@@ -28,6 +28,7 @@ module Phronomy
|
|
|
28
28
|
# @param message_elements [Array<Hash>]
|
|
29
29
|
# each element: { seq: Integer, message: Object, tokens: Integer, role: Symbol }
|
|
30
30
|
# @param budget [Phronomy::Context::TokenBudget, nil]
|
|
31
|
+
# @api private
|
|
31
32
|
def initialize(message_elements:, budget:)
|
|
32
33
|
@message_elements = message_elements.dup
|
|
33
34
|
@budget = budget
|
|
@@ -38,6 +39,7 @@ module Phronomy
|
|
|
38
39
|
# Each element is a Hash with +:seq+, +:message+, +:tokens+, and +:role+.
|
|
39
40
|
#
|
|
40
41
|
# @return [Array<Hash>]
|
|
42
|
+
# @api private
|
|
41
43
|
def message_elements
|
|
42
44
|
@message_elements.dup
|
|
43
45
|
end
|
|
@@ -47,6 +49,7 @@ module Phronomy
|
|
|
47
49
|
#
|
|
48
50
|
# @param seqs [Integer, Array<Integer>] seq number(s) to remove
|
|
49
51
|
# @return [self]
|
|
52
|
+
# @api private
|
|
50
53
|
def remove(seqs)
|
|
51
54
|
seqs_set = Array(seqs).to_set
|
|
52
55
|
@message_elements.reject! { |e| seqs_set.include?(e[:seq]) }
|
|
@@ -57,6 +60,7 @@ module Phronomy
|
|
|
57
60
|
# Convenience: returns the plain message objects (without element metadata).
|
|
58
61
|
#
|
|
59
62
|
# @return [Array]
|
|
63
|
+
# @api private
|
|
60
64
|
def messages
|
|
61
65
|
@message_elements.map { |e| e[:message] }
|
|
62
66
|
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
# A point in time used as an upper bound for an operation.
|
|
5
|
+
#
|
|
6
|
+
# Uses the monotonic clock (+Process::CLOCK_MONOTONIC+) internally to avoid
|
|
7
|
+
# skew from NTP adjustments or DST transitions.
|
|
8
|
+
#
|
|
9
|
+
# @example Create a 30-second deadline and check remaining time
|
|
10
|
+
# deadline = Phronomy::Deadline.in(30)
|
|
11
|
+
# sleep 1
|
|
12
|
+
# deadline.remaining_seconds # => ~29.0
|
|
13
|
+
# deadline.expired? # => false
|
|
14
|
+
class Deadline
|
|
15
|
+
# Creates a deadline that expires +seconds+ from now.
|
|
16
|
+
#
|
|
17
|
+
# @param seconds [Numeric] seconds from now until expiry
|
|
18
|
+
# @return [Deadline]
|
|
19
|
+
# @api private
|
|
20
|
+
def self.in(seconds)
|
|
21
|
+
new(Process.clock_gettime(Process::CLOCK_MONOTONIC) + seconds)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# @param monotonic_at [Float] absolute monotonic timestamp of expiry
|
|
25
|
+
# @api private
|
|
26
|
+
def initialize(monotonic_at)
|
|
27
|
+
@monotonic_at = monotonic_at
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Returns +true+ when the deadline has passed.
|
|
31
|
+
# @return [Boolean]
|
|
32
|
+
# @api private
|
|
33
|
+
def expired?
|
|
34
|
+
Process.clock_gettime(Process::CLOCK_MONOTONIC) >= @monotonic_at
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Seconds remaining until expiry. Returns 0 when already expired.
|
|
38
|
+
# @return [Float]
|
|
39
|
+
# @api private
|
|
40
|
+
def remaining_seconds
|
|
41
|
+
remaining = @monotonic_at - Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
42
|
+
[remaining, 0.0].max
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Attaches this deadline to a {CancellationToken} by cancelling the token
|
|
46
|
+
# when the deadline expires. Uses the Runtime timer queue (a single
|
|
47
|
+
# background thread shared by all deadlines) instead of spawning one thread
|
|
48
|
+
# per deadline.
|
|
49
|
+
#
|
|
50
|
+
# @param token [CancellationToken]
|
|
51
|
+
# @param timer_queue [Runtime::TimerQueue, nil] queue to register with;
|
|
52
|
+
# defaults to +Phronomy::Runtime.instance.timer_queue+
|
|
53
|
+
# @return [self]
|
|
54
|
+
# @api private
|
|
55
|
+
def attach_to(token, timer_queue: Phronomy::Runtime.instance.timer_queue)
|
|
56
|
+
seconds = remaining_seconds
|
|
57
|
+
return self if seconds <= 0
|
|
58
|
+
|
|
59
|
+
timer_queue.schedule(seconds: seconds) { token.cancel! }
|
|
60
|
+
self
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
# Developer-facing diagnostics for blocking operation detection (Issue #279).
|
|
5
|
+
#
|
|
6
|
+
# Provides debug dump utilities that can be called from an IRB / Rails console
|
|
7
|
+
# or in test helpers to inspect the current state of the Runtime.
|
|
8
|
+
#
|
|
9
|
+
# @example Enable diagnostics and print a dump
|
|
10
|
+
# Phronomy.configure { |c| c.scheduler_debug = true }
|
|
11
|
+
# Phronomy::Diagnostics.dump
|
|
12
|
+
module Diagnostics
|
|
13
|
+
# Prints a formatted summary of the current Runtime state to +$stderr+
|
|
14
|
+
# (or the supplied IO).
|
|
15
|
+
#
|
|
16
|
+
# Includes:
|
|
17
|
+
# - BlockingAdapterPool: active workers, queue depth, abandoned count
|
|
18
|
+
# - EventLoop: last / max / average lag in milliseconds
|
|
19
|
+
#
|
|
20
|
+
# @param out [IO] output destination (default: $stderr)
|
|
21
|
+
# @return [void]
|
|
22
|
+
# @api public
|
|
23
|
+
def self.dump(out: $stderr)
|
|
24
|
+
snap = Phronomy::Metrics.snapshot
|
|
25
|
+
|
|
26
|
+
out.puts "[Phronomy::Diagnostics] Runtime state dump"
|
|
27
|
+
out.puts " BlockingAdapterPool:"
|
|
28
|
+
out.puts " pool_size : #{snap[:blocking_pool_size]}"
|
|
29
|
+
out.puts " active_count : #{snap[:blocking_pool_active]}"
|
|
30
|
+
out.puts " queue_depth : #{snap[:blocking_pool_queue_length]}"
|
|
31
|
+
out.puts " abandoned_total : #{snap[:blocking_pool_abandoned_total]}"
|
|
32
|
+
out.puts " EventLoop:"
|
|
33
|
+
out.puts " last_lag_ms : #{snap[:event_loop_lag_last_ms]}"
|
|
34
|
+
out.puts " max_lag_ms : #{snap[:event_loop_lag_max_ms]}"
|
|
35
|
+
out.puts " average_lag_ms : #{snap[:event_loop_lag_average_ms]}"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# Returns the diagnostics state as a plain Hash (useful for JSON export).
|
|
39
|
+
#
|
|
40
|
+
# @return [Hash]
|
|
41
|
+
# @api public
|
|
42
|
+
def self.snapshot
|
|
43
|
+
Phronomy::Metrics.snapshot
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# Raises an error if +invoke+ (blocking) is called from inside an EventLoop
|
|
47
|
+
# action, preventing accidental scheduler stalls.
|
|
48
|
+
#
|
|
49
|
+
# Called by Agent::Base#invoke and Workflow#invoke before executing.
|
|
50
|
+
#
|
|
51
|
+
# @raise [Phronomy::SchedulerReentrancyError] when called from EventLoop thread
|
|
52
|
+
# @return [void]
|
|
53
|
+
# @api private
|
|
54
|
+
def self.assert_not_in_event_loop!
|
|
55
|
+
return unless Phronomy::EventLoop.current?
|
|
56
|
+
|
|
57
|
+
raise Phronomy::SchedulerReentrancyError,
|
|
58
|
+
"Blocking invoke called from inside an EventLoop action. " \
|
|
59
|
+
"Use invoke_async instead."
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
@@ -9,11 +9,31 @@ module Phronomy
|
|
|
9
9
|
class Base
|
|
10
10
|
# Embed the given text and return a vector representation.
|
|
11
11
|
#
|
|
12
|
-
# @param text
|
|
12
|
+
# @param text [String] the text to embed
|
|
13
|
+
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
13
14
|
# @return [Array<Float>] the embedding vector
|
|
14
|
-
|
|
15
|
+
# @api public
|
|
16
|
+
def embed(text, cancellation_token = nil)
|
|
17
|
+
cancellation_token&.raise_if_cancelled!
|
|
15
18
|
raise NotImplementedError, "#{self.class}#embed is not implemented"
|
|
16
19
|
end
|
|
20
|
+
|
|
21
|
+
# Submits an {#embed} call to {BlockingAdapterPool} and returns a
|
|
22
|
+
# {BlockingAdapterPool::PendingOperation}.
|
|
23
|
+
#
|
|
24
|
+
# @param text [String]
|
|
25
|
+
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
26
|
+
# @param timeout [Numeric, nil] seconds before the operation is abandoned
|
|
27
|
+
# @return [BlockingAdapterPool::PendingOperation]
|
|
28
|
+
# @api public
|
|
29
|
+
def embed_async(text, cancellation_token = nil, timeout: nil)
|
|
30
|
+
Phronomy::Runtime.instance.blocking_io.submit(
|
|
31
|
+
timeout: timeout,
|
|
32
|
+
cancellation_token: cancellation_token
|
|
33
|
+
) do
|
|
34
|
+
embed(text, cancellation_token)
|
|
35
|
+
end
|
|
36
|
+
end
|
|
17
37
|
end
|
|
18
38
|
end
|
|
19
39
|
end
|
|
@@ -19,6 +19,7 @@ module Phronomy
|
|
|
19
19
|
# @param provider [Symbol, nil] provider override (e.g. :openai); nil uses the RubyLLM default
|
|
20
20
|
# @param assume_model_exists [Boolean] when true, skips RubyLLM model-registry validation
|
|
21
21
|
# (useful for locally hosted models not in the registry)
|
|
22
|
+
# @api public
|
|
22
23
|
def initialize(model: nil, provider: nil, assume_model_exists: false)
|
|
23
24
|
@model = model
|
|
24
25
|
@provider = provider
|
|
@@ -27,9 +28,12 @@ module Phronomy
|
|
|
27
28
|
|
|
28
29
|
# Embed text via RubyLLM.
|
|
29
30
|
#
|
|
30
|
-
# @param text
|
|
31
|
+
# @param text [String]
|
|
32
|
+
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
31
33
|
# @return [Array<Float>]
|
|
32
|
-
|
|
34
|
+
# @api public
|
|
35
|
+
def embed(text, cancellation_token = nil)
|
|
36
|
+
cancellation_token&.raise_if_cancelled!
|
|
33
37
|
opts = {}
|
|
34
38
|
opts[:model] = @model if @model
|
|
35
39
|
opts[:provider] = @provider if @provider
|
|
@@ -19,6 +19,7 @@ module Phronomy
|
|
|
19
19
|
ComparisonPair = Data.define(:eval_case, :result_a, :result_b)
|
|
20
20
|
|
|
21
21
|
# @param scorer [Scorer::Base]
|
|
22
|
+
# @api public
|
|
22
23
|
def initialize(scorer: Scorer::ExactMatch.new)
|
|
23
24
|
@scorer = scorer
|
|
24
25
|
end
|
|
@@ -29,6 +30,7 @@ module Phronomy
|
|
|
29
30
|
# @param callable_a [#call]
|
|
30
31
|
# @param callable_b [#call]
|
|
31
32
|
# @return [Array<ComparisonPair>]
|
|
33
|
+
# @api public
|
|
32
34
|
def compare(dataset, callable_a, callable_b)
|
|
33
35
|
runner_a = Runner.new(scorer: @scorer)
|
|
34
36
|
runner_b = Runner.new(scorer: @scorer)
|
|
@@ -13,6 +13,7 @@ module Phronomy
|
|
|
13
13
|
include Enumerable
|
|
14
14
|
|
|
15
15
|
# @param cases [Array<EvalCase>]
|
|
16
|
+
# @api public
|
|
16
17
|
def initialize(cases = [])
|
|
17
18
|
@cases = cases.freeze
|
|
18
19
|
end
|
|
@@ -23,16 +24,19 @@ module Phronomy
|
|
|
23
24
|
#
|
|
24
25
|
# @param pairs [Array<Hash>]
|
|
25
26
|
# @return [Dataset]
|
|
27
|
+
# @api public
|
|
26
28
|
def self.from_array(pairs)
|
|
27
29
|
new(pairs.map { |h| EvalCase.new(**h) })
|
|
28
30
|
end
|
|
29
31
|
|
|
30
32
|
# @yield [EvalCase]
|
|
33
|
+
# @api public
|
|
31
34
|
def each(&block)
|
|
32
35
|
@cases.each(&block)
|
|
33
36
|
end
|
|
34
37
|
|
|
35
38
|
# @return [Integer]
|
|
39
|
+
# @api public
|
|
36
40
|
def size
|
|
37
41
|
@cases.size
|
|
38
42
|
end
|
|
@@ -11,12 +11,14 @@ module Phronomy
|
|
|
11
11
|
# puts metrics.to_h
|
|
12
12
|
class Metrics
|
|
13
13
|
# @param results [Array<EvalResult>]
|
|
14
|
+
# @api public
|
|
14
15
|
def initialize(results)
|
|
15
16
|
@results = results
|
|
16
17
|
end
|
|
17
18
|
|
|
18
19
|
# Fraction of results that passed (score == 1.0).
|
|
19
20
|
# @return [Float] in [0.0, 1.0]
|
|
21
|
+
# @api public
|
|
20
22
|
def pass_rate
|
|
21
23
|
return 0.0 if @results.empty?
|
|
22
24
|
@results.count(&:pass?).to_f / @results.size
|
|
@@ -24,6 +26,7 @@ module Phronomy
|
|
|
24
26
|
|
|
25
27
|
# Arithmetic mean of all scores.
|
|
26
28
|
# @return [Float]
|
|
29
|
+
# @api public
|
|
27
30
|
def average_score
|
|
28
31
|
return 0.0 if @results.empty?
|
|
29
32
|
@results.sum(&:score) / @results.size
|
|
@@ -32,12 +35,14 @@ module Phronomy
|
|
|
32
35
|
# Sum of all TokenUsage objects present in the results.
|
|
33
36
|
# Results without usage are skipped.
|
|
34
37
|
# @return [Phronomy::TokenUsage]
|
|
38
|
+
# @api public
|
|
35
39
|
def total_usage
|
|
36
40
|
@results.map(&:usage).compact.reduce(TokenUsage.zero, :+)
|
|
37
41
|
end
|
|
38
42
|
|
|
39
43
|
# Arithmetic mean of latency_ms across all results.
|
|
40
44
|
# @return [Float]
|
|
45
|
+
# @api public
|
|
41
46
|
def average_latency_ms
|
|
42
47
|
return 0.0 if @results.empty?
|
|
43
48
|
@results.sum(&:latency_ms).to_f / @results.size
|
|
@@ -45,6 +50,7 @@ module Phronomy
|
|
|
45
50
|
|
|
46
51
|
# Returns a plain Hash summary suitable for logging or serialisation.
|
|
47
52
|
# @return [Hash]
|
|
53
|
+
# @api public
|
|
48
54
|
def to_h
|
|
49
55
|
{
|
|
50
56
|
total: @results.size,
|
data/lib/phronomy/eval/runner.rb
CHANGED
|
@@ -18,6 +18,7 @@ module Phronomy
|
|
|
18
18
|
# results = runner.run(dataset, ->(input) { agent.invoke(input) })
|
|
19
19
|
class Runner
|
|
20
20
|
# @param scorer [Scorer::Base] scorer used to evaluate each result
|
|
21
|
+
# @api public
|
|
21
22
|
def initialize(scorer: Scorer::ExactMatch.new)
|
|
22
23
|
@scorer = scorer
|
|
23
24
|
end
|
|
@@ -26,29 +27,30 @@ module Phronomy
|
|
|
26
27
|
# @param callable [#call] accepts a single String argument
|
|
27
28
|
# @param concurrency [Integer] number of parallel threads (default: 1, sequential)
|
|
28
29
|
# @return [Array<EvalResult>]
|
|
30
|
+
# @api public
|
|
29
31
|
def run(dataset, callable, concurrency: 1)
|
|
30
32
|
cases = dataset.to_a
|
|
31
33
|
return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
|
|
32
34
|
|
|
33
|
-
# Run cases in slices of +concurrency+
|
|
34
|
-
# before the next starts, bounding peak
|
|
35
|
-
# Writing to pre-allocated slots (one per
|
|
36
|
-
#
|
|
35
|
+
# Run cases in slices of +concurrency+ tasks. Each slice is joined
|
|
36
|
+
# before the next starts, bounding peak task count to +concurrency+.
|
|
37
|
+
# Writing to pre-allocated slots (one per task) is safe because each
|
|
38
|
+
# task writes to a unique index and all tasks in a slice are joined
|
|
37
39
|
# before the next slice begins.
|
|
38
|
-
# Exceptions in worker
|
|
39
|
-
#
|
|
40
|
+
# Exceptions in worker tasks are collected and re-raised after all
|
|
41
|
+
# tasks in the slice are joined, preventing orphaned tasks.
|
|
40
42
|
results = Array.new(cases.length)
|
|
41
43
|
cases.each_with_index.each_slice(concurrency) do |batch|
|
|
42
44
|
errors = []
|
|
43
45
|
errors_mu = Mutex.new
|
|
44
|
-
|
|
45
|
-
|
|
46
|
+
tasks = batch.map do |eval_case, i|
|
|
47
|
+
Phronomy::Runtime.instance.spawn(name: "eval-case-#{i}") do
|
|
46
48
|
results[i] = run_one(eval_case, callable)
|
|
47
49
|
rescue => e
|
|
48
50
|
errors_mu.synchronize { errors << e }
|
|
49
51
|
end
|
|
50
52
|
end
|
|
51
|
-
|
|
53
|
+
tasks.each(&:join)
|
|
52
54
|
raise errors.first if errors.any?
|
|
53
55
|
end
|
|
54
56
|
results
|
|
@@ -12,6 +12,7 @@ module Phronomy
|
|
|
12
12
|
# @param expected [String] the ground-truth value from the EvalCase
|
|
13
13
|
# @param input [String, nil] the original input (used by LLM scorers)
|
|
14
14
|
# @return [Float] a value in [0.0, 1.0]
|
|
15
|
+
# @api public
|
|
15
16
|
def score(actual:, expected:, input: nil)
|
|
16
17
|
raise NotImplementedError, "#{self.class}#score is not implemented"
|
|
17
18
|
end
|
|
@@ -12,11 +12,13 @@ module Phronomy
|
|
|
12
12
|
# ExactMatch.new.score(actual: "paris", expected: "Paris") # => 0.0
|
|
13
13
|
class ExactMatch < Base
|
|
14
14
|
# @param case_sensitive [Boolean] default true
|
|
15
|
+
# @api public
|
|
15
16
|
def initialize(case_sensitive: true)
|
|
16
17
|
@case_sensitive = case_sensitive
|
|
17
18
|
end
|
|
18
19
|
|
|
19
20
|
# @return [Float] 1.0 on match, 0.0 otherwise
|
|
21
|
+
# @api public
|
|
20
22
|
def score(actual:, expected:, input: nil)
|
|
21
23
|
a = actual.to_s.strip
|
|
22
24
|
e = expected.to_s.strip
|
|
@@ -13,11 +13,13 @@ module Phronomy
|
|
|
13
13
|
# IncludesScorer.new.score(actual: "The answer is 42.", expected: "42") # => 1.0
|
|
14
14
|
class IncludesScorer < Base
|
|
15
15
|
# @param case_sensitive [Boolean] default false
|
|
16
|
+
# @api public
|
|
16
17
|
def initialize(case_sensitive: false)
|
|
17
18
|
@case_sensitive = case_sensitive
|
|
18
19
|
end
|
|
19
20
|
|
|
20
21
|
# @return [Float] 1.0 if actual contains expected, 0.0 otherwise
|
|
22
|
+
# @api public
|
|
21
23
|
def score(actual:, expected:, input: nil)
|
|
22
24
|
a = actual.to_s
|
|
23
25
|
e = expected.to_s
|
|
@@ -36,6 +36,7 @@ module Phronomy
|
|
|
36
36
|
# @param prompt_template [String] format string with %<input>s, %<expected>s, %<actual>s
|
|
37
37
|
# @param raise_on_error [Boolean] when true, re-raises scoring exceptions instead of
|
|
38
38
|
# returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
|
|
39
|
+
# @api public
|
|
39
40
|
def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
|
|
40
41
|
@model = model
|
|
41
42
|
@prompt_template = prompt_template
|
|
@@ -43,6 +44,7 @@ module Phronomy
|
|
|
43
44
|
end
|
|
44
45
|
|
|
45
46
|
# @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
|
|
47
|
+
# @api public
|
|
46
48
|
def score(actual:, expected:, input: nil)
|
|
47
49
|
prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
|
|
48
50
|
response = RubyLLM.chat(model: @model).ask(prompt)
|