phronomy 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. checksums.yaml +4 -4
  2. data/.mutant.yml +22 -0
  3. data/CHANGELOG.md +488 -0
  4. data/CONTRIBUTING.md +102 -0
  5. data/README.md +374 -36
  6. data/RELEASE_CHECKLIST.md +86 -0
  7. data/Rakefile +33 -0
  8. data/SECURITY.md +80 -0
  9. data/benchmark/baseline.json +9 -0
  10. data/benchmark/bench_agent_invoke.rb +105 -0
  11. data/benchmark/bench_context_assembler.rb +46 -0
  12. data/benchmark/bench_regression.rb +172 -0
  13. data/benchmark/bench_token_estimator.rb +44 -0
  14. data/benchmark/bench_tool_schema.rb +69 -0
  15. data/benchmark/bench_vector_store.rb +39 -0
  16. data/benchmark/bench_workflow.rb +55 -0
  17. data/benchmark/run_all.rb +118 -0
  18. data/docs/decisions/001-rubyllm-as-provider-layer.md +42 -0
  19. data/docs/decisions/002-workflow-context-immutability.md +42 -0
  20. data/docs/decisions/003-event-loop-singleton.md +48 -0
  21. data/docs/decisions/004-invoke-timeout-is-not-cancellation.md +75 -0
  22. data/docs/decisions/005-static-knowledge-class-level-cache.md +45 -0
  23. data/docs/decisions/006-no-built-in-guardrails.md +66 -0
  24. data/docs/decisions/007-mcp-is-beta-stability.md +51 -0
  25. data/docs/decisions/008-orchestrator-uses-os-threads.md +52 -0
  26. data/docs/decisions/009-state-store-abstraction.md +141 -0
  27. data/docs/decisions/010-cooperative-first-concurrency.md +248 -0
  28. data/lib/phronomy/agent/base.rb +416 -49
  29. data/lib/phronomy/agent/before_completion_context.rb +1 -0
  30. data/lib/phronomy/agent/checkpoint.rb +1 -0
  31. data/lib/phronomy/agent/concerns/before_completion.rb +6 -0
  32. data/lib/phronomy/agent/concerns/error_translation.rb +45 -0
  33. data/lib/phronomy/agent/concerns/guardrailable.rb +3 -0
  34. data/lib/phronomy/agent/concerns/retryable.rb +12 -1
  35. data/lib/phronomy/agent/concerns/suspendable.rb +19 -0
  36. data/lib/phronomy/agent/fsm.rb +44 -52
  37. data/lib/phronomy/agent/handoff.rb +3 -0
  38. data/lib/phronomy/agent/orchestrator.rb +191 -54
  39. data/lib/phronomy/agent/parallel_tool_chat.rb +87 -13
  40. data/lib/phronomy/agent/react_agent.rb +16 -6
  41. data/lib/phronomy/agent/runner.rb +2 -0
  42. data/lib/phronomy/agent/shared_state.rb +11 -0
  43. data/lib/phronomy/agent/suspend_signal.rb +2 -0
  44. data/lib/phronomy/agent/team_coordinator.rb +17 -5
  45. data/lib/phronomy/async_queue.rb +155 -0
  46. data/lib/phronomy/blocking_adapter_pool.rb +435 -0
  47. data/lib/phronomy/cancellation_scope.rb +123 -0
  48. data/lib/phronomy/cancellation_token.rb +133 -0
  49. data/lib/phronomy/concurrency_gate.rb +155 -0
  50. data/lib/phronomy/configuration.rb +168 -2
  51. data/lib/phronomy/context/assembler.rb +6 -0
  52. data/lib/phronomy/context/compaction_context.rb +2 -0
  53. data/lib/phronomy/context/context_version_cache.rb +2 -0
  54. data/lib/phronomy/context/token_budget.rb +3 -0
  55. data/lib/phronomy/context/token_estimator.rb +9 -2
  56. data/lib/phronomy/context/trigger_context.rb +1 -0
  57. data/lib/phronomy/context/trim_context.rb +4 -0
  58. data/lib/phronomy/deadline.rb +63 -0
  59. data/lib/phronomy/diagnostics.rb +62 -0
  60. data/lib/phronomy/embeddings/base.rb +22 -2
  61. data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +6 -2
  62. data/lib/phronomy/eval/comparison.rb +2 -0
  63. data/lib/phronomy/eval/dataset.rb +4 -0
  64. data/lib/phronomy/eval/metrics.rb +6 -0
  65. data/lib/phronomy/eval/runner.rb +11 -9
  66. data/lib/phronomy/eval/scorer/base.rb +1 -0
  67. data/lib/phronomy/eval/scorer/exact_match.rb +2 -0
  68. data/lib/phronomy/eval/scorer/includes_scorer.rb +2 -0
  69. data/lib/phronomy/eval/scorer/llm_judge.rb +2 -0
  70. data/lib/phronomy/event_loop.rb +275 -30
  71. data/lib/phronomy/fsm_session.rb +57 -4
  72. data/lib/phronomy/generator_verifier.rb +2 -0
  73. data/lib/phronomy/guardrail/base.rb +3 -0
  74. data/lib/phronomy/guardrail/prompt_injection_guardrail.rb +58 -0
  75. data/lib/phronomy/invocation_context.rb +152 -0
  76. data/lib/phronomy/knowledge_source/base.rb +24 -2
  77. data/lib/phronomy/knowledge_source/entity_knowledge.rb +7 -2
  78. data/lib/phronomy/knowledge_source/rag_knowledge.rb +8 -4
  79. data/lib/phronomy/knowledge_source/static_knowledge.rb +7 -2
  80. data/lib/phronomy/llm_adapter/base.rb +104 -0
  81. data/lib/phronomy/llm_adapter/ruby_llm.rb +41 -0
  82. data/lib/phronomy/llm_adapter.rb +20 -0
  83. data/lib/phronomy/loader/base.rb +1 -0
  84. data/lib/phronomy/loader/csv_loader.rb +2 -0
  85. data/lib/phronomy/loader/markdown_loader.rb +2 -0
  86. data/lib/phronomy/loader/plain_text_loader.rb +1 -0
  87. data/lib/phronomy/metrics.rb +38 -0
  88. data/lib/phronomy/output_parser/base.rb +1 -0
  89. data/lib/phronomy/output_parser/json_parser.rb +22 -3
  90. data/lib/phronomy/output_parser/structured_parser.rb +2 -0
  91. data/lib/phronomy/prompt_template.rb +5 -0
  92. data/lib/phronomy/runnable.rb +20 -3
  93. data/lib/phronomy/runtime/deterministic_scheduler.rb +412 -0
  94. data/lib/phronomy/runtime/fake_scheduler.rb +165 -0
  95. data/lib/phronomy/runtime/gate_registry.rb +52 -0
  96. data/lib/phronomy/runtime/pool_registry.rb +57 -0
  97. data/lib/phronomy/runtime/runtime_metrics.rb +117 -0
  98. data/lib/phronomy/runtime/scheduler.rb +98 -0
  99. data/lib/phronomy/runtime/scheduler_timer_adapter.rb +79 -0
  100. data/lib/phronomy/runtime/task_registry.rb +48 -0
  101. data/lib/phronomy/runtime/thread_scheduler.rb +30 -0
  102. data/lib/phronomy/runtime/timer_queue.rb +106 -0
  103. data/lib/phronomy/runtime/timer_service.rb +42 -0
  104. data/lib/phronomy/runtime.rb +374 -0
  105. data/lib/phronomy/splitter/base.rb +2 -0
  106. data/lib/phronomy/splitter/fixed_size_splitter.rb +2 -0
  107. data/lib/phronomy/splitter/recursive_splitter.rb +2 -0
  108. data/lib/phronomy/state_store/base.rb +48 -0
  109. data/lib/phronomy/state_store/in_memory.rb +62 -0
  110. data/lib/phronomy/task/backend.rb +80 -0
  111. data/lib/phronomy/task/fiber_backend.rb +157 -0
  112. data/lib/phronomy/task/immediate_backend.rb +89 -0
  113. data/lib/phronomy/task/thread_backend.rb +84 -0
  114. data/lib/phronomy/task.rb +275 -0
  115. data/lib/phronomy/task_group.rb +265 -0
  116. data/lib/phronomy/testing/fake_clock.rb +109 -0
  117. data/lib/phronomy/testing/fake_scheduler.rb +104 -0
  118. data/lib/phronomy/testing/scheduler_helpers.rb +59 -0
  119. data/lib/phronomy/testing.rb +12 -0
  120. data/lib/phronomy/tool/agent_tool.rb +1 -0
  121. data/lib/phronomy/tool/base.rb +298 -28
  122. data/lib/phronomy/tool/mcp_tool.rb +103 -17
  123. data/lib/phronomy/tool/scope_policy.rb +50 -0
  124. data/lib/phronomy/tool_executor.rb +106 -0
  125. data/lib/phronomy/tracing/base.rb +3 -0
  126. data/lib/phronomy/tracing/langfuse_tracer.rb +2 -0
  127. data/lib/phronomy/tracing/open_telemetry_tracer.rb +36 -0
  128. data/lib/phronomy/vector_store/async_backend.rb +110 -0
  129. data/lib/phronomy/vector_store/base.rb +40 -7
  130. data/lib/phronomy/vector_store/in_memory.rb +16 -7
  131. data/lib/phronomy/vector_store/pgvector.rb +40 -9
  132. data/lib/phronomy/vector_store/redis_search.rb +29 -8
  133. data/lib/phronomy/version.rb +1 -1
  134. data/lib/phronomy/workflow.rb +147 -11
  135. data/lib/phronomy/workflow_context.rb +83 -6
  136. data/lib/phronomy/workflow_runner.rb +106 -7
  137. data/lib/phronomy.rb +112 -1
  138. data/scripts/api_snapshot.rb +91 -0
  139. data/scripts/check_api_annotations.rb +68 -0
  140. data/scripts/check_private_enforcement.rb +93 -0
  141. data/scripts/check_readme_runnable.rb +98 -0
  142. data/scripts/run_mutation.sh +46 -0
  143. metadata +83 -2
@@ -45,6 +45,7 @@ module Phronomy
45
45
  # @param max_output_tokens [Integer, nil] explicit output reservation; when nil
46
46
  # and model is given, uses max_output_tokens
47
47
  # @param overhead [Integer] tokens reserved for instructions/tools
48
+ # @api private
48
49
  def initialize(model: nil, context_window: nil, max_output_tokens: nil, overhead: 0)
49
50
  @overhead = overhead.to_i
50
51
 
@@ -65,6 +66,7 @@ module Phronomy
65
66
  # Always >= 0.
66
67
  #
67
68
  # @return [Integer]
69
+ # @api private
68
70
  def effective_input_limit
69
71
  [@context_window - @max_output_tokens - @overhead, 0].max
70
72
  end
@@ -73,6 +75,7 @@ module Phronomy
73
75
  #
74
76
  # @param used [Integer] tokens already committed (e.g. from knowledge injection)
75
77
  # @return [Integer] remaining tokens (always >= 0)
78
+ # @api private
76
79
  def available(used: 0)
77
80
  [effective_input_limit - used.to_i, 0].max
78
81
  end
@@ -9,8 +9,12 @@ module Phronomy
9
9
  # any other class.
10
10
  #
11
11
  # Default approximation: ceil(char_count / 4).
12
- # English text averages ~4 chars/token; Japanese text averages ~2 chars/token
13
- # so this is a slight underestimate for Japanese.
12
+ # This heuristic is calibrated for ASCII/Latin text (~4 chars/token).
13
+ # For CJK languages (Chinese, Japanese, Korean) the actual token count is
14
+ # approximately 4× higher than the estimate because CJK characters are
15
+ # typically 1 token each in GPT-4/Claude tokenizers (~1 char/token vs the
16
+ # 4 char/token assumed here). Use a tokenizer-backed callable via
17
+ # +.tokenizer=+ for accurate CJK token counting.
14
18
  #
15
19
  # Replace the built-in heuristic with any callable via .tokenizer=:
16
20
  #
@@ -33,11 +37,13 @@ module Phronomy
33
37
  # In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
34
38
  # prevent cross-test contamination.
35
39
  # @param callable [#call, nil]
40
+ # @api private
36
41
  def tokenizer=(callable)
37
42
  @tokenizer_mutex.synchronize { @tokenizer = callable }
38
43
  end
39
44
 
40
45
  # @return [#call, nil]
46
+ # @api private
41
47
  def tokenizer
42
48
  @tokenizer_mutex.synchronize { @tokenizer }
43
49
  end
@@ -52,6 +58,7 @@ module Phronomy
52
58
  # @param input [String, Array, #content] a string, a message-like object,
53
59
  # or an Array of message-like objects (each must respond to #content).
54
60
  # @return [Integer] estimated token count (>= 0)
61
+ # @api private
55
62
  def estimate(input)
56
63
  tok = @tokenizer_mutex.synchronize { @tokenizer }
57
64
  case input
@@ -28,6 +28,7 @@ module Phronomy
28
28
 
29
29
  # @param message_elements [Array<Hash>]
30
30
  # @param budget [Phronomy::Context::TokenBudget, nil]
31
+ # @api private
31
32
  def initialize(message_elements:, budget:)
32
33
  @message_elements = message_elements.dup.freeze
33
34
  @budget = budget
@@ -28,6 +28,7 @@ module Phronomy
28
28
  # @param message_elements [Array<Hash>]
29
29
  # each element: { seq: Integer, message: Object, tokens: Integer, role: Symbol }
30
30
  # @param budget [Phronomy::Context::TokenBudget, nil]
31
+ # @api private
31
32
  def initialize(message_elements:, budget:)
32
33
  @message_elements = message_elements.dup
33
34
  @budget = budget
@@ -38,6 +39,7 @@ module Phronomy
38
39
  # Each element is a Hash with +:seq+, +:message+, +:tokens+, and +:role+.
39
40
  #
40
41
  # @return [Array<Hash>]
42
+ # @api private
41
43
  def message_elements
42
44
  @message_elements.dup
43
45
  end
@@ -47,6 +49,7 @@ module Phronomy
47
49
  #
48
50
  # @param seqs [Integer, Array<Integer>] seq number(s) to remove
49
51
  # @return [self]
52
+ # @api private
50
53
  def remove(seqs)
51
54
  seqs_set = Array(seqs).to_set
52
55
  @message_elements.reject! { |e| seqs_set.include?(e[:seq]) }
@@ -57,6 +60,7 @@ module Phronomy
57
60
  # Convenience: returns the plain message objects (without element metadata).
58
61
  #
59
62
  # @return [Array]
63
+ # @api private
60
64
  def messages
61
65
  @message_elements.map { |e| e[:message] }
62
66
  end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ # A point in time used as an upper bound for an operation.
5
+ #
6
+ # Uses the monotonic clock (+Process::CLOCK_MONOTONIC+) internally to avoid
7
+ # skew from NTP adjustments or DST transitions.
8
+ #
9
+ # @example Create a 30-second deadline and check remaining time
10
+ # deadline = Phronomy::Deadline.in(30)
11
+ # sleep 1
12
+ # deadline.remaining_seconds # => ~29.0
13
+ # deadline.expired? # => false
14
+ class Deadline
15
+ # Creates a deadline that expires +seconds+ from now.
16
+ #
17
+ # @param seconds [Numeric] seconds from now until expiry
18
+ # @return [Deadline]
19
+ # @api private
20
+ def self.in(seconds)
21
+ new(Process.clock_gettime(Process::CLOCK_MONOTONIC) + seconds)
22
+ end
23
+
24
+ # @param monotonic_at [Float] absolute monotonic timestamp of expiry
25
+ # @api private
26
+ def initialize(monotonic_at)
27
+ @monotonic_at = monotonic_at
28
+ end
29
+
30
+ # Returns +true+ when the deadline has passed.
31
+ # @return [Boolean]
32
+ # @api private
33
+ def expired?
34
+ Process.clock_gettime(Process::CLOCK_MONOTONIC) >= @monotonic_at
35
+ end
36
+
37
+ # Seconds remaining until expiry. Returns 0 when already expired.
38
+ # @return [Float]
39
+ # @api private
40
+ def remaining_seconds
41
+ remaining = @monotonic_at - Process.clock_gettime(Process::CLOCK_MONOTONIC)
42
+ [remaining, 0.0].max
43
+ end
44
+
45
+ # Attaches this deadline to a {CancellationToken} by cancelling the token
46
+ # when the deadline expires. Uses the Runtime timer queue (a single
47
+ # background thread shared by all deadlines) instead of spawning one thread
48
+ # per deadline.
49
+ #
50
+ # @param token [CancellationToken]
51
+ # @param timer_queue [Runtime::TimerQueue, nil] queue to register with;
52
+ # defaults to +Phronomy::Runtime.instance.timer_queue+
53
+ # @return [self]
54
+ # @api private
55
+ def attach_to(token, timer_queue: Phronomy::Runtime.instance.timer_queue)
56
+ seconds = remaining_seconds
57
+ return self if seconds <= 0
58
+
59
+ timer_queue.schedule(seconds: seconds) { token.cancel! }
60
+ self
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,62 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ # Developer-facing diagnostics for blocking operation detection (Issue #279).
5
+ #
6
+ # Provides debug dump utilities that can be called from an IRB / Rails console
7
+ # or in test helpers to inspect the current state of the Runtime.
8
+ #
9
+ # @example Enable diagnostics and print a dump
10
+ # Phronomy.configure { |c| c.scheduler_debug = true }
11
+ # Phronomy::Diagnostics.dump
12
+ module Diagnostics
13
+ # Prints a formatted summary of the current Runtime state to +$stderr+
14
+ # (or the supplied IO).
15
+ #
16
+ # Includes:
17
+ # - BlockingAdapterPool: active workers, queue depth, abandoned count
18
+ # - EventLoop: last / max / average lag in milliseconds
19
+ #
20
+ # @param out [IO] output destination (default: $stderr)
21
+ # @return [void]
22
+ # @api public
23
+ def self.dump(out: $stderr)
24
+ snap = Phronomy::Metrics.snapshot
25
+
26
+ out.puts "[Phronomy::Diagnostics] Runtime state dump"
27
+ out.puts " BlockingAdapterPool:"
28
+ out.puts " pool_size : #{snap[:blocking_pool_size]}"
29
+ out.puts " active_count : #{snap[:blocking_pool_active]}"
30
+ out.puts " queue_depth : #{snap[:blocking_pool_queue_length]}"
31
+ out.puts " abandoned_total : #{snap[:blocking_pool_abandoned_total]}"
32
+ out.puts " EventLoop:"
33
+ out.puts " last_lag_ms : #{snap[:event_loop_lag_last_ms]}"
34
+ out.puts " max_lag_ms : #{snap[:event_loop_lag_max_ms]}"
35
+ out.puts " average_lag_ms : #{snap[:event_loop_lag_average_ms]}"
36
+ end
37
+
38
+ # Returns the diagnostics state as a plain Hash (useful for JSON export).
39
+ #
40
+ # @return [Hash]
41
+ # @api public
42
+ def self.snapshot
43
+ Phronomy::Metrics.snapshot
44
+ end
45
+
46
+ # Raises an error if +invoke+ (blocking) is called from inside an EventLoop
47
+ # action, preventing accidental scheduler stalls.
48
+ #
49
+ # Called by Agent::Base#invoke and Workflow#invoke before executing.
50
+ #
51
+ # @raise [Phronomy::SchedulerReentrancyError] when called from EventLoop thread
52
+ # @return [void]
53
+ # @api private
54
+ def self.assert_not_in_event_loop!
55
+ return unless Phronomy::EventLoop.current?
56
+
57
+ raise Phronomy::SchedulerReentrancyError,
58
+ "Blocking invoke called from inside an EventLoop action. " \
59
+ "Use invoke_async instead."
60
+ end
61
+ end
62
+ end
@@ -9,11 +9,31 @@ module Phronomy
9
9
  class Base
10
10
  # Embed the given text and return a vector representation.
11
11
  #
12
- # @param text [String] the text to embed
12
+ # @param text [String] the text to embed
13
+ # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
13
14
  # @return [Array<Float>] the embedding vector
14
- def embed(text)
15
+ # @api public
16
+ def embed(text, cancellation_token = nil)
17
+ cancellation_token&.raise_if_cancelled!
15
18
  raise NotImplementedError, "#{self.class}#embed is not implemented"
16
19
  end
20
+
21
+ # Submits an {#embed} call to {BlockingAdapterPool} and returns a
22
+ # {BlockingAdapterPool::PendingOperation}.
23
+ #
24
+ # @param text [String]
25
+ # @param cancellation_token [Phronomy::CancellationToken, nil]
26
+ # @param timeout [Numeric, nil] seconds before the operation is abandoned
27
+ # @return [BlockingAdapterPool::PendingOperation]
28
+ # @api public
29
+ def embed_async(text, cancellation_token = nil, timeout: nil)
30
+ Phronomy::Runtime.instance.blocking_io.submit(
31
+ timeout: timeout,
32
+ cancellation_token: cancellation_token
33
+ ) do
34
+ embed(text, cancellation_token)
35
+ end
36
+ end
17
37
  end
18
38
  end
19
39
  end
@@ -19,6 +19,7 @@ module Phronomy
19
19
  # @param provider [Symbol, nil] provider override (e.g. :openai); nil uses the RubyLLM default
20
20
  # @param assume_model_exists [Boolean] when true, skips RubyLLM model-registry validation
21
21
  # (useful for locally hosted models not in the registry)
22
+ # @api public
22
23
  def initialize(model: nil, provider: nil, assume_model_exists: false)
23
24
  @model = model
24
25
  @provider = provider
@@ -27,9 +28,12 @@ module Phronomy
27
28
 
28
29
  # Embed text via RubyLLM.
29
30
  #
30
- # @param text [String]
31
+ # @param text [String]
32
+ # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
31
33
  # @return [Array<Float>]
32
- def embed(text)
34
+ # @api public
35
+ def embed(text, cancellation_token = nil)
36
+ cancellation_token&.raise_if_cancelled!
33
37
  opts = {}
34
38
  opts[:model] = @model if @model
35
39
  opts[:provider] = @provider if @provider
@@ -19,6 +19,7 @@ module Phronomy
19
19
  ComparisonPair = Data.define(:eval_case, :result_a, :result_b)
20
20
 
21
21
  # @param scorer [Scorer::Base]
22
+ # @api public
22
23
  def initialize(scorer: Scorer::ExactMatch.new)
23
24
  @scorer = scorer
24
25
  end
@@ -29,6 +30,7 @@ module Phronomy
29
30
  # @param callable_a [#call]
30
31
  # @param callable_b [#call]
31
32
  # @return [Array<ComparisonPair>]
33
+ # @api public
32
34
  def compare(dataset, callable_a, callable_b)
33
35
  runner_a = Runner.new(scorer: @scorer)
34
36
  runner_b = Runner.new(scorer: @scorer)
@@ -13,6 +13,7 @@ module Phronomy
13
13
  include Enumerable
14
14
 
15
15
  # @param cases [Array<EvalCase>]
16
+ # @api public
16
17
  def initialize(cases = [])
17
18
  @cases = cases.freeze
18
19
  end
@@ -23,16 +24,19 @@ module Phronomy
23
24
  #
24
25
  # @param pairs [Array<Hash>]
25
26
  # @return [Dataset]
27
+ # @api public
26
28
  def self.from_array(pairs)
27
29
  new(pairs.map { |h| EvalCase.new(**h) })
28
30
  end
29
31
 
30
32
  # @yield [EvalCase]
33
+ # @api public
31
34
  def each(&block)
32
35
  @cases.each(&block)
33
36
  end
34
37
 
35
38
  # @return [Integer]
39
+ # @api public
36
40
  def size
37
41
  @cases.size
38
42
  end
@@ -11,12 +11,14 @@ module Phronomy
11
11
  # puts metrics.to_h
12
12
  class Metrics
13
13
  # @param results [Array<EvalResult>]
14
+ # @api public
14
15
  def initialize(results)
15
16
  @results = results
16
17
  end
17
18
 
18
19
  # Fraction of results that passed (score == 1.0).
19
20
  # @return [Float] in [0.0, 1.0]
21
+ # @api public
20
22
  def pass_rate
21
23
  return 0.0 if @results.empty?
22
24
  @results.count(&:pass?).to_f / @results.size
@@ -24,6 +26,7 @@ module Phronomy
24
26
 
25
27
  # Arithmetic mean of all scores.
26
28
  # @return [Float]
29
+ # @api public
27
30
  def average_score
28
31
  return 0.0 if @results.empty?
29
32
  @results.sum(&:score) / @results.size
@@ -32,12 +35,14 @@ module Phronomy
32
35
  # Sum of all TokenUsage objects present in the results.
33
36
  # Results without usage are skipped.
34
37
  # @return [Phronomy::TokenUsage]
38
+ # @api public
35
39
  def total_usage
36
40
  @results.map(&:usage).compact.reduce(TokenUsage.zero, :+)
37
41
  end
38
42
 
39
43
  # Arithmetic mean of latency_ms across all results.
40
44
  # @return [Float]
45
+ # @api public
41
46
  def average_latency_ms
42
47
  return 0.0 if @results.empty?
43
48
  @results.sum(&:latency_ms).to_f / @results.size
@@ -45,6 +50,7 @@ module Phronomy
45
50
 
46
51
  # Returns a plain Hash summary suitable for logging or serialisation.
47
52
  # @return [Hash]
53
+ # @api public
48
54
  def to_h
49
55
  {
50
56
  total: @results.size,
@@ -18,6 +18,7 @@ module Phronomy
18
18
  # results = runner.run(dataset, ->(input) { agent.invoke(input) })
19
19
  class Runner
20
20
  # @param scorer [Scorer::Base] scorer used to evaluate each result
21
+ # @api public
21
22
  def initialize(scorer: Scorer::ExactMatch.new)
22
23
  @scorer = scorer
23
24
  end
@@ -26,29 +27,30 @@ module Phronomy
26
27
  # @param callable [#call] accepts a single String argument
27
28
  # @param concurrency [Integer] number of parallel threads (default: 1, sequential)
28
29
  # @return [Array<EvalResult>]
30
+ # @api public
29
31
  def run(dataset, callable, concurrency: 1)
30
32
  cases = dataset.to_a
31
33
  return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
32
34
 
33
- # Run cases in slices of +concurrency+ threads. Each slice is joined
34
- # before the next starts, bounding peak thread count to +concurrency+.
35
- # Writing to pre-allocated slots (one per thread) is safe because each
36
- # thread writes to a unique index and all threads in a slice are joined
35
+ # Run cases in slices of +concurrency+ tasks. Each slice is joined
36
+ # before the next starts, bounding peak task count to +concurrency+.
37
+ # Writing to pre-allocated slots (one per task) is safe because each
38
+ # task writes to a unique index and all tasks in a slice are joined
37
39
  # before the next slice begins.
38
- # Exceptions in worker threads are collected and re-raised after all
39
- # threads in the slice are joined, preventing orphaned threads.
40
+ # Exceptions in worker tasks are collected and re-raised after all
41
+ # tasks in the slice are joined, preventing orphaned tasks.
40
42
  results = Array.new(cases.length)
41
43
  cases.each_with_index.each_slice(concurrency) do |batch|
42
44
  errors = []
43
45
  errors_mu = Mutex.new
44
- threads = batch.map do |eval_case, i|
45
- Thread.new do
46
+ tasks = batch.map do |eval_case, i|
47
+ Phronomy::Runtime.instance.spawn(name: "eval-case-#{i}") do
46
48
  results[i] = run_one(eval_case, callable)
47
49
  rescue => e
48
50
  errors_mu.synchronize { errors << e }
49
51
  end
50
52
  end
51
- threads.each(&:join)
53
+ tasks.each(&:join)
52
54
  raise errors.first if errors.any?
53
55
  end
54
56
  results
@@ -12,6 +12,7 @@ module Phronomy
12
12
  # @param expected [String] the ground-truth value from the EvalCase
13
13
  # @param input [String, nil] the original input (used by LLM scorers)
14
14
  # @return [Float] a value in [0.0, 1.0]
15
+ # @api public
15
16
  def score(actual:, expected:, input: nil)
16
17
  raise NotImplementedError, "#{self.class}#score is not implemented"
17
18
  end
@@ -12,11 +12,13 @@ module Phronomy
12
12
  # ExactMatch.new.score(actual: "paris", expected: "Paris") # => 0.0
13
13
  class ExactMatch < Base
14
14
  # @param case_sensitive [Boolean] default true
15
+ # @api public
15
16
  def initialize(case_sensitive: true)
16
17
  @case_sensitive = case_sensitive
17
18
  end
18
19
 
19
20
  # @return [Float] 1.0 on match, 0.0 otherwise
21
+ # @api public
20
22
  def score(actual:, expected:, input: nil)
21
23
  a = actual.to_s.strip
22
24
  e = expected.to_s.strip
@@ -13,11 +13,13 @@ module Phronomy
13
13
  # IncludesScorer.new.score(actual: "The answer is 42.", expected: "42") # => 1.0
14
14
  class IncludesScorer < Base
15
15
  # @param case_sensitive [Boolean] default false
16
+ # @api public
16
17
  def initialize(case_sensitive: false)
17
18
  @case_sensitive = case_sensitive
18
19
  end
19
20
 
20
21
  # @return [Float] 1.0 if actual contains expected, 0.0 otherwise
22
+ # @api public
21
23
  def score(actual:, expected:, input: nil)
22
24
  a = actual.to_s
23
25
  e = expected.to_s
@@ -36,6 +36,7 @@ module Phronomy
36
36
  # @param prompt_template [String] format string with %<input>s, %<expected>s, %<actual>s
37
37
  # @param raise_on_error [Boolean] when true, re-raises scoring exceptions instead of
38
38
  # returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
39
+ # @api public
39
40
  def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
40
41
  @model = model
41
42
  @prompt_template = prompt_template
@@ -43,6 +44,7 @@ module Phronomy
43
44
  end
44
45
 
45
46
  # @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
47
+ # @api public
46
48
  def score(actual:, expected:, input: nil)
47
49
  prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
48
50
  response = RubyLLM.chat(model: @model).ask(prompt)