phronomy 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/.mutant.yml +21 -0
  3. data/CHANGELOG.md +338 -0
  4. data/CONTRIBUTING.md +102 -0
  5. data/README.md +242 -27
  6. data/RELEASE_CHECKLIST.md +86 -0
  7. data/SECURITY.md +80 -0
  8. data/benchmark/baseline.json +9 -0
  9. data/benchmark/bench_agent_invoke.rb +105 -0
  10. data/benchmark/bench_context_assembler.rb +46 -0
  11. data/benchmark/bench_regression.rb +171 -0
  12. data/benchmark/bench_token_estimator.rb +44 -0
  13. data/benchmark/bench_tool_schema.rb +69 -0
  14. data/benchmark/bench_vector_store.rb +39 -0
  15. data/benchmark/bench_workflow.rb +55 -0
  16. data/benchmark/run_all.rb +118 -0
  17. data/docs/decisions/001-rubyllm-as-provider-layer.md +42 -0
  18. data/docs/decisions/002-workflow-context-immutability.md +42 -0
  19. data/docs/decisions/003-event-loop-singleton.md +48 -0
  20. data/docs/decisions/004-invoke-timeout-is-not-cancellation.md +51 -0
  21. data/docs/decisions/005-static-knowledge-class-level-cache.md +45 -0
  22. data/docs/decisions/006-no-built-in-guardrails.md +48 -0
  23. data/docs/decisions/007-mcp-is-beta-stability.md +51 -0
  24. data/docs/decisions/008-orchestrator-uses-os-threads.md +52 -0
  25. data/docs/decisions/009-state-store-abstraction.md +141 -0
  26. data/lib/phronomy/agent/base.rb +194 -12
  27. data/lib/phronomy/agent/before_completion_context.rb +1 -0
  28. data/lib/phronomy/agent/checkpoint.rb +1 -0
  29. data/lib/phronomy/agent/concerns/before_completion.rb +6 -0
  30. data/lib/phronomy/agent/concerns/error_translation.rb +45 -0
  31. data/lib/phronomy/agent/concerns/guardrailable.rb +3 -0
  32. data/lib/phronomy/agent/concerns/retryable.rb +12 -1
  33. data/lib/phronomy/agent/concerns/suspendable.rb +4 -0
  34. data/lib/phronomy/agent/fsm.rb +15 -0
  35. data/lib/phronomy/agent/handoff.rb +3 -0
  36. data/lib/phronomy/agent/orchestrator.rb +123 -11
  37. data/lib/phronomy/agent/parallel_tool_chat.rb +21 -4
  38. data/lib/phronomy/agent/react_agent.rb +8 -6
  39. data/lib/phronomy/agent/runner.rb +2 -0
  40. data/lib/phronomy/agent/shared_state.rb +11 -0
  41. data/lib/phronomy/agent/suspend_signal.rb +2 -0
  42. data/lib/phronomy/agent/team_coordinator.rb +17 -5
  43. data/lib/phronomy/cancellation_token.rb +92 -0
  44. data/lib/phronomy/configuration.rb +26 -2
  45. data/lib/phronomy/context/assembler.rb +6 -0
  46. data/lib/phronomy/context/compaction_context.rb +2 -0
  47. data/lib/phronomy/context/context_version_cache.rb +2 -0
  48. data/lib/phronomy/context/token_budget.rb +3 -0
  49. data/lib/phronomy/context/token_estimator.rb +9 -2
  50. data/lib/phronomy/context/trigger_context.rb +1 -0
  51. data/lib/phronomy/context/trim_context.rb +4 -0
  52. data/lib/phronomy/embeddings/base.rb +5 -2
  53. data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +6 -2
  54. data/lib/phronomy/eval/comparison.rb +2 -0
  55. data/lib/phronomy/eval/dataset.rb +4 -0
  56. data/lib/phronomy/eval/metrics.rb +6 -0
  57. data/lib/phronomy/eval/runner.rb +2 -0
  58. data/lib/phronomy/eval/scorer/base.rb +1 -0
  59. data/lib/phronomy/eval/scorer/exact_match.rb +2 -0
  60. data/lib/phronomy/eval/scorer/includes_scorer.rb +2 -0
  61. data/lib/phronomy/eval/scorer/llm_judge.rb +2 -0
  62. data/lib/phronomy/event_loop.rb +114 -7
  63. data/lib/phronomy/fsm_session.rb +8 -1
  64. data/lib/phronomy/generator_verifier.rb +2 -0
  65. data/lib/phronomy/guardrail/base.rb +3 -0
  66. data/lib/phronomy/knowledge_source/base.rb +6 -2
  67. data/lib/phronomy/knowledge_source/entity_knowledge.rb +7 -2
  68. data/lib/phronomy/knowledge_source/rag_knowledge.rb +8 -4
  69. data/lib/phronomy/knowledge_source/static_knowledge.rb +7 -2
  70. data/lib/phronomy/loader/base.rb +1 -0
  71. data/lib/phronomy/loader/csv_loader.rb +2 -0
  72. data/lib/phronomy/loader/markdown_loader.rb +2 -0
  73. data/lib/phronomy/loader/plain_text_loader.rb +1 -0
  74. data/lib/phronomy/output_parser/base.rb +1 -0
  75. data/lib/phronomy/output_parser/json_parser.rb +22 -3
  76. data/lib/phronomy/output_parser/structured_parser.rb +2 -0
  77. data/lib/phronomy/prompt_template.rb +5 -0
  78. data/lib/phronomy/runnable.rb +20 -3
  79. data/lib/phronomy/splitter/base.rb +2 -0
  80. data/lib/phronomy/splitter/fixed_size_splitter.rb +2 -0
  81. data/lib/phronomy/splitter/recursive_splitter.rb +2 -0
  82. data/lib/phronomy/state_store/base.rb +48 -0
  83. data/lib/phronomy/state_store/in_memory.rb +62 -0
  84. data/lib/phronomy/tool/agent_tool.rb +1 -0
  85. data/lib/phronomy/tool/base.rb +189 -27
  86. data/lib/phronomy/tool/mcp_tool.rb +68 -13
  87. data/lib/phronomy/tracing/base.rb +3 -0
  88. data/lib/phronomy/tracing/langfuse_tracer.rb +2 -0
  89. data/lib/phronomy/tracing/open_telemetry_tracer.rb +2 -0
  90. data/lib/phronomy/vector_store/base.rb +33 -7
  91. data/lib/phronomy/vector_store/in_memory.rb +16 -7
  92. data/lib/phronomy/vector_store/pgvector.rb +40 -9
  93. data/lib/phronomy/vector_store/redis_search.rb +29 -8
  94. data/lib/phronomy/version.rb +1 -1
  95. data/lib/phronomy/workflow.rb +96 -7
  96. data/lib/phronomy/workflow_context.rb +54 -4
  97. data/lib/phronomy/workflow_runner.rb +35 -7
  98. data/lib/phronomy.rb +70 -1
  99. data/scripts/api_snapshot.rb +91 -0
  100. data/scripts/check_api_annotations.rb +68 -0
  101. data/scripts/check_private_enforcement.rb +93 -0
  102. data/scripts/check_readme_runnable.rb +98 -0
  103. data/scripts/run_mutation.sh +46 -0
  104. metadata +45 -2
@@ -0,0 +1,92 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ # Provides cooperative cancellation for agent invocations.
5
+ #
6
+ # Pass a token to an agent via +config: { cancellation_token: token }+.
7
+ # The agent checks the token before each LLM call and raises
8
+ # {Phronomy::CancellationError} when the token is cancelled or the
9
+ # optional deadline has passed.
10
+ #
11
+ # A token may be shared across multiple agent invocations and across threads;
12
+ # all access to internal state is protected by a Mutex.
13
+ #
14
+ # @example Explicit cancel from another thread
15
+ # token = Phronomy::CancellationToken.new
16
+ # Thread.new { sleep 5; token.cancel! }
17
+ # result = agent.invoke("...", config: { cancellation_token: token })
18
+ #
19
+ # @example Hard deadline via monotonic clock (recommended)
20
+ # token = Phronomy::CancellationToken.timeout_after(30)
21
+ # result = agent.invoke("...", config: { cancellation_token: token })
22
+ #
23
+ # @example Hard deadline via wall-clock (legacy)
24
+ # token = Phronomy::CancellationToken.new(deadline: Time.now + 30)
25
+ # result = agent.invoke("...", config: { cancellation_token: token })
26
+ #
27
+ # @example Propagate to parallel workers
28
+ # token = Phronomy::CancellationToken.new
29
+ # orchestrator.dispatch_parallel(task1, task2, cancellation_token: token)
30
+ class CancellationToken
31
+ # Returns a new token that will expire after +seconds+ seconds, measured
32
+ # with the monotonic clock (+Process::CLOCK_MONOTONIC+). Unlike constructing
33
+ # a token with +deadline: Time.now + seconds+, this factory is immune to NTP
34
+ # adjustments and DST transitions.
35
+ #
36
+ # @param seconds [Numeric] duration in seconds until the token expires.
37
+ # @return [CancellationToken]
38
+ # @api public
39
+ def self.timeout_after(seconds)
40
+ monotonic_deadline = Process.clock_gettime(Process::CLOCK_MONOTONIC) + seconds
41
+ new(monotonic_deadline: monotonic_deadline)
42
+ end
43
+
44
+ # @param deadline [Time, nil] optional wall-clock deadline; the token reports
45
+ # +cancelled?+ as +true+ once +Time.now >= deadline+. Prefer
46
+ # {.timeout_after} for duration-based cancellation.
47
+ # @param monotonic_deadline [Float, nil] internal monotonic timestamp set by
48
+ # {.timeout_after}; prefer that factory method over passing this directly.
49
+ # @api public
50
+ def initialize(deadline: nil, monotonic_deadline: nil)
51
+ @cancelled = false
52
+ @deadline = deadline
53
+ @monotonic_deadline = monotonic_deadline
54
+ @mutex = Mutex.new
55
+ end
56
+
57
+ # @return [Time, nil] the wall-clock deadline passed to {#initialize}, or +nil+.
58
+ attr_reader :deadline
59
+
60
+ # Mark the token as cancelled. Thread-safe; may be called from any thread.
61
+ # @return [self]
62
+ # @api public
63
+ def cancel!
64
+ @mutex.synchronize { @cancelled = true }
65
+ self
66
+ end
67
+
68
+ # Returns +true+ when the token has been explicitly cancelled via {#cancel!},
69
+ # when the wall-clock deadline has passed, or when the monotonic deadline
70
+ # (set by {.timeout_after}) has elapsed. Thread-safe.
71
+ # @return [Boolean]
72
+ # @api public
73
+ def cancelled?
74
+ return true if @mutex.synchronize { @cancelled }
75
+ return true if !@deadline.nil? && Time.now >= @deadline
76
+ !@monotonic_deadline.nil? &&
77
+ Process.clock_gettime(Process::CLOCK_MONOTONIC) >= @monotonic_deadline
78
+ end
79
+
80
+ # Raises {Phronomy::CancellationError} if the token is cancelled.
81
+ # A convenience method for cooperative cancellation checks inside tools,
82
+ # RAG loaders, and hooks, replacing the +if cancelled? then raise+ pattern.
83
+ #
84
+ # @param message [String] optional error message
85
+ # @return [nil] when the token is not cancelled
86
+ # @raise [Phronomy::CancellationError] when the token is cancelled
87
+ # @api public
88
+ def raise_if_cancelled!(message = "invocation cancelled")
89
+ raise Phronomy::CancellationError, message if cancelled?
90
+ end
91
+ end
92
+ end
@@ -33,16 +33,40 @@ module Phronomy
33
33
  # @see Phronomy::EventLoop
34
34
  attr_accessor :event_loop
35
35
 
36
- # When true (default), user input and LLM output are recorded in trace spans.
36
+ # When true, user input and LLM output are recorded in trace spans.
37
+ # Defaults to false; set to true only in environments where PII capture is acceptable.
37
38
  # Set to false in privacy-sensitive environments to prevent PII from reaching
38
39
  # the tracing backend (OTel, Langfuse, etc.).
39
40
  attr_accessor :trace_pii
40
41
 
42
+ # Optional logger for framework diagnostic messages (e.g. unreachable-state warnings).
43
+ # Must respond to +#warn(message)+. When nil (default), messages are written to +$stderr+
44
+ # via +Kernel#warn+.
45
+ # @example
46
+ # Phronomy.configure { |c| c.logger = Rails.logger }
47
+ attr_accessor :logger
48
+
49
+ # Grace period (in seconds) before the EventLoop background thread is force-killed
50
+ # after a cooperative stop request. Applies both to the overall thread join
51
+ # and to the drain-and-cancel phase when +stop(drain: true)+ is used.
52
+ # Default: 5 seconds.
53
+ # @see Phronomy::EventLoop#stop
54
+ attr_accessor :event_loop_stop_grace_seconds
55
+
56
+ # Global state store for workflow persistence.
57
+ # When set, WorkflowRunner routes all state reads and writes through this store.
58
+ # Must be an instance of a class that inherits from Phronomy::StateStore::Base.
59
+ # Defaults to +nil+ (no persistence — state lives only for the duration of invoke).
60
+ # @example
61
+ # Phronomy.configure { |c| c.state_store = Phronomy::StateStore::InMemory.new }
62
+ attr_accessor :state_store
63
+
41
64
  def initialize
42
65
  @recursion_limit = 25
43
66
  @tracer = Phronomy::Tracing::NullTracer.new
44
- @trace_pii = true
67
+ @trace_pii = false
45
68
  @event_loop = false
69
+ @event_loop_stop_grace_seconds = 5
46
70
  end
47
71
  end
48
72
  end
@@ -35,12 +35,14 @@ module Phronomy
35
35
  # @param type [Symbol, String]
36
36
  # @param trusted [Boolean]
37
37
  # @return [String]
38
+ # @api private
38
39
  def self.xml_tag(text, type:, trusted: false)
39
40
  "<context type=\"#{CGI.escapeHTML(type.to_s)}\" trusted=\"#{trusted}\">\n#{CGI.escapeHTML(text.to_s)}\n</context>"
40
41
  end
41
42
 
42
43
  # @param budget [Phronomy::Context::TokenBudget, nil]
43
44
  # when nil no token trimming is performed
45
+ # @api private
44
46
  def initialize(budget: nil)
45
47
  @budget = budget
46
48
  @instruction = nil
@@ -53,6 +55,7 @@ module Phronomy
53
55
  #
54
56
  # @param text [String]
55
57
  # @return [self]
58
+ # @api private
56
59
  def add_instruction(text)
57
60
  @instruction = text.to_s
58
61
  self
@@ -67,6 +70,7 @@ module Phronomy
67
70
  # @param source [String, nil] optional source label (e.g. filename); included in the
68
71
  # XML tag so the LLM can produce grounded citations. Omitted when nil.
69
72
  # @return [self]
73
+ # @api private
70
74
  def add_knowledge(text, type:, trusted: false, source: nil)
71
75
  @knowledge_chunks << {text: text.to_s, type: type.to_s, trusted: trusted, source: source}
72
76
  self
@@ -76,6 +80,7 @@ module Phronomy
76
80
  #
77
81
  # @param messages [Array] message-like objects with #role and #content
78
82
  # @return [self]
83
+ # @api private
79
84
  def add_messages(messages)
80
85
  @messages = Array(messages)
81
86
  self
@@ -86,6 +91,7 @@ module Phronomy
86
91
  # @return [Hash{Symbol => Object}]
87
92
  # :system [String, nil] combined system prompt (instruction + knowledge XML tags)
88
93
  # :messages [Array] conversation messages, trimmed to budget if set
94
+ # @api private
89
95
  def build
90
96
  knowledge_text = @knowledge_chunks.map { |c| xml_context_tag(c) }.join("\n\n")
91
97
  system_parts = [@instruction, knowledge_text.empty? ? nil : knowledge_text].compact
@@ -45,6 +45,7 @@ module Phronomy
45
45
  # @param thread_id [String, nil] used when saving compaction records
46
46
  # @param memory [Object, nil] memory object; must respond to #save_compaction
47
47
  # for compaction records to be persisted
48
+ # @api private
48
49
  def initialize(message_elements:, budget:, thread_id: nil, memory: nil)
49
50
  @message_elements = message_elements.dup
50
51
  @budget = budget
@@ -67,6 +68,7 @@ module Phronomy
67
68
  # @yieldparam elements [Array<Hash>] the selected message elements
68
69
  # @yieldreturn [String] summary text to replace the selected messages
69
70
  # @return [Array] the updated result_messages array
71
+ # @api private
70
72
  def compact(range)
71
73
  # Normalise: Integer index → single-element Array; Range → Array slice.
72
74
  raw = @message_elements[range]
@@ -25,6 +25,7 @@ module Phronomy
25
25
  #
26
26
  # @param fingerprint [String] SHA-256 hex digest to compare
27
27
  # @return [Boolean]
28
+ # @api private
28
29
  def valid?(fingerprint)
29
30
  !@fingerprint.nil? && !@system_text.nil? && @fingerprint == fingerprint
30
31
  end
@@ -33,6 +34,7 @@ module Phronomy
33
34
  #
34
35
  # @param fingerprint [String] new SHA-256 hex digest
35
36
  # @param system_text [String] fully assembled system prompt text
37
+ # @api private
36
38
  def update(fingerprint:, system_text:)
37
39
  @fingerprint = fingerprint
38
40
  @system_text = system_text.to_s
@@ -45,6 +45,7 @@ module Phronomy
45
45
  # @param max_output_tokens [Integer, nil] explicit output reservation; when nil
46
46
  # and model is given, uses max_output_tokens
47
47
  # @param overhead [Integer] tokens reserved for instructions/tools
48
+ # @api private
48
49
  def initialize(model: nil, context_window: nil, max_output_tokens: nil, overhead: 0)
49
50
  @overhead = overhead.to_i
50
51
 
@@ -65,6 +66,7 @@ module Phronomy
65
66
  # Always >= 0.
66
67
  #
67
68
  # @return [Integer]
69
+ # @api private
68
70
  def effective_input_limit
69
71
  [@context_window - @max_output_tokens - @overhead, 0].max
70
72
  end
@@ -73,6 +75,7 @@ module Phronomy
73
75
  #
74
76
  # @param used [Integer] tokens already committed (e.g. from knowledge injection)
75
77
  # @return [Integer] remaining tokens (always >= 0)
78
+ # @api private
76
79
  def available(used: 0)
77
80
  [effective_input_limit - used.to_i, 0].max
78
81
  end
@@ -9,8 +9,12 @@ module Phronomy
9
9
  # any other class.
10
10
  #
11
11
  # Default approximation: ceil(char_count / 4).
12
- # English text averages ~4 chars/token; Japanese text averages ~2 chars/token
13
- # so this is a slight underestimate for Japanese.
12
+ # This heuristic is calibrated for ASCII/Latin text (~4 chars/token).
13
+ # For CJK languages (Chinese, Japanese, Korean) the actual token count is
14
+ # approximately 4× higher than the estimate because CJK characters are
15
+ # typically 1 token each in GPT-4/Claude tokenizers (~1 char/token vs the
16
+ # 4 char/token assumed here). Use a tokenizer-backed callable via
17
+ # +.tokenizer=+ for accurate CJK token counting.
14
18
  #
15
19
  # Replace the built-in heuristic with any callable via .tokenizer=:
16
20
  #
@@ -33,11 +37,13 @@ module Phronomy
33
37
  # In tests, call +TokenEstimator.reset_tokenizer!+ after each test to
34
38
  # prevent cross-test contamination.
35
39
  # @param callable [#call, nil]
40
+ # @api private
36
41
  def tokenizer=(callable)
37
42
  @tokenizer_mutex.synchronize { @tokenizer = callable }
38
43
  end
39
44
 
40
45
  # @return [#call, nil]
46
+ # @api private
41
47
  def tokenizer
42
48
  @tokenizer_mutex.synchronize { @tokenizer }
43
49
  end
@@ -52,6 +58,7 @@ module Phronomy
52
58
  # @param input [String, Array, #content] a string, a message-like object,
53
59
  # or an Array of message-like objects (each must respond to #content).
54
60
  # @return [Integer] estimated token count (>= 0)
61
+ # @api private
55
62
  def estimate(input)
56
63
  tok = @tokenizer_mutex.synchronize { @tokenizer }
57
64
  case input
@@ -28,6 +28,7 @@ module Phronomy
28
28
 
29
29
  # @param message_elements [Array<Hash>]
30
30
  # @param budget [Phronomy::Context::TokenBudget, nil]
31
+ # @api private
31
32
  def initialize(message_elements:, budget:)
32
33
  @message_elements = message_elements.dup.freeze
33
34
  @budget = budget
@@ -28,6 +28,7 @@ module Phronomy
28
28
  # @param message_elements [Array<Hash>]
29
29
  # each element: { seq: Integer, message: Object, tokens: Integer, role: Symbol }
30
30
  # @param budget [Phronomy::Context::TokenBudget, nil]
31
+ # @api private
31
32
  def initialize(message_elements:, budget:)
32
33
  @message_elements = message_elements.dup
33
34
  @budget = budget
@@ -38,6 +39,7 @@ module Phronomy
38
39
  # Each element is a Hash with +:seq+, +:message+, +:tokens+, and +:role+.
39
40
  #
40
41
  # @return [Array<Hash>]
42
+ # @api private
41
43
  def message_elements
42
44
  @message_elements.dup
43
45
  end
@@ -47,6 +49,7 @@ module Phronomy
47
49
  #
48
50
  # @param seqs [Integer, Array<Integer>] seq number(s) to remove
49
51
  # @return [self]
52
+ # @api private
50
53
  def remove(seqs)
51
54
  seqs_set = Array(seqs).to_set
52
55
  @message_elements.reject! { |e| seqs_set.include?(e[:seq]) }
@@ -57,6 +60,7 @@ module Phronomy
57
60
  # Convenience: returns the plain message objects (without element metadata).
58
61
  #
59
62
  # @return [Array]
63
+ # @api private
60
64
  def messages
61
65
  @message_elements.map { |e| e[:message] }
62
66
  end
@@ -9,9 +9,12 @@ module Phronomy
9
9
  class Base
10
10
  # Embed the given text and return a vector representation.
11
11
  #
12
- # @param text [String] the text to embed
12
+ # @param text [String] the text to embed
13
+ # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
13
14
  # @return [Array<Float>] the embedding vector
14
- def embed(text)
15
+ # @api public
16
+ def embed(text, cancellation_token = nil)
17
+ cancellation_token&.raise_if_cancelled!
15
18
  raise NotImplementedError, "#{self.class}#embed is not implemented"
16
19
  end
17
20
  end
@@ -19,6 +19,7 @@ module Phronomy
19
19
  # @param provider [Symbol, nil] provider override (e.g. :openai); nil uses the RubyLLM default
20
20
  # @param assume_model_exists [Boolean] when true, skips RubyLLM model-registry validation
21
21
  # (useful for locally hosted models not in the registry)
22
+ # @api public
22
23
  def initialize(model: nil, provider: nil, assume_model_exists: false)
23
24
  @model = model
24
25
  @provider = provider
@@ -27,9 +28,12 @@ module Phronomy
27
28
 
28
29
  # Embed text via RubyLLM.
29
30
  #
30
- # @param text [String]
31
+ # @param text [String]
32
+ # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
31
33
  # @return [Array<Float>]
32
- def embed(text)
34
+ # @api public
35
+ def embed(text, cancellation_token = nil)
36
+ cancellation_token&.raise_if_cancelled!
33
37
  opts = {}
34
38
  opts[:model] = @model if @model
35
39
  opts[:provider] = @provider if @provider
@@ -19,6 +19,7 @@ module Phronomy
19
19
  ComparisonPair = Data.define(:eval_case, :result_a, :result_b)
20
20
 
21
21
  # @param scorer [Scorer::Base]
22
+ # @api public
22
23
  def initialize(scorer: Scorer::ExactMatch.new)
23
24
  @scorer = scorer
24
25
  end
@@ -29,6 +30,7 @@ module Phronomy
29
30
  # @param callable_a [#call]
30
31
  # @param callable_b [#call]
31
32
  # @return [Array<ComparisonPair>]
33
+ # @api public
32
34
  def compare(dataset, callable_a, callable_b)
33
35
  runner_a = Runner.new(scorer: @scorer)
34
36
  runner_b = Runner.new(scorer: @scorer)
@@ -13,6 +13,7 @@ module Phronomy
13
13
  include Enumerable
14
14
 
15
15
  # @param cases [Array<EvalCase>]
16
+ # @api public
16
17
  def initialize(cases = [])
17
18
  @cases = cases.freeze
18
19
  end
@@ -23,16 +24,19 @@ module Phronomy
23
24
  #
24
25
  # @param pairs [Array<Hash>]
25
26
  # @return [Dataset]
27
+ # @api public
26
28
  def self.from_array(pairs)
27
29
  new(pairs.map { |h| EvalCase.new(**h) })
28
30
  end
29
31
 
30
32
  # @yield [EvalCase]
33
+ # @api public
31
34
  def each(&block)
32
35
  @cases.each(&block)
33
36
  end
34
37
 
35
38
  # @return [Integer]
39
+ # @api public
36
40
  def size
37
41
  @cases.size
38
42
  end
@@ -11,12 +11,14 @@ module Phronomy
11
11
  # puts metrics.to_h
12
12
  class Metrics
13
13
  # @param results [Array<EvalResult>]
14
+ # @api public
14
15
  def initialize(results)
15
16
  @results = results
16
17
  end
17
18
 
18
19
  # Fraction of results that passed (score == 1.0).
19
20
  # @return [Float] in [0.0, 1.0]
21
+ # @api public
20
22
  def pass_rate
21
23
  return 0.0 if @results.empty?
22
24
  @results.count(&:pass?).to_f / @results.size
@@ -24,6 +26,7 @@ module Phronomy
24
26
 
25
27
  # Arithmetic mean of all scores.
26
28
  # @return [Float]
29
+ # @api public
27
30
  def average_score
28
31
  return 0.0 if @results.empty?
29
32
  @results.sum(&:score) / @results.size
@@ -32,12 +35,14 @@ module Phronomy
32
35
  # Sum of all TokenUsage objects present in the results.
33
36
  # Results without usage are skipped.
34
37
  # @return [Phronomy::TokenUsage]
38
+ # @api public
35
39
  def total_usage
36
40
  @results.map(&:usage).compact.reduce(TokenUsage.zero, :+)
37
41
  end
38
42
 
39
43
  # Arithmetic mean of latency_ms across all results.
40
44
  # @return [Float]
45
+ # @api public
41
46
  def average_latency_ms
42
47
  return 0.0 if @results.empty?
43
48
  @results.sum(&:latency_ms).to_f / @results.size
@@ -45,6 +50,7 @@ module Phronomy
45
50
 
46
51
  # Returns a plain Hash summary suitable for logging or serialisation.
47
52
  # @return [Hash]
53
+ # @api public
48
54
  def to_h
49
55
  {
50
56
  total: @results.size,
@@ -18,6 +18,7 @@ module Phronomy
18
18
  # results = runner.run(dataset, ->(input) { agent.invoke(input) })
19
19
  class Runner
20
20
  # @param scorer [Scorer::Base] scorer used to evaluate each result
21
+ # @api public
21
22
  def initialize(scorer: Scorer::ExactMatch.new)
22
23
  @scorer = scorer
23
24
  end
@@ -26,6 +27,7 @@ module Phronomy
26
27
  # @param callable [#call] accepts a single String argument
27
28
  # @param concurrency [Integer] number of parallel threads (default: 1, sequential)
28
29
  # @return [Array<EvalResult>]
30
+ # @api public
29
31
  def run(dataset, callable, concurrency: 1)
30
32
  cases = dataset.to_a
31
33
  return cases.map { |eval_case| run_one(eval_case, callable) } if concurrency <= 1
@@ -12,6 +12,7 @@ module Phronomy
12
12
  # @param expected [String] the ground-truth value from the EvalCase
13
13
  # @param input [String, nil] the original input (used by LLM scorers)
14
14
  # @return [Float] a value in [0.0, 1.0]
15
+ # @api public
15
16
  def score(actual:, expected:, input: nil)
16
17
  raise NotImplementedError, "#{self.class}#score is not implemented"
17
18
  end
@@ -12,11 +12,13 @@ module Phronomy
12
12
  # ExactMatch.new.score(actual: "paris", expected: "Paris") # => 0.0
13
13
  class ExactMatch < Base
14
14
  # @param case_sensitive [Boolean] default true
15
+ # @api public
15
16
  def initialize(case_sensitive: true)
16
17
  @case_sensitive = case_sensitive
17
18
  end
18
19
 
19
20
  # @return [Float] 1.0 on match, 0.0 otherwise
21
+ # @api public
20
22
  def score(actual:, expected:, input: nil)
21
23
  a = actual.to_s.strip
22
24
  e = expected.to_s.strip
@@ -13,11 +13,13 @@ module Phronomy
13
13
  # IncludesScorer.new.score(actual: "The answer is 42.", expected: "42") # => 1.0
14
14
  class IncludesScorer < Base
15
15
  # @param case_sensitive [Boolean] default false
16
+ # @api public
16
17
  def initialize(case_sensitive: false)
17
18
  @case_sensitive = case_sensitive
18
19
  end
19
20
 
20
21
  # @return [Float] 1.0 if actual contains expected, 0.0 otherwise
22
+ # @api public
21
23
  def score(actual:, expected:, input: nil)
22
24
  a = actual.to_s
23
25
  e = expected.to_s
@@ -36,6 +36,7 @@ module Phronomy
36
36
  # @param prompt_template [String] format string with %<input>s, %<expected>s, %<actual>s
37
37
  # @param raise_on_error [Boolean] when true, re-raises scoring exceptions instead of
38
38
  # returning 0.0. Use this in batch eval pipelines where silent failures are unacceptable.
39
+ # @api public
39
40
  def initialize(model:, prompt_template: DEFAULT_PROMPT, raise_on_error: false)
40
41
  @model = model
41
42
  @prompt_template = prompt_template
@@ -43,6 +44,7 @@ module Phronomy
43
44
  end
44
45
 
45
46
  # @return [Float] score in [0.0, 1.0]; 0.0 on error when raise_on_error is false
47
+ # @api public
46
48
  def score(actual:, expected:, input: nil)
47
49
  prompt = format(@prompt_template, input: input.to_s, expected: expected.to_s, actual: actual.to_s)
48
50
  response = RubyLLM.chat(model: @model).ask(prompt)