phronomy 0.5.4 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. checksums.yaml +4 -4
  2. data/.mutant.yml +21 -0
  3. data/CHANGELOG.md +379 -0
  4. data/CONTRIBUTING.md +102 -0
  5. data/README.md +262 -48
  6. data/RELEASE_CHECKLIST.md +86 -0
  7. data/SECURITY.md +80 -0
  8. data/benchmark/baseline.json +9 -0
  9. data/benchmark/bench_agent_invoke.rb +105 -0
  10. data/benchmark/bench_context_assembler.rb +46 -0
  11. data/benchmark/bench_regression.rb +171 -0
  12. data/benchmark/bench_token_estimator.rb +44 -0
  13. data/benchmark/bench_tool_schema.rb +69 -0
  14. data/benchmark/bench_vector_store.rb +39 -0
  15. data/benchmark/bench_workflow.rb +55 -0
  16. data/benchmark/run_all.rb +118 -0
  17. data/docs/decisions/001-rubyllm-as-provider-layer.md +42 -0
  18. data/docs/decisions/002-workflow-context-immutability.md +42 -0
  19. data/docs/decisions/003-event-loop-singleton.md +48 -0
  20. data/docs/decisions/004-invoke-timeout-is-not-cancellation.md +51 -0
  21. data/docs/decisions/005-static-knowledge-class-level-cache.md +45 -0
  22. data/docs/decisions/006-no-built-in-guardrails.md +48 -0
  23. data/docs/decisions/007-mcp-is-beta-stability.md +51 -0
  24. data/docs/decisions/008-orchestrator-uses-os-threads.md +52 -0
  25. data/docs/decisions/009-state-store-abstraction.md +141 -0
  26. data/lib/phronomy/agent/base.rb +281 -13
  27. data/lib/phronomy/agent/before_completion_context.rb +1 -0
  28. data/lib/phronomy/agent/checkpoint.rb +1 -0
  29. data/lib/phronomy/agent/concerns/before_completion.rb +6 -0
  30. data/lib/phronomy/agent/concerns/error_translation.rb +45 -0
  31. data/lib/phronomy/agent/concerns/guardrailable.rb +3 -0
  32. data/lib/phronomy/agent/concerns/retryable.rb +12 -1
  33. data/lib/phronomy/agent/concerns/suspendable.rb +4 -0
  34. data/lib/phronomy/agent/fsm.rb +180 -0
  35. data/lib/phronomy/agent/handoff.rb +3 -0
  36. data/lib/phronomy/agent/orchestrator.rb +123 -11
  37. data/lib/phronomy/agent/parallel_tool_chat.rb +92 -0
  38. data/lib/phronomy/agent/react_agent.rb +8 -6
  39. data/lib/phronomy/agent/runner.rb +2 -0
  40. data/lib/phronomy/agent/shared_state.rb +11 -0
  41. data/lib/phronomy/agent/suspend_signal.rb +2 -0
  42. data/lib/phronomy/agent/team_coordinator.rb +17 -5
  43. data/lib/phronomy/cancellation_token.rb +92 -0
  44. data/lib/phronomy/configuration.rb +32 -2
  45. data/lib/phronomy/context/assembler.rb +6 -0
  46. data/lib/phronomy/context/compaction_context.rb +2 -0
  47. data/lib/phronomy/context/context_version_cache.rb +2 -0
  48. data/lib/phronomy/context/token_budget.rb +3 -0
  49. data/lib/phronomy/context/token_estimator.rb +9 -2
  50. data/lib/phronomy/context/trigger_context.rb +1 -0
  51. data/lib/phronomy/context/trim_context.rb +4 -0
  52. data/lib/phronomy/context.rb +0 -1
  53. data/lib/phronomy/embeddings/base.rb +5 -2
  54. data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +6 -2
  55. data/lib/phronomy/eval/comparison.rb +2 -0
  56. data/lib/phronomy/eval/dataset.rb +4 -0
  57. data/lib/phronomy/eval/metrics.rb +6 -0
  58. data/lib/phronomy/eval/runner.rb +2 -0
  59. data/lib/phronomy/eval/scorer/base.rb +1 -0
  60. data/lib/phronomy/eval/scorer/exact_match.rb +2 -0
  61. data/lib/phronomy/eval/scorer/includes_scorer.rb +2 -0
  62. data/lib/phronomy/eval/scorer/llm_judge.rb +2 -0
  63. data/lib/phronomy/event.rb +14 -0
  64. data/lib/phronomy/event_loop.rb +254 -0
  65. data/lib/phronomy/fsm_session.rb +201 -0
  66. data/lib/phronomy/generator_verifier.rb +24 -22
  67. data/lib/phronomy/guardrail/base.rb +3 -0
  68. data/lib/phronomy/guardrail.rb +0 -1
  69. data/lib/phronomy/knowledge_source/base.rb +6 -2
  70. data/lib/phronomy/knowledge_source/entity_knowledge.rb +7 -2
  71. data/lib/phronomy/knowledge_source/rag_knowledge.rb +8 -4
  72. data/lib/phronomy/knowledge_source/static_knowledge.rb +7 -2
  73. data/lib/phronomy/loader/base.rb +1 -0
  74. data/lib/phronomy/loader/csv_loader.rb +2 -0
  75. data/lib/phronomy/loader/markdown_loader.rb +2 -0
  76. data/lib/phronomy/loader/plain_text_loader.rb +1 -0
  77. data/lib/phronomy/output_parser/base.rb +1 -0
  78. data/lib/phronomy/output_parser/json_parser.rb +22 -3
  79. data/lib/phronomy/output_parser/structured_parser.rb +2 -0
  80. data/lib/phronomy/prompt_template.rb +5 -0
  81. data/lib/phronomy/runnable.rb +20 -3
  82. data/lib/phronomy/splitter/base.rb +2 -0
  83. data/lib/phronomy/splitter/fixed_size_splitter.rb +2 -0
  84. data/lib/phronomy/splitter/recursive_splitter.rb +2 -0
  85. data/lib/phronomy/state_store/base.rb +48 -0
  86. data/lib/phronomy/state_store/in_memory.rb +62 -0
  87. data/lib/phronomy/tool/agent_tool.rb +1 -0
  88. data/lib/phronomy/tool/base.rb +189 -27
  89. data/lib/phronomy/tool/mcp_tool.rb +68 -13
  90. data/lib/phronomy/tracing/base.rb +3 -0
  91. data/lib/phronomy/tracing/langfuse_tracer.rb +2 -0
  92. data/lib/phronomy/tracing/open_telemetry_tracer.rb +2 -0
  93. data/lib/phronomy/vector_store/base.rb +33 -7
  94. data/lib/phronomy/vector_store/in_memory.rb +16 -7
  95. data/lib/phronomy/vector_store/pgvector.rb +40 -9
  96. data/lib/phronomy/vector_store/redis_search.rb +29 -8
  97. data/lib/phronomy/version.rb +1 -1
  98. data/lib/phronomy/workflow.rb +175 -74
  99. data/lib/phronomy/workflow_context.rb +55 -5
  100. data/lib/phronomy/workflow_runner.rb +197 -114
  101. data/lib/phronomy.rb +74 -1
  102. data/scripts/api_snapshot.rb +91 -0
  103. data/scripts/check_api_annotations.rb +68 -0
  104. data/scripts/check_private_enforcement.rb +93 -0
  105. data/scripts/check_readme_runnable.rb +98 -0
  106. data/scripts/run_mutation.sh +46 -0
  107. metadata +50 -6
  108. data/lib/phronomy/context/builder.rb +0 -92
  109. data/lib/phronomy/guardrail/builtin/pii_pattern_detector.rb +0 -100
  110. data/lib/phronomy/guardrail/builtin/prompt_injection_detector.rb +0 -67
  111. data/lib/phronomy/guardrail/builtin.rb +0 -16
@@ -0,0 +1,105 @@
1
+ # frozen_string_literal: true
2
+
3
+ # bench_agent_invoke.rb — Agent#invoke framework overhead benchmark.
4
+ #
5
+ # Measures the per-invoke cost of the Phronomy::Agent::Base framework path
6
+ # (context assembly, guardrail checks, before_completion hooks, response
7
+ # handling) with a fully stubbed LLM. No network calls are made.
8
+ #
9
+ # Scenarios:
10
+ # 1. Minimal agent (no tools, no knowledge) — baseline framework overhead.
11
+ # 2. Tool-aware agent with max_parallel_tools=4 (4 stub tools per turn).
12
+ # 3. Agent#stream setup latency (first-chunk time with stubbed stream).
13
+
14
+ require "benchmark"
15
+ require_relative "../lib/phronomy"
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Shared stubs
19
+ # ---------------------------------------------------------------------------
20
+
21
+ BenchAgentMessage = Struct.new(:role, :content, :tool_calls, :tokens) do
22
+ def self.assistant(content = "done")
23
+ new(:assistant, content, nil,
24
+ Struct.new(:input, :output, :cached, :cache_creation).new(5, 5, 0, 0))
25
+ end
26
+ end
27
+
28
+ # A minimal stub Chat that returns a pre-built response immediately.
29
+ class BenchStubChat
30
+ attr_reader :messages
31
+
32
+ def initialize(response)
33
+ @response = response
34
+ @messages = []
35
+ end
36
+
37
+ def with_instructions(_) = self
38
+ def with_tool(_) = self
39
+ def with_temperature(_) = self
40
+ def with_cache_instructions(_) = self
41
+ def with_output_schema(_) = self
42
+ def last_message = @response
43
+
44
+ def ask(_)
45
+ @messages << @response
46
+ @response
47
+ end
48
+
49
+ def stream(*)
50
+ yield @response.content if block_given?
51
+ @response
52
+ end
53
+ end
54
+
55
+ # A stub tool that does nothing but conforms to the Tool::Base interface.
56
+ class BenchNullTool < Phronomy::Tool::Base
57
+ description "No-op benchmark tool"
58
+ param :x, type: :string, desc: "input"
59
+
60
+ def execute(x:)
61
+ "result:#{x}"
62
+ end
63
+ end
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Agent classes
67
+ # ---------------------------------------------------------------------------
68
+
69
+ BENCH_RESP = BenchAgentMessage.assistant("benchmark complete")
70
+ BENCH_RESP_CHAT = BenchStubChat.new(BENCH_RESP)
71
+
72
+ bench_minimal_class = Class.new(Phronomy::Agent::Base) do
73
+ model "stub-model"
74
+
75
+ define_method(:build_chat) { |*| BenchStubChat.new(BENCH_RESP) }
76
+ end
77
+
78
+ bench_tool_class = Class.new(Phronomy::Agent::Base) do
79
+ model "stub-model"
80
+ tools BenchNullTool
81
+ max_parallel_tools 4
82
+
83
+ define_method(:build_chat) { |*| BenchStubChat.new(BENCH_RESP) }
84
+ end
85
+
86
+ BENCH_AGENT_MINIMAL = bench_minimal_class.new
87
+ BENCH_AGENT_TOOLS = bench_tool_class.new
88
+
89
+ AGENT_INVOKE_ITERATIONS = 200
90
+
91
+ puts "=== bench_agent_invoke ==="
92
+ Benchmark.bm(50) do |x|
93
+ x.report("Agent#invoke — minimal (no tools), #{AGENT_INVOKE_ITERATIONS} iters") do
94
+ AGENT_INVOKE_ITERATIONS.times do
95
+ BENCH_AGENT_MINIMAL.invoke("ping", thread_id: "bench-#{rand(1_000_000)}")
96
+ end
97
+ end
98
+
99
+ x.report("Agent#invoke — 4 parallel stub tools, #{AGENT_INVOKE_ITERATIONS} iters") do
100
+ AGENT_INVOKE_ITERATIONS.times do
101
+ BENCH_AGENT_TOOLS.invoke("ping", thread_id: "bench-#{rand(1_000_000)}")
102
+ end
103
+ end
104
+ end
105
+ puts
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Benchmark: Context::Assembler#build
4
+ #
5
+ # Tests context assembly performance for varying numbers of messages and
6
+ # knowledge chunks. This path is exercised on every agent turn.
7
+
8
+ require "benchmark"
9
+ require_relative "../lib/phronomy"
10
+
11
+ BenchAsmMessage = Struct.new(:content)
12
+
13
+ def make_assembler(n_messages:, n_chunks:, with_budget: false)
14
+ budget = if with_budget
15
+ Phronomy::Context::TokenBudget.new(context_window: 4096, max_output_tokens: 512)
16
+ end
17
+ asm = Phronomy::Context::Assembler.new(budget: budget)
18
+ asm.add_instruction("You are a helpful assistant. Answer the user's question.")
19
+ n_chunks.times do |i|
20
+ asm.add_knowledge("Fact #{i}: The capital of country #{i} is City #{i}.", type: :entity, trusted: true)
21
+ end
22
+ msgs = Array.new(n_messages) { BenchAsmMessage.new("This is a conversation message.") }
23
+ asm.add_messages(msgs)
24
+ asm
25
+ end
26
+
27
+ BENCH_ASM_ITERATIONS = 1_000
28
+
29
+ puts "=== bench_context_assembler ==="
30
+ Benchmark.bm(40) do |x|
31
+ x.report("build(10 msgs, 0 chunks)") do
32
+ BENCH_ASM_ITERATIONS.times { make_assembler(n_messages: 10, n_chunks: 0).build }
33
+ end
34
+
35
+ x.report("build(100 msgs, 5 chunks)") do
36
+ BENCH_ASM_ITERATIONS.times { make_assembler(n_messages: 100, n_chunks: 5).build }
37
+ end
38
+
39
+ x.report("build(1000 msgs, 10 chunks, no budget)") do
40
+ (BENCH_ASM_ITERATIONS / 10).times { make_assembler(n_messages: 1000, n_chunks: 10).build }
41
+ end
42
+
43
+ x.report("build(1000 msgs, 10 chunks, budgeted)") do
44
+ (BENCH_ASM_ITERATIONS / 10).times { make_assembler(n_messages: 1000, n_chunks: 10, with_budget: true).build }
45
+ end
46
+ end
@@ -0,0 +1,171 @@
1
+ # frozen_string_literal: true
2
+
3
+ # bench_regression.rb — Targeted regression benchmarks.
4
+ #
5
+ # Measures the five minimum regression targets defined in Issue #232:
6
+ # 1. WorkflowContext#merge throughput
7
+ # 2. Workflow.define (graph build) time
8
+ # 3. Tool::Base#params_schema generation (10 params)
9
+ # 4. Orchestrator#dispatch_parallel overhead (10 stub agents, no LLM)
10
+ # 5. CancellationToken#cancelled? throughput (shared token, 8 threads)
11
+ #
12
+ # Results are stored in a global REGRESSION_RESULTS hash (keyed by metric name,
13
+ # value = iterations per second) for use by run_all.rb baseline comparison.
14
+
15
+ require "benchmark"
16
+ require_relative "../lib/phronomy"
17
+
18
+ REGRESSION_ITERATIONS = 5_000
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Target 1: WorkflowContext#merge throughput
22
+ # ---------------------------------------------------------------------------
23
+ context_class = Class.new do
24
+ include Phronomy::WorkflowContext
25
+
26
+ field :value, type: :replace, default: -> { 0 }
27
+ field :log, type: :append, default: -> { [] }
28
+ end
29
+
30
+ sample_ctx = context_class.new(value: 42, log: ["a"])
31
+
32
+ t1 = Benchmark.measure("WorkflowContext#merge") do
33
+ REGRESSION_ITERATIONS.times { sample_ctx.merge(value: 99, log: "b") }
34
+ end
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Target 2: Workflow.define graph build time
38
+ # ---------------------------------------------------------------------------
39
+ BUILD_ITERATIONS = 1_000
40
+
41
+ t2 = Benchmark.measure("Workflow.define (5 states)") do
42
+ BUILD_ITERATIONS.times do
43
+ build_ctx = Class.new do
44
+ include Phronomy::WorkflowContext
45
+
46
+ field :x, type: :replace, default: -> { 0 }
47
+ end
48
+ Phronomy::Workflow.define(build_ctx) do
49
+ initial :a
50
+ %i[a b c d].each_with_index do |state, i|
51
+ next_state = %i[a b c d e][i + 1]
52
+ action = ->(s) { s.merge(x: s.x + 1) }
53
+ self.state state, action: action
54
+ transition from: state, to: next_state
55
+ end
56
+ self.state :e, action: ->(s) { s }
57
+ transition from: :e, to: :__finish__
58
+ end
59
+ end
60
+ end
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Target 3: Tool::Base#params_schema generation (10 params)
64
+ # ---------------------------------------------------------------------------
65
+ tool_class = Class.new(Phronomy::Tool::Base) do
66
+ description "Test tool with 10 params"
67
+ param :p1, type: :string, desc: "param 1"
68
+ param :p2, type: :string, desc: "param 2"
69
+ param :p3, type: :string, desc: "param 3"
70
+ param :p4, type: :string, desc: "param 4"
71
+ param :p5, type: :string, desc: "param 5"
72
+ param :p6, type: :string, desc: "param 6"
73
+ param :p7, type: :string, desc: "param 7"
74
+ param :p8, type: :string, desc: "param 8"
75
+ param :p9, type: :string, desc: "param 9"
76
+ param :p10, type: :string, desc: "param 10"
77
+
78
+ def execute(**_kwargs)
79
+ "ok"
80
+ end
81
+ end
82
+
83
+ t3 = Benchmark.measure("Tool::Base#params_schema_definition (10 params)") do
84
+ REGRESSION_ITERATIONS.times { tool_class.params_schema_definition }
85
+ end
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Target 4: Orchestrator#dispatch_parallel overhead (10 stub agents, no LLM)
89
+ # ---------------------------------------------------------------------------
90
+ stub_agent_class = Class.new(Phronomy::Agent::Base) do
91
+ define_method(:invoke) do |_input, messages: [], thread_id: nil, config: {}|
92
+ {output: "stub", messages: []}
93
+ end
94
+ end
95
+
96
+ orchestrator_class = Class.new(Phronomy::Agent::Orchestrator)
97
+ orchestrator = orchestrator_class.new
98
+
99
+ PARALLEL_ITERATIONS = 200
100
+
101
+ t4 = Benchmark.measure("Orchestrator#dispatch_parallel (10 agents)") do
102
+ PARALLEL_ITERATIONS.times do
103
+ tasks = Array.new(10) { {agent: stub_agent_class, input: "x"} }
104
+ orchestrator.dispatch_parallel(*tasks)
105
+ end
106
+ end
107
+
108
+ # ---------------------------------------------------------------------------
109
+ # Target 5: CancellationToken#cancelled? throughput (8 threads)
110
+ # ---------------------------------------------------------------------------
111
+ CANCEL_TOKEN = Phronomy::CancellationToken.new
112
+ CANCEL_ITERATIONS = 10_000
113
+
114
+ t5 = Benchmark.measure("CancellationToken#cancelled? (8 threads)") do
115
+ threads = 8.times.map do
116
+ Thread.new { CANCEL_ITERATIONS.times { CANCEL_TOKEN.cancelled? } }
117
+ end
118
+ threads.each(&:join)
119
+ end
120
+
121
+ # ---------------------------------------------------------------------------
122
+ # Target 6: CancellationToken#raise_if_cancelled! hot path (no-op, single thread)
123
+ # ---------------------------------------------------------------------------
124
+ RAISE_TOKEN = Phronomy::CancellationToken.new # not cancelled — no-op path
125
+ RAISE_ITERATIONS = 200_000
126
+
127
+ t6 = Benchmark.measure("CancellationToken#raise_if_cancelled! (no-op)") do
128
+ RAISE_ITERATIONS.times { RAISE_TOKEN.raise_if_cancelled! }
129
+ end
130
+
131
+ # ---------------------------------------------------------------------------
132
+ # Target 7: Context::TrimContext#remove on a 2000-element history
133
+ # ---------------------------------------------------------------------------
134
+ BenchMsg = Struct.new(:content) unless defined?(BenchMsg)
135
+
136
+ TRIM_ELEMENTS = Array.new(2_000) { |i| {seq: i, message: BenchMsg.new("msg #{i}"), tokens: 10, role: :user} }
137
+ TRIM_BUDGET = Phronomy::Context::TokenBudget.new(context_window: 4096, max_output_tokens: 512)
138
+ TRIM_ITERATIONS = 500
139
+
140
+ t7 = Benchmark.measure("TrimContext#remove (2000-element history)") do
141
+ TRIM_ITERATIONS.times do
142
+ tc = Phronomy::Context::TrimContext.new(message_elements: TRIM_ELEMENTS, budget: TRIM_BUDGET)
143
+ tc.remove((0...200).to_a) # remove 200 oldest messages
144
+ end
145
+ end
146
+
147
+ # ---------------------------------------------------------------------------
148
+ # Print results and store in REGRESSION_RESULTS
149
+ # ---------------------------------------------------------------------------
150
+ puts "=== bench_regression ==="
151
+ printf("%-46s %8s %12s\n", "Metric", "Real (s)", "Iter/s")
152
+ puts "-" * 70
153
+
154
+ metrics = {
155
+ "workflow_context_merge" => [t1, REGRESSION_ITERATIONS],
156
+ "workflow_define" => [t2, BUILD_ITERATIONS],
157
+ "tool_params_schema_definition" => [t3, REGRESSION_ITERATIONS],
158
+ "dispatch_parallel_10" => [t4, PARALLEL_ITERATIONS],
159
+ "cancellation_token_cancelled" => [t5, 8 * CANCEL_ITERATIONS],
160
+ "cancellation_token_raise_if_cancelled_noop" => [t6, RAISE_ITERATIONS],
161
+ "trim_context_remove_2000" => [t7, TRIM_ITERATIONS]
162
+ }
163
+
164
+ REGRESSION_RESULTS = {} # rubocop:disable Style/MutableConstant
165
+
166
+ metrics.each do |key, (measure, iters)|
167
+ ips = iters / measure.real
168
+ REGRESSION_RESULTS[key] = ips
169
+ printf("%-46s %8.3f %12.0f\n", key, measure.real, ips)
170
+ end
171
+ puts
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Benchmark: Context::TokenEstimator.estimate
4
+ #
5
+ # Tests estimation speed for short, medium, and long text inputs, and for
6
+ # Arrays of message-like objects. This method is called on every message in
7
+ # every agent turn, so it must be consistently fast.
8
+
9
+ require "benchmark"
10
+ require_relative "../lib/phronomy"
11
+
12
+ SHORT_TEXT = "Hello, how are you today?"
13
+ MEDIUM_TEXT = "A" * 500
14
+ LONG_TEXT = "A" * 10_000
15
+
16
+ BenchMessage = Struct.new(:content)
17
+
18
+ MESSAGES_100 = Array.new(100) { BenchMessage.new("A" * 100) }
19
+ MESSAGES_1000 = Array.new(1000) { BenchMessage.new("A" * 100) }
20
+
21
+ BENCH_TOKEN_ITERATIONS = 10_000
22
+
23
+ puts "=== bench_token_estimator ==="
24
+ Benchmark.bm(30) do |x|
25
+ x.report("estimate(short text)") do
26
+ BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(SHORT_TEXT) }
27
+ end
28
+
29
+ x.report("estimate(medium text 500c)") do
30
+ BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(MEDIUM_TEXT) }
31
+ end
32
+
33
+ x.report("estimate(long text 10k c)") do
34
+ BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(LONG_TEXT) }
35
+ end
36
+
37
+ x.report("estimate(100 messages)") do
38
+ BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(MESSAGES_100) }
39
+ end
40
+
41
+ x.report("estimate(1000 messages)") do
42
+ (BENCH_TOKEN_ITERATIONS / 10).times { Phronomy::Context::TokenEstimator.estimate(MESSAGES_1000) }
43
+ end
44
+ end
@@ -0,0 +1,69 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Benchmark: Tool::Base params_schema generation and static_knowledge_chunks cache
4
+ #
5
+ # Tool schema generation happens once per tool class (lazily memoised).
6
+ # static_knowledge_chunks is cached at the class level; cache-hit overhead
7
+ # should be negligible compared to cache-miss (which calls the knowledge source).
8
+
9
+ require "benchmark"
10
+ require_relative "../lib/phronomy"
11
+
12
+ # --- Tool schema ---
13
+
14
+ class BenchTool10Params < Phronomy::Tool::Base
15
+ description "A tool with 10 parameters for benchmarking purposes"
16
+ param :param1, type: :string, desc: "First parameter"
17
+ param :param2, type: :integer, desc: "Second parameter"
18
+ param :param3, type: :number, desc: "Third parameter"
19
+ param :param4, type: :boolean, desc: "Fourth parameter"
20
+ param :param5, type: :string, desc: "Fifth parameter"
21
+ param :param6, type: :string, desc: "Sixth parameter", required: false
22
+ param :param7, type: :integer, desc: "Seventh parameter", required: false
23
+ param :param8, type: :string, desc: "Eighth parameter", required: false
24
+ param :param9, type: :string, desc: "Ninth parameter", required: false
25
+ param :param10, type: :string, desc: "Tenth parameter", required: false
26
+
27
+ def execute(**_)
28
+ "ok"
29
+ end
30
+ end
31
+
32
+ # Warm up memoisation
33
+ BenchTool10Params.params_schema_definition
34
+
35
+ BENCH_TOOL_ITERATIONS = 50_000
36
+
37
+ puts "=== bench_tool_schema ==="
38
+ Benchmark.bm(35) do |x|
39
+ x.report("params_schema_definition (memoised, 10p)") do
40
+ BENCH_TOOL_ITERATIONS.times { BenchTool10Params.params_schema_definition }
41
+ end
42
+ end
43
+
44
+ # --- static_knowledge_chunks cache ---
45
+
46
+ class BenchKnowledgeSource < Phronomy::KnowledgeSource::Base
47
+ def fetch(query: nil)
48
+ [{content: "Cached knowledge fact.", type: :static}]
49
+ end
50
+
51
+ def static?
52
+ true
53
+ end
54
+ end
55
+
56
+ class BenchAgentWithKnowledge < Phronomy::Agent::Base
57
+ model "gpt-4o-mini"
58
+ static_knowledge BenchKnowledgeSource.new
59
+ end
60
+
61
+ # Warm up cache
62
+ BenchAgentWithKnowledge.static_knowledge_chunks
63
+
64
+ puts "\n=== bench_static_knowledge_cache ==="
65
+ Benchmark.bm(35) do |x|
66
+ x.report("static_knowledge_chunks (hit)") do
67
+ BENCH_TOOL_ITERATIONS.times { BenchAgentWithKnowledge.static_knowledge_chunks }
68
+ end
69
+ end
@@ -0,0 +1,39 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Benchmark: VectorStore::InMemory#search
4
+ #
5
+ # Tests search performance at different corpus sizes (100, 1000, 10_000 docs).
6
+ # Linear scan is expected; this benchmark establishes the scaling baseline.
7
+
8
+ require "benchmark"
9
+ require_relative "../lib/phronomy"
10
+
11
+ DIM = 64
12
+
13
+ def random_embedding(dim)
14
+ Array.new(dim) { rand(-1.0..1.0) }
15
+ end
16
+
17
+ def populate(store, n)
18
+ n.times do |i|
19
+ store.add(id: "doc#{i}", embedding: random_embedding(DIM), metadata: {text: "Document #{i}"})
20
+ end
21
+ end
22
+
23
+ QUERY = random_embedding(DIM)
24
+
25
+ # Use fewer iterations for larger corpora to keep total run time reasonable.
26
+ BENCH_VS_ITERS = {100 => 100, 1_000 => 20, 10_000 => 5}.freeze
27
+
28
+ puts "=== bench_vector_store_inmemory ==="
29
+ Benchmark.bm(35) do |x|
30
+ [100, 1_000, 10_000].each do |n|
31
+ store = Phronomy::VectorStore::InMemory.new(dimension: DIM)
32
+ populate(store, n)
33
+ iters = BENCH_VS_ITERS[n]
34
+
35
+ x.report("search(k=5, corpus=#{n}, iters=#{iters})") do
36
+ iters.times { store.search(query_embedding: QUERY, k: 5) }
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Benchmark: Workflow transition loop
4
+ #
5
+ # Builds a linear chain of N states and measures how long it takes to run
6
+ # the full workflow to completion. 100 transitions must complete in <10ms.
7
+
8
+ require "benchmark"
9
+ require_relative "../lib/phronomy"
10
+
11
+ # Build a linear workflow: state_0 -> state_1 -> ... -> state_(N-1) -> __finish__
12
+ def build_linear_workflow(n)
13
+ context_class = Class.new do
14
+ include Phronomy::WorkflowContext
15
+
16
+ field :count, type: :replace, default: -> { 0 }
17
+ end
18
+
19
+ Phronomy::Workflow.define(context_class) do
20
+ initial :state_0
21
+
22
+ n.times do |i|
23
+ state :"state_#{i}", action: ->(s) { s.merge(count: s.count + 1) }
24
+ transition from: :"state_#{i}", to: (i + 1 < n) ? :"state_#{i + 1}" : :__finish__
25
+ end
26
+ end
27
+ end
28
+
29
+ BENCH_WF_ITERATIONS = 50
30
+
31
+ puts "=== bench_workflow_transition ==="
32
+ Benchmark.bm(30) do |x|
33
+ [10, 50, 100].each do |n|
34
+ app = build_linear_workflow(n)
35
+ cfg = {recursion_limit: n + 5}
36
+
37
+ x.report("#{n} transitions") do
38
+ BENCH_WF_ITERATIONS.times { app.invoke({}, config: cfg) }
39
+ end
40
+ end
41
+ end
42
+
43
+ # Threshold assertion: 100 transitions should complete in <10ms on average
44
+ puts "\nThreshold check: 100 transitions < 10ms average..."
45
+ app100 = build_linear_workflow(100)
46
+ cfg100 = {recursion_limit: 110}
47
+ samples = 20
48
+ elapsed = Benchmark.realtime { samples.times { app100.invoke({}, config: cfg100) } }
49
+ avg_ms = (elapsed / samples) * 1000.0
50
+ puts " Average: #{"%.2f" % avg_ms}ms per run"
51
+ if avg_ms < 10.0
52
+ puts " PASS (< 10ms)"
53
+ else
54
+ warn " WARN: #{avg_ms.round(2)}ms exceeds 10ms threshold (environment may be slow)"
55
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ # run_all.rb — Runs all Phronomy benchmarks in sequence.
4
+ #
5
+ # Usage:
6
+ # ruby benchmark/run_all.rb
7
+ #
8
+ # In CI this script must complete within 30 seconds (smoke check only).
9
+ #
10
+ # Baseline management (nightly regression tracking):
11
+ # BENCHMARK_WRITE_BASELINE=path/to/baseline.json — write current throughput
12
+ # results from bench_regression.rb to a JSON baseline file.
13
+ # BENCHMARK_BASELINE=path/to/baseline.json — compare current results against
14
+ # the stored baseline; exit 1 if any metric regresses beyond the threshold.
15
+ # BENCHMARK_REGRESSION_THRESHOLD — percentage allowed before failing (default 20).
16
+
17
+ require "benchmark"
18
+ require "json"
19
+
20
+ BENCH_DIR = __dir__
21
+ SCRIPTS = %w[
22
+ bench_token_estimator.rb
23
+ bench_context_assembler.rb
24
+ bench_vector_store.rb
25
+ bench_workflow.rb
26
+ bench_tool_schema.rb
27
+ bench_agent_invoke.rb
28
+ bench_regression.rb
29
+ ].freeze
30
+
31
+ puts "Phronomy benchmark suite"
32
+ puts "Ruby #{RUBY_VERSION} on #{RUBY_PLATFORM}"
33
+ puts "=" * 60
34
+
35
+ overall_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
36
+
37
+ SCRIPTS.each do |script|
38
+ path = File.join(BENCH_DIR, script)
39
+ puts
40
+ load path
41
+ end
42
+
43
+ overall_elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - overall_start
44
+ puts
45
+ puts "=" * 60
46
+ puts "Total elapsed: #{"%.2f" % overall_elapsed}s"
47
+
48
+ # CI smoke check: fail if total exceeds the allowed limit.
49
+ max_seconds = ENV.fetch("BENCHMARK_MAX_SECONDS", "60").to_i
50
+ if overall_elapsed > max_seconds
51
+ warn "FAIL: benchmark suite exceeded #{max_seconds}s limit (took #{"%.1f" % overall_elapsed}s)"
52
+ exit 1
53
+ end
54
+
55
+ puts "OK: completed in #{"%.1f" % overall_elapsed}s (limit: #{max_seconds}s)"
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Baseline management — only active when the relevant env vars are set.
59
+ # REGRESSION_RESULTS is defined in bench_regression.rb (loaded above).
60
+ # ---------------------------------------------------------------------------
61
+
62
+ write_path = ENV["BENCHMARK_WRITE_BASELINE"]
63
+ compare_path = ENV["BENCHMARK_BASELINE"]
64
+ threshold = ENV.fetch("BENCHMARK_REGRESSION_THRESHOLD", "20").to_f / 100.0
65
+
66
+ if write_path
67
+ File.write(write_path, JSON.pretty_generate(REGRESSION_RESULTS))
68
+ puts "\nBaseline written to #{write_path}"
69
+ end
70
+
71
+ if compare_path
72
+ unless File.exist?(compare_path)
73
+ warn "FAIL: baseline file not found: #{compare_path}"
74
+ exit 1
75
+ end
76
+
77
+ baseline = JSON.parse(File.read(compare_path))
78
+ regressions = []
79
+
80
+ puts "\n#{"=" * 60}"
81
+ puts "Regression comparison (threshold: #{(threshold * 100).to_i}%)"
82
+ printf("%-46s %10s %10s %8s\n", "Metric", "Baseline", "Current", "Change")
83
+ puts "-" * 78
84
+
85
+ REGRESSION_RESULTS.each do |key, current_ips|
86
+ unless baseline.key?(key)
87
+ printf("%-46s %10s %10.0f %8s\n", key, "N/A", current_ips, "new")
88
+ next
89
+ end
90
+
91
+ baseline_ips = baseline[key].to_f
92
+ change = (baseline_ips - current_ips) / baseline_ips # positive = slower
93
+
94
+ status = if change > threshold
95
+ regressions << {key:, baseline: baseline_ips, current: current_ips, change:}
96
+ "FAIL"
97
+ elsif change > threshold * 0.5
98
+ "WARN"
99
+ else
100
+ "OK"
101
+ end
102
+
103
+ printf("%-46s %10.0f %10.0f %+7.1f%% %s\n",
104
+ key, baseline_ips, current_ips, -change * 100, status)
105
+ end
106
+
107
+ if regressions.any?
108
+ puts
109
+ warn "FAIL: #{regressions.size} benchmark(s) regressed beyond #{(threshold * 100).to_i}%:"
110
+ regressions.each do |r|
111
+ warn " #{r[:key]}: #{r[:baseline].round} → #{r[:current].round} iter/s " \
112
+ "(#{format("%+.1f%%", -r[:change] * 100)})"
113
+ end
114
+ exit 1
115
+ else
116
+ puts "\nAll benchmarks within threshold."
117
+ end
118
+ end
@@ -0,0 +1,42 @@
1
+ # ADR-001: Use RubyLLM as the LLM Provider Layer
2
+
3
+ ## Status
4
+
5
+ Accepted
6
+
7
+ ## Context
8
+
9
+ Phronomy needs to send prompts to large language models and receive structured
10
+ responses. The options were:
11
+
12
+ 1. Implement provider clients directly (OpenAI, Anthropic, Google, etc.)
13
+ 2. Vendor an existing Ruby abstraction library
14
+ 3. Treat providers as a pluggable adapter with a thin wrapper
15
+
16
+ Implementing provider clients directly would require maintaining authentication,
17
+ retry logic, streaming, and model versioning for each provider — significant
18
+ ongoing maintenance cost. The Ruby ecosystem has a maturing option in RubyLLM,
19
+ which provides a unified interface for multiple providers and handles streaming,
20
+ tool call serialization, and response parsing.
21
+
22
+ ## Decision
23
+
24
+ Phronomy delegates all LLM provider communication to the `ruby-llm` gem.
25
+ `Phronomy::Agent::Base` and `Phronomy::Chain::LLMChain` call `RubyLLM.chat`
26
+ (or equivalent) rather than provider SDKs directly.
27
+
28
+ ## Consequences
29
+
30
+ **Positive:**
31
+ - Provider switching is a configuration change, not a code change.
32
+ - Streaming, tool call parsing, and multi-modal input handling are inherited
33
+ from RubyLLM without re-implementation.
34
+ - The phronomy codebase stays focused on agent/workflow orchestration.
35
+
36
+ **Negative / Tradeoffs:**
37
+ - Phronomy's LLM feature surface is bounded by what RubyLLM exposes. Provider
38
+ capabilities not yet supported by RubyLLM are unavailable without a custom
39
+ adapter.
40
+ - Bugs or breaking changes in RubyLLM require downstream fixes in phronomy.
41
+ - Error types from providers are wrapped in RubyLLM errors; phronomy re-wraps
42
+ them again (see `Agent::Concerns::ErrorTranslation`).