RubyGems - phronomy - Versions diffs - 0.6.0 → 0.7.1 - Mend

phronomy 0.6.0 → 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (143) hide show

checksums.yaml +4 -4
data/.mutant.yml +22 -0
data/CHANGELOG.md +488 -0
data/CONTRIBUTING.md +102 -0
data/README.md +374 -36
data/RELEASE_CHECKLIST.md +86 -0
data/Rakefile +33 -0
data/SECURITY.md +80 -0
data/benchmark/baseline.json +9 -0
data/benchmark/bench_agent_invoke.rb +105 -0
data/benchmark/bench_context_assembler.rb +46 -0
data/benchmark/bench_regression.rb +172 -0
data/benchmark/bench_token_estimator.rb +44 -0
data/benchmark/bench_tool_schema.rb +69 -0
data/benchmark/bench_vector_store.rb +39 -0
data/benchmark/bench_workflow.rb +55 -0
data/benchmark/run_all.rb +118 -0
data/docs/decisions/001-rubyllm-as-provider-layer.md +42 -0
data/docs/decisions/002-workflow-context-immutability.md +42 -0
data/docs/decisions/003-event-loop-singleton.md +48 -0
data/docs/decisions/004-invoke-timeout-is-not-cancellation.md +75 -0
data/docs/decisions/005-static-knowledge-class-level-cache.md +45 -0
data/docs/decisions/006-no-built-in-guardrails.md +66 -0
data/docs/decisions/007-mcp-is-beta-stability.md +51 -0
data/docs/decisions/008-orchestrator-uses-os-threads.md +52 -0
data/docs/decisions/009-state-store-abstraction.md +141 -0
data/docs/decisions/010-cooperative-first-concurrency.md +248 -0
data/lib/phronomy/agent/base.rb +416 -49
data/lib/phronomy/agent/before_completion_context.rb +1 -0
data/lib/phronomy/agent/checkpoint.rb +1 -0
data/lib/phronomy/agent/concerns/before_completion.rb +6 -0
data/lib/phronomy/agent/concerns/error_translation.rb +45 -0
data/lib/phronomy/agent/concerns/guardrailable.rb +3 -0
data/lib/phronomy/agent/concerns/retryable.rb +12 -1
data/lib/phronomy/agent/concerns/suspendable.rb +19 -0
data/lib/phronomy/agent/fsm.rb +44 -52
data/lib/phronomy/agent/handoff.rb +3 -0
data/lib/phronomy/agent/orchestrator.rb +191 -54
data/lib/phronomy/agent/parallel_tool_chat.rb +87 -13
data/lib/phronomy/agent/react_agent.rb +16 -6
data/lib/phronomy/agent/runner.rb +2 -0
data/lib/phronomy/agent/shared_state.rb +11 -0
data/lib/phronomy/agent/suspend_signal.rb +2 -0
data/lib/phronomy/agent/team_coordinator.rb +17 -5
data/lib/phronomy/async_queue.rb +155 -0
data/lib/phronomy/blocking_adapter_pool.rb +435 -0
data/lib/phronomy/cancellation_scope.rb +123 -0
data/lib/phronomy/cancellation_token.rb +133 -0
data/lib/phronomy/concurrency_gate.rb +155 -0
data/lib/phronomy/configuration.rb +168 -2
data/lib/phronomy/context/assembler.rb +6 -0
data/lib/phronomy/context/compaction_context.rb +2 -0
data/lib/phronomy/context/context_version_cache.rb +2 -0
data/lib/phronomy/context/token_budget.rb +3 -0
data/lib/phronomy/context/token_estimator.rb +9 -2
data/lib/phronomy/context/trigger_context.rb +1 -0
data/lib/phronomy/context/trim_context.rb +4 -0
data/lib/phronomy/deadline.rb +63 -0
data/lib/phronomy/diagnostics.rb +62 -0
data/lib/phronomy/embeddings/base.rb +22 -2
data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +6 -2
data/lib/phronomy/eval/comparison.rb +2 -0
data/lib/phronomy/eval/dataset.rb +4 -0
data/lib/phronomy/eval/metrics.rb +6 -0
data/lib/phronomy/eval/runner.rb +11 -9
data/lib/phronomy/eval/scorer/base.rb +1 -0
data/lib/phronomy/eval/scorer/exact_match.rb +2 -0
data/lib/phronomy/eval/scorer/includes_scorer.rb +2 -0
data/lib/phronomy/eval/scorer/llm_judge.rb +2 -0
data/lib/phronomy/event_loop.rb +275 -30
data/lib/phronomy/fsm_session.rb +57 -4
data/lib/phronomy/generator_verifier.rb +2 -0
data/lib/phronomy/guardrail/base.rb +3 -0
data/lib/phronomy/guardrail/prompt_injection_guardrail.rb +58 -0
data/lib/phronomy/invocation_context.rb +152 -0
data/lib/phronomy/knowledge_source/base.rb +24 -2
data/lib/phronomy/knowledge_source/entity_knowledge.rb +7 -2
data/lib/phronomy/knowledge_source/rag_knowledge.rb +8 -4
data/lib/phronomy/knowledge_source/static_knowledge.rb +7 -2
data/lib/phronomy/llm_adapter/base.rb +104 -0
data/lib/phronomy/llm_adapter/ruby_llm.rb +41 -0
data/lib/phronomy/llm_adapter.rb +20 -0
data/lib/phronomy/loader/base.rb +1 -0
data/lib/phronomy/loader/csv_loader.rb +2 -0
data/lib/phronomy/loader/markdown_loader.rb +2 -0
data/lib/phronomy/loader/plain_text_loader.rb +1 -0
data/lib/phronomy/metrics.rb +38 -0
data/lib/phronomy/output_parser/base.rb +1 -0
data/lib/phronomy/output_parser/json_parser.rb +22 -3
data/lib/phronomy/output_parser/structured_parser.rb +2 -0
data/lib/phronomy/prompt_template.rb +5 -0
data/lib/phronomy/runnable.rb +20 -3
data/lib/phronomy/runtime/deterministic_scheduler.rb +412 -0
data/lib/phronomy/runtime/fake_scheduler.rb +165 -0
data/lib/phronomy/runtime/gate_registry.rb +52 -0
data/lib/phronomy/runtime/pool_registry.rb +57 -0
data/lib/phronomy/runtime/runtime_metrics.rb +117 -0
data/lib/phronomy/runtime/scheduler.rb +98 -0
data/lib/phronomy/runtime/scheduler_timer_adapter.rb +79 -0
data/lib/phronomy/runtime/task_registry.rb +48 -0
data/lib/phronomy/runtime/thread_scheduler.rb +30 -0
data/lib/phronomy/runtime/timer_queue.rb +106 -0
data/lib/phronomy/runtime/timer_service.rb +42 -0
data/lib/phronomy/runtime.rb +374 -0
data/lib/phronomy/splitter/base.rb +2 -0
data/lib/phronomy/splitter/fixed_size_splitter.rb +2 -0
data/lib/phronomy/splitter/recursive_splitter.rb +2 -0
data/lib/phronomy/state_store/base.rb +48 -0
data/lib/phronomy/state_store/in_memory.rb +62 -0
data/lib/phronomy/task/backend.rb +80 -0
data/lib/phronomy/task/fiber_backend.rb +157 -0
data/lib/phronomy/task/immediate_backend.rb +89 -0
data/lib/phronomy/task/thread_backend.rb +84 -0
data/lib/phronomy/task.rb +275 -0
data/lib/phronomy/task_group.rb +265 -0
data/lib/phronomy/testing/fake_clock.rb +109 -0
data/lib/phronomy/testing/fake_scheduler.rb +104 -0
data/lib/phronomy/testing/scheduler_helpers.rb +59 -0
data/lib/phronomy/testing.rb +12 -0
data/lib/phronomy/tool/agent_tool.rb +1 -0
data/lib/phronomy/tool/base.rb +298 -28
data/lib/phronomy/tool/mcp_tool.rb +103 -17
data/lib/phronomy/tool/scope_policy.rb +50 -0
data/lib/phronomy/tool_executor.rb +106 -0
data/lib/phronomy/tracing/base.rb +3 -0
data/lib/phronomy/tracing/langfuse_tracer.rb +2 -0
data/lib/phronomy/tracing/open_telemetry_tracer.rb +36 -0
data/lib/phronomy/vector_store/async_backend.rb +110 -0
data/lib/phronomy/vector_store/base.rb +40 -7
data/lib/phronomy/vector_store/in_memory.rb +16 -7
data/lib/phronomy/vector_store/pgvector.rb +40 -9
data/lib/phronomy/vector_store/redis_search.rb +29 -8
data/lib/phronomy/version.rb +1 -1
data/lib/phronomy/workflow.rb +147 -11
data/lib/phronomy/workflow_context.rb +83 -6
data/lib/phronomy/workflow_runner.rb +106 -7
data/lib/phronomy.rb +112 -1
data/scripts/api_snapshot.rb +91 -0
data/scripts/check_api_annotations.rb +68 -0
data/scripts/check_private_enforcement.rb +93 -0
data/scripts/check_readme_runnable.rb +98 -0
data/scripts/run_mutation.sh +46 -0
metadata +83 -2

data/Rakefile CHANGED Viewed

@@ -7,4 +7,37 @@ RSpec::Core::RakeTask.new(:spec)
 require "standard/rake"
+# Verify that @api private classes do not leak into the public YARD output.
+# Any class or module without @api private that ends up in the public doc must
+# have a corresponding entry in the Features table in README.md.
+#
+# Usage: bundle exec rake yard_check
+desc "Build YARD docs excluding @api private items and check for undocumented public APIs"
+task :yard_check do
+  require "yard"
+  YARD::Registry.clear
+  YARD.parse(Dir["lib/**/*.rb"])
+  undocumented = []
+  YARD::Registry.all(:class, :module).each do |obj|
+    next if obj.visibility == :private
+    next if obj.tag(:api)&.name == "private"
+    next if obj.docstring.blank?
+    # Classes/modules with no docstring that are not @api private are worth
+    # noting, but only raise on truly undocumented public objects.
+    if obj.docstring.empty?
+      undocumented << obj.path
+    end
+  end
+  unless undocumented.empty?
+    warn "The following public classes/modules have no YARD documentation:\n" \
+         "  #{undocumented.join("\n  ")}\n" \
+         "Either add a docstring or mark them @api private."
+    exit 1
+  end
+  puts "yard_check passed — no undocumented public classes/modules found."
+end
 task default: %i[spec standard]

data/SECURITY.md ADDED Viewed

@@ -0,0 +1,80 @@
+# Security Policy
+## Supported Versions
+| Version | Supported |
+|---------|-----------|
+| Latest release (main branch) | ✅ |
+| Older versions | ❌ — please upgrade |
+Only the latest released version of `phronomy` receives security patches. If you
+are running an older version, please upgrade before filing a report.
+---
+## Reporting a Vulnerability
+**Please do NOT open a public GitHub Issue for security vulnerabilities.**
+Use [GitHub's private vulnerability reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing/privately-reporting-a-security-vulnerability)
+instead:
+1. Navigate to the [Security tab](https://github.com/Raizo-TCS/phronomy/security)
+   of this repository.
+2. Click **"Report a vulnerability"**.
+3. Fill in the advisory form with as much detail as possible.
+This creates a private draft advisory visible only to maintainers, keeping the
+details confidential until a fix is prepared and released.
+---
+## Response Timeline
+| Milestone | Target |
+|-----------|--------|
+| Acknowledgement of report | Within **7 days** |
+| Triage and initial assessment | Within **14 days** |
+| Patch release (critical / high severity) | Within **30 days** |
+| Patch release (medium / low severity) | Best effort; typically within **60 days** |
+If you do not receive an acknowledgement within 7 days, please follow up by
+opening a **public** Issue with the subject "Security report follow-up (no
+response)" — do **not** include vulnerability details in the public Issue.
+---
+## Scope
+**In scope:**
+- Vulnerabilities in the `phronomy` gem source code (`lib/`, `spec/`).
+- Dependency vulnerabilities that affect gem consumers when `phronomy` is used as intended.
+- Information disclosure via tracing/logging APIs (e.g. `trace_pii: false` bypass).
+- Approval gate bypasses (tool execution without the registered approval handler).
+**Out of scope:**
+- Security of consumer applications built on top of `phronomy`.
+- Vulnerabilities in the LLM provider (OpenAI, Anthropic, etc.) or in `ruby_llm`.
+- Attacks that require an attacker to already have write access to the host system.
+- Prompt injection via LLM output — the gem forwards LLM output faithfully; prompt
+  injection resistance is the responsibility of the LLM provider and the application.
+---
+## Disclosure Policy
+- Maintainers will coordinate with you on the release date and credit you in the
+  `CHANGELOG.md` entry and GitHub release notes.
+- If you wish to remain anonymous, let us know in the advisory.
+- We follow a **coordinated disclosure** model: the advisory will be made public
+  after a patch is released (or after 90 days, whichever comes first).
+---
+## Credit
+Security reporters are credited in the `CHANGELOG.md` entry for the patch release,
+in the GitHub Security Advisory, and in the release notes — unless they request
+anonymity.

data/benchmark/baseline.json ADDED Viewed

@@ -0,0 +1,9 @@
+{
+  "workflow_context_merge": 124364.81010472385,
+  "workflow_define": 2179.945274115319,
+  "tool_params_schema_definition": 19534379.159046534,
+  "dispatch_parallel_10": 886.0,
+  "cancellation_token_cancelled": 4335060.97443425,
+  "cancellation_token_raise_if_cancelled_noop": 3566903.189098373,
+  "trim_context_remove_2000": 1761.5700678986254
+}

data/benchmark/bench_agent_invoke.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+# bench_agent_invoke.rb — Agent#invoke framework overhead benchmark.
+#
+# Measures the per-invoke cost of the Phronomy::Agent::Base framework path
+# (context assembly, guardrail checks, before_completion hooks, response
+# handling) with a fully stubbed LLM.  No network calls are made.
+#
+# Scenarios:
+#   1. Minimal agent (no tools, no knowledge) — baseline framework overhead.
+#   2. Tool-aware agent with max_parallel_tools=4 (4 stub tools per turn).
+#   3. Agent#stream setup latency (first-chunk time with stubbed stream).
+require "benchmark"
+require_relative "../lib/phronomy"
+# ---------------------------------------------------------------------------
+# Shared stubs
+# ---------------------------------------------------------------------------
+BenchAgentMessage = Struct.new(:role, :content, :tool_calls, :tokens) do
+  def self.assistant(content = "done")
+    new(:assistant, content, nil,
+      Struct.new(:input, :output, :cached, :cache_creation).new(5, 5, 0, 0))
+  end
+end
+# A minimal stub Chat that returns a pre-built response immediately.
+class BenchStubChat
+  attr_reader :messages
+  def initialize(response)
+    @response = response
+    @messages = []
+  end
+  def with_instructions(_) = self
+  def with_tool(_) = self
+  def with_temperature(_) = self
+  def with_cache_instructions(_) = self
+  def with_output_schema(_) = self
+  def last_message = @response
+  def ask(_)
+    @messages << @response
+    @response
+  end
+  def stream(*)
+    yield @response.content if block_given?
+    @response
+  end
+end
+# A stub tool that does nothing but conforms to the Tool::Base interface.
+class BenchNullTool < Phronomy::Tool::Base
+  description "No-op benchmark tool"
+  param :x, type: :string, desc: "input"
+  def execute(x:)
+    "result:#{x}"
+  end
+end
+# ---------------------------------------------------------------------------
+# Agent classes
+# ---------------------------------------------------------------------------
+BENCH_RESP = BenchAgentMessage.assistant("benchmark complete")
+BENCH_RESP_CHAT = BenchStubChat.new(BENCH_RESP)
+bench_minimal_class = Class.new(Phronomy::Agent::Base) do
+  model "stub-model"
+  define_method(:build_chat) { |*| BenchStubChat.new(BENCH_RESP) }
+end
+bench_tool_class = Class.new(Phronomy::Agent::Base) do
+  model "stub-model"
+  tools BenchNullTool
+  max_parallel_tools 4
+  define_method(:build_chat) { |*| BenchStubChat.new(BENCH_RESP) }
+end
+BENCH_AGENT_MINIMAL = bench_minimal_class.new
+BENCH_AGENT_TOOLS = bench_tool_class.new
+AGENT_INVOKE_ITERATIONS = 200
+puts "=== bench_agent_invoke ==="
+Benchmark.bm(50) do |x|
+  x.report("Agent#invoke — minimal (no tools), #{AGENT_INVOKE_ITERATIONS} iters") do
+    AGENT_INVOKE_ITERATIONS.times do
+      BENCH_AGENT_MINIMAL.invoke("ping", thread_id: "bench-#{rand(1_000_000)}")
+    end
+  end
+  x.report("Agent#invoke — 4 parallel stub tools, #{AGENT_INVOKE_ITERATIONS} iters") do
+    AGENT_INVOKE_ITERATIONS.times do
+      BENCH_AGENT_TOOLS.invoke("ping", thread_id: "bench-#{rand(1_000_000)}")
+    end
+  end
+end
+puts

data/benchmark/bench_context_assembler.rb ADDED Viewed

@@ -0,0 +1,46 @@
+# frozen_string_literal: true
+# Benchmark: Context::Assembler#build
+#
+# Tests context assembly performance for varying numbers of messages and
+# knowledge chunks. This path is exercised on every agent turn.
+require "benchmark"
+require_relative "../lib/phronomy"
+BenchAsmMessage = Struct.new(:content)
+def make_assembler(n_messages:, n_chunks:, with_budget: false)
+  budget = if with_budget
+    Phronomy::Context::TokenBudget.new(context_window: 4096, max_output_tokens: 512)
+  end
+  asm = Phronomy::Context::Assembler.new(budget: budget)
+  asm.add_instruction("You are a helpful assistant. Answer the user's question.")
+  n_chunks.times do |i|
+    asm.add_knowledge("Fact #{i}: The capital of country #{i} is City #{i}.", type: :entity, trusted: true)
+  end
+  msgs = Array.new(n_messages) { BenchAsmMessage.new("This is a conversation message.") }
+  asm.add_messages(msgs)
+  asm
+end
+BENCH_ASM_ITERATIONS = 1_000
+puts "=== bench_context_assembler ==="
+Benchmark.bm(40) do |x|
+  x.report("build(10 msgs, 0 chunks)") do
+    BENCH_ASM_ITERATIONS.times { make_assembler(n_messages: 10, n_chunks: 0).build }
+  end
+  x.report("build(100 msgs, 5 chunks)") do
+    BENCH_ASM_ITERATIONS.times { make_assembler(n_messages: 100, n_chunks: 5).build }
+  end
+  x.report("build(1000 msgs, 10 chunks, no budget)") do
+    (BENCH_ASM_ITERATIONS / 10).times { make_assembler(n_messages: 1000, n_chunks: 10).build }
+  end
+  x.report("build(1000 msgs, 10 chunks, budgeted)") do
+    (BENCH_ASM_ITERATIONS / 10).times { make_assembler(n_messages: 1000, n_chunks: 10, with_budget: true).build }
+  end
+end

data/benchmark/bench_regression.rb ADDED Viewed

@@ -0,0 +1,172 @@
+# frozen_string_literal: true
+# bench_regression.rb — Targeted regression benchmarks.
+#
+# Measures the five minimum regression targets defined in Issue #232:
+#   1. WorkflowContext#merge throughput
+#   2. Workflow.define (graph build) time
+#   3. Tool::Base#params_schema generation (10 params)
+#   4. Orchestrator#dispatch_parallel overhead (10 stub agents, no LLM)
+#   5. CancellationToken#cancelled? throughput (shared token, 8 threads)
+#
+# Results are stored in a global REGRESSION_RESULTS hash (keyed by metric name,
+# value = iterations per second) for use by run_all.rb baseline comparison.
+require "benchmark"
+require_relative "../lib/phronomy"
+REGRESSION_ITERATIONS = 5_000
+# ---------------------------------------------------------------------------
+# Target 1: WorkflowContext#merge throughput
+# ---------------------------------------------------------------------------
+context_class = Class.new do
+  include Phronomy::WorkflowContext
+  field :value, type: :replace, default: -> { 0 }
+  field :log, type: :append, default: -> { [] }
+end
+sample_ctx = context_class.new(value: 42, log: ["a"])
+t1 = Benchmark.measure("WorkflowContext#merge") do
+  REGRESSION_ITERATIONS.times { sample_ctx.merge(value: 99, log: "b") }
+end
+# ---------------------------------------------------------------------------
+# Target 2: Workflow.define graph build time
+# ---------------------------------------------------------------------------
+BUILD_ITERATIONS = 1_000
+t2 = Benchmark.measure("Workflow.define (5 states)") do
+  BUILD_ITERATIONS.times do
+    build_ctx = Class.new do
+      include Phronomy::WorkflowContext
+      field :x, type: :replace, default: -> { 0 }
+    end
+    Phronomy::Workflow.define(build_ctx) do
+      initial :a
+      %i[a b c d].each_with_index do |state, i|
+        next_state = %i[a b c d e][i + 1]
+        action = ->(s) { s.merge(x: s.x + 1) }
+        self.state state, action: action
+        transition from: state, to: next_state
+      end
+      self.state :e, action: ->(s) { s }
+      transition from: :e, to: :__finish__
+    end
+  end
+end
+# ---------------------------------------------------------------------------
+# Target 3: Tool::Base#params_schema generation (10 params)
+# ---------------------------------------------------------------------------
+tool_class = Class.new(Phronomy::Tool::Base) do
+  description "Test tool with 10 params"
+  param :p1, type: :string, desc: "param 1"
+  param :p2, type: :string, desc: "param 2"
+  param :p3, type: :string, desc: "param 3"
+  param :p4, type: :string, desc: "param 4"
+  param :p5, type: :string, desc: "param 5"
+  param :p6, type: :string, desc: "param 6"
+  param :p7, type: :string, desc: "param 7"
+  param :p8, type: :string, desc: "param 8"
+  param :p9, type: :string, desc: "param 9"
+  param :p10, type: :string, desc: "param 10"
+  def execute(**_kwargs)
+    "ok"
+  end
+end
+t3 = Benchmark.measure("Tool::Base#params_schema_definition (10 params)") do
+  REGRESSION_ITERATIONS.times { tool_class.params_schema_definition }
+end
+# ---------------------------------------------------------------------------
+# Target 4: Orchestrator#dispatch_parallel overhead (10 stub agents, no LLM)
+# ---------------------------------------------------------------------------
+stub_agent_class = Class.new(Phronomy::Agent::Base) do
+  define_method(:invoke) do |_input, messages: [], thread_id: nil, config: {}|
+    {output: "stub", messages: []}
+  end
+  define_method(:invoke_async) { |input, **_kw| Phronomy::Runtime.instance.spawn(name: "bench-stub") { invoke(input) } }
+end
+orchestrator_class = Class.new(Phronomy::Agent::Orchestrator)
+orchestrator = orchestrator_class.new
+PARALLEL_ITERATIONS = 200
+t4 = Benchmark.measure("Orchestrator#dispatch_parallel (10 agents)") do
+  PARALLEL_ITERATIONS.times do
+    tasks = Array.new(10) { {agent: stub_agent_class, input: "x"} }
+    orchestrator.dispatch_parallel(*tasks)
+  end
+end
+# ---------------------------------------------------------------------------
+# Target 5: CancellationToken#cancelled? throughput (8 threads)
+# ---------------------------------------------------------------------------
+CANCEL_TOKEN = Phronomy::CancellationToken.new
+CANCEL_ITERATIONS = 10_000
+t5 = Benchmark.measure("CancellationToken#cancelled? (8 threads)") do
+  threads = 8.times.map do
+    Thread.new { CANCEL_ITERATIONS.times { CANCEL_TOKEN.cancelled? } }
+  end
+  threads.each(&:join)
+end
+# ---------------------------------------------------------------------------
+# Target 6: CancellationToken#raise_if_cancelled! hot path (no-op, single thread)
+# ---------------------------------------------------------------------------
+RAISE_TOKEN = Phronomy::CancellationToken.new  # not cancelled — no-op path
+RAISE_ITERATIONS = 200_000
+t6 = Benchmark.measure("CancellationToken#raise_if_cancelled! (no-op)") do
+  RAISE_ITERATIONS.times { RAISE_TOKEN.raise_if_cancelled! }
+end
+# ---------------------------------------------------------------------------
+# Target 7: Context::TrimContext#remove on a 2000-element history
+# ---------------------------------------------------------------------------
+BenchMsg = Struct.new(:content) unless defined?(BenchMsg)
+TRIM_ELEMENTS = Array.new(2_000) { |i| {seq: i, message: BenchMsg.new("msg #{i}"), tokens: 10, role: :user} }
+TRIM_BUDGET = Phronomy::Context::TokenBudget.new(context_window: 4096, max_output_tokens: 512)
+TRIM_ITERATIONS = 500
+t7 = Benchmark.measure("TrimContext#remove (2000-element history)") do
+  TRIM_ITERATIONS.times do
+    tc = Phronomy::Context::TrimContext.new(message_elements: TRIM_ELEMENTS, budget: TRIM_BUDGET)
+    tc.remove((0...200).to_a)  # remove 200 oldest messages
+  end
+end
+# ---------------------------------------------------------------------------
+# Print results and store in REGRESSION_RESULTS
+# ---------------------------------------------------------------------------
+puts "=== bench_regression ==="
+printf("%-46s  %8s  %12s\n", "Metric", "Real (s)", "Iter/s")
+puts "-" * 70
+metrics = {
+  "workflow_context_merge" => [t1, REGRESSION_ITERATIONS],
+  "workflow_define" => [t2, BUILD_ITERATIONS],
+  "tool_params_schema_definition" => [t3, REGRESSION_ITERATIONS],
+  "dispatch_parallel_10" => [t4, PARALLEL_ITERATIONS],
+  "cancellation_token_cancelled" => [t5, 8 * CANCEL_ITERATIONS],
+  "cancellation_token_raise_if_cancelled_noop" => [t6, RAISE_ITERATIONS],
+  "trim_context_remove_2000" => [t7, TRIM_ITERATIONS]
+}
+REGRESSION_RESULTS = {} # rubocop:disable Style/MutableConstant
+metrics.each do |key, (measure, iters)|
+  ips = iters / measure.real
+  REGRESSION_RESULTS[key] = ips
+  printf("%-46s  %8.3f  %12.0f\n", key, measure.real, ips)
+end
+puts

data/benchmark/bench_token_estimator.rb ADDED Viewed

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+# Benchmark: Context::TokenEstimator.estimate
+#
+# Tests estimation speed for short, medium, and long text inputs, and for
+# Arrays of message-like objects. This method is called on every message in
+# every agent turn, so it must be consistently fast.
+require "benchmark"
+require_relative "../lib/phronomy"
+SHORT_TEXT = "Hello, how are you today?"
+MEDIUM_TEXT = "A" * 500
+LONG_TEXT = "A" * 10_000
+BenchMessage = Struct.new(:content)
+MESSAGES_100 = Array.new(100) { BenchMessage.new("A" * 100) }
+MESSAGES_1000 = Array.new(1000) { BenchMessage.new("A" * 100) }
+BENCH_TOKEN_ITERATIONS = 10_000
+puts "=== bench_token_estimator ==="
+Benchmark.bm(30) do |x|
+  x.report("estimate(short text)") do
+    BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(SHORT_TEXT) }
+  end
+  x.report("estimate(medium text 500c)") do
+    BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(MEDIUM_TEXT) }
+  end
+  x.report("estimate(long text 10k c)") do
+    BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(LONG_TEXT) }
+  end
+  x.report("estimate(100 messages)") do
+    BENCH_TOKEN_ITERATIONS.times { Phronomy::Context::TokenEstimator.estimate(MESSAGES_100) }
+  end
+  x.report("estimate(1000 messages)") do
+    (BENCH_TOKEN_ITERATIONS / 10).times { Phronomy::Context::TokenEstimator.estimate(MESSAGES_1000) }
+  end
+end

data/benchmark/bench_tool_schema.rb ADDED Viewed

@@ -0,0 +1,69 @@
+# frozen_string_literal: true
+# Benchmark: Tool::Base params_schema generation and static_knowledge_chunks cache
+#
+# Tool schema generation happens once per tool class (lazily memoised).
+# static_knowledge_chunks is cached at the class level; cache-hit overhead
+# should be negligible compared to cache-miss (which calls the knowledge source).
+require "benchmark"
+require_relative "../lib/phronomy"
+# --- Tool schema ---
+class BenchTool10Params < Phronomy::Tool::Base
+  description "A tool with 10 parameters for benchmarking purposes"
+  param :param1, type: :string, desc: "First parameter"
+  param :param2, type: :integer, desc: "Second parameter"
+  param :param3, type: :number, desc: "Third parameter"
+  param :param4, type: :boolean, desc: "Fourth parameter"
+  param :param5, type: :string, desc: "Fifth parameter"
+  param :param6, type: :string, desc: "Sixth parameter", required: false
+  param :param7, type: :integer, desc: "Seventh parameter", required: false
+  param :param8, type: :string, desc: "Eighth parameter", required: false
+  param :param9, type: :string, desc: "Ninth parameter", required: false
+  param :param10, type: :string, desc: "Tenth parameter", required: false
+  def execute(**_)
+    "ok"
+  end
+end
+# Warm up memoisation
+BenchTool10Params.params_schema_definition
+BENCH_TOOL_ITERATIONS = 50_000
+puts "=== bench_tool_schema ==="
+Benchmark.bm(35) do |x|
+  x.report("params_schema_definition (memoised, 10p)") do
+    BENCH_TOOL_ITERATIONS.times { BenchTool10Params.params_schema_definition }
+  end
+end
+# --- static_knowledge_chunks cache ---
+class BenchKnowledgeSource < Phronomy::KnowledgeSource::Base
+  def fetch(query: nil)
+    [{content: "Cached knowledge fact.", type: :static}]
+  end
+  def static?
+    true
+  end
+end
+class BenchAgentWithKnowledge < Phronomy::Agent::Base
+  model "gpt-4o-mini"
+  static_knowledge BenchKnowledgeSource.new
+end
+# Warm up cache
+BenchAgentWithKnowledge.static_knowledge_chunks
+puts "\n=== bench_static_knowledge_cache ==="
+Benchmark.bm(35) do |x|
+  x.report("static_knowledge_chunks (hit)") do
+    BENCH_TOOL_ITERATIONS.times { BenchAgentWithKnowledge.static_knowledge_chunks }
+  end
+end

data/benchmark/bench_vector_store.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+# Benchmark: VectorStore::InMemory#search
+#
+# Tests search performance at different corpus sizes (100, 1000, 10_000 docs).
+# Linear scan is expected; this benchmark establishes the scaling baseline.
+require "benchmark"
+require_relative "../lib/phronomy"
+DIM = 64
+def random_embedding(dim)
+  Array.new(dim) { rand(-1.0..1.0) }
+end
+def populate(store, n)
+  n.times do |i|
+    store.add(id: "doc#{i}", embedding: random_embedding(DIM), metadata: {text: "Document #{i}"})
+  end
+end
+QUERY = random_embedding(DIM)
+# Use fewer iterations for larger corpora to keep total run time reasonable.
+BENCH_VS_ITERS = {100 => 100, 1_000 => 20, 10_000 => 5}.freeze
+puts "=== bench_vector_store_inmemory ==="
+Benchmark.bm(35) do |x|
+  [100, 1_000, 10_000].each do |n|
+    store = Phronomy::VectorStore::InMemory.new(dimension: DIM)
+    populate(store, n)
+    iters = BENCH_VS_ITERS[n]
+    x.report("search(k=5, corpus=#{n}, iters=#{iters})") do
+      iters.times { store.search(query_embedding: QUERY, k: 5) }
+    end
+  end
+end

data/benchmark/bench_workflow.rb ADDED Viewed

@@ -0,0 +1,55 @@
+# frozen_string_literal: true
+# Benchmark: Workflow transition loop
+#
+# Builds a linear chain of N states and measures how long it takes to run
+# the full workflow to completion. 100 transitions must complete in <10ms.
+require "benchmark"
+require_relative "../lib/phronomy"
+# Build a linear workflow: state_0 -> state_1 -> ... -> state_(N-1) -> __finish__
+def build_linear_workflow(n)
+  context_class = Class.new do
+    include Phronomy::WorkflowContext
+    field :count, type: :replace, default: -> { 0 }
+  end
+  Phronomy::Workflow.define(context_class) do
+    initial :state_0
+    n.times do |i|
+      state :"state_#{i}", action: ->(s) { s.merge(count: s.count + 1) }
+      transition from: :"state_#{i}", to: (i + 1 < n) ? :"state_#{i + 1}" : :__finish__
+    end
+  end
+end
+BENCH_WF_ITERATIONS = 50
+puts "=== bench_workflow_transition ==="
+Benchmark.bm(30) do |x|
+  [10, 50, 100].each do |n|
+    app = build_linear_workflow(n)
+    cfg = {recursion_limit: n + 5}
+    x.report("#{n} transitions") do
+      BENCH_WF_ITERATIONS.times { app.invoke({}, config: cfg) }
+    end
+  end
+end
+# Threshold assertion: 100 transitions should complete in <10ms on average
+puts "\nThreshold check: 100 transitions < 10ms average..."
+app100 = build_linear_workflow(100)
+cfg100 = {recursion_limit: 110}
+samples = 20
+elapsed = Benchmark.realtime { samples.times { app100.invoke({}, config: cfg100) } }
+avg_ms = (elapsed / samples) * 1000.0
+puts "  Average: #{"%.2f" % avg_ms}ms per run"
+if avg_ms < 10.0
+  puts "  PASS (< 10ms)"
+else
+  warn "  WARN: #{avg_ms.round(2)}ms exceeds 10ms threshold (environment may be slow)"
+end