RubyGems - phronomy - Versions diffs - 0.8.0 → 0.9.0 - Mend

phronomy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (77) hide show

checksums.yaml +4 -4
data/README.md +31 -41
data/benchmark/baseline.json +1 -1
data/benchmark/bench_agent_invoke.rb +1 -1
data/benchmark/bench_context_assembler.rb +9 -1
data/benchmark/bench_regression.rb +8 -8
data/benchmark/bench_tool_schema.rb +2 -2
data/benchmark/bench_vector_store.rb +1 -1
data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
data/lib/phronomy/agent/base.rb +253 -351
data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
data/lib/phronomy/agent/context/capability/base.rb +689 -0
data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
data/lib/phronomy/agent/invocation_pipeline.rb +10 -1
data/lib/phronomy/agent/react_agent.rb +24 -23
data/lib/phronomy/agent/shared_state.rb +2 -2
data/lib/phronomy/agent/tool_executor.rb +1 -1
data/lib/phronomy/concurrency/gate_registry.rb +0 -1
data/lib/phronomy/configuration.rb +0 -6
data/lib/phronomy/llm_context_window/assembler.rb +77 -44
data/lib/phronomy/multi_agent/handoff.rb +4 -4
data/lib/phronomy/multi_agent/orchestrator.rb +1 -1
data/lib/phronomy/multi_agent/team_coordinator.rb +2 -2
data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
data/lib/phronomy/runtime.rb +1 -2
data/lib/phronomy/tool.rb +3 -4
data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
data/lib/phronomy/tools/vector_search.rb +70 -0
data/lib/phronomy/vector_store/async_backend.rb +110 -0
data/lib/phronomy/vector_store/base.rb +89 -0
data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
data/lib/phronomy/vector_store/in_memory.rb +103 -0
data/lib/phronomy/vector_store/loader/base.rb +27 -0
data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
data/lib/phronomy/vector_store/pgvector.rb +127 -0
data/lib/phronomy/vector_store/redis_search.rb +192 -0
data/lib/phronomy/vector_store/splitter/base.rb +49 -0
data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
data/lib/phronomy/vector_store.rb +16 -4
data/lib/phronomy/version.rb +1 -1
data/lib/phronomy.rb +2 -1
data/scripts/api_snapshot.rb +11 -9
metadata +28 -32
data/lib/phronomy/agent/context/conversation/compaction_context.rb +0 -117
data/lib/phronomy/agent/context/conversation/trigger_context.rb +0 -43
data/lib/phronomy/agent/context/conversation/trim_context.rb +0 -82
data/lib/phronomy/agent/context/knowledge/embeddings/base.rb +0 -45
data/lib/phronomy/agent/context/knowledge/embeddings/ruby_llm_embeddings.rb +0 -51
data/lib/phronomy/agent/context/knowledge/loader/base.rb +0 -31
data/lib/phronomy/agent/context/knowledge/loader/csv_loader.rb +0 -62
data/lib/phronomy/agent/context/knowledge/loader/markdown_loader.rb +0 -82
data/lib/phronomy/agent/context/knowledge/loader/plain_text_loader.rb +0 -28
data/lib/phronomy/agent/context/knowledge/source/base.rb +0 -60
data/lib/phronomy/agent/context/knowledge/source/entity_knowledge.rb +0 -102
data/lib/phronomy/agent/context/knowledge/source/rag_knowledge.rb +0 -63
data/lib/phronomy/agent/context/knowledge/source/static_knowledge.rb +0 -58
data/lib/phronomy/agent/context/knowledge/splitter/base.rb +0 -53
data/lib/phronomy/agent/context/knowledge/splitter/fixed_size_splitter.rb +0 -57
data/lib/phronomy/agent/context/knowledge/splitter/recursive_splitter.rb +0 -111
data/lib/phronomy/agent/context/knowledge/vector_store/async_backend.rb +0 -116
data/lib/phronomy/agent/context/knowledge/vector_store/base.rb +0 -95
data/lib/phronomy/agent/context/knowledge/vector_store/in_memory.rb +0 -109
data/lib/phronomy/agent/context/knowledge/vector_store/pgvector.rb +0 -133
data/lib/phronomy/agent/context/knowledge/vector_store/redis_search.rb +0 -198
data/lib/phronomy/embeddings.rb +0 -11
data/lib/phronomy/loader.rb +0 -13
data/lib/phronomy/splitter.rb +0 -12
data/lib/phronomy/tool/base.rb +0 -685
data/lib/phronomy/tool/scope_policy.rb +0 -50

data/lib/phronomy/agent/base.rb CHANGED Viewed

@@ -1,6 +1,5 @@
 # frozen_string_literal: true
-require "digest"
 require "securerandom"
 require_relative "concerns/retryable"
 require_relative "concerns/guardrailable"
@@ -255,10 +254,10 @@ module Phronomy
         # the first time +invoke+ is called. The cache persists for the lifetime
         # of the process; call {.static_knowledge_refresh!} to force a reload.
         #
-        # @param sources [Array<Phronomy::Agent::Context::Knowledge::Source::Base>]
+        # @param sources [Array<Phronomy::Agent::Context::Knowledge::Base>]
         # @example
         #   class PolicyAgent < Phronomy::Agent::Base
-        #     static_knowledge Phronomy::Agent::Context::Knowledge::Source::StaticKnowledge.new(POLICY_TEXT)
+        #     static_knowledge Phronomy::Agent::Context::Knowledge::StaticKnowledge.new(POLICY_TEXT)
         #   end
         # @api public
         def static_knowledge(*sources)
@@ -269,7 +268,7 @@ module Phronomy
         end
         # Returns the registered static knowledge sources.
-        # @return [Array<Phronomy::Agent::Context::Knowledge::Source::Base>]
+        # @return [Array<Phronomy::Agent::Context::Knowledge::Base>]
         # @api public
         def static_knowledge_sources
           @static_knowledge_sources || []
@@ -302,80 +301,6 @@ module Phronomy
           @static_knowledge_chunks = nil
         end
-        # Registers a callback that is invoked before every LLM call so the
-        # application can remove stale or irrelevant messages from the
-        # conversation history.
-        #
-        # The block receives a {Phronomy::Agent::Context::Conversation::TrimContext} and may call
-        # +ctx.remove(seqs)+ to drop messages by seq number. Changes affect
-        # only the current invocation; the underlying memory store is unchanged.
-        #
-        # @yield [ctx] Phronomy::Agent::Context::Conversation::TrimContext
-        # @example Drop the oldest message when over 80% of budget is used
-        #   on_trim do |ctx|
-        #     limit = ctx.budget&.available(used: 0) || Float::INFINITY
-        #     ctx.remove(ctx.message_elements.first[:seq]) if ctx.total_tokens > limit * 0.8
-        #   end
-        # @api public
-        def on_trim(&block)
-          @on_trim_callback = block
-        end
-        # @return [Proc, nil]
-        # @api private
-        def _on_trim_callback
-          @on_trim_callback
-        end
-        # Registers a callback that decides whether compaction should run.
-        # Evaluated before every LLM call (after on_trim). If the block returns
-        # truthy AND an +on_compact+ callback is also registered, the compact
-        # pipeline is executed.
-        #
-        # The block receives a read-only {Phronomy::Agent::Context::Conversation::TriggerContext}.
-        #
-        # @yield [ctx] Phronomy::Agent::Context::Conversation::TriggerContext
-        # @return [Boolean] truthy → run on_compact; falsy → skip
-        # @example Trigger when messages exceed 70% of token budget
-        #   on_compaction_trigger do |ctx|
-        #     limit = ctx.budget&.available(used: 0) || Float::INFINITY
-        #     ctx.total_tokens > limit * 0.7
-        #   end
-        # @api public
-        def on_compaction_trigger(&block)
-          @on_compaction_trigger_callback = block
-        end
-        # @return [Proc, nil]
-        # @api private
-        def _on_compaction_trigger_callback
-          @on_compaction_trigger_callback
-        end
-        # Registers a callback that performs the actual compaction when the
-        # +on_compaction_trigger+ callback fires. The block receives a
-        # {Phronomy::Agent::Context::Conversation::CompactionContext} and should call +ctx.compact+
-        # to specify which messages to summarise.
-        #
-        # @yield [ctx] Phronomy::Agent::Context::Conversation::CompactionContext
-        # @example Replace the first 4 messages with a short summary
-        #   on_compact do |ctx|
-        #     ctx.compact(0..3) do |elements|
-        #       texts = elements.map { |e| e[:message].content }.join(" | ")
-        #       "Earlier conversation summary: #{texts}"
-        #     end
-        #   end
-        # @api public
-        def on_compact(&block)
-          @on_compact_callback = block
-        end
-        # @return [Proc, nil]
-        # @api private
-        def _on_compact_callback
-          @on_compact_callback
-        end
         # When enabled, attaches Anthropic prompt-cache markers to the system
         # message so that the fixed instructions are served from cache on
         # subsequent turns, reducing input-token costs.
@@ -453,7 +378,7 @@ module Phronomy
       # Registers an anonymous handoff tool class on this agent instance.
       # Called by Runner during construction when routes are configured.
-      # @param tool_class [Class<Phronomy::Tool::Base>]
+      # @param tool_class [Class<Phronomy::Agent::Context::Capability::Base>]
       # @return [self]
       # @api private
       def _add_handoff_tool(tool_class)
@@ -482,7 +407,6 @@ module Phronomy
       # @param thread_id [String, nil] conversation thread identifier, forwarded
       #   to the compaction context when on_compact is configured.
       # @param config    [Hash] additional runtime options:
-      #   +:knowledge_sources+ (Array) — dynamic knowledge sources for this turn
       #   +:user_id+    (+String+, optional) — caller identity forwarded to the tracer
       #   +:session_id+ (+String+, optional) — session identity forwarded to the tracer
       # @param invocation_context [Phronomy::InvocationContext, nil] optional first-class context
@@ -519,60 +443,9 @@ module Phronomy
           thread_id, config = _apply_invocation_context(thread_id, config, invocation_context)
         end
         if Phronomy.configuration.event_loop
-          # Protect against blocking the EventLoop thread itself.
-          if Phronomy::EventLoop.current?
-            raise Phronomy::Error,
-              "Cannot call Agent#invoke (EventLoop mode) from within an EventLoop " \
-              "entry action. Use agent.run_as_child(input, ctx: ctx) instead."
-          end
-          # Build an effective config that includes the invoke_timeout scope's
-          # CancellationToken before constructing the FSM.  This ensures that
-          # every LLM, tool, and RAG call made inside _invoke_impl observes
-          # cancellation when the deadline fires.
-          timeout_sec = self.class.invoke_timeout
-          effective_config, scope = if timeout_sec
-            s = Phronomy::Concurrency::CancellationScope.new(parent_token: config[:cancellation_token])
-            s.deadline_in(timeout_sec)
-            [config.merge(cancellation_token: s.token), s]
-          else
-            [config, nil]
-          end
-          fsm = Agent::FSM.new(
-            agent: self,
-            input: input,
-            messages: messages,
-            thread_id: thread_id || SecureRandom.uuid,
-            config: effective_config
-          )
-          completion_queue = Phronomy::EventLoop.instance.register(fsm)
-          result = if scope
-            scope.pop_queue(completion_queue) do
-              raise Phronomy::TimeoutError,
-                "Agent #{self.class.name} invoke timed out after #{timeout_sec}s"
-            end
-          else
-            completion_queue.pop
-          end
-          raise result if result.is_a?(Exception)
-          result
+          _invoke_via_event_loop(input, messages: messages, thread_id: thread_id, config: config)
         else
-          # Guard: calling invoke from inside a scheduler task would block the task
-          # against itself when using a cooperative backend.  Use invoke_async
-          # instead to compose agents without introducing a blocking wait.
-          if Phronomy::Task.current
-            msg = "#{self.class.name}#invoke called from inside a scheduler task. " \
-              "This blocks the scheduler until the inner invocation completes, preventing " \
-              "other tasks from making progress. Use invoke_async + await instead."
-            if Phronomy.configuration.strict_runtime_guards
-              raise Phronomy::SchedulerReentrancyError, msg
-            elsif Phronomy.configuration.logger
-              Phronomy.configuration.logger.warn(msg)
-            else
-              Kernel.warn("[phronomy] WARNING: #{msg}")
-            end
-          end
+          _check_scheduler_reentrancy
           invoke_async(input, messages: messages, thread_id: thread_id, config: config).await
         end
       end
@@ -687,19 +560,11 @@ module Phronomy
         raise
       end
-      # Returns the {LlmContextWindow::ContextVersionCache} built during the most recent
-      # {#invoke} call on this agent instance.  The thread-local cache entry is
-      # cleaned up in the +ensure+ block of {#invoke}, but a reference is kept
-      # in +@last_context_version_cache+ so callers can inspect it after invoke
-      # returns.
-      #
-      # NOTE: Not thread-safe.  When the same Agent instance is used concurrently,
-      # +@last_context_version_cache+ reflects the most recent +invoke+ on *any*
-      # thread.  For per-invocation isolation, use a separate Agent instance per
-      # thread.
+      # @deprecated The context version cache has been removed. Returns nil.
+      #   Retained for backward compatibility with callers using safe navigation (+&.reset+).
       # @api private
       def context_version_cache
-        @last_context_version_cache
+        nil
       end
       private
@@ -722,29 +587,75 @@ module Phronomy
         [effective_thread_id, effective_config]
       end
-      # Streaming implementation for #stream.
-      def _stream_impl(input, messages: [], thread_id: nil, config: {}, &block)
-        caller_meta = {}
-        caller_meta[:user_id] = config[:user_id] if config[:user_id]
-        caller_meta[:session_id] = config[:session_id] if config[:session_id]
-        if (ic = config[:invocation_context])
-          caller_meta[:task_id] = ic.task_id if ic.task_id
-          caller_meta[:parent_task_id] = ic.parent_task_id if ic.parent_task_id
+      def _invoke_via_event_loop(input, messages:, thread_id:, config:)
+        if Phronomy::EventLoop.current?
+          raise Phronomy::Error,
+            "Cannot call Agent#invoke (EventLoop mode) from within an EventLoop " \
+            "entry action. Use agent.run_as_child(input, ctx: ctx) instead."
+        end
+        timeout_sec = self.class.invoke_timeout
+        effective_config, scope = if timeout_sec
+          s = Phronomy::Concurrency::CancellationScope.new(parent_token: config[:cancellation_token])
+          s.deadline_in(timeout_sec)
+          [config.merge(cancellation_token: s.token), s]
+        else
+          [config, nil]
+        end
+        fsm = Agent::FSM.new(
+          agent: self,
+          input: input,
+          messages: messages,
+          thread_id: thread_id || SecureRandom.uuid,
+          config: effective_config
+        )
+        completion_queue = Phronomy::EventLoop.instance.register(fsm)
+        result = if scope
+          scope.pop_queue(completion_queue) do
+            raise Phronomy::TimeoutError,
+              "Agent #{self.class.name} invoke timed out after #{timeout_sec}s"
+          end
+        else
+          completion_queue.pop
         end
+        raise result if result.is_a?(Exception)
+        result
+      end
-        trace("agent.invoke", input: input, **caller_meta) do |_span|
+      def _check_scheduler_reentrancy
+        return unless Phronomy::Task.current
+        msg = "#{self.class.name}#invoke called from inside a scheduler task. " \
+          "This blocks the scheduler until the inner invocation completes, preventing " \
+          "other tasks from making progress. Use invoke_async + await instead."
+        if Phronomy.configuration.strict_runtime_guards
+          raise Phronomy::SchedulerReentrancyError, msg
+        elsif Phronomy.configuration.logger
+          Phronomy.configuration.logger.warn(msg)
+        else
+          Kernel.warn("[phronomy] WARNING: #{msg}")
+        end
+      end
+      # Streaming implementation for #stream.
+      def _stream_impl(input, messages: [], thread_id: nil, config: {}, &block)
+        trace("agent.invoke", input: input, **_build_caller_meta(config)) do |_span|
           run_input_guardrails!(input)
           chat = build_chat
           user_message = extract_message(input)
+          context = build_context(
+            input,
+            messages: messages,
+            thread_id: thread_id,
+            config: config,
+            budget: build_token_budget,
+            instruction: build_instructions(input),
+            tools: self.class.tools + _handoff_tools
+          )
+          _apply_context_to_chat(chat, context)
-          # Assemble context (system prompt + history). Override #build_context to
-          # inject custom context editing logic at the Agent subclass level.
-          context = build_context(input, messages: messages, thread_id: thread_id, config: config)
-          apply_instructions(chat, context[:system]) if context[:system]
-          context[:messages].each { |msg| chat.messages << msg }
-          # Wire per-event callbacks to yield StreamEvents.
           current_tool_call = nil
           chat.on_tool_call do |tool_call|
             current_tool_call = tool_call
@@ -758,32 +669,9 @@ module Phronomy
             }))
           end
-          # Run before_completion hooks (global → class → instance) before the LLM call.
           run_before_completion_hooks!(chat, config)
-          # Route the LLM streaming call through the configured LLMAdapter.
-          # Chunks are pushed into a token queue by the pool worker thread and
-          # drained here (on the caller's side) so that the user block is never
-          # executed on a BlockingAdapterPool worker thread.
-          # The queue capacity is bounded by Configuration#stream_queue_max_size
-          # (nil = unbounded) to provide backpressure against a fast LLM producer.
-          adapter = Phronomy.configuration.llm_adapter
-          chunk_queue = Phronomy::Concurrency::AsyncQueue.new(max_size: Phronomy.configuration.stream_queue_max_size)
-          pending = adapter.stream_async(chat, user_message, config: config, enqueue_to: chunk_queue)
-          # Drain the chunk queue on this side (scheduler task / caller thread).
-          loop do
-            chunk = chunk_queue.pop
-            break if chunk.nil? # queue closed — LLM streaming complete
-            block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
-            check_cancellation!(config, "invocation cancelled during streaming")
-          end
-          response = pending.await
-          output = response.content
-          usage = Phronomy::TokenUsage.from_tokens(response.tokens)
+          output, usage = _drain_stream(chat, user_message, config, &block)
           run_output_guardrails!(output)
           result = {output: output, messages: chat.messages, usage: usage}
@@ -797,139 +685,172 @@ module Phronomy
       # inject custom context editing logic without having to override
       # the full #invoke_once pipeline.
       #
-      # @param input     [String, Hash] the user's input for this turn
-      # @param messages  [Array<RubyLLM::Message>] raw conversation history
-      # @param thread_id [String, nil] conversation thread identifier
-      # @param config    [Hash] the invocation config (see #invoke)
-      # @return [Hash] { system: String|nil, messages: Array }
+      # The keyword arguments +budget+, +instruction+, +tools+, and +knowledge+
+      # carry pre-computed values. Override them in a subclass call to +super+
+      # to inject custom context without recomputing the defaults.
+      #
+      # @param input       [String, Hash] the user's input for this turn
+      # @param messages    [Array<RubyLLM::Message>] raw conversation history
+      # @param thread_id   [String, nil] conversation thread identifier
+      # @param config      [Hash] the invocation config (see #invoke)
+      # @param budget      [LlmContextWindow::TokenBudget, nil] pre-computed token budget
+      # @param instruction [String, nil] pre-computed system instruction
+      # @param tools       [Array<Class>] tool classes to expose
+      # @param knowledge   [Array<Hash>] knowledge chunks ({ content:, type:, source: })
+      # @return [Hash] { system: String|nil, messages: Array, tool_classes: Array }
       # @api public
-      def build_context(input, messages: [], thread_id: nil, config: {})
-        history = prepare_history(messages: messages, thread_id: thread_id, config: config)
-        budget = build_token_budget
-        system_text = build_cached_system_text(input)
-        user_message = extract_message(input)
+      def build_context(
+        input,
+        messages: [],
+        thread_id: nil,
+        config: {},
+        budget: build_token_budget,
+        instruction: build_instructions(input),
+        tools: self.class.tools + _handoff_tools,
+        knowledge: self.class.static_knowledge_chunks + instance_knowledge_chunks
+      )
         assembler = LlmContextWindow::Assembler.new(budget: budget)
-        assembler.add_instruction(system_text) if system_text
-        fetch_knowledge_chunks(user_message, config).each do |chunk|
-          assembler.add_knowledge(chunk[:content], type: chunk[:type], source: chunk[:source])
+        assembler.add_instruction(instruction) if instruction
+        assembler.add_capability(tools)
+        knowledge.each { |chunk| assembler.add_knowledge(chunk[:content], type: chunk[:type] || :static, trusted: true, source: chunk[:source]) }
+        msgs = Array(messages)
+        if budget && budget_exceeded?(msgs)
+          # Default strategy when the token budget is tight:
+          # 1. Compact: keep the most recent half of the messages verbatim and
+          #    replace the older half with a brief omission marker.
+          # 2. Trim: if the compacted history still exceeds the budget, call
+          #    trim_to_budget with the :safe strategy, which discards the oldest
+          #    message one at a time until the history fits.
+          # Subclasses can override build_context to apply a different strategy
+          # (e.g. LLM-based summarisation) before calling super.
+          keep = [msgs.size / 2, 2].max
+          msgs = compact_messages(msgs, keep_tail: keep) do |dropped|
+            "[#{dropped.size} earlier messages omitted]"
+          end
+          remaining = assembler.available_for_messages
+          msgs = trim_to_budget(msgs, remaining: remaining, strategy: :safe)
         end
-        assembler.add_messages(history)
-        assembler.build
+        assembler.add_messages(msgs)
+        @last_context = assembler.build
       end
       protected :build_context
-      # Fetches knowledge chunks from all registered sources concurrently.
-      #
-      # Each source is spawned as a separate task within a {Phronomy::TaskGroup};
-      # the RAG concurrency gate enforces the +max_concurrent_rag_fetches+ cap.
-      # Results are returned in registration order (spawn order) as a flat array.
+      # Keeps the last +keep+ messages from +messages+, discarding older ones.
+      # Use this inside a +build_context+ override to trim conversation history.
       #
-      # This method is available to subclasses as a building block when
-      # overriding {#build_context}. Pass a custom +query+ to implement
-      # multi-hop RAG or other retrieval strategies.
-      #
-      # @param query  [String] RAG query string (typically the current user message)
-      # @param config [Hash]   invocation config; relevant keys:
-      #   +:knowledge_sources+, +:rag_failure_policy+, +:cancellation_token+, +:rag_timeout+
-      # @return [Array<Hash>] flat list of chunk hashes with +:content+, +:type+, +:source+
-      # @api private
-      def fetch_knowledge_chunks(query, config)
-        sources = Array(config[:knowledge_sources])
-        return [] if sources.empty?
-        check_cancellation!(config, "invocation cancelled before RAG fetch")
-        # :skip (default) — ignore per-source failures so the agent can still
-        # answer with partial context. :fail surfaces the first error immediately.
-        failure_policy =
-          case config[:rag_failure_policy]
-          when :fail then :fail_fast
-          else :skip_failed
-          end
+      # @param messages [Array<RubyLLM::Message>] conversation history
+      # @param keep     [Integer] number of messages to retain (from the tail)
+      # @return [Array<RubyLLM::Message>]
+      # @api public
+      def trim_messages(messages, keep:)
+        Array(messages).last(keep)
+      end
+      protected :trim_messages
-        group = Phronomy::Runtime.instance.task_group(failure_policy: failure_policy)
-        bp = Phronomy.configuration.backpressure
-        rag_on_full = (bp == :raise) ? :reject : (bp || :wait)
-        rag_bp_timeout = Phronomy.configuration.backpressure_timeout
-        # Spawn all fetches concurrently. Results are returned in spawn order
-        # (i.e. registration order of knowledge sources) by TaskGroup#await_all.
-        sources.each do |ks|
-          group.spawn do
-            Phronomy::Runtime.instance.gate(:rag).acquire(on_full: rag_on_full, timeout: rag_bp_timeout) do
-              result, elapsed_ms = Phronomy::Runtime.measure_ms do
-                ks.fetch_async(
-                  query: query,
-                  cancellation_token: config[:cancellation_token],
-                  timeout: config[:rag_timeout]
-                ).await
-              end
-              Phronomy.configuration.logger&.debug { "RAG fetch from #{ks.class.name} completed in #{elapsed_ms}ms" }
-              result
-            end
-          end
-        end
+      # Removes the oldest messages one at a time until the count is within +limit+.
+      #
+      # @param messages [Array<RubyLLM::Message>] conversation history
+      # @param limit    [Integer] maximum number of messages to retain
+      # @return [Array<RubyLLM::Message>]
+      # @api public
+      def drop_messages_over(messages, limit:)
+        msgs = Array(messages).dup
+        msgs.shift while msgs.size > limit
+        msgs
+      end
+      protected :drop_messages_over
-        # await_all returns results in spawn order; nil entries indicate
-        # skipped failures when using :skip_failed.
-        group.await_all.flat_map { |chunks| Array(chunks) }
+      # Replaces all but the last +keep_tail+ messages with a single system summary.
+      # The block receives the dropped messages and must return a summary String.
+      #
+      # @param messages  [Array<RubyLLM::Message>] conversation history
+      # @param keep_tail [Integer] number of recent messages to preserve verbatim
+      # @yield  [Array<RubyLLM::Message>] the messages being summarised
+      # @yieldreturn [String] summary text
+      # @return [Array<RubyLLM::Message>]
+      # @api public
+      def compact_messages(messages, keep_tail:, &summariser)
+        msgs = Array(messages)
+        return msgs if msgs.size <= keep_tail
+        tail = msgs.last(keep_tail)
+        dropped = msgs.first(msgs.size - keep_tail)
+        summary_text = summariser.call(dropped)
+        [RubyLLM::Message.new(role: :system, content: summary_text)] + tail
       end
-      protected :fetch_knowledge_chunks
+      protected :compact_messages
-      # Runs the on_trim / on_compaction_trigger / on_compact pipeline on the
-      # supplied message array and returns the final Array of message objects
-      # ready to pass to the Assembler.
+      # Trims +messages+ to fit within +remaining+ tokens using the given
+      # +strategy+. Returns the trimmed message array without touching the
+      # assembler. The caller is responsible for passing the result to
+      # +assembler.add_messages+ and calling +assembler.build+.
       #
-      # Override this method in a subclass to customize how conversation
-      # history is filtered or compressed before context assembly.
+      # Supported strategies:
+      #   +:safe+ — discard the oldest message one at a time (default)
       #
-      # @param messages  [Array<RubyLLM::Message>] raw conversation history
-      # @param thread_id [String, nil] conversation thread identifier
-      # @param config    [Hash] additional invocation options
-      # @return [Array] filtered and/or compacted message objects
+      # @param messages  [Array<RubyLLM::Message>] conversation history
+      # @param remaining [Integer, nil] token allowance for messages; when +nil+
+      #   the messages are returned unchanged
+      # @param strategy  [Symbol] trim strategy (default +:safe+)
+      # @return [Array<RubyLLM::Message>]
       # @api public
-      def prepare_history(messages: [], thread_id: nil, config: {})
-        budget = build_token_budget
-        elements = build_message_elements(Array(messages))
-        if (trim_cb = self.class._on_trim_callback)
-          trim_ctx = Context::Conversation::TrimContext.new(message_elements: elements, budget: budget)
-          trim_cb.call(trim_ctx)
-          elements = trim_ctx.message_elements
+      def trim_to_budget(messages, remaining:, strategy: :safe)
+        return Array(messages) unless remaining
+        msgs = Array(messages)
+        loop do
+          used = msgs.sum { |m| LlmContextWindow::TokenEstimator.estimate(m.content.to_s) }
+          return msgs if used <= remaining
+          break if msgs.empty?
+          msgs = trim_messages(msgs, keep: msgs.size - 1)
         end
+        msgs
+      end
+      protected :trim_to_budget
-        if (trigger_cb = self.class._on_compaction_trigger_callback)
-          trigger_ctx = Context::Conversation::TriggerContext.new(message_elements: elements, budget: budget)
-          if trigger_cb.call(trigger_ctx)
-            if (compact_cb = self.class._on_compact_callback)
-              compact_ctx = Context::Conversation::CompactionContext.new(
-                message_elements: elements,
-                budget: budget,
-                thread_id: thread_id
-              )
-              compact_cb.call(compact_ctx)
-              elements = build_message_elements(compact_ctx.result_messages)
-            end
-          end
-        end
+      # Returns +true+ when the estimated token usage of +messages+ exceeds
+      # +threshold+ times the available context budget.
+      # Always returns +false+ when no token budget is available.
+      #
+      # @param messages  [Array<RubyLLM::Message>] conversation history
+      # @param threshold [Float] fraction of the available budget (default 0.8)
+      # @return [Boolean]
+      # @api public
+      def budget_exceeded?(messages, threshold: 0.8)
+        return false unless (b = build_token_budget)
+        total = Array(messages).sum { |m| LlmContextWindow::TokenEstimator.estimate(m.content.to_s) }
+        limit = b.available(used: 0)
+        total > limit * threshold
+      end
+      protected :budget_exceeded?
-        elements.map { |e| e[:message] }
+      # Registers a per-instance knowledge source. Knowledge chunks from all
+      # registered sources are included in every LLM call via +build_context+.
+      #
+      # @param source [#fetch] any object responding to +fetch(query:)+
+      # @return [void]
+      # @api public
+      def add_knowledge_source(source)
+        @instance_knowledge_sources ||= []
+        @instance_knowledge_sources << source
       end
-      protected :prepare_history
+      protected :add_knowledge_source
+      # Returns knowledge chunks fetched from all instance-level knowledge sources.
+      #
+      # @return [Array<Hash>]
+      # @api private
+      def instance_knowledge_chunks
+        return [] unless @instance_knowledge_sources
+        @instance_knowledge_sources.flat_map { |ks| ks.fetch(query: nil) }
+      end
+      protected :instance_knowledge_chunks
       # Performs a single (non-retried) invocation. Extracted so that #invoke can
       # wrap it in a retry loop without duplicating the LLM interaction logic.
       def invoke_once(input, messages: [], thread_id: nil, config: {})
-        caller_meta = {}
-        caller_meta[:user_id] = config[:user_id] if config[:user_id]
-        caller_meta[:session_id] = config[:session_id] if config[:session_id]
-        if (ic = config[:invocation_context])
-          caller_meta[:task_id] = ic.task_id if ic.task_id
-          caller_meta[:parent_task_id] = ic.parent_task_id if ic.parent_task_id
-        end
-        trace("agent.invoke", input: input, **caller_meta) do |_span|
+        trace("agent.invoke", input: input, **_build_caller_meta(config)) do |_span|
           Agent::InvocationPipeline.new(self).run(
             input,
             messages: messages,
@@ -939,6 +860,39 @@ module Phronomy
         end
       end
+      def _build_caller_meta(config)
+        meta = {}
+        meta[:user_id] = config[:user_id] if config[:user_id]
+        meta[:session_id] = config[:session_id] if config[:session_id]
+        if (ic = config[:invocation_context])
+          meta[:task_id] = ic.task_id if ic.task_id
+          meta[:parent_task_id] = ic.parent_task_id if ic.parent_task_id
+        end
+        meta
+      end
+      def _apply_context_to_chat(chat, context)
+        apply_instructions(chat, context[:system]) if context[:system]
+        (context[:tool_classes] || []).each { |tc| chat.with_tool(prepare_tool_class(tc)) }
+        context[:messages].each { |msg| chat.messages << msg }
+      end
+      def _drain_stream(chat, user_message, config, &block)
+        adapter = Phronomy.configuration.llm_adapter
+        chunk_queue = Phronomy::Concurrency::AsyncQueue.new(max_size: Phronomy.configuration.stream_queue_max_size)
+        pending = adapter.stream_async(chat, user_message, config: config, enqueue_to: chunk_queue)
+        loop do
+          chunk = chunk_queue.pop
+          break if chunk.nil?
+          block.call(StreamEvent.new(type: :token, payload: {content: chunk.content}))
+          check_cancellation!(config, "invocation cancelled during streaming")
+        end
+        response = pending.await
+        [response.content, Phronomy::TokenUsage.from_tokens(response.tokens)]
+      end
       # Builds a TokenBudget for this agent's model if possible.
       # When context_window is set at the class level, that value is used directly
       # (bypassing the RubyLLM catalogue) — useful for locally-hosted models where
@@ -965,54 +919,6 @@ module Phronomy
         nil
       end
-      # Converts a flat Array of message objects into the internal message_elements
-      # format used by TrimContext, TriggerContext, and CompactionContext.
-      # Each element receives a 0-based synthetic seq number.
-      #
-      # @param messages [Array] message-like objects with #role and #content
-      # @return [Array<Hash>]
-      # @api public
-      def build_message_elements(messages)
-        Array(messages).each_with_index.map do |msg, idx|
-          tokens = LlmContextWindow::TokenEstimator.estimate(msg.content.to_s)
-          {seq: idx, message: msg, tokens: tokens, role: msg.role}
-        end
-      end
-      # Builds (or returns a cached) system prompt text.
-      # The fingerprint is a SHA-256 digest of the instruction text concatenated
-      # with the content of every registered static knowledge source.
-      # When the fingerprint is unchanged the ContextVersionCache returns the
-      # previously assembled text without re-fetching any sources.
-      #
-      # @param input [String, Hash] the agent's current input (used for template evaluation)
-      # @return [String, nil] assembled system text, or nil when empty
-      # @api public
-      def build_cached_system_text(input)
-        instruction = build_instructions(input)
-        static_chunks = self.class.static_knowledge_chunks
-        fingerprint = Digest::SHA256.hexdigest(
-          [instruction.to_s, *static_chunks.map { |c| c[:content] }].join("\0")
-        )
-        cache = (@context_version_cache ||= LlmContextWindow::ContextVersionCache.new)
-        unless cache.valid?(fingerprint)
-          parts = [instruction]
-          static_chunks.each do |chunk|
-            parts << LlmContextWindow::Assembler.xml_tag(chunk[:content], type: chunk[:type], trusted: true)
-          end
-          cache.update(fingerprint: fingerprint, system_text: parts.compact.join("\n\n"))
-        end
-        # Persist a reference on the instance so that context_version_cache
-        # remains accessible after invoke completes.
-        @last_context_version_cache = cache
-        cache.system_text.empty? ? nil : cache.system_text
-      end
       # Returns the chat class to instantiate for this invocation.
       # When EventLoop mode is enabled ({Phronomy.configuration.event_loop}),
       # returns {ParallelToolChat} so that concurrent tool dispatch is enabled.
@@ -1039,10 +945,6 @@ module Phronomy
           RubyLLM.chat(**opts)
         end
         chat.with_temperature(t) if t
-        self.class.tools.each do |tool_class|
-          chat.with_tool(prepare_tool_class(tool_class))
-        end
-        _handoff_tools.each { |tc| chat.with_tool(tc) }
         chat
       end
@@ -1102,7 +1004,7 @@ module Phronomy
       # Builds the final tool class to register with the chat.
       #
       # When an already-instantiated tool object is passed (e.g. a
-      # {Phronomy::Tool::McpTool} returned by +McpTool.from_server+), it is
+      # {Phronomy::Tools::Mcp} returned by +Phronomy::Tools::Mcp.from_server+), it is
       # returned as-is.  RubyLLM's +with_tool+ accepts both classes and
       # instances, so no wrapping is needed.
       #
@@ -1110,7 +1012,7 @@ module Phronomy
       #   1. Alias override — when the Hash form of .tools maps this class to an
       #      explicit name, an anonymous subclass with that tool_name is returned.
       #   2. Scope policy   — when a scope is declared on the tool, the configured
-      #      {Phronomy::Tool::ScopePolicy} (or the default) is evaluated.
+      #      {Phronomy::Agent::Context::Capability::ScopePolicy} (or the default) is evaluated.
       #      +:reject+ wraps the tool to return a denial message without executing.
       #      +:approve+ behaves like requiring approval (same as step 3 when the
       #      tool does not already have +requires_approval+).
@@ -1120,7 +1022,7 @@ module Phronomy
       #      (tool_name, args) and, if it returns falsy, the tool returns a denial
       #      message instead of executing.
       def prepare_tool_class(tool_class)
-        # When an instantiated tool object is passed (e.g. McpTool.from_server
+        # When an instantiated tool object is passed (e.g. Phronomy::Tools::Mcp.from_server
         # returns an instance, not a class), skip class-level processing and
         # return it directly. RubyLLM#with_tool handles both forms.
         return tool_class unless tool_class.is_a?(Class)
@@ -1139,7 +1041,7 @@ module Phronomy
         # Step 2: evaluate scope policy.
         scope = resolved.scope
         if scope
-          policy = @scope_policy || Phronomy::Tool::ScopePolicy::DEFAULT
+          policy = @scope_policy || Phronomy::Agent::Context::Capability::ScopePolicy::DEFAULT
           decision = policy.call(resolved, scope, self)
           case decision
           when :reject