RubyGems - legion-llm - Versions diffs - 0.8.32 → 0.8.49 - Mend

legion-llm 0.8.32 → 0.8.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (120) hide show

checksums.yaml +4 -4
data/.gitignore +4 -0
data/CHANGELOG.md +135 -8
data/CLAUDE.md +27 -36
data/Gemfile +25 -0
data/README.md +125 -135
data/legion-llm.gemspec +6 -9
data/lib/legion/llm/api/anthropic/messages.rb +8 -10
data/lib/legion/llm/api/auth.rb +2 -2
data/lib/legion/llm/api/native/chat.rb +6 -2
data/lib/legion/llm/api/native/helpers.rb +57 -77
data/lib/legion/llm/api/native/inference.rb +3 -9
data/lib/legion/llm/api/native/instances.rb +87 -0
data/lib/legion/llm/api/native/models.rb +117 -0
data/lib/legion/llm/api/native/offerings.rb +75 -0
data/lib/legion/llm/api/native/providers.rb +57 -43
data/lib/legion/llm/api/native/routing.rb +55 -0
data/lib/legion/llm/api/openai/chat_completions.rb +13 -12
data/lib/legion/llm/api/openai/embeddings.rb +1 -1
data/lib/legion/llm/api/openai/models.rb +6 -40
data/lib/legion/llm/api.rb +8 -0
data/lib/legion/llm/audit.rb +1 -2
data/lib/legion/llm/cache/response.rb +48 -23
data/lib/legion/llm/cache.rb +47 -22
data/lib/legion/llm/call/daemon_client.rb +6 -6
data/lib/legion/llm/call/dispatch.rb +181 -58
data/lib/legion/llm/call/embeddings.rb +112 -438
data/lib/legion/llm/call/lex_llm_adapter.rb +230 -0
data/lib/legion/llm/call/providers.rb +24 -500
data/lib/legion/llm/call/registry.rb +94 -10
data/lib/legion/llm/call/structured_output.rb +4 -4
data/lib/legion/llm/call.rb +0 -3
data/lib/legion/llm/compat.rb +8 -11
data/lib/legion/llm/config.rb +18 -12
data/lib/legion/llm/context/compressor.rb +5 -1
data/lib/legion/llm/context/curator.rb +8 -5
data/lib/legion/llm/discovery/memory_gate.rb +53 -0
data/lib/legion/llm/discovery/rule_generator.rb +147 -0
data/lib/legion/llm/discovery/system.rb +1 -1
data/lib/legion/llm/discovery.rb +227 -46
data/lib/legion/llm/fleet/dispatcher.rb +129 -33
data/lib/legion/llm/fleet/handler.rb +54 -21
data/lib/legion/llm/fleet/lane.rb +132 -0
data/lib/legion/llm/fleet/reply_dispatcher.rb +51 -39
data/lib/legion/llm/fleet.rb +1 -0
data/lib/legion/llm/helper.rb +3 -9
data/lib/legion/llm/hooks/budget_guard.rb +1 -3
data/lib/legion/llm/hooks/rag_guard.rb +2 -4
data/lib/legion/llm/hooks/reflection.rb +1 -2
data/lib/legion/llm/hooks.rb +2 -2
data/lib/legion/llm/inference/audit_publisher.rb +36 -12
data/lib/legion/llm/inference/conversation.rb +3 -2
data/lib/legion/llm/inference/enrichment_injector.rb +1 -3
data/lib/legion/llm/inference/executor.rb +474 -255
data/lib/legion/llm/inference/prompt.rb +13 -2
data/lib/legion/llm/inference/response.rb +1 -1
data/lib/legion/llm/inference/steps/classification.rb +68 -26
data/lib/legion/llm/inference/steps/debate.rb +41 -23
data/lib/legion/llm/inference/steps/gaia_advisory.rb +2 -2
data/lib/legion/llm/inference/steps/metering.rb +9 -7
data/lib/legion/llm/inference/steps/prompt_cache.rb +15 -12
data/lib/legion/llm/inference/steps/rag_context.rb +30 -21
data/lib/legion/llm/inference/steps/rbac.rb +1 -1
data/lib/legion/llm/inference/steps/skill_injector.rb +16 -6
data/lib/legion/llm/inference/steps/sticky_helpers.rb +21 -6
data/lib/legion/llm/inference/steps/sticky_persist.rb +29 -8
data/lib/legion/llm/inference/steps/sticky_runners.rb +6 -6
data/lib/legion/llm/inference/steps/tier_assigner.rb +28 -9
data/lib/legion/llm/inference/steps/tool_calls.rb +16 -3
data/lib/legion/llm/inference/steps/tool_discovery.rb +27 -13
data/lib/legion/llm/inference/steps/tool_history.rb +5 -1
data/lib/legion/llm/inference/steps/trigger_match.rb +15 -5
data/lib/legion/llm/inference.rb +139 -108
data/lib/legion/llm/inventory.rb +403 -0
data/lib/legion/llm/metering/tokens.rb +2 -6
data/lib/legion/llm/metering/tracker.rb +10 -3
data/lib/legion/llm/metering.rb +31 -9
data/lib/legion/llm/quality/checker.rb +2 -2
data/lib/legion/llm/quality/confidence/scorer.rb +6 -11
data/lib/legion/llm/quality/shadow_eval.rb +7 -3
data/lib/legion/llm/router/arbitrage.rb +1 -4
data/lib/legion/llm/router/gateway_interceptor.rb +4 -5
data/lib/legion/llm/router/health_tracker.rb +154 -46
data/lib/legion/llm/router/resolution.rb +33 -10
data/lib/legion/llm/router/rule.rb +9 -5
data/lib/legion/llm/router.rb +210 -70
data/lib/legion/llm/scheduling/batch.rb +1 -4
data/lib/legion/llm/scheduling.rb +3 -6
data/lib/legion/llm/settings.rb +121 -53
data/lib/legion/llm/skills/base.rb +4 -1
data/lib/legion/llm/skills/external_discovery.rb +2 -2
data/lib/legion/llm/skills.rb +1 -1
data/lib/legion/llm/tools/confidence.rb +44 -10
data/lib/legion/llm/tools/dispatcher.rb +80 -39
data/lib/legion/llm/tools.rb +0 -1
data/lib/legion/llm/transport/exchanges/fleet.rb +1 -1
data/lib/legion/llm/transport/message.rb +83 -9
data/lib/legion/llm/transport/messages/fleet_error.rb +25 -16
data/lib/legion/llm/transport/messages/fleet_request.rb +45 -1
data/lib/legion/llm/transport/messages/fleet_response.rb +22 -15
data/lib/legion/llm/transport/messages/prompt_event.rb +1 -1
data/lib/legion/llm/transport/messages/skill_event.rb +1 -1
data/lib/legion/llm/transport/messages/tool_event.rb +1 -1
data/lib/legion/llm/types/tool_definition.rb +71 -0
data/lib/legion/llm/types.rb +1 -0
data/lib/legion/llm/version.rb +1 -1
data/lib/legion/llm.rb +21 -18
metadata +30 -74
data/lib/legion/llm/bedrock_bearer_auth.rb +0 -4
data/lib/legion/llm/call/bedrock_auth.rb +0 -53
data/lib/legion/llm/call/bedrock_embeddings.rb +0 -270
data/lib/legion/llm/call/claude_config_loader.rb +0 -172
data/lib/legion/llm/call/codex_config_loader.rb +0 -132
data/lib/legion/llm/discovery/ollama.rb +0 -96
data/lib/legion/llm/discovery/vllm.rb +0 -114
data/lib/legion/llm/inference/mcp_tool_adapter.rb +0 -5
data/lib/legion/llm/inference/tool_adapter.rb +0 -13
data/lib/legion/llm/patches/ruby_llm_parallel_tools.rb +0 -102
data/lib/legion/llm/patches/ruby_llm_vllm.rb +0 -78
data/lib/legion/llm/tools/adapter.rb +0 -101

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f73cca457b602ab79cd3964a15333afe338a7ed5ef18a556f0e502da7817b4ef
-  data.tar.gz: 706498c91924640dc31bf43e814a04f2880acb2930cf30abdf144c28cb02fd01
+  metadata.gz: 88524bf330ed22e6ac366d35ddccdbad4557b115c447d87d238ab2670ca8da7e
+  data.tar.gz: 68c0470f4c10ee885bd28d0486f487777ae3c9dc35ab0fea3cc4ab860c68a0c6
 SHA512:
-  metadata.gz: 3b499fde4636085676157fdad1576ba60ed0149f2e9d126289b38399b4232ce64ed47e66250c7bf0dd01c87e15afff675c0d3f0ddc5d6d152ec17640af615fd2
-  data.tar.gz: 5b26a4426f630a3e36dedd6bdeacae7b6a2283696f4a9771d5458bdd2c8735380984ee983a10c9483ea2cb373eaa5a4e54c0cc92cac5d1a0b04f5835a7920850
+  metadata.gz: ba9ad2293c9e65db838aca3921ba291ef3bfd71798fc044bfdcdb90f3a04e0536fa5c562f00ea3e06eda88c41ee1659d23ef46382c69424940b4ff058f5cd978
+  data.tar.gz: c7feaf54a2a618eb5d45741269ba65d855bb478142a3b90702d29de9d23f3bbceadca26b5eb765bedf6c2476b97901dc59cc597242051d9500df9733174532f4

data/.gitignore CHANGED Viewed

@@ -1,6 +1,7 @@
 /.bundle/
 /.yardoc
 /Gemfile.lock
+Gemfile.lock
 *.gem
 /_yardoc/
 /coverage/
@@ -20,3 +21,6 @@ legion.log
 .worktrees/
 .claude/
 docs/
+bin/apollo-setup-postreboot.sh
+bin/apollo-setup-prereboot.sh
+legionio-bootstrap-uhg-v3.json

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,132 @@
 # Legion LLM Changelog
+## [0.8.49] - 2026-04-29
+### Changed
+- `Settings.register_defaults!` now calls `Legion::Settings.register_library` instead of `merge_settings`, using the idempotent legion-settings 1.4.0 API that prevents double-registration.
+- Bumped `legion-settings` dependency floor to `>= 1.4.0`.
+- Test stub `Legion::Settings` now exposes `register_library` matching the real 1.4.0 API.
+## [0.8.48] - 2026-04-29
+### Added
+- `ToolDefinition.from_registry_entry` builds tool definitions from `Legion::Settings::Extensions` registry entries.
+- `Dispatcher` checks `Settings::Extensions` for tool override resolution; when no matching entry is found it falls back to settings-based MCP overrides (no `Tools::Registry` or `Catalog::Registry` fallback).
+- `Executor#add_registry_tool_definitions` reads from `Settings::Extensions` when available, falling back to `Legion::Tools::Registry` for backward compatibility.
+- `Steps::ToolDiscovery` discovers tools from `Settings::Extensions` when available, falling back to `Legion::Tools::Registry`.
+### Changed
+- Bumped `legion-settings` dependency floor to `>= 1.4.0` (requires `Settings::Extensions` module).
+## [0.8.47] - 2026-04-29
+### Fixed
+- Tool-bearing native inference now forwards native tool definitions and runs a bounded native tool loop through `Call::Dispatch` and `Inference::ToolDispatcher`.
+- OpenAI-compatible, Anthropic-compatible, and native API tool declarations now use provider-neutral native tool definitions.
+- Embedding generation and discovery health checks now route through native `Call::Dispatch.dispatch_embed`.
+## [0.8.46] - 2026-04-29
+### Fixed
+- Pipeline native dispatch now rejects tool-bearing requests that lack native tool-loop support instead of silently dropping tools.
+- Escalation exhaustion now raises `EscalationExhausted` consistently when RubyLLM is unavailable, preserving the RubyLLM-absent native failure details in the error message.
+## [0.8.45] - 2026-04-29
+### Added
+- Native LLM API now exposes model-offering inventory at `/api/llm/offerings` and provider-instance inventory at `/api/llm/instances`, matching the routing redesign metadata surface.
+## [0.8.44] - 2026-04-28
+### Fixed
+- Native LLM API caller metadata now uses the unified `Legion::Identity::Request`/`Process` identity path instead of ad-hoc request waterfalls.
+- Identity broker credential lookups now include `purpose:` and `context:` metadata so credential grants are auditable.
+- Fleet request expiration no longer logs warnings for omitted or blank TTL values, and unified identity caller metadata accepts string-keyed `requested_by` hashes.
+## [0.8.43] - 2026-04-28
+### Changed
+- RubyLLM is now loaded as an optional compatibility layer instead of a router runtime dependency, and native provider dispatch no longer falls back to RubyLLM by default.
+- Non-pipeline chat, escalation, provider probes, startup defaults, discovery checks, and compatibility API tool builders now degrade cleanly when RubyLLM is unavailable, routing native direct calls where a registered native provider exists and raising `ProviderError` instead of `NameError` for RubyLLM-only paths.
+## [0.8.42] - 2026-04-28
+### Fixed
+- LLM provider settings snapshots now initialize their fallback source before deep-copying settings, preventing an uninitialized local fallback when snapshot duplication raises.
+## [0.8.41] - 2026-04-28
+### Fixed
+- The router gemspec now depends on the provider-neutral `lex-llm` base instead of installing legacy provider gems (`lex-bedrock`, `lex-claude`, `lex-gemini`, and `lex-openai`) as runtime dependencies.
+## [0.8.40] - 2026-04-28
+### Fixed
+- Streaming inference now uses native lex-llm provider dispatch when the provider layer selects native mode.
+- Startup discovery now refreshes local system facts independently of Ollama model refresh, and metering publish now loads its transport message class when connected instead of depending on prior boot order.
+## [0.8.39] - 2026-04-28
+### Fixed
+- OpenAI-compatible, Anthropic-compatible, and native chat API routes now use the same server-resolved caller identity metadata as native inference, preserving audit and metering identity fields across compatibility routes.
+## [0.8.38] - 2026-04-28
+### Fixed
+- `require 'legion/llm'` now loads `legion-settings` when the host has not already loaded `Legion::Settings`, preserving standalone settings defaults and override behavior during LegionIO load-phase initialization.
+## [0.8.37] - 2026-04-28
+### Fixed
+- Router resolutions, health tracking, inventory, native dispatch, inference responses, audit, and metering now preserve optional lex-llm model offering metadata while keeping provider/model fallback behavior compatible with older callers.
+- Inventory now consumes lex-llm 0.1.5 `ModelOffering`-style fields from configured settings or native provider adapters when available, exposing offering IDs, model families, canonical aliases, provider instances, and routing metadata without credentials.
+## [0.8.36] - 2026-04-28
+### Fixed
+- Fleet transport now publishes requests through the `llm.fleet` exchange, keeps provider/model in fleet message bodies for workers, and publishes handler replies through the mandatory confirmed fleet response message path.
+- Inventory now exposes exact offering lanes and non-secret offering metadata so provider instances can opt into offering-level routing without losing shared fleet lane compatibility.
+## [0.8.35] - 2026-04-28
+### Fixed
+- Fleet lane routing now rejects sensitive or oversized public boundary, eligibility, and offering-instance segments before they can enter routing keys.
+- Lex-llm native provider adapters now memoize provider instances and cover streaming, token-count estimation, provider failures, missing namespaces, and non-hash message inputs.
+- Local Ollama/vLLM health probes now distinguish malformed base URL configuration from ordinary unreachable services.
+- Inventory failures now re-raise programmer/config-shape errors instead of silently returning an empty model list.
+- Fleet reply dispatch logging now includes operation tags, JSON parse failures are logged at warning level, and unwired broker return/nack helpers were removed so pending replies rely on the documented timeout path.
+- The vLLM thinking patch now rescues only expected settings-shape errors instead of swallowing all runtime failures.
+## [0.8.34] - 2026-04-28
+### Fixed
+- Native lex-llm provider dispatch now preserves injected system instructions when routing through `LexLLMAdapter`.
+- Lex-llm bridge configuration now normalizes OpenAI-compatible `/v1` base URLs for `vllm` and `openai` providers while preserving versioned non-OpenAI-compatible endpoints.
+## [0.8.33] - 2026-04-27
+### Fixed
+- `legion-llm` can now bridge loaded `lex-llm-*` provider classes into native dispatch through a `LexLLMAdapter`, allowing the new provider-gem split to participate without duplicating old `lex-*` runner constants.
+- Provider-layer defaults now prefer `auto` dispatch and include the new `ollama`, `vllm`, `anthropic`, `openai`, `gemini`, and `mlx` native provider names.
+- Inventory now recognizes `mlx` as a local HTTP provider.
+- Fleet dispatch now registers reply futures before publishing requests, consumes structured publish results, fails fast on unroutable/nacked/confirm-timeout publishes, and validates reply metadata before fulfilling pending requests.
+- `FleetRequest` now opts out of live-request spooling, requires mandatory publish and publisher confirms by default, and includes reply routing fields in the worker payload.
+- `FleetResponse` and `FleetError` now publish live replies with mandatory routing, publisher confirms, and no spool/replay.
+- LLM transport message IDs are memoized per message instance so AMQP return/confirm correlation sees the same `message_id` that was published.
+- Embedding provider/model resolution, provider disable gates, prefix injection, fallback chains, Azure settings, Ollama base URLs, and metering caller context now honor JSON/string-keyed settings in addition to symbol-keyed runtime settings.
+- Discovery for Ollama, vLLM, and embedding fallback chains now honors JSON/string-keyed provider, embedding, base URL, model metadata, and refresh TTL settings.
+- Inference executor routing defaults, conversation compaction, pipeline escalation, native dispatch, async post-step, telemetry span, tool-loop, and fallback-provider settings now honor JSON/string-keyed settings.
+- Module-level inference, prompt dispatch, prompt-cache, debate, and skill-injector settings now honor JSON/string-keyed settings.
+- Sticky tool history, trigger matching, and RAG context settings now honor JSON/string-keyed settings.
+- Shared settings helpers now normalize string and symbol keys across router, fleet, scheduling, response cache, API auth/defaults, metering, quality, guards, skills, discovery, inventory, daemon, config loaders, and audit checks.
+- Shared settings helpers now register defaults through `Legion::Settings.merge_settings(:llm, ...)` and read directly from the canonical `Legion::Settings[:llm]` store so JSON-loaded settings files and runtime overrides remain authoritative.
+- LLM cache, response-cache, and tool-confidence paths now prefer connected local cache backends while preserving shared cache fallback behavior.
+- Boot, compatibility, Bedrock embedding, transport-connected, identity, RBAC, and API helper paths now use shared LLM settings/logging helpers instead of direct `Legion::Settings`, `Legion::Logging`, and `Legion::Cache` calls.
+- LLM transport messages now promote tracing metadata into W3C `traceparent`, `baggage`, and Legion trace headers for fleet/audit/metering correlation.
+- Fleet dispatch replies now avoid request-side metadata gates by default and expose both `model` and `model_id` so downstream metering and metadata readers can resolve the model consistently.
+- Fleet lane sanitization and vLLM health URL normalization now avoid regex patterns flagged by CodeQL for uncontrolled input.
+- The lex-llm provider bridge now loads only the Legion-native `Legion::Extensions::Llm` namespace and no longer probes removed fork-era entrypoints.
 ## [0.8.32] - 2026-04-27
 ### Fixed
@@ -37,7 +164,7 @@
 ## [0.8.28] - 2026-04-24
 ### Fixed
-- Model/provider mismatch when clients send a model name (e.g., `qwen3.5:latest`) without an explicit provider. The fallback paths blindly paired it with `default_provider` (typically `bedrock`), causing `RubyLLM::ModelNotFoundError`. Now infers the correct provider from model naming patterns before falling back to the global default.
+- Model/provider mismatch when clients send a model name (e.g., `qwen3.5:latest`) without an explicit provider. The routing paths blindly paired it with `default_provider` (typically `bedrock`), causing provider model lookup failures. Now infers the correct provider from model naming patterns before using the global default.
 - `arbitrage_fallback` hardcoded `:cloud` tier and `:bedrock` provider when inference failed. Now uses `PROVIDER_TIER` to resolve the correct tier for the inferred provider.
 ### Added
@@ -82,7 +209,7 @@
 ## [0.8.23] - 2026-04-23
 ### Fixed
-- `Call::StructuredOutput` prompt-fallback path passed `messages:` (plural) to `chat_single` which only accepts `message:` (singular), leaking the unknown kwarg into `RubyLLM::Chat.new`. Visible as repeated "unknown keyword: :messages" warnings during dream cycle contradiction detection. Flattened instruction + messages into a single string via `extract_user_content`.
+- `Call::StructuredOutput` parse-retry path passed `messages:` (plural) to `chat_single` which only accepts `message:` (singular), leaking the unknown kwarg into the provider chat call. Visible as repeated "unknown keyword: :messages" warnings during dream cycle contradiction detection. Flattened instruction + messages into a single string via `extract_user_content`.
 ## [0.8.22] - 2026-04-22
@@ -132,7 +259,7 @@
 ## [0.8.16] - 2026-04-22
 ### Fixed
-- `RubyLLM::BadRequestError` (HTTP 400) and `RubyLLM::ContextLengthExceededError` now trigger the provider fallback-retry chain instead of bubbling up as unhandled 500s. Both `run_provider_call_single` and `step_provider_call_stream` retry on the next available provider before giving up.
+- Provider bad-request and context-length errors now trigger the provider retry chain instead of bubbling up as unhandled 500s. Both `run_provider_call_single` and `step_provider_call_stream` retry on the next available provider before giving up.
 - Resolved provider/model is now logged (`log.info`) in `step_routing` so provider errors can be diagnosed from daemon logs without relying on SSE done events.
 ### Changed
@@ -448,7 +575,7 @@
 - `started_at` timestamp stored in `Thread.current[:legion_current_tool_started_at]` for accurate per-call wall-clock duration even across parallel threads
 ### Changed
-- `MAX_RUBY_LLM_TOOL_ROUNDS` constant raised from `25` to `200` (now serves as a fallback default for the configurable `max_tool_rounds` setting)
+- Tool-loop round cap raised from `25` to `200` for the configurable `max_tool_rounds` setting.
 ### Fixed
 - `ConversationStore#db_append_message` now serializes non-String `content` values (e.g., tool-call arrays) to JSON before writing to the database, preventing Sequel type errors when tool-use messages are persisted
@@ -493,7 +620,7 @@
 ### Added
 - Per-step pipeline timing diagnostics: `[pipeline][timing]` log line with duration per step
 - Pre-pipeline timing in inference route: `gaia_ingest`, `pre_pipeline_setup`, `executor_call` durations
-- `MAX_RUBY_LLM_TOOL_ROUNDS` (25) — caps RubyLLM's unbounded tool-use loop to prevent infinite cycling
+- Tool-loop round cap (25) to prevent infinite cycling
 - `install_tool_loop_guard` applied to both streaming and non-streaming provider paths
 ### Fixed
@@ -639,9 +766,9 @@
 - `Legion::LLM::ProviderRegistry` — thread-safe registry for native lex-* provider extensions: `register(name, ext)`, `for(name)`, `available`, `registered?(name)`, `reset!`; cleared automatically on `Legion::LLM.shutdown` (closes #37)
 - `Legion::LLM::NativeDispatch` — native provider dispatch layer: `dispatch_chat`, `dispatch_embed`, `dispatch_stream`, `dispatch_count_tokens` route calls to registered lex-* extension modules and return standardized `{ result:, usage: Usage }` hashes; raises `ProviderError` when provider is not registered (closes #37)
 - `Legion::LLM::NativeResponseAdapter` — adapter wrapping native dispatch result hash to expose the same `.content`, `.input_tokens`, `.output_tokens`, `.usage` interface as a RubyLLM response object (closes #37)
-- `provider_layer` settings section: `mode` (`'ruby_llm'` default / `'native'` / `'auto'`), `native_providers` (default `['claude', 'bedrock']`), `fallback_to_ruby_llm` (default `true`); `ruby_llm` mode preserves all existing behavior unchanged (closes #37)
+- `provider_layer` settings section: `mode` (`'native'` / `'auto'`) and `native_providers` (default `['claude', 'bedrock']`) for native provider dispatch (closes #37)
 - Auto-registration in `Legion::LLM.start`: detects loaded lex-* extensions via `Object.const_defined?` and registers them — `lex-claude` → `:claude`/`:anthropic`, `lex-bedrock` → `:bedrock`, `lex-openai` → `:openai`, `lex-gemini` → `:gemini`; no hard dependencies added (closes #37)
-- `Pipeline::Executor` provider layer integration: `use_native_dispatch?` checks `provider_layer.mode`; `execute_provider_request_native` calls `NativeDispatch.dispatch_chat` and wraps result in `NativeResponseAdapter`, falls back to RubyLLM when `fallback_to_ruby_llm: true`; `execute_provider_request_ruby_llm` is the extracted RubyLLM path (default, no behavior change) (closes #37)
+- `Pipeline::Executor` provider layer integration: `use_native_dispatch?` checks `provider_layer.mode`; `execute_provider_request_native` calls `NativeDispatch.dispatch_chat` and wraps result in `NativeResponseAdapter` (closes #37)
 - Optional adversarial debate pipeline step for high-stakes decisions (closes #28): `Pipeline::Steps::Debate` runs a multi-round advocate/challenger/judge debate after `provider_call`; the initial response is the advocate, a challenger model critiques it, the advocate rebuts, and a judge model synthesizes all sides into the final response; activation via `debate: true` in `chat()` kwargs, or `Legion::Settings[:llm][:debate][:enabled]`, or GAIA auto-trigger when `gaia_auto_trigger: true` and `high_stakes`/`debate_recommended` are set in the advisory enrichment; debate is disabled by default; GAIA auto-trigger defaults to false in v0.6.0; different models are required for each role (advocate, challenger, judge) to avoid training bias — model rotation picks from enabled providers automatically when not explicitly configured; model strings use `provider:model` format; all LLM calls use `chat_direct` to avoid pipeline recursion; configurable via `debate.default_rounds` (default 1), `debate.max_rounds` (cap, default 3), `debate.advocate_model`, `debate.challenger_model`, `debate.judge_model`, `debate.model_selection_strategy` (default `'rotate'`); debate metadata (`enabled`, `rounds`, `advocate_model`, `challenger_model`, `judge_model`, `advocate_summary`, `challenger_summary`, `judge_confidence`) stored in `enrichments['debate:result']`; gracefully degrades to single-model mode with a warning when fewer than 2 models are available
 - Async context curation (`Legion::LLM::ContextCurator`): keeps LLM context lean without compaction (closes #38). Heuristic curation runs async in `Thread.new` after each `step_context_store` — zero latency impact. Curated messages are used in `step_context_load` when available, falling back to raw history. Heuristic pipeline: `strip_thinking` removes `<thinking>` blocks; `distill_tool_result` summarizes large tool outputs by tool type (`read_file` → line count + first/last, `search`/`grep` → match counts, `bash` → exit code + last lines, default → char count + preview); `fold_resolved_exchanges` detects multi-turn clarification reaching agreement and folds to a system note; `evict_superseded` keeps only the latest read of each file path; `dedup_similar` removes near-duplicate messages via Jaccard similarity (delegates to `Compressor.deduplicate_messages`). LLM-assisted mode is built but off by default (`llm_assisted: false`); when enabled with `mode: 'llm_assisted'`, a configurable small/fast model produces better summaries with automatic fallback to heuristic on any error. All behavior gated by `Legion::Settings[:llm][:context_curation]`: `enabled` (default `true`), `mode` (`'heuristic'`), `llm_assisted` (`false`), `llm_model` (`nil`), `tool_result_max_chars` (2000), `thinking_eviction` (`true`), `exchange_folding` (`true`), `superseded_eviction` (`true`), `dedup_enabled` (`true`), `dedup_threshold` (0.85), `target_context_tokens` (40000).
 - Message chain architecture with parent links and sidechain support in `ConversationStore` (closes #39): every message now carries `id` (UUID), `parent_id`, `sidechain` (default `false`), `message_group_id`, and `agent_id` fields; `build_chain(conversation_id, include_sidechains: false)` reconstructs ordered message history from parent links with rooted-leaf selection, parallel sibling recovery via `message_group_id`, and orphan appending; `sidechain_messages(conversation_id, agent_id: nil)` queries background/subagent messages with optional agent filter; `branch(conversation_id, from_message_id:)` creates a new conversation by copying history up to the given message; `store_metadata` / `read_metadata` provide tail-window session metadata storage; `migrate_parent_links!` backfills parent links on pre-migration sequential data; `messages()` backward-compatible flat array uses chain reconstruction when parent links are present, seq ordering otherwise; DB persistence adds `message_id`, `parent_id`, `sidechain`, `message_group_id`, `agent_id` columns when present (graceful degradation without migration)
@@ -1188,7 +1315,7 @@
 ### Added
 - `ResponseCache` module for async response delivery via memcached with spool overflow at 8MB
 - `DaemonClient` module for HTTP routing to LegionIO daemon with health caching (30s TTL)
-- `Legion::LLM.ask` one-shot method: daemon-first routing with direct RubyLLM fallback
+- `Legion::LLM.ask` one-shot method: daemon-first routing with direct provider execution
 - `DaemonDeniedError` and `DaemonRateLimitedError` error classes
 - Daemon settings: `daemon.url` and `daemon.enabled` in defaults
 - HTTP status code contract: 200 (cached), 201 (sync), 202 (async poll), 403, 429, 503

data/CLAUDE.md CHANGED Viewed

@@ -5,10 +5,10 @@
 ## Purpose
-Core LegionIO gem providing LLM capabilities to all extensions. Wraps ruby_llm to provide a consistent interface for chat, embeddings, tool use, and agents across multiple providers (Bedrock, Anthropic, OpenAI, Gemini, Ollama). Includes a dynamic weighted routing engine that dispatches requests across local, fleet, and cloud tiers based on caller intent, priority rules, time schedules, cost multipliers, and real-time provider health.
+Core LegionIO gem providing LLM capabilities to all extensions through Legion-native provider dispatch. Includes a dynamic weighted routing engine that dispatches requests across local, fleet, and cloud tiers based on caller intent, priority rules, time schedules, cost multipliers, and real-time provider health.
 **GitHub**: https://github.com/LegionIO/legion-llm
-**Version**: 0.8.0
+**Version**: 0.8.49
 **License**: Apache-2.0
 ## Architecture
@@ -37,8 +37,6 @@ Legion::LLM.start
 ```
 Legion::LLM (lib/legion/llm.rb)          # Thin facade — delegates to Inference, Call, Discovery
-├── Patches                              # Monkey-patches for upstream gems
-│   └── RubyLLMParallelTools            # Parallel tool execution patch for RubyLLM
 ├── Errors                               # Typed error hierarchy (LLMError base + subtypes, retryable?)
 │   └── EscalationExhausted / DaemonDeniedError / DaemonRateLimitedError / AuthError /
 │       RateLimitError / ContextOverflow / ProviderError / ProviderDown /
@@ -57,14 +55,16 @@ Legion::LLM (lib/legion/llm.rb)          # Thin facade — delegates to Inferenc
 │   ├── Embeddings       # generate, generate_batch, default_model, fallback chain
 │   ├── StructuredOutput # JSON schema enforcement with native response_format and prompt fallback
 │   ├── DaemonClient     # HTTP routing to LegionIO daemon with 30s health cache
-│   ├── BedrockAuth      # Monkey-patch for Bedrock Bearer Token auth (required lazily)
 │   ├── ClaudeConfigLoader # Import Claude CLI config from ~/.claude/settings.json
 │   └── CodexConfigLoader  # Import OpenAI bearer token from ~/.codex/auth.json
 ├── Context                              # Prompt and conversation context management
 │   ├── Compressor   # Deterministic prompt compression (3 levels, code-block-aware)
 │   └── Curator      # Async conversation curation: strip thinking, distill tools, fold resolved exchanges
 ├── Discovery                            # Runtime introspection
-│   ├── Ollama       # Queries Ollama /api/tags for pulled models (TTL-cached)
+│   ├── Ollama       # Multi-instance Ollama /api/tags + /api/show discovery (TTL-cached)
+│   ├── Vllm         # Multi-instance vLLM /v1/models + /health discovery (TTL-cached)
+│   ├── RuleGenerator # Auto-generates routing rules from discovered instances/models
+│   ├── MemoryGate   # Checks available RAM before routing to local models
 │   └── System       # Queries OS memory: macOS (vm_stat/sysctl), Linux (/proc/meminfo)
 ├── Quality                              # Response quality evaluation
 │   ├── Checker      # Quality heuristics (empty, too_short, repetition, json_parse) + pluggable (was QualityChecker)
@@ -86,12 +86,10 @@ Legion::LLM (lib/legion/llm.rb)          # Thin facade — delegates to Inferenc
 │   ├── Executor     # 18-step skeleton with profile-aware execution and call_stream
 │   ├── Conversation # In-memory LRU (256 slots) + optional Sequel DB persistence (was ConversationStore)
 │   ├── Prompt       # Clean dispatch API: dispatch, request, summarize, extract, decide
-│   ├── ToolAdapter  # Wraps Tools::Base for RubyLLM sessions (McpToolAdapter kept as alias)
-│   ├── ToolDispatcher # Routes tool calls: MCP client / LEX runner / RubyLLM builtin
+│   ├── ToolDispatcher # Routes tool calls: MCP client / LEX runner / native execution
 │   ├── AuditPublisher # Publishes audit events to llm.audit exchange
 │   ├── EnrichmentInjector # Converts RAG/GAIA enrichments into system prompt
 │   ├── GaiaCaller   # Gaia-specific chat dispatch with phase/tick tracing
-│   ├── McpToolAdapter # Backward-compat alias for ToolAdapter
 │   └── Steps/       # All 18+ pipeline step modules
 │       ├── Metering, Billing, TokenBudget, PromptCache, Classification, Rbac
 │       ├── GaiaAdvisory, TierAssigner, TriggerMatch, ToolDiscovery, McpDiscovery, RagContext
@@ -148,9 +146,8 @@ Legion::LLM (lib/legion/llm.rb)          # Thin facade — delegates to Inferenc
 │   └── OffPeak      # Peak-hour deferral (delegates to Scheduling)
 ├── Tools                                # Tool call layer
 │   ├── Confidence   # 4-tier degrading confidence storage (was OverrideConfidence)
-│   ├── Dispatcher   # Routes tool calls to MCP/LEX/RubyLLM
+│   ├── Dispatcher   # Routes tool calls to MCP/LEX/native execution
 │   ├── Interceptor  # Extensible pre-dispatch intercept registry
-│   ├── Adapter      # Wraps lex-* extension tool as RubyLLM::Tool
 │   └── Interceptors/
 │       └── PythonVenv # Redirects python3/pip3 tool calls to isolated venv
 ├── Hooks                                # before/after chat interceptor registry
@@ -286,7 +283,6 @@ All compatibility routes normalize requests through `API::Translators` (OpenAIRe
 | Gem | Purpose |
 |-----|---------|
-| `ruby_llm` (>= 1.0) | Multi-provider LLM client |
 | `tzinfo` (>= 2.0) | IANA timezone conversion for schedule windows |
 | `legion-logging` | Logging |
 | `legion-settings` | Configuration |
@@ -371,24 +367,21 @@ Settings read from `Legion::Settings[:llm]`:
 ### Provider Settings
-Each provider has: `enabled`, `api_key`, `vault_path`, plus provider-specific keys.
+Provider defaults now live in each `lex-llm-*` provider extension's `default_settings`. The `providers:` key in `Settings.default` ships as an empty hash; settings files and extension registrations populate it at runtime. Each provider has: `enabled`, `api_key`, plus provider-specific keys.
-Vault credential resolution: When `vault_path` is set and Legion::Crypt::Vault is connected, credentials are fetched from Vault at startup. Keys map to provider-specific fields automatically.
+Local/fleet providers (Ollama, vLLM, MLX) support multi-instance configs via an `instances:` hash. Discovery scans all instances in parallel, enriches models with real capability metadata, and generates per-instance routing rules.
-Bedrock supports two auth modes:
-- **SigV4** (default): `api_key` + `secret_key` (+ optional `session_token`)
-- **Bearer token**: `bearer_token` for AWS Identity Center/SSO. When set, `bedrock_bearer_auth.rb` is required lazily to monkey-patch RubyLLM's Bedrock provider.
+### Capability-Aware Routing
-### Auto-Detection Priority
+Routing rules carry `model_capabilities`, `context_length`, and `parameter_count` from provider-supplied `Model::Info`. The `RuleGenerator` creates rules from discovered instances without a static capability map -- each provider supplies real metadata.
+### Memory Gate
-When no defaults are configured, the first enabled provider is used:
+`Discovery::MemoryGate` checks available system memory before routing to local models. Models that exceed available RAM minus `discovery.memory_floor_mb` are silently skipped.
+### Auto-Detection Priority
-1. Bedrock -> `us.anthropic.claude-sonnet-4-6-v1`
-2. Anthropic -> `claude-sonnet-4-6`
-3. OpenAI -> `gpt-4o`
-4. Gemini -> `gemini-2.0-flash`
-5. Azure -> (endpoint-specific, from `api_base`)
-6. Ollama -> `llama3`
+When no defaults are configured, the first enabled provider is used. Detection order and default models are defined by each `lex-llm-*` provider extension.
 ### Routing Settings
@@ -485,7 +478,6 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
 | Path | Purpose |
 |------|---------|
 | `lib/legion/llm.rb` | Thin facade: start, shutdown, delegates to Inference/Call/Discovery |
-| `lib/legion/llm/patches/ruby_llm_parallel_tools.rb` | Monkey-patch for RubyLLM parallel tool execution |
 | `lib/legion/llm/compat.rb` | Backward-compat aliases via const_missing with deprecation warnings |
 | `lib/legion/llm/errors.rb` | Typed error hierarchy: LLMError base + all subtypes, retryable? predicate |
 | `lib/legion/llm/version.rb` | Version constant |
@@ -503,14 +495,16 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
 | `lib/legion/llm/call/embeddings.rb` | generate, generate_batch, fallback chain, dimension enforcement |
 | `lib/legion/llm/call/structured_output.rb` | JSON schema enforcement with native response_format and prompt fallback |
 | `lib/legion/llm/call/daemon_client.rb` | HTTP routing to LegionIO daemon with 30s health cache |
-| `lib/legion/llm/call/bedrock_auth.rb` | Monkey-patch for Bedrock Bearer Token auth — required lazily |
 | `lib/legion/llm/call/claude_config_loader.rb` | Import Claude CLI config from ~/.claude/settings.json |
 | `lib/legion/llm/call/codex_config_loader.rb` | Import OpenAI bearer token from ~/.codex/auth.json |
 | `lib/legion/llm/context.rb` | Context entry point |
 | `lib/legion/llm/context/compressor.rb` | Deterministic prompt compression: 3 levels, code-block-aware, stopword removal |
 | `lib/legion/llm/context/curator.rb` | Async heuristic conversation curation (was ContextCurator) |
 | `lib/legion/llm/discovery.rb` | Discovery entry point: run, detect_embedding_capability, can_embed? |
-| `lib/legion/llm/discovery/ollama.rb` | Ollama /api/tags discovery with TTL cache |
+| `lib/legion/llm/discovery/ollama.rb` | Multi-instance Ollama /api/tags + /api/show discovery with TTL cache |
+| `lib/legion/llm/discovery/vllm.rb` | Multi-instance vLLM /v1/models + /health discovery with TTL cache |
+| `lib/legion/llm/discovery/rule_generator.rb` | Auto-generates routing rules from discovered instances/models |
+| `lib/legion/llm/discovery/memory_gate.rb` | Checks available RAM vs model size before routing to local models |
 | `lib/legion/llm/discovery/system.rb` | OS memory introspection (macOS + Linux) with TTL cache |
 | `lib/legion/llm/quality.rb` | Quality entry point |
 | `lib/legion/llm/quality/checker.rb` | Quality heuristics + pluggable callable (was QualityChecker) |
@@ -524,19 +518,17 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
 | `lib/legion/llm/metering/tokens.rb` | Thread-safe per-session token budget accumulator (was TokenTracker) |
 | `lib/legion/llm/inference.rb` | Inference entry point: requires all pipeline components |
 | `lib/legion/llm/inference/request.rb` | Inference::Request Data.define struct with .build and .from_chat_args |
-| `lib/legion/llm/inference/response.rb` | Inference::Response Data.define struct with .build, .from_ruby_llm, #with |
+| `lib/legion/llm/inference/response.rb` | Inference::Response Data.define struct with .build, .from_provider_message, #with |
 | `lib/legion/llm/inference/profile.rb` | Inference::Profile: caller-derived profiles for step skipping |
 | `lib/legion/llm/inference/tracing.rb` | Inference::Tracing: trace_id, span_id, exchange_id generation |
 | `lib/legion/llm/inference/timeline.rb` | Inference::Timeline: ordered event recording with participant tracking |
 | `lib/legion/llm/inference/executor.rb` | Inference::Executor: 18-step skeleton with profile-aware execution and call_stream |
 | `lib/legion/llm/inference/conversation.rb` | In-memory LRU (256 slots) + optional Sequel DB persistence (was ConversationStore) |
 | `lib/legion/llm/inference/prompt.rb` | Prompt dispatch API: dispatch, request, summarize, extract, decide |
-| `lib/legion/llm/inference/tool_adapter.rb` | Wraps Tools::Base for RubyLLM sessions (McpToolAdapter kept as alias) |
-| `lib/legion/llm/inference/tool_dispatcher.rb` | Routes tool calls to MCP client / LEX runner / RubyLLM builtin |
+| `lib/legion/llm/inference/tool_dispatcher.rb` | Routes tool calls to MCP client / LEX runner / native execution |
 | `lib/legion/llm/inference/audit_publisher.rb` | Publishes audit events to llm.audit exchange |
 | `lib/legion/llm/inference/enrichment_injector.rb` | Converts RAG/GAIA enrichments into system prompt |
 | `lib/legion/llm/inference/gaia_caller.rb` | Gaia-specific chat dispatch with phase/tick tracing |
-| `lib/legion/llm/inference/mcp_tool_adapter.rb` | Backward-compat alias for ToolAdapter |
 | `lib/legion/llm/inference/steps.rb` | Steps aggregator: requires all step modules |
 | `lib/legion/llm/inference/steps/*.rb` | All 18+ pipeline step modules (metering, billing, rbac, classification, etc.) |
 | `lib/legion/llm/router.rb` | Router: resolve, health_tracker, resolve_chain, select_candidates |
@@ -586,9 +578,8 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
 | `lib/legion/llm/scheduling/batch.rb` | Non-urgent request batching with priority queue and auto-flush |
 | `lib/legion/llm/scheduling/off_peak.rb` | Peak-hour deferral (delegates to Scheduling) |
 | `lib/legion/llm/tools/confidence.rb` | 4-tier degrading confidence storage (was OverrideConfidence) |
-| `lib/legion/llm/tools/dispatcher.rb` | Routes tool calls: MCP client / LEX runner / RubyLLM builtin |
+| `lib/legion/llm/tools/dispatcher.rb` | Routes tool calls: MCP client / LEX runner / native execution |
 | `lib/legion/llm/tools/interceptor.rb` | Extensible pre-dispatch intercept registry |
-| `lib/legion/llm/tools/adapter.rb` | Wraps lex-* extension tool as RubyLLM::Tool (McpToolAdapter kept as alias) |
 | `lib/legion/llm/tools/interceptors/python_venv.rb` | Redirects python3/pip3 tool calls to isolated venv |
 | `lib/legion/llm/hooks.rb` | Hooks: before/after chat registry, run_before, run_after, install_defaults |
 | `lib/legion/llm/hooks/rag_guard.rb` | Post-generation RAG faithfulness check via lex-eval |
@@ -643,7 +634,7 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
 | `spec/legion/llm/gateway_integration_spec.rb` | Tests: gateway teardown — verifies no delegation |
 | `spec/legion/llm/metering/estimator_spec.rb` | Tests: cost estimation, fuzzy matching, pricing table (was cost_estimator_spec.rb) |
 | `spec/legion/llm/inference/request_spec.rb` | Tests: Request struct builder, legacy adapter |
-| `spec/legion/llm/inference/response_spec.rb` | Tests: Response struct builder, RubyLLM adapter, #with |
+| `spec/legion/llm/inference/response_spec.rb` | Tests: Response struct builder, provider message adapter, #with |
 | `spec/legion/llm/inference/profile_spec.rb` | Tests: Profile derivation and step skipping |
 | `spec/legion/llm/inference/tracing_spec.rb` | Tests: Tracing init, exchange_id generation |
 | `spec/legion/llm/inference/timeline_spec.rb` | Tests: Timeline event recording, participants |
@@ -727,7 +718,7 @@ The legacy `vault_path` per-provider setting was removed in v0.3.1.
 Tests run without the full LegionIO stack. `spec/spec_helper.rb` uses real `Legion::Logging` and `Legion::Settings` (no stubs — hard dependencies are always present). Each test resets settings to defaults via `before(:each)`.
 ```bash
-bundle exec rspec    # 1661 examples, 0 failures
+bundle exec rspec    # 2379 examples, 0 failures
 bundle exec rubocop  # 0 offenses
 ```

data/Gemfile CHANGED Viewed

@@ -4,7 +4,32 @@ source 'https://rubygems.org'
 gemspec
+legion_settings_path = File.expand_path('../legion-settings', __dir__)
+gem 'legion-settings', path: legion_settings_path if Dir.exist?(legion_settings_path)
 group :test do
+  lex_llm_path = File.expand_path('../extensions-ai/lex-llm', __dir__)
+  if Dir.exist?(lex_llm_path)
+    gem 'lex-llm', path: lex_llm_path
+  else
+    gem 'lex-llm'
+  end
+  %w[
+    lex-llm-ollama
+    lex-llm-vllm
+    lex-llm-anthropic
+    lex-llm-openai
+    lex-llm-gemini
+    lex-llm-mlx
+    lex-llm-bedrock
+    lex-llm-azure-foundry
+    lex-llm-vertex
+  ].each do |provider_gem|
+    provider_path = File.expand_path("../extensions-ai/#{provider_gem}", __dir__)
+    gem provider_gem, path: provider_path if Dir.exist?(provider_path)
+  end
   gem 'rake'
   gem 'rspec'
   gem 'rspec_junit_formatter'