legion-llm 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/ci.yml +16 -0
- data/.gitignore +18 -0
- data/.rubocop.yml +56 -0
- data/CHANGELOG.md +71 -0
- data/CLAUDE.md +388 -0
- data/Gemfile +14 -0
- data/LICENSE +20 -0
- data/README.md +615 -0
- data/docs/plans/2026-03-15-ollama-discovery-design.md +164 -0
- data/docs/plans/2026-03-15-ollama-discovery-implementation.md +1147 -0
- data/legion-llm.gemspec +32 -0
- data/lib/legion/llm/bedrock_bearer_auth.rb +53 -0
- data/lib/legion/llm/compressor.rb +75 -0
- data/lib/legion/llm/discovery/ollama.rb +88 -0
- data/lib/legion/llm/discovery/system.rb +139 -0
- data/lib/legion/llm/escalation_history.rb +28 -0
- data/lib/legion/llm/helpers/llm.rb +59 -0
- data/lib/legion/llm/providers.rb +88 -0
- data/lib/legion/llm/quality_checker.rb +56 -0
- data/lib/legion/llm/router/escalation_chain.rb +49 -0
- data/lib/legion/llm/router/health_tracker.rb +160 -0
- data/lib/legion/llm/router/resolution.rb +43 -0
- data/lib/legion/llm/router/rule.rb +103 -0
- data/lib/legion/llm/router.rb +279 -0
- data/lib/legion/llm/settings.rb +97 -0
- data/lib/legion/llm/transport/exchanges/escalation.rb +14 -0
- data/lib/legion/llm/transport/messages/escalation_event.rb +13 -0
- data/lib/legion/llm/version.rb +7 -0
- data/lib/legion/llm.rb +264 -0
- metadata +136 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 9a39a5fe8483ddcd715dafd4d65dfe1f4457b90e5a39e62cfa2a32b6c68c8e0c
|
|
4
|
+
data.tar.gz: 37ed9c3b024a1cb9cce7eed1e287512c91269ef99fbb7b54342de57ceae9a668
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: a19943d8d25665e16ae55dfe6c0e32bad0e834a3eed3c5e028c0c0db672d531ea21e6015e6137b1f7c0b57bb38e2677091cab7d48dc3a3169cf9273fe6e468e7
|
|
7
|
+
data.tar.gz: b9bd3d4586e64b9f1866d7e276ecdb3969e2204c8428b37e08fd262ddaef77846c5d28f192174b2ed5787d1576431aae4ebe29e2087499f06e2d0f9393e293ff
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches: [main]
|
|
5
|
+
pull_request:
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
ci:
|
|
9
|
+
uses: LegionIO/.github/.github/workflows/ci.yml@main
|
|
10
|
+
|
|
11
|
+
release:
|
|
12
|
+
needs: ci
|
|
13
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
14
|
+
uses: LegionIO/.github/.github/workflows/release.yml@main
|
|
15
|
+
secrets:
|
|
16
|
+
rubygems-api-key: ${{ secrets.RUBYGEMS_API_KEY }}
|
data/.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/.bundle/
|
|
2
|
+
/.yardoc
|
|
3
|
+
/Gemfile.lock
|
|
4
|
+
/_yardoc/
|
|
5
|
+
/coverage/
|
|
6
|
+
/doc/
|
|
7
|
+
/pkg/
|
|
8
|
+
/spec/reports/
|
|
9
|
+
/tmp/
|
|
10
|
+
/legion/.idea/
|
|
11
|
+
/.idea/
|
|
12
|
+
*.key
|
|
13
|
+
# rspec failure tracking
|
|
14
|
+
.rspec_status
|
|
15
|
+
legionio.key
|
|
16
|
+
# logs and OS artifacts
|
|
17
|
+
legion.log
|
|
18
|
+
.DS_Store
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.4
|
|
3
|
+
NewCops: enable
|
|
4
|
+
SuggestExtensions: false
|
|
5
|
+
|
|
6
|
+
Layout/LineLength:
|
|
7
|
+
Max: 160
|
|
8
|
+
|
|
9
|
+
Layout/SpaceAroundEqualsInParameterDefault:
|
|
10
|
+
EnforcedStyle: space
|
|
11
|
+
|
|
12
|
+
Layout/HashAlignment:
|
|
13
|
+
EnforcedHashRocketStyle: table
|
|
14
|
+
EnforcedColonStyle: table
|
|
15
|
+
|
|
16
|
+
Metrics/MethodLength:
|
|
17
|
+
Max: 50
|
|
18
|
+
|
|
19
|
+
Metrics/ClassLength:
|
|
20
|
+
Max: 1500
|
|
21
|
+
|
|
22
|
+
Metrics/ModuleLength:
|
|
23
|
+
Max: 1500
|
|
24
|
+
|
|
25
|
+
Metrics/BlockLength:
|
|
26
|
+
Max: 40
|
|
27
|
+
Exclude:
|
|
28
|
+
- 'spec/**/*'
|
|
29
|
+
|
|
30
|
+
Metrics/AbcSize:
|
|
31
|
+
Max: 60
|
|
32
|
+
|
|
33
|
+
Metrics/CyclomaticComplexity:
|
|
34
|
+
Max: 15
|
|
35
|
+
|
|
36
|
+
Metrics/PerceivedComplexity:
|
|
37
|
+
Max: 17
|
|
38
|
+
|
|
39
|
+
Style/Documentation:
|
|
40
|
+
Enabled: false
|
|
41
|
+
|
|
42
|
+
Style/SymbolArray:
|
|
43
|
+
Enabled: true
|
|
44
|
+
|
|
45
|
+
Style/FrozenStringLiteralComment:
|
|
46
|
+
Enabled: true
|
|
47
|
+
EnforcedStyle: always
|
|
48
|
+
|
|
49
|
+
Naming/FileName:
|
|
50
|
+
Enabled: false
|
|
51
|
+
|
|
52
|
+
Naming/PredicateMethod:
|
|
53
|
+
Enabled: false
|
|
54
|
+
|
|
55
|
+
Metrics/ParameterLists:
|
|
56
|
+
Max: 9
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Legion LLM Changelog
|
|
2
|
+
|
|
3
|
+
## [0.3.1] - 2026-03-16
|
|
4
|
+
|
|
5
|
+
### Removed
|
|
6
|
+
- `vault_path` provider setting (superseded by universal `vault://` resolver in legion-settings 1.3.0)
|
|
7
|
+
- `resolve_credentials` and related methods from Providers module
|
|
8
|
+
|
|
9
|
+
## [0.3.0] - 2026-03-16
|
|
10
|
+
|
|
11
|
+
### Added
|
|
12
|
+
- Model escalation on retry: automatic fallback to more capable models on hard or quality failures
|
|
13
|
+
- `Router.resolve_chain` returns ordered `EscalationChain` of fallback resolutions
|
|
14
|
+
- `QualityChecker` module with built-in heuristics (empty, too_short, repetition, json_parse) and pluggable checks
|
|
15
|
+
- `EscalationHistory` mixin tracks attempts on response objects (`escalated?`, `escalation_history`, `final_resolution`)
|
|
16
|
+
- `chat(escalate: true, message:)` retry loop with configurable `max_escalations:` and `quality_check:`
|
|
17
|
+
- HealthTracker `:quality_failure` signal with half-weight failure counting (6 quality failures to trip circuit)
|
|
18
|
+
- AMQP transport: `llm.escalation` exchange + `EscalationEvent` message for fleet-wide observability
|
|
19
|
+
- Settings: `routing.escalation.enabled`, `max_attempts`, `quality_threshold`
|
|
20
|
+
- Helper passthrough: `llm_chat` accepts `escalate:`, `max_escalations:`, `quality_check:`
|
|
21
|
+
|
|
22
|
+
## [0.2.3]
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
- Timezone support for routing schedule windows via TZInfo
|
|
26
|
+
- `within_schedule?` converts `now` to the schedule's IANA timezone before evaluating hours and days
|
|
27
|
+
- `tzinfo` (>= 2.0) runtime dependency
|
|
28
|
+
|
|
29
|
+
## [0.2.2]
|
|
30
|
+
|
|
31
|
+
### Added
|
|
32
|
+
- `Legion::LLM::Discovery::Ollama` module — queries Ollama `/api/tags` for pulled models with TTL cache
|
|
33
|
+
- `Legion::LLM::Discovery::System` module — queries OS memory (macOS `vm_stat`/`sysctl`, Linux `/proc/meminfo`) with TTL cache
|
|
34
|
+
- Router step 4.5: rejects Ollama rules where model is not pulled or exceeds available memory
|
|
35
|
+
- Discovery settings: `enabled`, `refresh_seconds`, `memory_floor_mb` under `Legion::Settings[:llm][:discovery]`
|
|
36
|
+
- Startup discovery: logs available Ollama models and system memory when Ollama provider is enabled
|
|
37
|
+
|
|
38
|
+
### Changed
|
|
39
|
+
- Added SimpleCov for test coverage reporting
|
|
40
|
+
|
|
41
|
+
## [0.2.1]
|
|
42
|
+
|
|
43
|
+
### Added
|
|
44
|
+
- `Legion::LLM::Compressor` module for deterministic prompt compression
|
|
45
|
+
- Three compression levels: light (articles/filler), moderate (+connectives), aggressive (+low-signal words, whitespace collapse)
|
|
46
|
+
- Code block protection (fenced and inline code preserved)
|
|
47
|
+
- `compress_level` field on `Router::Resolution` for routing-driven compression
|
|
48
|
+
- `compress:` parameter on `llm_chat` helper for opt-in compression
|
|
49
|
+
- Routing rules can specify `compress_level` in target to auto-compress for cost-sensitive tiers
|
|
50
|
+
|
|
51
|
+
## [0.2.0]
|
|
52
|
+
|
|
53
|
+
### Added
|
|
54
|
+
- Dynamic weighted routing engine (`Legion::LLM::Router`)
|
|
55
|
+
- Intent-based dispatch with privacy, capability, and cost dimensions
|
|
56
|
+
- Priority-based rule matching with time-based schedule windows
|
|
57
|
+
- Cost multipliers for economic routing (e.g., provider promotions)
|
|
58
|
+
- HealthTracker with circuit breaker pattern and latency rolling window
|
|
59
|
+
- Pluggable signal handlers for extensible health monitoring
|
|
60
|
+
- `intent:` and `tier:` parameters on `chat`, `llm_chat`, and `llm_session`
|
|
61
|
+
- Routing rules configurable via `Legion::Settings[:llm][:routing]`
|
|
62
|
+
- Three-tier routing: local (Ollama), fleet (Transport/AMQP), cloud (API providers)
|
|
63
|
+
|
|
64
|
+
## v0.1.0
|
|
65
|
+
* Initial release
|
|
66
|
+
* Core module with start/shutdown lifecycle
|
|
67
|
+
* Provider configuration (Bedrock, Anthropic, OpenAI, Gemini, Ollama)
|
|
68
|
+
* Vault credential resolution for all providers
|
|
69
|
+
* Chat, embed, and agent convenience methods
|
|
70
|
+
* Extension helper mixin for LEX extensions
|
|
71
|
+
* Auto-detection of default model from enabled providers
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
# legion-llm
|
|
2
|
+
|
|
3
|
+
**Repository Level 3 Documentation**
|
|
4
|
+
- **Parent**: `/Users/miverso2/rubymine/legion/CLAUDE.md`
|
|
5
|
+
|
|
6
|
+
## Purpose
|
|
7
|
+
|
|
8
|
+
Core LegionIO gem providing LLM capabilities to all extensions. Wraps ruby_llm to provide a consistent interface for chat, embeddings, tool use, and agents across multiple providers (Bedrock, Anthropic, OpenAI, Gemini, Ollama). Includes a dynamic weighted routing engine that dispatches requests across local, fleet, and cloud tiers based on caller intent, priority rules, time schedules, cost multipliers, and real-time provider health.
|
|
9
|
+
|
|
10
|
+
**GitHub**: https://github.com/LegionIO/legion-llm
|
|
11
|
+
**License**: Apache-2.0
|
|
12
|
+
|
|
13
|
+
## Architecture
|
|
14
|
+
|
|
15
|
+
### Startup Sequence
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
Legion::LLM.start
|
|
19
|
+
├── 1. Read settings from Legion::Settings[:llm]
|
|
20
|
+
├── 2. For each enabled provider:
|
|
21
|
+
│ ├── Resolve credentials from Vault (if vault_path set)
|
|
22
|
+
│ └── Configure RubyLLM provider
|
|
23
|
+
├── 3. Run discovery (if Ollama enabled): warm model + system memory caches
|
|
24
|
+
├── 4. Auto-detect default model from first enabled provider
|
|
25
|
+
└── 5. Ping provider (if default_model + default_provider set): send test request, log latency
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
### Module Structure
|
|
29
|
+
|
|
30
|
+
```
|
|
31
|
+
Legion::LLM (lib/legion/llm.rb)
|
|
32
|
+
├── EscalationExhausted # Raised when all escalation attempts are exhausted
|
|
33
|
+
├── Settings # Default config, provider settings, routing defaults, discovery defaults
|
|
34
|
+
├── Providers # Provider configuration and Vault credential resolution
|
|
35
|
+
├── Compressor # Deterministic prompt compression (3 levels, code-block-aware)
|
|
36
|
+
├── Discovery # Runtime introspection for local model availability and system resources
|
|
37
|
+
│ ├── Ollama # Queries Ollama /api/tags for pulled models (TTL-cached)
|
|
38
|
+
│ └── System # Queries OS memory: macOS (vm_stat/sysctl), Linux (/proc/meminfo)
|
|
39
|
+
├── QualityChecker # Response quality heuristics (empty, too_short, repetition, json_parse, json_expected) + pluggable callable
|
|
40
|
+
├── EscalationHistory # Mixin for response objects: escalation_history, escalated?, final_resolution, escalation_chain
|
|
41
|
+
├── Router # Dynamic weighted routing engine
|
|
42
|
+
│ ├── Resolution # Value object: tier, provider, model, rule name, metadata, compress_level
|
|
43
|
+
│ ├── Rule # Routing rule: intent matching, schedule windows, constraints
|
|
44
|
+
│ ├── HealthTracker # Circuit breaker, latency rolling window, pluggable signal handlers
|
|
45
|
+
│ └── EscalationChain # Ordered fallback resolution chain with max_attempts cap (pads last resolution if chain is short)
|
|
46
|
+
└── Helpers::LLM # Extension helper mixin (llm_chat, llm_embed, llm_session, compress:)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Routing Architecture
|
|
50
|
+
|
|
51
|
+
Three-tier dispatch model. Local-first avoids unnecessary network hops; fleet offloads to shared hardware via Transport; cloud is the fallback for frontier models.
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
┌─────────────────────────────────────────────────────────┐
|
|
55
|
+
│ Legion::LLM Router (per-node) │
|
|
56
|
+
│ │
|
|
57
|
+
│ Tier 1: LOCAL → Ollama on this machine (direct HTTP) │
|
|
58
|
+
│ Zero network overhead, no Transport │
|
|
59
|
+
│ │
|
|
60
|
+
│ Tier 2: FLEET → Ollama on Mac Studios / GPU servers │
|
|
61
|
+
│ Via Legion::Transport (AMQP) when local can't │
|
|
62
|
+
│ serve the model (Phase 2, not yet built) │
|
|
63
|
+
│ │
|
|
64
|
+
│ Tier 3: CLOUD → Bedrock / Anthropic / OpenAI / Gemini │
|
|
65
|
+
│ Existing provider API calls │
|
|
66
|
+
└─────────────────────────────────────────────────────────┘
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Routing Resolution Flow
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
1. Caller passes intent: { privacy: :strict, capability: :basic }
|
|
73
|
+
2. Router merges with default_intent (fills missing dimensions)
|
|
74
|
+
3. Load rules from settings, filter by:
|
|
75
|
+
a. Intent match (all `when` conditions must match)
|
|
76
|
+
b. Schedule window (valid_from/valid_until, hours, days)
|
|
77
|
+
c. Constraints (e.g., never_cloud strips cloud-tier rules)
|
|
78
|
+
d. Discovery (Ollama model pulled? Model fits in available RAM?)
|
|
79
|
+
e. Tier availability (is Ollama running? is Transport loaded?)
|
|
80
|
+
4. Score remaining candidates:
|
|
81
|
+
effective_priority = rule.priority
|
|
82
|
+
+ health_tracker.adjustment(provider)
|
|
83
|
+
+ (1.0 - cost_multiplier) * 10
|
|
84
|
+
5. Return Resolution for highest-scoring candidate
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Integration with LegionIO
|
|
88
|
+
|
|
89
|
+
- **Service**: `setup_llm` called between data and supervision in startup sequence
|
|
90
|
+
- **Extensions**: `llm_required?` method on extension module, checked at load time
|
|
91
|
+
- **Helpers**: `Legion::Extensions::Helpers::LLM` auto-loaded when gem is present
|
|
92
|
+
- **Readiness**: Registers as `:llm` in `Legion::Readiness`
|
|
93
|
+
- **Shutdown**: `Legion::LLM.shutdown` called during service shutdown
|
|
94
|
+
|
|
95
|
+
## Dependencies
|
|
96
|
+
|
|
97
|
+
| Gem | Purpose |
|
|
98
|
+
|-----|---------|
|
|
99
|
+
| `ruby_llm` (>= 1.0) | Multi-provider LLM client |
|
|
100
|
+
| `tzinfo` (>= 2.0) | IANA timezone conversion for schedule windows |
|
|
101
|
+
| `legion-logging` | Logging |
|
|
102
|
+
| `legion-settings` | Configuration |
|
|
103
|
+
|
|
104
|
+
## Key Interfaces
|
|
105
|
+
|
|
106
|
+
```ruby
|
|
107
|
+
# Core
|
|
108
|
+
Legion::LLM.start # Configure providers, set defaults
|
|
109
|
+
Legion::LLM.shutdown # Cleanup
|
|
110
|
+
Legion::LLM.started? # -> Boolean
|
|
111
|
+
Legion::LLM.settings # -> Hash
|
|
112
|
+
|
|
113
|
+
# Chat (with optional routing)
|
|
114
|
+
Legion::LLM.chat(model:, provider:) # Direct (no routing)
|
|
115
|
+
Legion::LLM.chat(intent: { privacy: :strict }) # Intent-based routing
|
|
116
|
+
Legion::LLM.chat(tier: :cloud, model: 'claude-sonnet-4-6') # Explicit tier override
|
|
117
|
+
Legion::LLM.embed(text, model:) # Embeddings (no routing)
|
|
118
|
+
Legion::LLM.agent(AgentClass) # Agent instance
|
|
119
|
+
|
|
120
|
+
# Compressor
|
|
121
|
+
Legion::LLM::Compressor.compress(text, level: 1) # -> String (deterministic)
|
|
122
|
+
Legion::LLM::Compressor.stopwords_for_level(2) # -> Array of words
|
|
123
|
+
|
|
124
|
+
# Router
|
|
125
|
+
Legion::LLM::Router.resolve(intent:, tier:, model:, provider:) # -> Resolution or nil
|
|
126
|
+
Legion::LLM::Router.health_tracker # -> HealthTracker
|
|
127
|
+
Legion::LLM::Router.routing_enabled? # -> Boolean
|
|
128
|
+
Legion::LLM::Router.reset! # Clear cached state
|
|
129
|
+
|
|
130
|
+
# HealthTracker
|
|
131
|
+
tracker = Legion::LLM::Router.health_tracker
|
|
132
|
+
tracker.report(provider: :anthropic, signal: :error, value: 1) # Feed signal
|
|
133
|
+
tracker.report(provider: :ollama, signal: :latency, value: 1200) # Feed latency
|
|
134
|
+
tracker.adjustment(:anthropic) # -> Integer (priority offset)
|
|
135
|
+
tracker.circuit_state(:anthropic) # -> :closed/:open/:half_open
|
|
136
|
+
tracker.register_handler(:gpu_utilization) { |data| ... } # Extend with new signals
|
|
137
|
+
|
|
138
|
+
# Escalation
|
|
139
|
+
Legion::LLM.chat(message:, escalate: true, max_escalations: 3, quality_check:) # Escalating chat — raises EscalationExhausted if all attempts fail
|
|
140
|
+
Legion::LLM::EscalationExhausted # raised when all escalation attempts are exhausted
|
|
141
|
+
Legion::LLM::Router.resolve_chain(intent:, tier:, max_escalations:) # -> EscalationChain
|
|
142
|
+
Legion::LLM::QualityChecker.check(response, quality_threshold: 50, json_expected: false, quality_check: nil) # -> QualityResult
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Settings
|
|
146
|
+
|
|
147
|
+
Settings read from `Legion::Settings[:llm]`:
|
|
148
|
+
|
|
149
|
+
| Key | Type | Default | Description |
|
|
150
|
+
|-----|------|---------|-------------|
|
|
151
|
+
| `enabled` | Boolean | `true` | Enable LLM support |
|
|
152
|
+
| `connected` | Boolean | `false` | Set to true after successful start |
|
|
153
|
+
| `default_model` | String | `nil` | Default model ID (auto-detected if nil) |
|
|
154
|
+
| `default_provider` | Symbol | `nil` | Default provider (auto-detected if nil) |
|
|
155
|
+
| `providers` | Hash | See below | Per-provider configuration |
|
|
156
|
+
| `routing` | Hash | See below | Dynamic routing engine configuration |
|
|
157
|
+
| `discovery` | Hash | See below | Ollama model discovery and system memory settings |
|
|
158
|
+
|
|
159
|
+
### Provider Settings
|
|
160
|
+
|
|
161
|
+
Each provider has: `enabled`, `api_key`, `vault_path`, plus provider-specific keys.
|
|
162
|
+
|
|
163
|
+
Vault credential resolution: When `vault_path` is set and Legion::Crypt::Vault is connected, credentials are fetched from Vault at startup. Keys map to provider-specific fields automatically.
|
|
164
|
+
|
|
165
|
+
Bedrock supports two auth modes:
|
|
166
|
+
- **SigV4** (default): `api_key` + `secret_key` (+ optional `session_token`)
|
|
167
|
+
- **Bearer token**: `bearer_token` for AWS Identity Center/SSO. When set, `bedrock_bearer_auth.rb` is required lazily to monkey-patch RubyLLM's Bedrock provider.
|
|
168
|
+
|
|
169
|
+
### Auto-Detection Priority
|
|
170
|
+
|
|
171
|
+
When no defaults are configured, the first enabled provider is used:
|
|
172
|
+
|
|
173
|
+
1. Bedrock -> `us.anthropic.claude-sonnet-4-6-v1`
|
|
174
|
+
2. Anthropic -> `claude-sonnet-4-6`
|
|
175
|
+
3. OpenAI -> `gpt-4o`
|
|
176
|
+
4. Gemini -> `gemini-2.0-flash`
|
|
177
|
+
5. Ollama -> `llama3`
|
|
178
|
+
|
|
179
|
+
### Routing Settings
|
|
180
|
+
|
|
181
|
+
Nested under `Legion::Settings[:llm][:routing]`:
|
|
182
|
+
|
|
183
|
+
| Key | Type | Default | Description |
|
|
184
|
+
|-----|------|---------|-------------|
|
|
185
|
+
| `enabled` | Boolean | `false` | Enable routing (opt-in) |
|
|
186
|
+
| `default_intent` | Hash | `{ privacy: 'normal', capability: 'moderate', cost: 'normal' }` | Defaults merged into every intent |
|
|
187
|
+
| `tiers.local` | Hash | `{ provider: 'ollama' }` | Local tier config |
|
|
188
|
+
| `tiers.fleet` | Hash | `{ queue: 'llm.inference', timeout_seconds: 30 }` | Fleet tier config |
|
|
189
|
+
| `tiers.cloud` | Hash | `{ providers: ['bedrock', 'anthropic'] }` | Cloud tier config |
|
|
190
|
+
| `health.window_seconds` | Integer | `300` | Rolling window for latency tracking |
|
|
191
|
+
| `health.circuit_breaker.failure_threshold` | Integer | `3` | Consecutive failures before circuit opens |
|
|
192
|
+
| `health.circuit_breaker.cooldown_seconds` | Integer | `60` | Seconds before circuit transitions to half_open |
|
|
193
|
+
| `health.latency_penalty_threshold_ms` | Integer | `5000` | Latency above this triggers priority penalty |
|
|
194
|
+
| `health.budget.daily_limit_usd` | Float | `nil` | Daily cloud spend limit (future) |
|
|
195
|
+
| `health.budget.monthly_limit_usd` | Float | `nil` | Monthly cloud spend limit (future) |
|
|
196
|
+
| `rules` | Array | `[]` | Routing rules (see below) |
|
|
197
|
+
| `escalation.enabled` | Boolean | `false` | Enable model escalation on retry |
|
|
198
|
+
| `escalation.max_attempts` | Integer | `3` | Max escalation attempts per call |
|
|
199
|
+
| `escalation.quality_threshold` | Integer | `50` | Min response character length |
|
|
200
|
+
|
|
201
|
+
### Routing Rules
|
|
202
|
+
|
|
203
|
+
Each rule is a hash with:
|
|
204
|
+
|
|
205
|
+
| Field | Type | Required | Description |
|
|
206
|
+
|-------|------|----------|-------------|
|
|
207
|
+
| `name` | String | Yes | Unique rule identifier |
|
|
208
|
+
| `when` | Hash | Yes | Intent conditions to match (`privacy`, `capability`, `cost`) |
|
|
209
|
+
| `then` | Hash | No | Target: `{ tier:, provider:, model: }` |
|
|
210
|
+
| `priority` | Integer | No (default 0) | Higher wins when multiple rules match |
|
|
211
|
+
| `constraint` | String | No | Hard constraint (e.g., `never_cloud`) |
|
|
212
|
+
| `fallback` | String | No | Fallback tier if primary is unavailable |
|
|
213
|
+
| `cost_multiplier` | Float | No (default 1.0) | Lower = cheaper = routing bonus |
|
|
214
|
+
| `schedule` | Hash | No | Time-based activation window |
|
|
215
|
+
| `note` | String | No | Human-readable note |
|
|
216
|
+
|
|
217
|
+
### Intent Dimensions
|
|
218
|
+
|
|
219
|
+
| Dimension | Values | Default | Effect |
|
|
220
|
+
|-----------|--------|---------|--------|
|
|
221
|
+
| `privacy` | `:strict`, `:normal` | `:normal` | `:strict` -> never cloud (via `never_cloud` constraint rules) |
|
|
222
|
+
| `capability` | `:basic`, `:moderate`, `:reasoning` | `:moderate` | Higher prefers larger/cloud models |
|
|
223
|
+
| `cost` | `:minimize`, `:normal` | `:normal` | `:minimize` prefers local/fleet |
|
|
224
|
+
|
|
225
|
+
### Schedule Windows
|
|
226
|
+
|
|
227
|
+
Rules can include a `schedule` hash for time-based activation:
|
|
228
|
+
|
|
229
|
+
| Field | Format | Example |
|
|
230
|
+
|-------|--------|---------|
|
|
231
|
+
| `valid_from` | ISO 8601 | `"2026-03-15T00:00:00"` |
|
|
232
|
+
| `valid_until` | ISO 8601 | `"2026-03-29T23:59:59"` |
|
|
233
|
+
| `hours` | Array of "HH:MM-HH:MM" | `["00:00-06:00", "18:00-23:59"]` |
|
|
234
|
+
| `days` | Array of day names | `["monday", "tuesday"]` |
|
|
235
|
+
| `timezone` | IANA timezone | `"America/Chicago"` (converts `now` via TZInfo before evaluating hours/days) |
|
|
236
|
+
|
|
237
|
+
All fields optional. Omit any to mean "always active."
|
|
238
|
+
|
|
239
|
+
### Discovery Settings
|
|
240
|
+
|
|
241
|
+
Nested under `Legion::Settings[:llm][:discovery]`:
|
|
242
|
+
|
|
243
|
+
| Key | Type | Default | Description |
|
|
244
|
+
|-----|------|---------|-------------|
|
|
245
|
+
| `enabled` | Boolean | `true` | Master switch for discovery checks |
|
|
246
|
+
| `refresh_seconds` | Integer | `60` | TTL for both discovery caches |
|
|
247
|
+
| `memory_floor_mb` | Integer | `2048` | Minimum free MB to reserve for OS |
|
|
248
|
+
|
|
249
|
+
Discovery is lazy TTL-cached: data refreshes on the next `Router.resolve` call after TTL expires. At startup, caches are warmed if Ollama is enabled. When disabled, all discovery checks are bypassed (permissive).
|
|
250
|
+
|
|
251
|
+
### HealthTracker
|
|
252
|
+
|
|
253
|
+
In-memory signal consumer with pluggable handlers. Adjusts effective priorities at runtime.
|
|
254
|
+
|
|
255
|
+
**Built-in signals:** `:error` (circuit breaker), `:success` (circuit recovery), `:latency` (rolling window penalty), `:quality_failure` (half-weight circuit breaker, 6 failures to trip vs 3 for hard errors)
|
|
256
|
+
|
|
257
|
+
**Circuit breaker states:**
|
|
258
|
+
- `:closed` (normal, adjustment = 0)
|
|
259
|
+
- `:open` (after `failure_threshold` consecutive errors, adjustment = -50)
|
|
260
|
+
- `:half_open` (after `cooldown_seconds`, tries one request, adjustment = -25)
|
|
261
|
+
|
|
262
|
+
**Latency penalty:** `-10` per multiple above `LATENCY_THRESHOLD_MS` (5000ms), capped at `-50`
|
|
263
|
+
|
|
264
|
+
**Extensible:** Call `register_handler(:signal_name) { |data| ... }` to add new signal types. Signal providers (like lex-metering) call `report()` with `defined?(Legion::LLM::Router)` guard.
|
|
265
|
+
|
|
266
|
+
## File Map
|
|
267
|
+
|
|
268
|
+
| Path | Purpose |
|
|
269
|
+
|------|---------|
|
|
270
|
+
| `lib/legion/llm.rb` | Entry point: start, shutdown, chat (with routing), embed, agent |
|
|
271
|
+
| `lib/legion/llm/settings.rb` | Default settings including routing_defaults, auto-merge into Legion::Settings |
|
|
272
|
+
| `lib/legion/llm/providers.rb` | Provider config, Vault resolution, RubyLLM configuration |
|
|
273
|
+
| `lib/legion/llm/bedrock_bearer_auth.rb` | Monkey-patch for Bedrock Bearer Token auth — required lazily |
|
|
274
|
+
| `lib/legion/llm/compressor.rb` | Deterministic prompt compression: 3 levels, code-block-aware, stopword removal |
|
|
275
|
+
| `lib/legion/llm/router.rb` | Router module: resolve, health_tracker, select_candidates pipeline |
|
|
276
|
+
| `lib/legion/llm/router/resolution.rb` | Value object: tier, provider, model, rule, metadata, compress_level |
|
|
277
|
+
| `lib/legion/llm/router/rule.rb` | Rule class: from_hash, matches_intent?, within_schedule?, to_resolution |
|
|
278
|
+
| `lib/legion/llm/router/health_tracker.rb` | HealthTracker: circuit breaker, latency window, pluggable signal handlers |
|
|
279
|
+
| `lib/legion/llm/discovery/ollama.rb` | Ollama /api/tags discovery with TTL cache |
|
|
280
|
+
| `lib/legion/llm/discovery/system.rb` | OS memory introspection (macOS + Linux) with TTL cache |
|
|
281
|
+
| `lib/legion/llm/version.rb` | Version constant (0.3.0) |
|
|
282
|
+
| `lib/legion/llm/quality_checker.rb` | QualityChecker module with QualityResult struct |
|
|
283
|
+
| `lib/legion/llm/escalation_history.rb` | EscalationHistory mixin: `escalation_history`, `escalated?`, `final_resolution`, `escalation_chain` |
|
|
284
|
+
| `lib/legion/llm/router/escalation_chain.rb` | EscalationChain value object |
|
|
285
|
+
| `lib/legion/llm/transport/exchanges/escalation.rb` | AMQP exchange for escalation events |
|
|
286
|
+
| `lib/legion/llm/transport/messages/escalation_event.rb` | AMQP message for escalation events |
|
|
287
|
+
| `lib/legion/llm/helpers/llm.rb` | Extension helper mixin: llm_chat (with compress:, escalate:, max_escalations:, quality_check:), llm_embed, llm_session |
|
|
288
|
+
| `spec/legion/llm_spec.rb` | Tests: settings, lifecycle, providers, auto-config |
|
|
289
|
+
| `spec/legion/llm/integration_spec.rb` | Tests: routing integration with chat() |
|
|
290
|
+
| `spec/legion/llm/router_spec.rb` | Tests: Router.resolve, priority selection, constraints, health |
|
|
291
|
+
| `spec/legion/llm/router/resolution_spec.rb` | Tests: Resolution value object |
|
|
292
|
+
| `spec/legion/llm/router/rule_spec.rb` | Tests: Rule intent matching, from_hash, to_resolution |
|
|
293
|
+
| `spec/legion/llm/router/rule_schedule_spec.rb` | Tests: Rule schedule evaluation |
|
|
294
|
+
| `spec/legion/llm/router/health_tracker_spec.rb` | Tests: circuit breaker, latency, signal handlers |
|
|
295
|
+
| `spec/legion/llm/router/settings_spec.rb` | Tests: routing defaults in settings |
|
|
296
|
+
| `spec/legion/llm/compressor_spec.rb` | Tests: compression levels, code-block protection, determinism |
|
|
297
|
+
| `spec/legion/llm/helpers/llm_spec.rb` | Tests: helper mixin with compress integration |
|
|
298
|
+
| `spec/legion/llm/discovery/ollama_spec.rb` | Tests: Ollama model discovery, TTL, error handling |
|
|
299
|
+
| `spec/legion/llm/discovery/system_spec.rb` | Tests: System memory introspection |
|
|
300
|
+
| `spec/legion/llm/discovery/router_integration_spec.rb` | Tests: Router discovery filtering |
|
|
301
|
+
| `spec/legion/llm/discovery/startup_spec.rb` | Tests: Startup discovery warmup |
|
|
302
|
+
| `spec/legion/llm/discovery/settings_spec.rb` | Tests: Discovery settings defaults |
|
|
303
|
+
| `spec/legion/llm/quality_checker_spec.rb` | QualityChecker tests |
|
|
304
|
+
| `spec/legion/llm/escalation_history_spec.rb` | EscalationHistory tests |
|
|
305
|
+
| `spec/legion/llm/escalation_integration_spec.rb` | chat() escalation loop tests |
|
|
306
|
+
| `spec/legion/llm/router/escalation_chain_spec.rb` | EscalationChain tests |
|
|
307
|
+
| `spec/legion/llm/router/resolve_chain_spec.rb` | Router.resolve_chain tests |
|
|
308
|
+
| `spec/legion/llm/transport/escalation_spec.rb` | Transport tests |
|
|
309
|
+
| `spec/spec_helper.rb` | Stubbed Legion::Logging and Legion::Settings for testing |
|
|
310
|
+
|
|
311
|
+
## Extension Integration
|
|
312
|
+
|
|
313
|
+
Extensions declare LLM dependency via `llm_required?`:
|
|
314
|
+
|
|
315
|
+
```ruby
|
|
316
|
+
module Legion::Extensions::MyLex
|
|
317
|
+
def self.llm_required?
|
|
318
|
+
true
|
|
319
|
+
end
|
|
320
|
+
end
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
Helper methods available in runners when gem is loaded:
|
|
324
|
+
|
|
325
|
+
```ruby
|
|
326
|
+
# Direct (no routing)
|
|
327
|
+
llm_chat(message, model:, provider:, tools:, instructions:)
|
|
328
|
+
llm_embed(text, model:)
|
|
329
|
+
llm_session(model:, provider:)
|
|
330
|
+
|
|
331
|
+
# With routing
|
|
332
|
+
llm_chat(message, intent: { privacy: :strict, capability: :basic })
|
|
333
|
+
llm_chat(message, tier: :cloud, model: 'claude-sonnet-4-6')
|
|
334
|
+
llm_session(intent: { capability: :reasoning })
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
## Vault Integration
|
|
338
|
+
|
|
339
|
+
Provider credentials are resolved by the universal `Legion::Settings::Resolver` (in `legion-settings`), not by legion-llm itself. Use `vault://` and `env://` URI references directly in settings values:
|
|
340
|
+
|
|
341
|
+
```json
|
|
342
|
+
{
|
|
343
|
+
"llm": {
|
|
344
|
+
"providers": {
|
|
345
|
+
"bedrock": {
|
|
346
|
+
"enabled": true,
|
|
347
|
+
"bearer_token": ["vault://secret/data/llm/bedrock#bearer_token", "env://AWS_BEARER_TOKEN"],
|
|
348
|
+
"region": "us-east-2"
|
|
349
|
+
},
|
|
350
|
+
"anthropic": {
|
|
351
|
+
"enabled": true,
|
|
352
|
+
"api_key": "env://ANTHROPIC_API_KEY"
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
By the time `Legion::LLM.start` runs, all `vault://` and `env://` references have already been resolved to plain strings by `Legion::Settings.resolve_secrets!` (called in the boot sequence after `Legion::Crypt.start`).
|
|
360
|
+
|
|
361
|
+
The legacy `vault_path` per-provider setting was removed in v0.3.1.
|
|
362
|
+
|
|
363
|
+
## Testing
|
|
364
|
+
|
|
365
|
+
Tests run without the full LegionIO stack. `spec/spec_helper.rb` stubs `Legion::Logging` and `Legion::Settings` with in-memory implementations. Each test resets settings to defaults via `before(:each)`.
|
|
366
|
+
|
|
367
|
+
```bash
|
|
368
|
+
bundle exec rspec # 269 examples, 0 failures
|
|
369
|
+
bundle exec rubocop # 31 files, 0 offenses
|
|
370
|
+
```
|
|
371
|
+
|
|
372
|
+
## Design Documents
|
|
373
|
+
|
|
374
|
+
- `docs/plans/2026-03-14-llm-dynamic-routing-design.md` — Full design (approved)
|
|
375
|
+
- `docs/plans/2026-03-14-llm-dynamic-routing-implementation.md` — Implementation plan
|
|
376
|
+
- `docs/plans/2026-03-15-ollama-discovery-design.md` — Ollama discovery design (approved)
|
|
377
|
+
- `docs/plans/2026-03-15-ollama-discovery-implementation.md` — Discovery implementation plan
|
|
378
|
+
- `docs/plans/2026-03-16-llm-escalation-design.md` — Model escalation design (approved)
|
|
379
|
+
- `docs/plans/2026-03-16-llm-escalation-implementation.md` — Escalation implementation plan
|
|
380
|
+
|
|
381
|
+
## Future (Not Yet Built)
|
|
382
|
+
|
|
383
|
+
- **Fleet tier (Phase 2)**: `lex-llm-fleet` extension — inference workers on Mac Studios / NVIDIA servers, dispatched via Legion::Transport AMQP queues
|
|
384
|
+
- **Advanced signals (Phase 3)**: Budget tracking, lex-metering integration, GPU utilization monitoring
|
|
385
|
+
|
|
386
|
+
---
|
|
387
|
+
|
|
388
|
+
**Maintained By**: Matthew Iverson (@Esity)
|
data/Gemfile
ADDED
data/LICENSE
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
|
|
2
|
+
Apache License
|
|
3
|
+
Version 2.0, January 2004
|
|
4
|
+
http://www.apache.org/licenses/
|
|
5
|
+
|
|
6
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
|
7
|
+
|
|
8
|
+
Copyright 2026 Esity / Matthew Iverson
|
|
9
|
+
|
|
10
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
11
|
+
you may not use this file except in compliance with the License.
|
|
12
|
+
You may obtain a copy of the License at
|
|
13
|
+
|
|
14
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
15
|
+
|
|
16
|
+
Unless required by applicable law or agreed to in writing, software
|
|
17
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
18
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
19
|
+
See the License for the specific language governing permissions and
|
|
20
|
+
limitations under the License.
|