legion-llm 0.3.23 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGELOG.md +92 -0
- data/CLAUDE.md +42 -3
- data/legion-llm-0.3.20.gem +0 -0
- data/lib/legion/llm/batch.rb +14 -0
- data/lib/legion/llm/compressor.rb +89 -0
- data/lib/legion/llm/cost_estimator.rb +51 -0
- data/lib/legion/llm/escalation_tracker.rb +73 -0
- data/lib/legion/llm/fleet/dispatcher.rb +90 -0
- data/lib/legion/llm/fleet/handler.rb +110 -0
- data/lib/legion/llm/fleet/reply_dispatcher.rb +98 -0
- data/lib/legion/llm/fleet.rb +12 -0
- data/lib/legion/llm/hooks/budget_guard.rb +81 -0
- data/lib/legion/llm/hooks/cost_tracking.rb +54 -0
- data/lib/legion/llm/hooks/reflection.rb +238 -0
- data/lib/legion/llm/hooks.rb +3 -0
- data/lib/legion/llm/pipeline/executor.rb +226 -0
- data/lib/legion/llm/pipeline/profile.rb +42 -0
- data/lib/legion/llm/pipeline/request.rb +93 -0
- data/lib/legion/llm/pipeline/response.rb +75 -0
- data/lib/legion/llm/pipeline/steps/metering.rb +86 -0
- data/lib/legion/llm/pipeline/timeline.rb +51 -0
- data/lib/legion/llm/pipeline/tracing.rb +35 -0
- data/lib/legion/llm/pipeline.rb +16 -0
- data/lib/legion/llm/quality_checker.rb +23 -0
- data/lib/legion/llm/scheduling.rb +12 -0
- data/lib/legion/llm/settings.rb +1 -0
- data/lib/legion/llm/shadow_eval.rb +76 -2
- data/lib/legion/llm/version.rb +1 -1
- data/lib/legion/llm.rb +26 -0
- metadata +19 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2963ec3f995c8bab5b80af82d724155eb3590473bbf04e6eec7fe22c8625435a
|
|
4
|
+
data.tar.gz: 2fb50f72dfbe6867a388a50317eea6403e10c117e80f2f83fcb259e5601e9419
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: f6009ed5907f5cfc642cac46f0b626b3e8c888549c6de480ff4c39a569fe23901a281cc2ccd249fe605f88862ffc1392a684afc9ab4a1439520edb5f83cf6734
|
|
7
|
+
data.tar.gz: 0d5b8fcd87a585cc3da2c81ff737cabe25bcadf171ee58304f2544b4a601fc519666e6a54ff402923b39b597b41c527019fbc421eefbb6bbdf8afaab0ce75f7e
|
data/.gitignore
CHANGED
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,97 @@
|
|
|
1
1
|
# Legion LLM Changelog
|
|
2
2
|
|
|
3
|
+
## [0.4.0] - 2026-03-23
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `Pipeline::Request`: Data.define struct with `.build` and `.from_chat_args` for unified request representation
|
|
7
|
+
- `Pipeline::Response`: Data.define struct with `.build`, `.from_ruby_llm`, and `#with` for immutable responses
|
|
8
|
+
- `Pipeline::Profile`: Caller-derived profiles (external/gaia/system) with step skip logic
|
|
9
|
+
- `Pipeline::Tracing`: Distributed tracing with trace_id, span_id, and exchange_id generation
|
|
10
|
+
- `Pipeline::Timeline`: Ordered event recording with participant tracking
|
|
11
|
+
- `Pipeline::Executor`: 18-step pipeline skeleton with profile-aware step execution
|
|
12
|
+
- `Pipeline::Steps::Metering`: Metering event builder absorbed from lex-llm-gateway
|
|
13
|
+
- `CostEstimator`: Model cost estimation with fuzzy matching, absorbed from lex-llm-gateway
|
|
14
|
+
- `Fleet::Dispatcher`: Fleet RPC dispatch absorbed from lex-llm-gateway
|
|
15
|
+
- `Fleet::Handler`: Fleet request handler absorbed from lex-llm-gateway
|
|
16
|
+
- `Fleet::ReplyDispatcher`: Correlation-based reply routing for fleet RPC
|
|
17
|
+
- Feature-flagged `pipeline_enabled` setting (default: false) for incremental rollout
|
|
18
|
+
- Pipeline path in `_dispatch_chat` activated by `pipeline_enabled: true`
|
|
19
|
+
|
|
20
|
+
## [0.3.32] - 2026-03-23
|
|
21
|
+
|
|
22
|
+
### Added
|
|
23
|
+
- `Hooks::Reflection`: after_chat hook that extracts knowledge from conversations
|
|
24
|
+
- Detects decisions, patterns, and facts using regex markers
|
|
25
|
+
- Publishes extracted entries to Apollo via AMQP or direct ingest
|
|
26
|
+
- Cooldown-based dedup (5 min) and async extraction to avoid blocking
|
|
27
|
+
- `summary` method for introspection of extraction history
|
|
28
|
+
|
|
29
|
+
## [0.3.31] - 2026-03-23
|
|
30
|
+
|
|
31
|
+
### Added
|
|
32
|
+
- `Compressor.deduplicate_messages`: removes near-duplicate messages from conversation history using Jaccard similarity on word sets
|
|
33
|
+
- Configurable similarity threshold (default 0.85), keeps last occurrence, same-role-only comparison
|
|
34
|
+
- Skips short messages (< 20 chars) to avoid false positives
|
|
35
|
+
|
|
36
|
+
## [0.3.30] - 2026-03-23
|
|
37
|
+
|
|
38
|
+
### Added
|
|
39
|
+
- `Scheduling.status`: returns hash with current scheduling state (peak hours, defer intents, next off-peak)
|
|
40
|
+
- `Batch.status`: returns hash with queue size, priority breakdown, oldest entry, config
|
|
41
|
+
|
|
42
|
+
## [0.3.29] - 2026-03-23
|
|
43
|
+
|
|
44
|
+
### Added
|
|
45
|
+
- `EscalationTracker`: global escalation history with summary analytics
|
|
46
|
+
- Tracks model escalations (from_model, to_model, reason, tier changes)
|
|
47
|
+
- `summary` aggregates by reason, source model, and target model
|
|
48
|
+
- `escalation_rate` reports escalation frequency within configurable time windows
|
|
49
|
+
- Capped at 200 entries with automatic eviction
|
|
50
|
+
|
|
51
|
+
## [0.3.28] - 2026-03-23
|
|
52
|
+
|
|
53
|
+
### Added
|
|
54
|
+
- QualityChecker: truncation detection for responses cut off mid-sentence
|
|
55
|
+
- QualityChecker: refusal detection for model refusal patterns ("I can't", "as an AI")
|
|
56
|
+
- REFUSAL_PATTERNS constant with configurable regex patterns
|
|
57
|
+
- 6 new specs covering truncation and refusal detection
|
|
58
|
+
|
|
59
|
+
## [0.3.27] - 2026-03-23
|
|
60
|
+
|
|
61
|
+
### Added
|
|
62
|
+
- `Compressor.summarize_messages` for LLM-based conversation summarization
|
|
63
|
+
- Uses configurable model (default: gpt-4o-mini) for context window compression
|
|
64
|
+
- Falls back to aggressive stopword compression when LLM unavailable
|
|
65
|
+
- Short conversations returned uncompressed to avoid unnecessary API calls
|
|
66
|
+
|
|
67
|
+
## [0.3.26] - 2026-03-23
|
|
68
|
+
|
|
69
|
+
### Changed
|
|
70
|
+
- Enhanced ShadowEval with result history, cost comparison, and summary analytics
|
|
71
|
+
- `compare` now includes primary_cost, shadow_cost, and cost_savings ratio
|
|
72
|
+
- Added `history`, `clear_history`, and `summary` class methods
|
|
73
|
+
- History capped at 100 entries with automatic eviction
|
|
74
|
+
- Cost estimation uses CostTracker pricing when available
|
|
75
|
+
|
|
76
|
+
## [0.3.25] - 2026-03-23
|
|
77
|
+
|
|
78
|
+
### Added
|
|
79
|
+
- `Hooks::BudgetGuard` before_chat hook: blocks LLM calls when session cost budget is exceeded
|
|
80
|
+
- `BudgetGuard.status` returns enforcing state, spent, remaining, and ratio
|
|
81
|
+
- `BudgetGuard.remaining` returns remaining budget in USD
|
|
82
|
+
- Configurable via `llm.budget.session_usd` in settings (disabled when 0 or unset)
|
|
83
|
+
- Auto-installed during `LLM.start` only when budget is configured
|
|
84
|
+
- 10 specs covering blocking, passthrough, remaining, status, and enforcing checks
|
|
85
|
+
|
|
86
|
+
## [0.3.24] - 2026-03-23
|
|
87
|
+
|
|
88
|
+
### Added
|
|
89
|
+
- Auto cost-tracking hook: records per-request cost via `CostTracker` after every LLM call
|
|
90
|
+
- `Hooks::CostTracking.install` registers an `after_chat` hook during `LLM.start`
|
|
91
|
+
- Extracts usage tokens and model from response, feeds into in-memory `CostTracker.record`
|
|
92
|
+
- Opt-out via `llm.cost_tracking.auto: false` in settings
|
|
93
|
+
- 9 specs covering hook installation, token extraction, model fallback, and edge cases
|
|
94
|
+
|
|
3
95
|
## [0.3.23] - 2026-03-23
|
|
4
96
|
|
|
5
97
|
### Added
|
data/CLAUDE.md
CHANGED
|
@@ -8,7 +8,7 @@
|
|
|
8
8
|
Core LegionIO gem providing LLM capabilities to all extensions. Wraps ruby_llm to provide a consistent interface for chat, embeddings, tool use, and agents across multiple providers (Bedrock, Anthropic, OpenAI, Gemini, Ollama). Includes a dynamic weighted routing engine that dispatches requests across local, fleet, and cloud tiers based on caller intent, priority rules, time schedules, cost multipliers, and real-time provider health.
|
|
9
9
|
|
|
10
10
|
**GitHub**: https://github.com/LegionIO/legion-llm
|
|
11
|
-
**Version**: 0.
|
|
11
|
+
**Version**: 0.4.0
|
|
12
12
|
**License**: Apache-2.0
|
|
13
13
|
|
|
14
14
|
## Architecture
|
|
@@ -51,6 +51,20 @@ Legion::LLM (lib/legion/llm.rb)
|
|
|
51
51
|
│ ├── Rule # Routing rule: intent matching, schedule windows, constraints
|
|
52
52
|
│ ├── HealthTracker # Circuit breaker, latency rolling window, pluggable signal handlers
|
|
53
53
|
│ └── EscalationChain # Ordered fallback resolution chain with max_attempts cap (pads last resolution if chain is short)
|
|
54
|
+
├── Pipeline # 18-step request/response pipeline (feature-flagged)
|
|
55
|
+
│ ├── Request # Data.define struct for unified request representation
|
|
56
|
+
│ ├── Response # Data.define struct for unified response representation
|
|
57
|
+
│ ├── Profile # Caller-derived profiles (external/gaia/system) for step skipping
|
|
58
|
+
│ ├── Tracing # Distributed trace_id, span_id, exchange_id generation
|
|
59
|
+
│ ├── Timeline # Ordered event recording with participant tracking
|
|
60
|
+
│ ├── Executor # 18-step pipeline skeleton with profile-aware execution
|
|
61
|
+
│ └── Steps/
|
|
62
|
+
│ └── Metering # Metering event builder (absorbed from lex-llm-gateway)
|
|
63
|
+
├── CostEstimator # Model cost estimation with fuzzy pricing (absorbed from lex-llm-gateway)
|
|
64
|
+
├── Fleet # Fleet RPC dispatch (absorbed from lex-llm-gateway)
|
|
65
|
+
│ ├── Dispatcher # Fleet dispatch with timeout and availability checks
|
|
66
|
+
│ ├── Handler # Fleet request handler for GPU worker nodes
|
|
67
|
+
│ └── ReplyDispatcher # Correlation-based reply routing for fleet RPC
|
|
54
68
|
└── Helpers::LLM # Extension helper mixin (llm_chat, llm_embed, llm_session, compress:)
|
|
55
69
|
```
|
|
56
70
|
|
|
@@ -179,6 +193,7 @@ Settings read from `Legion::Settings[:llm]`:
|
|
|
179
193
|
|-----|------|---------|-------------|
|
|
180
194
|
| `enabled` | Boolean | `true` | Enable LLM support |
|
|
181
195
|
| `connected` | Boolean | `false` | Set to true after successful start |
|
|
196
|
+
| `pipeline_enabled` | Boolean | `false` | Enable 18-step pipeline for chat() dispatch |
|
|
182
197
|
| `default_model` | String | `nil` | Default model ID (auto-detected if nil) |
|
|
183
198
|
| `default_provider` | Symbol | `nil` | Default provider (auto-detected if nil) |
|
|
184
199
|
| `providers` | Hash | See below | Per-provider configuration |
|
|
@@ -320,6 +335,19 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
|
|
|
320
335
|
| `lib/legion/llm/router/escalation_chain.rb` | EscalationChain value object |
|
|
321
336
|
| `lib/legion/llm/transport/exchanges/escalation.rb` | AMQP exchange for escalation events |
|
|
322
337
|
| `lib/legion/llm/transport/messages/escalation_event.rb` | AMQP message for escalation events |
|
|
338
|
+
| `lib/legion/llm/pipeline.rb` | Pipeline module: requires all pipeline components |
|
|
339
|
+
| `lib/legion/llm/pipeline/request.rb` | Pipeline::Request Data.define struct with .build and .from_chat_args |
|
|
340
|
+
| `lib/legion/llm/pipeline/response.rb` | Pipeline::Response Data.define struct with .build, .from_ruby_llm, #with |
|
|
341
|
+
| `lib/legion/llm/pipeline/profile.rb` | Pipeline::Profile: caller-derived profiles for step skipping |
|
|
342
|
+
| `lib/legion/llm/pipeline/tracing.rb` | Pipeline::Tracing: trace_id, span_id, exchange_id generation |
|
|
343
|
+
| `lib/legion/llm/pipeline/timeline.rb` | Pipeline::Timeline: ordered event recording |
|
|
344
|
+
| `lib/legion/llm/pipeline/executor.rb` | Pipeline::Executor: 18-step skeleton with profile-aware execution |
|
|
345
|
+
| `lib/legion/llm/pipeline/steps/metering.rb` | Pipeline::Steps::Metering: metering event builder |
|
|
346
|
+
| `lib/legion/llm/cost_estimator.rb` | CostEstimator: model cost estimation with fuzzy pricing |
|
|
347
|
+
| `lib/legion/llm/fleet.rb` | Fleet module: requires dispatcher, handler, reply_dispatcher |
|
|
348
|
+
| `lib/legion/llm/fleet/dispatcher.rb` | Fleet::Dispatcher: fleet RPC dispatch |
|
|
349
|
+
| `lib/legion/llm/fleet/handler.rb` | Fleet::Handler: fleet request handler |
|
|
350
|
+
| `lib/legion/llm/fleet/reply_dispatcher.rb` | Fleet::ReplyDispatcher: correlation-based reply routing |
|
|
323
351
|
| `lib/legion/llm/helpers/llm.rb` | Extension helper mixin: llm_chat (with compress:, escalate:, max_escalations:, quality_check:), llm_embed, llm_session |
|
|
324
352
|
| `spec/legion/llm_spec.rb` | Tests: settings, lifecycle, providers, auto-config |
|
|
325
353
|
| `spec/legion/llm/integration_spec.rb` | Tests: routing integration with chat() |
|
|
@@ -346,6 +374,17 @@ In-memory signal consumer with pluggable handlers. Adjusts effective priorities
|
|
|
346
374
|
| `spec/legion/llm/shadow_eval_spec.rb` | ShadowEval tests |
|
|
347
375
|
| `spec/legion/llm/structured_output_spec.rb` | StructuredOutput tests |
|
|
348
376
|
| `spec/legion/llm/gateway_integration_spec.rb` | Tests: gateway delegation and _direct bypass |
|
|
377
|
+
| `spec/legion/llm/cost_estimator_spec.rb` | Tests: cost estimation, fuzzy matching, pricing table |
|
|
378
|
+
| `spec/legion/llm/pipeline/request_spec.rb` | Tests: Request struct builder, legacy adapter |
|
|
379
|
+
| `spec/legion/llm/pipeline/response_spec.rb` | Tests: Response struct builder, RubyLLM adapter, #with |
|
|
380
|
+
| `spec/legion/llm/pipeline/profile_spec.rb` | Tests: Profile derivation and step skipping |
|
|
381
|
+
| `spec/legion/llm/pipeline/tracing_spec.rb` | Tests: Tracing init, exchange_id generation |
|
|
382
|
+
| `spec/legion/llm/pipeline/timeline_spec.rb` | Tests: Timeline event recording, participants |
|
|
383
|
+
| `spec/legion/llm/pipeline/executor_spec.rb` | Tests: Executor pipeline execution, profile skipping |
|
|
384
|
+
| `spec/legion/llm/pipeline/integration_spec.rb` | Tests: Pipeline integration with chat() dispatch |
|
|
385
|
+
| `spec/legion/llm/pipeline/steps/metering_spec.rb` | Tests: Metering event building |
|
|
386
|
+
| `spec/legion/llm/fleet/dispatcher_spec.rb` | Tests: Fleet dispatch, availability, timeout |
|
|
387
|
+
| `spec/legion/llm/fleet/handler_spec.rb` | Tests: Fleet handler, auth, response building |
|
|
349
388
|
| `spec/spec_helper.rb` | Stubbed Legion::Logging and Legion::Settings for testing |
|
|
350
389
|
|
|
351
390
|
## Extension Integration
|
|
@@ -405,8 +444,8 @@ The legacy `vault_path` per-provider setting was removed in v0.3.1.
|
|
|
405
444
|
Tests run without the full LegionIO stack. `spec/spec_helper.rb` stubs `Legion::Logging` and `Legion::Settings` with in-memory implementations. Each test resets settings to defaults via `before(:each)`.
|
|
406
445
|
|
|
407
446
|
```bash
|
|
408
|
-
bundle exec rspec #
|
|
409
|
-
bundle exec rubocop #
|
|
447
|
+
bundle exec rspec # 712 examples, 0 failures
|
|
448
|
+
bundle exec rubocop # 113 files, 0 offenses
|
|
410
449
|
```
|
|
411
450
|
|
|
412
451
|
## Design Documents
|
|
Binary file
|
data/lib/legion/llm/batch.rb
CHANGED
|
@@ -78,6 +78,20 @@ module Legion
|
|
|
78
78
|
queue.size
|
|
79
79
|
end
|
|
80
80
|
|
|
81
|
+
# Returns a summary of current batch queue state.
|
|
82
|
+
def status
|
|
83
|
+
entries = queue.dup
|
|
84
|
+
oldest = entries.min_by { |e| e[:queued_at] }
|
|
85
|
+
{
|
|
86
|
+
enabled: enabled?,
|
|
87
|
+
queue_size: entries.size,
|
|
88
|
+
max_batch_size: settings.fetch(:max_batch_size, 100),
|
|
89
|
+
window_seconds: settings.fetch(:window_seconds, 300),
|
|
90
|
+
oldest_queued: oldest ? oldest[:queued_at].iso8601 : nil,
|
|
91
|
+
by_priority: entries.group_by { |e| e[:priority] }.transform_values(&:size)
|
|
92
|
+
}
|
|
93
|
+
end
|
|
94
|
+
|
|
81
95
|
# Clears the queue (useful for testing).
|
|
82
96
|
def reset!
|
|
83
97
|
@queue = []
|
|
@@ -15,6 +15,17 @@ module Legion
|
|
|
15
15
|
3 => %w[also then still even already yet again please note that]
|
|
16
16
|
}.freeze
|
|
17
17
|
|
|
18
|
+
SUMMARIZE_PROMPT = <<~PROMPT
|
|
19
|
+
Summarize this conversation concisely. Preserve:
|
|
20
|
+
- Key decisions and conclusions
|
|
21
|
+
- Code snippets and file paths
|
|
22
|
+
- Action items and next steps
|
|
23
|
+
- Technical details that would be needed to continue the conversation
|
|
24
|
+
|
|
25
|
+
Omit pleasantries, repetition, and verbose explanations.
|
|
26
|
+
Return only the summary, no preamble.
|
|
27
|
+
PROMPT
|
|
28
|
+
|
|
18
29
|
class << self
|
|
19
30
|
def compress(text, level: LIGHT)
|
|
20
31
|
return text if text.nil? || text.empty? || level <= NONE
|
|
@@ -28,6 +39,55 @@ module Legion
|
|
|
28
39
|
result
|
|
29
40
|
end
|
|
30
41
|
|
|
42
|
+
def summarize_messages(messages, max_tokens: 2000)
|
|
43
|
+
return { summary: '', original_count: 0 } if messages.nil? || messages.empty?
|
|
44
|
+
|
|
45
|
+
text = messages.map { |m| "#{m[:role]}: #{m[:content]}" }.join("\n\n")
|
|
46
|
+
return { summary: text, original_count: messages.size, compressed: false } if text.length < max_tokens * 4
|
|
47
|
+
|
|
48
|
+
summary = llm_summarize(text, max_tokens)
|
|
49
|
+
if summary
|
|
50
|
+
log_debug("summarize_messages: #{messages.size} messages -> #{summary.length} chars")
|
|
51
|
+
{ summary: summary, original_count: messages.size, compressed: true }
|
|
52
|
+
else
|
|
53
|
+
fallback = compress(text, level: AGGRESSIVE)
|
|
54
|
+
{ summary: fallback, original_count: messages.size, compressed: true, method: :stopword }
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Removes near-duplicate messages from a conversation history.
|
|
59
|
+
# Uses Jaccard similarity on word sets to detect duplicates.
|
|
60
|
+
# Keeps the last occurrence of similar messages.
|
|
61
|
+
#
|
|
62
|
+
# @param messages [Array<Hash>] messages with :role and :content keys
|
|
63
|
+
# @param threshold [Float] similarity threshold (0.0-1.0) above which messages are considered duplicates
|
|
64
|
+
# @return [Hash] { messages: Array, removed: Integer, original_count: Integer }
|
|
65
|
+
def deduplicate_messages(messages, threshold: 0.85)
|
|
66
|
+
return { messages: [], removed: 0, original_count: 0 } if messages.nil? || messages.empty?
|
|
67
|
+
|
|
68
|
+
kept = []
|
|
69
|
+
removed = 0
|
|
70
|
+
|
|
71
|
+
messages.reverse_each do |msg|
|
|
72
|
+
content = msg[:content].to_s
|
|
73
|
+
next kept.unshift(msg) if content.length < 20
|
|
74
|
+
|
|
75
|
+
duplicate = kept.any? do |existing|
|
|
76
|
+
next false unless existing[:role] == msg[:role]
|
|
77
|
+
|
|
78
|
+
jaccard_similarity(content, existing[:content].to_s) >= threshold
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
if duplicate
|
|
82
|
+
removed += 1
|
|
83
|
+
else
|
|
84
|
+
kept.unshift(msg)
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
{ messages: kept, removed: removed, original_count: messages.size }
|
|
89
|
+
end
|
|
90
|
+
|
|
31
91
|
def stopwords_for_level(level)
|
|
32
92
|
return [] if level <= NONE
|
|
33
93
|
|
|
@@ -71,6 +131,35 @@ module Legion
|
|
|
71
131
|
def collapse_whitespace(text)
|
|
72
132
|
text.gsub(/\n{3,}/, "\n\n")
|
|
73
133
|
end
|
|
134
|
+
|
|
135
|
+
def llm_summarize(text, max_tokens)
|
|
136
|
+
return nil unless defined?(Legion::LLM) && Legion::LLM.respond_to?(:chat_direct)
|
|
137
|
+
|
|
138
|
+
session = Legion::LLM.chat_direct(model: summarize_model)
|
|
139
|
+
response = session.ask("#{SUMMARIZE_PROMPT}\n\n#{text[0, max_tokens * 8]}")
|
|
140
|
+
response.content
|
|
141
|
+
rescue StandardError => e
|
|
142
|
+
log_debug("llm_summarize failed: #{e.message}")
|
|
143
|
+
nil
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
def summarize_model
|
|
147
|
+
(defined?(Legion::Settings) && Legion::Settings.dig(:llm, :compressor, :model)) || 'gpt-4o-mini'
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def jaccard_similarity(text_a, text_b)
|
|
151
|
+
words_a = text_a.downcase.scan(/\w+/).to_set
|
|
152
|
+
words_b = text_b.downcase.scan(/\w+/).to_set
|
|
153
|
+
return 0.0 if words_a.empty? && words_b.empty?
|
|
154
|
+
|
|
155
|
+
intersection = (words_a & words_b).size.to_f
|
|
156
|
+
union = (words_a | words_b).size.to_f
|
|
157
|
+
union.zero? ? 0.0 : intersection / union
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def log_debug(msg)
|
|
161
|
+
Legion::Logging.debug("Compressor: #{msg}") if defined?(Legion::Logging)
|
|
162
|
+
end
|
|
74
163
|
end
|
|
75
164
|
end
|
|
76
165
|
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module CostEstimator
|
|
6
|
+
# Prices per 1M tokens [input, output] in USD
|
|
7
|
+
# Source: published API pricing as of 2026-03
|
|
8
|
+
PRICING = {
|
|
9
|
+
'claude-opus-4-6' => [15.0, 75.0],
|
|
10
|
+
'claude-sonnet-4-6' => [3.0, 15.0],
|
|
11
|
+
'claude-haiku-4-5' => [0.80, 4.0],
|
|
12
|
+
'claude-3-5-sonnet' => [3.0, 15.0],
|
|
13
|
+
'claude-3-haiku' => [0.25, 1.25],
|
|
14
|
+
'gpt-4o' => [2.50, 10.0],
|
|
15
|
+
'gpt-4o-mini' => [0.15, 0.60],
|
|
16
|
+
'gpt-4-turbo' => [10.0, 30.0],
|
|
17
|
+
'o3' => [10.0, 40.0],
|
|
18
|
+
'o3-mini' => [1.10, 4.40],
|
|
19
|
+
'o4-mini' => [1.10, 4.40],
|
|
20
|
+
'gemini-2.5-pro' => [1.25, 10.0],
|
|
21
|
+
'gemini-2.5-flash' => [0.15, 0.60],
|
|
22
|
+
'gemini-2.0-flash' => [0.10, 0.40]
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
DEFAULT_PRICE = [1.0, 3.0].freeze
|
|
26
|
+
|
|
27
|
+
module_function
|
|
28
|
+
|
|
29
|
+
def estimate(model_id:, input_tokens: 0, output_tokens: 0, **)
|
|
30
|
+
price = resolve_price(model_id)
|
|
31
|
+
input_cost = (input_tokens.to_i / 1_000_000.0) * price[0]
|
|
32
|
+
output_cost = (output_tokens.to_i / 1_000_000.0) * price[1]
|
|
33
|
+
(input_cost + output_cost).round(6)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def resolve_price(model_id)
|
|
37
|
+
return DEFAULT_PRICE unless model_id
|
|
38
|
+
|
|
39
|
+
normalized = model_id.to_s.downcase
|
|
40
|
+
PRICING[normalized] || fuzzy_match(normalized) || DEFAULT_PRICE
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def fuzzy_match(normalized)
|
|
44
|
+
PRICING.each do |key, price|
|
|
45
|
+
return price if normalized.include?(key) || key.include?(normalized)
|
|
46
|
+
end
|
|
47
|
+
nil
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module EscalationTracker
|
|
6
|
+
MAX_HISTORY = 200
|
|
7
|
+
|
|
8
|
+
class << self
|
|
9
|
+
def record(from_model:, to_model:, reason:, tier_from: nil, tier_to: nil)
|
|
10
|
+
entry = {
|
|
11
|
+
from_model: from_model.to_s,
|
|
12
|
+
to_model: to_model.to_s,
|
|
13
|
+
reason: reason.to_s,
|
|
14
|
+
tier_from: tier_from,
|
|
15
|
+
tier_to: tier_to,
|
|
16
|
+
recorded_at: Time.now.utc
|
|
17
|
+
}
|
|
18
|
+
history << entry
|
|
19
|
+
history.shift while history.size > MAX_HISTORY
|
|
20
|
+
log_debug("escalation: #{from_model} -> #{to_model} reason=#{reason}")
|
|
21
|
+
entry
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def history
|
|
25
|
+
@history ||= []
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def clear
|
|
29
|
+
@history = []
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def summary
|
|
33
|
+
entries = history.dup
|
|
34
|
+
return empty_summary if entries.empty?
|
|
35
|
+
|
|
36
|
+
{
|
|
37
|
+
total_escalations: entries.size,
|
|
38
|
+
by_reason: count_by(entries, :reason),
|
|
39
|
+
by_target_model: count_by(entries, :to_model),
|
|
40
|
+
by_source_model: count_by(entries, :from_model),
|
|
41
|
+
recent: entries.last(5).reverse
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def escalation_rate(window_seconds: 3600)
|
|
46
|
+
cutoff = Time.now.utc - window_seconds
|
|
47
|
+
recent = history.count { |e| e[:recorded_at] >= cutoff }
|
|
48
|
+
{ count: recent, window_seconds: window_seconds }
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
private
|
|
52
|
+
|
|
53
|
+
def count_by(entries, key)
|
|
54
|
+
entries.group_by { |e| e[key] }.transform_values(&:size)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def empty_summary
|
|
58
|
+
{
|
|
59
|
+
total_escalations: 0,
|
|
60
|
+
by_reason: {},
|
|
61
|
+
by_target_model: {},
|
|
62
|
+
by_source_model: {},
|
|
63
|
+
recent: []
|
|
64
|
+
}
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def log_debug(msg)
|
|
68
|
+
Legion::Logging.debug("[EscalationTracker] #{msg}") if defined?(Legion::Logging)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module Fleet
|
|
6
|
+
module Dispatcher
|
|
7
|
+
DEFAULT_TIMEOUT = 30
|
|
8
|
+
|
|
9
|
+
module_function
|
|
10
|
+
|
|
11
|
+
def dispatch(model:, messages:, **opts)
|
|
12
|
+
return error_result('fleet_unavailable') unless fleet_available?
|
|
13
|
+
|
|
14
|
+
correlation_id = "fleet_#{SecureRandom.hex(12)}"
|
|
15
|
+
publish_request(model: model, messages: messages, intent: opts[:intent],
|
|
16
|
+
correlation_id: correlation_id, **opts.except(:intent, :timeout))
|
|
17
|
+
|
|
18
|
+
wait_for_response(correlation_id, timeout: resolve_timeout(opts[:timeout]))
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def fleet_available?
|
|
22
|
+
transport_ready? && fleet_enabled?
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def transport_ready?
|
|
26
|
+
!!(defined?(Legion::Transport) &&
|
|
27
|
+
Legion::Transport.respond_to?(:connected?) &&
|
|
28
|
+
Legion::Transport.connected?)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def fleet_enabled?
|
|
32
|
+
return true unless defined?(Legion::Settings)
|
|
33
|
+
|
|
34
|
+
settings = begin
|
|
35
|
+
Legion::Settings[:llm]
|
|
36
|
+
rescue StandardError
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
return true unless settings.is_a?(Hash)
|
|
40
|
+
|
|
41
|
+
routing = settings[:routing]
|
|
42
|
+
return true unless routing.is_a?(Hash)
|
|
43
|
+
|
|
44
|
+
routing.fetch(:use_fleet, true)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def resolve_timeout(override)
|
|
48
|
+
return override if override
|
|
49
|
+
|
|
50
|
+
return DEFAULT_TIMEOUT unless defined?(Legion::Settings)
|
|
51
|
+
|
|
52
|
+
settings = begin
|
|
53
|
+
Legion::Settings[:llm]
|
|
54
|
+
rescue StandardError
|
|
55
|
+
nil
|
|
56
|
+
end
|
|
57
|
+
return DEFAULT_TIMEOUT unless settings.is_a?(Hash)
|
|
58
|
+
|
|
59
|
+
settings.dig(:routing, :fleet, :timeout_seconds) || DEFAULT_TIMEOUT
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def publish_request(**)
|
|
63
|
+
return unless defined?(Legion::Extensions::LLM::Gateway::Transport::Messages::InferenceRequest)
|
|
64
|
+
|
|
65
|
+
Legion::Extensions::LLM::Gateway::Transport::Messages::InferenceRequest.new(
|
|
66
|
+
reply_to: ReplyDispatcher.agent_queue_name, **
|
|
67
|
+
).publish
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def wait_for_response(correlation_id, timeout:)
|
|
71
|
+
future = ReplyDispatcher.register(correlation_id)
|
|
72
|
+
result = future.value!(timeout)
|
|
73
|
+
result || timeout_result(correlation_id, timeout)
|
|
74
|
+
rescue Concurrent::CancelledOperationError
|
|
75
|
+
timeout_result(correlation_id, timeout)
|
|
76
|
+
ensure
|
|
77
|
+
ReplyDispatcher.deregister(correlation_id)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
def timeout_result(correlation_id, timeout)
|
|
81
|
+
{ success: false, error: 'fleet_timeout', correlation_id: correlation_id, timeout: timeout }
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def error_result(reason)
|
|
85
|
+
{ success: false, error: reason }
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Legion
|
|
4
|
+
module LLM
|
|
5
|
+
module Fleet
|
|
6
|
+
module Handler
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def handle_fleet_request(payload)
|
|
10
|
+
if Dispatcher.fleet_enabled? && !valid_token?(payload[:signed_token])
|
|
11
|
+
error_response = { success: false, error: 'invalid_token' }
|
|
12
|
+
publish_reply(payload[:reply_to], payload[:correlation_id], error_response) if payload[:reply_to]
|
|
13
|
+
return error_response
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
response = call_local_llm(payload)
|
|
17
|
+
response_hash = build_response(payload[:correlation_id], response)
|
|
18
|
+
publish_reply(payload[:reply_to], payload[:correlation_id], response_hash) if payload[:reply_to]
|
|
19
|
+
response_hash
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def valid_token?(token)
|
|
23
|
+
return true unless require_auth?
|
|
24
|
+
return false if token.nil?
|
|
25
|
+
return true unless defined?(Legion::Crypt)
|
|
26
|
+
|
|
27
|
+
!Legion::Crypt.validate_jwt(token).nil?
|
|
28
|
+
rescue StandardError
|
|
29
|
+
false
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def require_auth?
|
|
33
|
+
return false unless defined?(Legion::Settings)
|
|
34
|
+
|
|
35
|
+
settings = begin
|
|
36
|
+
Legion::Settings[:llm]
|
|
37
|
+
rescue StandardError
|
|
38
|
+
nil
|
|
39
|
+
end
|
|
40
|
+
return false unless settings.is_a?(Hash)
|
|
41
|
+
|
|
42
|
+
fleet = settings.dig(:routing, :fleet)
|
|
43
|
+
return false unless fleet.is_a?(Hash)
|
|
44
|
+
|
|
45
|
+
fleet.fetch(:require_auth, false)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def call_local_llm(payload)
|
|
49
|
+
return { error: 'llm_not_available' } unless defined?(Legion::LLM)
|
|
50
|
+
|
|
51
|
+
case payload[:request_type]&.to_s
|
|
52
|
+
when 'structured'
|
|
53
|
+
Legion::LLM.structured_direct(messages: payload[:messages], schema: payload[:schema])
|
|
54
|
+
when 'embed'
|
|
55
|
+
text = payload[:text] || payload.dig(:messages, 0, :content)
|
|
56
|
+
Legion::LLM.embed_direct(text, model: payload[:model])
|
|
57
|
+
else
|
|
58
|
+
Legion::LLM.chat_direct(model: payload[:model], message: payload.dig(:messages, 0, :content))
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def build_response(correlation_id, response)
|
|
63
|
+
{
|
|
64
|
+
correlation_id: correlation_id,
|
|
65
|
+
response: response,
|
|
66
|
+
input_tokens: extract_token(response, :input_tokens),
|
|
67
|
+
output_tokens: extract_token(response, :output_tokens),
|
|
68
|
+
thinking_tokens: extract_token(response, :thinking_tokens),
|
|
69
|
+
provider: extract_field(response, :provider),
|
|
70
|
+
model_id: extract_field(response, :model)
|
|
71
|
+
}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def publish_reply(reply_to, correlation_id, response_hash)
|
|
75
|
+
return unless defined?(Legion::Transport)
|
|
76
|
+
|
|
77
|
+
payload = if defined?(Legion::JSON)
|
|
78
|
+
Legion::JSON.dump(response_hash)
|
|
79
|
+
else
|
|
80
|
+
require 'json'
|
|
81
|
+
::JSON.generate(response_hash)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
channel = Legion::Transport.connection.create_channel
|
|
85
|
+
channel.default_exchange.publish(
|
|
86
|
+
payload,
|
|
87
|
+
routing_key: reply_to,
|
|
88
|
+
correlation_id: correlation_id,
|
|
89
|
+
content_type: 'application/json'
|
|
90
|
+
)
|
|
91
|
+
channel.close
|
|
92
|
+
rescue StandardError => e
|
|
93
|
+
Legion::Logging.warn("Fleet::Handler: publish_reply failed: #{e.message}") if defined?(Legion::Logging)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def extract_token(response, field)
|
|
97
|
+
return 0 unless response.respond_to?(field)
|
|
98
|
+
|
|
99
|
+
response.public_send(field).to_i
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def extract_field(response, field)
|
|
103
|
+
return nil unless response.respond_to?(field)
|
|
104
|
+
|
|
105
|
+
response.public_send(field)
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|