llm.rb 6.0.0 → 7.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +82 -0
- data/README.md +15 -5
- data/lib/llm/agent.rb +34 -6
- data/lib/llm/buffer.rb +8 -0
- data/lib/llm/compactor.rb +27 -9
- data/lib/llm/context/deserializer.rb +1 -0
- data/lib/llm/context.rb +49 -23
- data/lib/llm/loop_guard.rb +1 -10
- data/lib/llm/provider/transport/http/execution.rb +1 -1
- data/lib/llm/provider/transport/http/interruptible.rb +99 -94
- data/lib/llm/provider/transport/http.rb +4 -3
- data/lib/llm/provider.rb +8 -0
- data/lib/llm/skill.rb +2 -0
- data/lib/llm/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 6c923952039095a2234eb1bd5c058a951b0d797d27577cdf7f679df59b49060b
|
|
4
|
+
data.tar.gz: 3667e0d79e44634f769dfced198dd07c1039f173cb43b72aab7d3204aa3638f8
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 655d450b2ffeb71ed9564b7c5c23a2a86e9e385de9dc1abdac18588e460cffdecd1b2da1d5ef9fc162dc3f3286b7d2c979baec3953cd1ddbdab74d1ef5b87112
|
|
7
|
+
data.tar.gz: a044fedb675c4d92eff55c210d588b68b80c7e3967188674c2de4d8f6bc69d76e8f15c18f49fb54e09a8c93dff89074304d231609337bfa3bc79c96e1f3f576b
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,87 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## Unreleased
|
|
4
|
+
|
|
5
|
+
## v7.0.0
|
|
6
|
+
|
|
7
|
+
Changes since `v6.1.0`.
|
|
8
|
+
|
|
9
|
+
This release turns agent tool-loop limit errors into in-band advisory
|
|
10
|
+
returns so the LLM can react to rate limits and continue the loop. It
|
|
11
|
+
adds `tool_attempts: nil` as a way to opt out of advisory tool-limit
|
|
12
|
+
returns entirely, and fixes the default provider HTTP path to keep
|
|
13
|
+
`net-http-persistent` optional when not explicitly enabled.
|
|
14
|
+
|
|
15
|
+
### Breaking
|
|
16
|
+
|
|
17
|
+
* **Return in-band tool-loop limit errors from agents** <br>
|
|
18
|
+
Stop raising `LLM::ToolLoopError` when an agent exhausts its tool loop
|
|
19
|
+
attempt budget, and instead send advisory `LLM::Function::Return`
|
|
20
|
+
errors back through the model so the LLM can react to the rate limit
|
|
21
|
+
in-band and continue the loop.
|
|
22
|
+
|
|
23
|
+
* **Allow `tool_attempts: nil` to disable advisory tool-limit returns** <br>
|
|
24
|
+
Keep the default `tool_attempts` budget at `25`, but treat an explicit
|
|
25
|
+
`tool_attempts: nil` as an opt-out that disables advisory tool-limit
|
|
26
|
+
returns entirely.
|
|
27
|
+
|
|
28
|
+
### Fix
|
|
29
|
+
|
|
30
|
+
* **Keep `net-http-persistent` optional on normal HTTP requests** <br>
|
|
31
|
+
Stop the default provider HTTP path from loading `net/http/persistent`
|
|
32
|
+
unless persistent transport support is explicitly enabled.
|
|
33
|
+
|
|
34
|
+
## v6.1.0
|
|
35
|
+
|
|
36
|
+
Changes since `v6.0.0`.
|
|
37
|
+
|
|
38
|
+
This release tightens interrupt and compaction behavior for long-running
|
|
39
|
+
contexts. It adds `LLM::Buffer#rindex`, supports percentage-based token
|
|
40
|
+
thresholds in `LLM::Compactor`, tracks persisted compaction state through
|
|
41
|
+
context serialization, reliably interrupts Async-backed requests, preserves
|
|
42
|
+
valid tool-call history on cancellation, keeps concurrent skill tool loops
|
|
43
|
+
running on streamed agents, and returns zero-valued usage objects when no
|
|
44
|
+
provider usage has been recorded yet.
|
|
45
|
+
|
|
46
|
+
### Change
|
|
47
|
+
|
|
48
|
+
* **Add `LLM::Buffer#rindex`** <br>
|
|
49
|
+
Add `LLM::Buffer#rindex` as a direct forward to the underlying message
|
|
50
|
+
array so callers can find the last matching message index through the
|
|
51
|
+
buffer API.
|
|
52
|
+
|
|
53
|
+
* **Support percentage compaction token thresholds** <br>
|
|
54
|
+
Let `LLM::Compactor` accept `token_threshold:` values like `"90%"` so
|
|
55
|
+
compaction can trigger at a percentage of the active model context
|
|
56
|
+
window.
|
|
57
|
+
|
|
58
|
+
### Fix
|
|
59
|
+
|
|
60
|
+
* **Interrupt Async-backed requests reliably** <br>
|
|
61
|
+
Track request ownership through the provider transport so contexts use
|
|
62
|
+
the active Async task when available, letting `ctx.interrupt!`
|
|
63
|
+
reliably cancel streamed requests under Async runtimes and surface
|
|
64
|
+
them as `LLM::Interrupt`.
|
|
65
|
+
|
|
66
|
+
* **Preserve valid tool-call history on cancellation** <br>
|
|
67
|
+
Append cancelled tool-return messages for unresolved tool calls during
|
|
68
|
+
`ctx.interrupt!` so follow-up provider requests do not fail with
|
|
69
|
+
invalid tool-call history after pending tool work is cancelled.
|
|
70
|
+
|
|
71
|
+
* **Preserve concurrent skill tool loops on streamed agents** <br>
|
|
72
|
+
Propagate the active agent concurrency through the effective request
|
|
73
|
+
stream so nested skill agents keep using queued `wait(...)` tool
|
|
74
|
+
execution instead of falling back to direct `:call` execution.
|
|
75
|
+
|
|
76
|
+
* **Track persisted compaction state on contexts** <br>
|
|
77
|
+
Mark contexts as compacted after `LLM::Compactor#compact!`, persist and
|
|
78
|
+
restore that state through context serialization, and clear it after the
|
|
79
|
+
next successful model response.
|
|
80
|
+
|
|
81
|
+
* **Return zero-valued usage objects from contexts** <br>
|
|
82
|
+
Make `LLM::Context#usage` consistently return an `LLM::Object`, using a
|
|
83
|
+
zero-valued usage object when no provider usage has been recorded yet.
|
|
84
|
+
|
|
3
85
|
## v6.0.0
|
|
4
86
|
|
|
5
87
|
Changes since `v5.4.0`.
|
data/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
<p align="center">
|
|
5
5
|
<a href="https://0x1eef.github.io/x/llm.rb?rebuild=1"><img src="https://img.shields.io/badge/docs-0x1eef.github.io-blue.svg" alt="RubyDoc"></a>
|
|
6
6
|
<a href="https://opensource.org/license/0bsd"><img src="https://img.shields.io/badge/License-0BSD-orange.svg?" alt="License"></a>
|
|
7
|
-
<a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-
|
|
7
|
+
<a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-7.0.0-green.svg?" alt="Version"></a>
|
|
8
8
|
</p>
|
|
9
9
|
|
|
10
10
|
## About
|
|
@@ -163,12 +163,15 @@ and when a stream is present it emits `on_compaction` and
|
|
|
163
163
|
`on_compaction_finish` through [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html).
|
|
164
164
|
The compactor can also use a different model from the main context, which is
|
|
165
165
|
useful when you want summarization to run on a cheaper or faster model.
|
|
166
|
+
`token_threshold:` accepts either a fixed token count or a percentage string
|
|
167
|
+
like `"90%"`, which resolves against the active model context window and
|
|
168
|
+
triggers compaction once total token usage goes over that percentage.
|
|
166
169
|
|
|
167
170
|
```ruby
|
|
168
171
|
ctx = LLM::Context.new(
|
|
169
172
|
llm,
|
|
170
173
|
compactor: {
|
|
171
|
-
|
|
174
|
+
token_threshold: "90%",
|
|
172
175
|
retention_window: 8,
|
|
173
176
|
model: "gpt-5.4-mini"
|
|
174
177
|
}
|
|
@@ -367,6 +370,10 @@ worker.join
|
|
|
367
370
|
or experimental `:ractor` support for class-based tools. MCP tools are not
|
|
368
371
|
supported by the current `:ractor` mode, but mixed tool sets can still
|
|
369
372
|
route MCP tools and local tools through different strategies at runtime.
|
|
373
|
+
By default, the tool attempt budget is `25`. When an agent exhausts that
|
|
374
|
+
budget, it sends advisory tool errors back through the model instead of
|
|
375
|
+
raising out of the runtime. Set `tool_attempts: nil` to disable that
|
|
376
|
+
advisory behavior.
|
|
370
377
|
- **Tool calls have an explicit lifecycle** <br>
|
|
371
378
|
A tool call can be executed, cancelled through
|
|
372
379
|
[`LLM::Function#cancel`](https://0x1eef.github.io/x/llm.rb/LLM/Function.html#cancel-instance_method),
|
|
@@ -622,9 +629,12 @@ This example uses [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context
|
|
|
622
629
|
[`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html) together so
|
|
623
630
|
long-lived contexts can summarize older history and expose the lifecycle
|
|
624
631
|
through stream hooks. This approach is inspired by General Intelligence
|
|
625
|
-
Systems
|
|
632
|
+
Systems. The
|
|
626
633
|
compactor can also use its own `model:` if you want summarization to run on a
|
|
627
|
-
different model from the main context.
|
|
634
|
+
different model from the main context. `token_threshold:` accepts either a
|
|
635
|
+
fixed token count or a percentage string like `"90%"`, which resolves
|
|
636
|
+
against the active model context window and triggers compaction once total
|
|
637
|
+
token usage goes over that percentage. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
|
|
628
638
|
|
|
629
639
|
```ruby
|
|
630
640
|
require "llm"
|
|
@@ -644,7 +654,7 @@ ctx = LLM::Context.new(
|
|
|
644
654
|
llm,
|
|
645
655
|
stream: Stream.new,
|
|
646
656
|
compactor: {
|
|
647
|
-
|
|
657
|
+
token_threshold: "90%",
|
|
648
658
|
retention_window: 8,
|
|
649
659
|
model: "gpt-5.4-mini"
|
|
650
660
|
}
|
data/lib/llm/agent.rb
CHANGED
|
@@ -19,6 +19,9 @@ module LLM
|
|
|
19
19
|
# * The automatic tool loop enables the wrapped context's `guard` by default.
|
|
20
20
|
# The built-in {LLM::LoopGuard LLM::LoopGuard} detects repeated tool-call
|
|
21
21
|
# patterns and blocks stuck execution before more tool work is queued.
|
|
22
|
+
# * The default tool attempt budget is `25`. After that, the agent sends
|
|
23
|
+
# advisory tool errors back through the model and keeps the loop in-band.
|
|
24
|
+
# Set `tool_attempts: nil` to disable that advisory behavior.
|
|
22
25
|
# * Tool loop execution can be configured with `concurrency :call`,
|
|
23
26
|
# `:thread`, `:task`, `:fiber`, `:ractor`, or a list of queued task
|
|
24
27
|
# types such as `[:thread, :ractor]`.
|
|
@@ -161,7 +164,10 @@ module LLM
|
|
|
161
164
|
#
|
|
162
165
|
# @param prompt (see LLM::Provider#complete)
|
|
163
166
|
# @param [Hash] params The params passed to the provider, including optional :stream, :tools, :schema etc.
|
|
164
|
-
# @option params [Integer] :tool_attempts
|
|
167
|
+
# @option params [Integer] :tool_attempts
|
|
168
|
+
# The maxinum number of tool call iterations before the agent sends
|
|
169
|
+
# in-band advisory tool errors back through the model (default 25).
|
|
170
|
+
# Set to `nil` to disable advisory tool-limit returns.
|
|
165
171
|
# @return [LLM::Response] Returns the LLM's response for this turn.
|
|
166
172
|
# @example
|
|
167
173
|
# llm = LLM.openai(key: ENV["KEY"])
|
|
@@ -180,7 +186,10 @@ module LLM
|
|
|
180
186
|
# @note Not all LLM providers support this API
|
|
181
187
|
# @param prompt (see LLM::Provider#complete)
|
|
182
188
|
# @param [Hash] params The params passed to the provider, including optional :stream, :tools, :schema etc.
|
|
183
|
-
# @option params [Integer] :tool_attempts
|
|
189
|
+
# @option params [Integer] :tool_attempts
|
|
190
|
+
# The maxinum number of tool call iterations before the agent sends
|
|
191
|
+
# in-band advisory tool errors back through the model (default 25).
|
|
192
|
+
# Set to `nil` to disable advisory tool-limit returns.
|
|
184
193
|
# @return [LLM::Response] Returns the LLM's response for this turn.
|
|
185
194
|
# @example
|
|
186
195
|
# llm = LLM.openai(key: ENV["KEY"])
|
|
@@ -393,18 +402,37 @@ module LLM
|
|
|
393
402
|
|
|
394
403
|
def run_loop(method, prompt, params)
|
|
395
404
|
loop = proc do
|
|
396
|
-
max =
|
|
405
|
+
max = params.key?(:tool_attempts) ? params.delete(:tool_attempts) : 25
|
|
406
|
+
max = Integer(max) if max
|
|
407
|
+
stream = params[:stream] || @ctx.params[:stream]
|
|
408
|
+
stream.extra[:concurrency] = concurrency if LLM::Stream === stream
|
|
397
409
|
res = @ctx.public_send(method, apply_instructions(prompt), params)
|
|
398
|
-
|
|
410
|
+
loop do
|
|
399
411
|
break if @ctx.functions.empty?
|
|
400
|
-
|
|
412
|
+
if max
|
|
413
|
+
max.times do
|
|
414
|
+
break if @ctx.functions.empty?
|
|
415
|
+
res = @ctx.public_send(method, call_functions, params)
|
|
416
|
+
end
|
|
417
|
+
break if @ctx.functions.empty?
|
|
418
|
+
res = @ctx.public_send(method, @ctx.functions.map { rate_limit(_1) }, params)
|
|
419
|
+
else
|
|
420
|
+
res = @ctx.public_send(method, call_functions, params)
|
|
421
|
+
end
|
|
401
422
|
end
|
|
402
|
-
raise LLM::ToolLoopError, "pending tool calls remain" unless @ctx.functions.empty?
|
|
403
423
|
res
|
|
404
424
|
end
|
|
405
425
|
@tracer ? @llm.with_tracer(@tracer, &loop) : loop.call
|
|
406
426
|
end
|
|
407
427
|
|
|
428
|
+
def rate_limit(function)
|
|
429
|
+
LLM::Function::Return.new(function.id, function.name, {
|
|
430
|
+
error: true,
|
|
431
|
+
type: LLM::ToolLoopError.name,
|
|
432
|
+
message: "tool loop rate limit reached"
|
|
433
|
+
})
|
|
434
|
+
end
|
|
435
|
+
|
|
408
436
|
def resolve_option(option)
|
|
409
437
|
Proc === option ? instance_exec(&option) : option
|
|
410
438
|
end
|
data/lib/llm/buffer.rb
CHANGED
|
@@ -52,6 +52,14 @@ module LLM
|
|
|
52
52
|
reverse_each.find(...)
|
|
53
53
|
end
|
|
54
54
|
|
|
55
|
+
##
|
|
56
|
+
# Returns the index of the last message matching the given block.
|
|
57
|
+
# @yield [LLM::Message]
|
|
58
|
+
# @return [Integer, nil]
|
|
59
|
+
def rindex(...)
|
|
60
|
+
@messages.rindex(...)
|
|
61
|
+
end
|
|
62
|
+
|
|
55
63
|
##
|
|
56
64
|
# Returns the last message(s) in the buffer
|
|
57
65
|
# @param [Integer, nil] n
|
data/lib/llm/compactor.rb
CHANGED
|
@@ -5,13 +5,14 @@
|
|
|
5
5
|
# smaller replacement message when a context grows too large.
|
|
6
6
|
#
|
|
7
7
|
# This work is directly inspired by the compaction approach developed by
|
|
8
|
-
# General Intelligence Systems
|
|
9
|
-
# [Brute](https://github.com/general-intelligence-systems/brute).
|
|
8
|
+
# General Intelligence Systems.
|
|
10
9
|
#
|
|
11
10
|
# The compactor can also use a different model from the main context by
|
|
12
11
|
# setting `model:` in the compactor config. Compaction thresholds are opt-in:
|
|
13
12
|
# provide `message_threshold:` and/or `token_threshold:` to enable policy-
|
|
14
|
-
# driven compaction.
|
|
13
|
+
# driven compaction. `token_threshold:` accepts either an integer token count
|
|
14
|
+
# or a percentage string like `"90%"`, which resolves against the current
|
|
15
|
+
# model context window.
|
|
15
16
|
class LLM::Compactor
|
|
16
17
|
DEFAULTS = {
|
|
17
18
|
retention_window: 8,
|
|
@@ -25,8 +26,11 @@ class LLM::Compactor
|
|
|
25
26
|
##
|
|
26
27
|
# @param [LLM::Context] ctx
|
|
27
28
|
# @param [Hash] config
|
|
28
|
-
# @option config [Integer, nil] :token_threshold
|
|
29
|
-
# Enables token-based compaction.
|
|
29
|
+
# @option config [Integer, String, nil] :token_threshold
|
|
30
|
+
# Enables token-based compaction. Integer values are treated as a fixed
|
|
31
|
+
# token count. Percentage strings like `"90%"` are resolved against
|
|
32
|
+
# {LLM::Context#context_window}; if the context window is unknown, the
|
|
33
|
+
# percentage threshold is treated as disabled.
|
|
30
34
|
# @option config [Integer, nil] :message_threshold
|
|
31
35
|
# Enables message-count-based compaction.
|
|
32
36
|
# @option config [Integer] :retention_window
|
|
@@ -39,18 +43,22 @@ class LLM::Compactor
|
|
|
39
43
|
end
|
|
40
44
|
|
|
41
45
|
##
|
|
42
|
-
# Returns true when the context should be compacted
|
|
46
|
+
# Returns true when the context should be compacted.
|
|
47
|
+
#
|
|
48
|
+
# When `token_threshold:` is a percentage string such as `"90%"`, the
|
|
49
|
+
# threshold is resolved against the current context window and compared to
|
|
50
|
+
# the current total token usage.
|
|
43
51
|
# @param [Object] prompt
|
|
44
52
|
# The next prompt or turn input
|
|
45
53
|
# @return [Boolean]
|
|
46
|
-
def
|
|
54
|
+
def compactable?(prompt = nil)
|
|
47
55
|
return false if ctx.functions.any? || [*prompt].grep(LLM::Function::Return).any?
|
|
48
56
|
messages = ctx.messages.reject(&:system?)
|
|
49
57
|
return true if config[:message_threshold] && messages.size > config[:message_threshold]
|
|
50
|
-
|
|
51
|
-
return true if config[:token_threshold] && usage && usage.total_tokens > config[:token_threshold]
|
|
58
|
+
return true if token_threshold and ctx.usage.total_tokens > token_threshold
|
|
52
59
|
false
|
|
53
60
|
end
|
|
61
|
+
alias_method :compact?, :compactable?
|
|
54
62
|
|
|
55
63
|
##
|
|
56
64
|
# Summarize older messages and replace them with a compact summary.
|
|
@@ -68,6 +76,7 @@ class LLM::Compactor
|
|
|
68
76
|
older = messages[0...(messages.size - recent.size)]
|
|
69
77
|
summary = LLM::Message.new(ctx.llm.user_role, "[Previous conversation summary]\n\n#{summarize(older)}", {compaction: true})
|
|
70
78
|
ctx.messages.replace([*ctx.messages.take_while(&:system?), summary, *recent])
|
|
79
|
+
ctx.compacted = true
|
|
71
80
|
stream.on_compaction_finish(ctx, self) if LLM::Stream === stream
|
|
72
81
|
summary
|
|
73
82
|
end
|
|
@@ -84,6 +93,15 @@ class LLM::Compactor
|
|
|
84
93
|
messages[start..] || []
|
|
85
94
|
end
|
|
86
95
|
|
|
96
|
+
def token_threshold
|
|
97
|
+
@token_threshold ||= begin
|
|
98
|
+
threshold = config[:token_threshold]
|
|
99
|
+
return threshold unless threshold.to_s.end_with?("%")
|
|
100
|
+
return if ctx.context_window <= 0
|
|
101
|
+
(ctx.context_window * threshold.delete_suffix("%").to_f / 100).floor
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
87
105
|
def summarize(messages)
|
|
88
106
|
model = config[:model] || ctx.params[:model] || ctx.llm.default_model
|
|
89
107
|
ctx.llm.complete(summary_prompt(messages), model:).content
|
data/lib/llm/context.rb
CHANGED
|
@@ -40,6 +40,14 @@ module LLM
|
|
|
40
40
|
include Serializer
|
|
41
41
|
include Deserializer
|
|
42
42
|
|
|
43
|
+
ZERO_USAGE = LLM::Object.from(
|
|
44
|
+
input_tokens: 0,
|
|
45
|
+
output_tokens: 0,
|
|
46
|
+
reasoning_tokens: 0,
|
|
47
|
+
total_tokens: 0
|
|
48
|
+
)
|
|
49
|
+
private_constant :ZERO_USAGE
|
|
50
|
+
|
|
43
51
|
##
|
|
44
52
|
# Returns the accumulated message history for this context
|
|
45
53
|
# @return [LLM::Buffer<LLM::Message>]
|
|
@@ -88,8 +96,7 @@ module LLM
|
|
|
88
96
|
##
|
|
89
97
|
# Returns a context compactor
|
|
90
98
|
# This feature is inspired by the compaction approach developed by
|
|
91
|
-
# General Intelligence Systems
|
|
92
|
-
# [Brute](https://github.com/general-intelligence-systems/brute).
|
|
99
|
+
# General Intelligence Systems.
|
|
93
100
|
# @return [LLM::Compactor]
|
|
94
101
|
def compactor
|
|
95
102
|
@compactor = LLM::Compactor.new(self, @compactor || {}) unless LLM::Compactor === @compactor
|
|
@@ -104,6 +111,14 @@ module LLM
|
|
|
104
111
|
@compactor = compactor
|
|
105
112
|
end
|
|
106
113
|
|
|
114
|
+
##
|
|
115
|
+
# Returns whether the context has been compacted and no later model
|
|
116
|
+
# response has cleared that state.
|
|
117
|
+
# @return [Boolean]
|
|
118
|
+
# @api private
|
|
119
|
+
attr_accessor :compacted
|
|
120
|
+
alias_method :compacted?, :compacted
|
|
121
|
+
|
|
107
122
|
##
|
|
108
123
|
# Returns a guard, if configured.
|
|
109
124
|
#
|
|
@@ -172,13 +187,14 @@ module LLM
|
|
|
172
187
|
# puts res.messages[0].content
|
|
173
188
|
def talk(prompt, params = {})
|
|
174
189
|
return respond(prompt, params) if mode == :responses
|
|
175
|
-
@owner =
|
|
190
|
+
@owner = @llm.request_owner
|
|
176
191
|
compactor.compact!(prompt) if compactor.compact?(prompt)
|
|
177
192
|
params = params.merge(messages: @messages.to_a)
|
|
178
193
|
params = @params.merge(params)
|
|
179
194
|
prompt, params = transform(prompt, params)
|
|
180
195
|
bind!(params[:stream], params[:model], params[:tools])
|
|
181
196
|
res = @llm.complete(prompt, params)
|
|
197
|
+
self.compacted = false
|
|
182
198
|
role = params[:role] || @llm.user_role
|
|
183
199
|
role = @llm.tool_role if params[:role].nil? && [*prompt].grep(LLM::Function::Return).any?
|
|
184
200
|
@messages.concat LLM::Prompt === prompt ? prompt.to_a : [LLM::Message.new(role, prompt)]
|
|
@@ -201,7 +217,7 @@ module LLM
|
|
|
201
217
|
# res = ctx.respond("What is the capital of France?")
|
|
202
218
|
# puts res.output_text
|
|
203
219
|
def respond(prompt, params = {})
|
|
204
|
-
@owner =
|
|
220
|
+
@owner = @llm.request_owner
|
|
205
221
|
compactor.compact!(prompt) if compactor.compact?(prompt)
|
|
206
222
|
params = @params.merge(params)
|
|
207
223
|
prompt, params = transform(prompt, params)
|
|
@@ -209,6 +225,7 @@ module LLM
|
|
|
209
225
|
res_id = params[:store] == false ? nil : @messages.find(&:assistant?)&.response&.response_id
|
|
210
226
|
params = params.merge(previous_response_id: res_id, input: @messages.to_a).compact
|
|
211
227
|
res = @llm.responses.create(prompt, params)
|
|
228
|
+
self.compacted = false
|
|
212
229
|
role = params[:role] || @llm.user_role
|
|
213
230
|
@messages.concat LLM::Prompt === prompt ? prompt.to_a : [LLM::Message.new(role, prompt)]
|
|
214
231
|
@messages.concat [res.choices[-1]]
|
|
@@ -313,27 +330,31 @@ module LLM
|
|
|
313
330
|
# This is inspired by Go's context cancellation model.
|
|
314
331
|
# @return [nil]
|
|
315
332
|
def interrupt!
|
|
333
|
+
pending = functions.to_a
|
|
316
334
|
llm.interrupt!(@owner)
|
|
317
335
|
queue&.interrupt!
|
|
336
|
+
return if pending.empty?
|
|
337
|
+
pending.each(&:interrupt!)
|
|
338
|
+
returns = pending.map { _1.cancel(reason: "function call cancelled") }
|
|
339
|
+
@messages << LLM::Message.new(@llm.tool_role, returns)
|
|
340
|
+
nil
|
|
318
341
|
end
|
|
319
342
|
alias_method :cancel!, :interrupt!
|
|
320
343
|
|
|
321
344
|
##
|
|
322
345
|
# Returns token usage accumulated in this context
|
|
323
|
-
# @
|
|
324
|
-
# This method returns token usage for the latest
|
|
325
|
-
# assistant message, and it returns nil for non-assistant
|
|
326
|
-
# messages.
|
|
327
|
-
# @return [LLM::Object, nil]
|
|
346
|
+
# @return [LLM::Object]
|
|
328
347
|
def usage
|
|
329
|
-
usage = @messages.find(&:assistant?)&.usage
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
348
|
+
if usage = @messages.find(&:assistant?)&.usage
|
|
349
|
+
LLM::Object.from(
|
|
350
|
+
input_tokens: usage.input_tokens || 0,
|
|
351
|
+
output_tokens: usage.output_tokens || 0,
|
|
352
|
+
reasoning_tokens: usage.reasoning_tokens || 0,
|
|
353
|
+
total_tokens: usage.total_tokens || 0
|
|
354
|
+
)
|
|
355
|
+
else
|
|
356
|
+
ZERO_USAGE
|
|
357
|
+
end
|
|
337
358
|
end
|
|
338
359
|
|
|
339
360
|
##
|
|
@@ -403,7 +424,12 @@ module LLM
|
|
|
403
424
|
##
|
|
404
425
|
# @return [Hash]
|
|
405
426
|
def to_h
|
|
406
|
-
{
|
|
427
|
+
{
|
|
428
|
+
schema_version: 1,
|
|
429
|
+
model:,
|
|
430
|
+
compacted:,
|
|
431
|
+
messages: @messages.map { serialize_message(_1) }
|
|
432
|
+
}
|
|
407
433
|
end
|
|
408
434
|
|
|
409
435
|
##
|
|
@@ -432,12 +458,12 @@ module LLM
|
|
|
432
458
|
# Returns an _approximate_ cost for a given context
|
|
433
459
|
# based on both the provider, and model
|
|
434
460
|
def cost
|
|
435
|
-
return LLM::Cost.new(0, 0) unless usage
|
|
436
461
|
cost = LLM.registry_for(llm).cost(model:)
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
462
|
+
input_cost = (cost.input.to_f / 1_000_000.0) * usage.input_tokens
|
|
463
|
+
output_cost = (cost.output.to_f / 1_000_000.0) * usage.output_tokens
|
|
464
|
+
LLM::Cost.new(input_cost, output_cost)
|
|
465
|
+
rescue LLM::NoSuchModelError, LLM::NoSuchRegistryError
|
|
466
|
+
LLM::Cost.new(0, 0)
|
|
441
467
|
end
|
|
442
468
|
|
|
443
469
|
##
|
data/lib/llm/loop_guard.rb
CHANGED
|
@@ -10,8 +10,7 @@
|
|
|
10
10
|
#
|
|
11
11
|
# {LLM::LoopGuard LLM::LoopGuard} detects when a context is repeating the same
|
|
12
12
|
# tool-call pattern instead of making progress. It is directly inspired by
|
|
13
|
-
# General Intelligence Systems
|
|
14
|
-
# approach.
|
|
13
|
+
# General Intelligence Systems and its doom-loop detection approach.
|
|
15
14
|
#
|
|
16
15
|
# The public interface is intentionally small:
|
|
17
16
|
# - `call(ctx)` returns `nil` when no intervention is needed
|
|
@@ -22,14 +21,6 @@
|
|
|
22
21
|
# {LLM::Agent LLM::Agent} enables this guard by default through its wrapped
|
|
23
22
|
# context.
|
|
24
23
|
#
|
|
25
|
-
# Brute is MIT licensed. The relevant license grant is:
|
|
26
|
-
#
|
|
27
|
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
28
|
-
# of this software and associated documentation files (the "Software"), to deal
|
|
29
|
-
# in the Software without restriction, including without limitation the rights
|
|
30
|
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
31
|
-
# copies of the Software, and to permit persons to whom the Software is
|
|
32
|
-
# furnished to do so.
|
|
33
24
|
class LLM::LoopGuard
|
|
34
25
|
##
|
|
35
26
|
# The default number of repeated tool-call patterns required before
|
|
@@ -38,7 +38,7 @@ module LLM::Provider::Transport
|
|
|
38
38
|
perform_request(http, request, stream, stream_parser, &b)
|
|
39
39
|
end
|
|
40
40
|
[handle_response(res, tracer, span), span, tracer]
|
|
41
|
-
rescue *
|
|
41
|
+
rescue *transport.interrupt_errors
|
|
42
42
|
raise LLM::Interrupt, "request interrupted" if transport.interrupted?(owner)
|
|
43
43
|
raise
|
|
44
44
|
end
|
|
@@ -1,109 +1,114 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
class LLM::Provider
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
INTERRUPT_ERRORS = [::IOError, ::EOFError, Errno::EBADF].freeze
|
|
17
|
-
Request = Struct.new(:http, :connection, keyword_init: true)
|
|
4
|
+
##
|
|
5
|
+
# Internal request interruption methods for
|
|
6
|
+
# {LLM::Provider::Transport::HTTP}.
|
|
7
|
+
#
|
|
8
|
+
# This module tracks active requests by execution owner and provides
|
|
9
|
+
# the logic used to interrupt an in-flight request by closing the
|
|
10
|
+
# active HTTP connection.
|
|
11
|
+
#
|
|
12
|
+
# @api private
|
|
13
|
+
module Transport::HTTP::Interruptible
|
|
14
|
+
INTERRUPT_ERRORS = [::IOError, ::EOFError, Errno::EBADF].freeze
|
|
15
|
+
Request = Struct.new(:http, :connection, keyword_init: true)
|
|
18
16
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# The execution owner whose request should be interrupted
|
|
23
|
-
# @return [nil]
|
|
24
|
-
def interrupt!(owner)
|
|
25
|
-
req = request_for(owner) or return
|
|
26
|
-
lock { (@interrupts ||= {})[owner] = true }
|
|
27
|
-
if persistent_http?(req.http)
|
|
28
|
-
close_socket(req.connection&.http)
|
|
29
|
-
req.http.finish(req.connection)
|
|
30
|
-
elsif transient_http?(req.http)
|
|
31
|
-
close_socket(req.http)
|
|
32
|
-
req.http.finish if req.http.active?
|
|
33
|
-
end
|
|
34
|
-
rescue *INTERRUPT_ERRORS
|
|
35
|
-
nil
|
|
36
|
-
end
|
|
37
|
-
|
|
38
|
-
private
|
|
17
|
+
def interrupt_errors
|
|
18
|
+
[*INTERRUPT_ERRORS, *optional_interrupt_errors]
|
|
19
|
+
end
|
|
39
20
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
21
|
+
##
|
|
22
|
+
# Interrupt an active request, if any.
|
|
23
|
+
# @param [Fiber] owner
|
|
24
|
+
# The execution owner whose request should be interrupted
|
|
25
|
+
# @return [nil]
|
|
26
|
+
def interrupt!(owner)
|
|
27
|
+
req = request_for(owner) or return
|
|
28
|
+
lock { (@interrupts ||= {})[owner] = true }
|
|
29
|
+
if persistent_http?(req.http)
|
|
30
|
+
close_socket(req.connection&.http)
|
|
31
|
+
req.http.finish(req.connection)
|
|
32
|
+
elsif transient_http?(req.http)
|
|
33
|
+
close_socket(req.http)
|
|
34
|
+
req.http.finish if req.http.active?
|
|
35
|
+
end
|
|
36
|
+
owner.stop if owner.respond_to?(:stop)
|
|
37
|
+
rescue *interrupt_errors
|
|
38
|
+
nil
|
|
39
|
+
end
|
|
51
40
|
|
|
52
|
-
|
|
53
|
-
# Returns whether the active request is using a transient HTTP client.
|
|
54
|
-
# @param [Object, nil] http
|
|
55
|
-
# @return [Boolean]
|
|
56
|
-
def transient_http?(http)
|
|
57
|
-
Net::HTTP === http
|
|
58
|
-
end
|
|
41
|
+
private
|
|
59
42
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
43
|
+
##
|
|
44
|
+
# Closes the active socket for a request, if present.
|
|
45
|
+
# @param [Net::HTTP, nil] http
|
|
46
|
+
# @return [nil]
|
|
47
|
+
def close_socket(http)
|
|
48
|
+
socket = http&.instance_variable_get(:@socket) or return
|
|
49
|
+
socket = socket.io if socket.respond_to?(:io)
|
|
50
|
+
socket.close
|
|
51
|
+
rescue *interrupt_errors
|
|
52
|
+
nil
|
|
53
|
+
end
|
|
67
54
|
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
@requests[owner]
|
|
76
|
-
end
|
|
77
|
-
end
|
|
55
|
+
##
|
|
56
|
+
# Returns whether the active request is using a transient HTTP client.
|
|
57
|
+
# @param [Object, nil] http
|
|
58
|
+
# @return [Boolean]
|
|
59
|
+
def transient_http?(http)
|
|
60
|
+
Net::HTTP === http
|
|
61
|
+
end
|
|
78
62
|
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
@requests ||= {}
|
|
87
|
-
@requests[owner] = req
|
|
88
|
-
end
|
|
89
|
-
end
|
|
63
|
+
##
|
|
64
|
+
# Returns whether the active request is using a persistent HTTP client.
|
|
65
|
+
# @param [Object, nil] http
|
|
66
|
+
# @return [Boolean]
|
|
67
|
+
def persistent_http?(http)
|
|
68
|
+
defined?(Net::HTTP::Persistent) && Net::HTTP::Persistent === http
|
|
69
|
+
end
|
|
90
70
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
71
|
+
##
|
|
72
|
+
# Returns the active request for an execution owner.
|
|
73
|
+
# @param [Fiber] owner
|
|
74
|
+
# @return [Request, nil]
|
|
75
|
+
def request_for(owner)
|
|
76
|
+
lock do
|
|
77
|
+
@requests ||= {}
|
|
78
|
+
@requests[owner]
|
|
79
|
+
end
|
|
80
|
+
end
|
|
98
81
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
82
|
+
##
|
|
83
|
+
# Records an active request for an execution owner.
|
|
84
|
+
# @param [Request] req
|
|
85
|
+
# @param [Fiber] owner
|
|
86
|
+
# @return [Request]
|
|
87
|
+
def set_request(req, owner)
|
|
88
|
+
lock do
|
|
89
|
+
@requests ||= {}
|
|
90
|
+
@requests[owner] = req
|
|
106
91
|
end
|
|
107
92
|
end
|
|
93
|
+
|
|
94
|
+
##
|
|
95
|
+
# Clears the active request for an execution owner.
|
|
96
|
+
# @param [Fiber] owner
|
|
97
|
+
# @return [Request, nil]
|
|
98
|
+
def clear_request(owner)
|
|
99
|
+
lock { @requests&.delete(owner) }
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
##
|
|
103
|
+
# Returns whether an execution owner was interrupted.
|
|
104
|
+
# @param [Fiber] owner
|
|
105
|
+
# @return [Boolean, nil]
|
|
106
|
+
def interrupted?(owner)
|
|
107
|
+
lock { @interrupts&.delete(owner) }
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def optional_interrupt_errors
|
|
111
|
+
defined?(::Async::Stop) ? [Async::Stop] : []
|
|
112
|
+
end
|
|
108
113
|
end
|
|
109
114
|
end
|
|
@@ -50,9 +50,10 @@ class LLM::Provider
|
|
|
50
50
|
|
|
51
51
|
##
|
|
52
52
|
# Returns the current request owner.
|
|
53
|
-
# @return [
|
|
53
|
+
# @return [Object]
|
|
54
54
|
def request_owner
|
|
55
|
-
Fiber.current
|
|
55
|
+
return Fiber.current unless defined?(::Async)
|
|
56
|
+
Async::Task.current || Fiber.current
|
|
56
57
|
end
|
|
57
58
|
|
|
58
59
|
##
|
|
@@ -70,7 +71,7 @@ class LLM::Provider
|
|
|
70
71
|
##
|
|
71
72
|
# @return [Boolean]
|
|
72
73
|
def persistent?
|
|
73
|
-
|
|
74
|
+
!@persistent_client.nil?
|
|
74
75
|
end
|
|
75
76
|
|
|
76
77
|
##
|
data/lib/llm/provider.rb
CHANGED
|
@@ -338,6 +338,14 @@ class LLM::Provider
|
|
|
338
338
|
end
|
|
339
339
|
alias_method :cancel!, :interrupt!
|
|
340
340
|
|
|
341
|
+
##
|
|
342
|
+
# Returns the current request owner used by the transport.
|
|
343
|
+
# @return [Object]
|
|
344
|
+
# @api private
|
|
345
|
+
def request_owner
|
|
346
|
+
transport.request_owner
|
|
347
|
+
end
|
|
348
|
+
|
|
341
349
|
##
|
|
342
350
|
# @param [Object] stream
|
|
343
351
|
# @return [Boolean]
|
data/lib/llm/skill.rb
CHANGED
|
@@ -76,6 +76,8 @@ module LLM
|
|
|
76
76
|
def call(ctx)
|
|
77
77
|
instructions, tools, tracer = self.instructions, self.tools, ctx.llm.tracer
|
|
78
78
|
params = ctx.params.merge(mode: ctx.mode).reject { [:tools, :schema].include?(_1) }
|
|
79
|
+
concurrency = params[:stream].extra[:concurrency] if LLM::Stream === params[:stream]
|
|
80
|
+
params[:concurrency] = concurrency if concurrency
|
|
79
81
|
agent = Class.new(LLM::Agent) do
|
|
80
82
|
instructions(instructions)
|
|
81
83
|
tools(*tools)
|
data/lib/llm/version.rb
CHANGED