llm.rb 4.14.0 → 4.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea1addf0bff644fa11e4f69a806f8ff5b7aa04fbbbc3f0592bd51b6ebc07f0f8
4
- data.tar.gz: a3c846b9744e4ef230e2f23ed6ab42f6b4c84a0165b8bc066b7f6a003ee8fc00
3
+ metadata.gz: 40217f9b44b00028739994a8f6f6a278b366d7fa7f4799b86afd2b793f367084
4
+ data.tar.gz: cf7e6d7935cf6ab8479ac864e09ea7a4403a97345f98e30ae03de9ae941c97a0
5
5
  SHA512:
6
- metadata.gz: 7387da06d824d42753ff30455b0e464b7ca6eaa43e9410ce814ad96451c5595154d1e721fb69c9edc0971208aaf8a011ce42078827b57971e0e7c0a66eb0db6e
7
- data.tar.gz: 590442f434086b7215d664e6b5d474130499a14fba16810ff7e0b04878d25e46ca8983057af5fd9275d8415d95da6e1439b84388fa450b8c06bc7841c832a48e
6
+ metadata.gz: 3850bb93244032d3ee721c1a7bc85efd0e86f605a421960abfbab56b89b9b4c36d97efb68970759561f08f165069e4c2e213132c0bb485b3366c57bebb71e3ad
7
+ data.tar.gz: 2856c27c38e6d6d8d659d8a21c13b4ea514fdc0a0bd024935b2ea2f20d10eac96f324489f1c06a08c9bad6be5f97756eedc38b9f301e7a78533a8422f02cd329
data/CHANGELOG.md CHANGED
@@ -2,8 +2,57 @@
2
2
 
3
3
  ## Unreleased
4
4
 
5
+ Changes since `v4.15.0`.
6
+
7
+ ## v4.15.0
8
+
5
9
  Changes since `v4.14.0`.
6
10
 
11
+ ### Change
12
+
13
+ * **Reduce OpenAI stream parser merge overhead** <br>
14
+ Special-case the most common single-field deltas, streamline
15
+ incremental tool-call merging, and avoid repeated JSON parse attempts
16
+ until streamed tool arguments look complete.
17
+
18
+ * **Cache streaming callback capabilities in parsers** <br>
19
+ Cache callback support checks once at parser initialization time in
20
+ the OpenAI, OpenAI Responses, Anthropic, Google, and Ollama stream
21
+ parsers instead of repeating `respond_to?` checks on hot streaming
22
+ paths.
23
+
24
+ * **Reduce OpenAI Responses parser lookup overhead** <br>
25
+ Special-case the hot Responses API event paths and cache the current
26
+ output item and content part so streamed output text deltas do less
27
+ repeated nested lookup work.
28
+
29
+ * **Add a Sequel context persistence plugin** <br>
30
+ Add `plugin :llm` for Sequel models so apps can persist
31
+ `LLM::Context` state with default columns and pass provider setup
32
+ through `provider:` when needed. The plugin now also supports
33
+ `format: :string`, `:json`, or `:jsonb` for text and native JSON
34
+ storage when Sequel JSON typecasting is enabled.
35
+
36
+ * **Improve streaming parser performance** <br>
37
+ In the local replay-based `stream_parser` benchmark versus
38
+ `v4.14.0` (median of 20 samples, 5000 iterations), plain Ruby is a
39
+ small overall win: the generic eventstream path is about 0.4%
40
+ faster, the OpenAI stream parser is about 0.5% faster, and the
41
+ OpenAI Responses parser is about 1.6% faster, with unchanged
42
+ allocations. Under YJIT on the same benchmark, the generic
43
+ eventstream path is about 0.9% faster and the OpenAI stream parser
44
+ is about 0.4% faster, while the OpenAI Responses parser is about
45
+ 0.7% slower, also with unchanged allocations.
46
+
47
+ Compared to `v4.13.0`, the larger `v4.14.0` streaming gains still
48
+ hold. The generic eventstream path remains dramatically faster than
49
+ `v4.13.0`, the OpenAI stream parser remains modestly faster, and the
50
+ OpenAI Responses parser is roughly flat to slightly better depending
51
+ on runtime. In other words, current keeps the large eventstream win
52
+ from `v4.14.0`, adds only small incremental changes beyond that, and
53
+ does not turn the post-`v4.14.0` parser work into another large
54
+ benchmark jump.
55
+
7
56
  ## v4.14.0
8
57
 
9
58
  Changes since `v4.13.0`.
@@ -40,6 +89,18 @@ parallel tool calls can safely share one connection.
40
89
  worthwhile, which lowers allocation churn in the remaining generic
41
90
  SSE path.
42
91
 
92
+ * **Improve streaming parser performance** <br>
93
+ In the local replay-based `stream_parser` benchmark versus `v4.13.0`
94
+ (median of 20 samples, 5000 iterations):
95
+ Plain Ruby: the generic eventstream path is about 53% faster with
96
+ about 32% fewer allocations, the OpenAI stream parser is about 11%
97
+ faster with about 4% fewer allocations, and the OpenAI Responses
98
+ parser is about 3% faster with unchanged allocations.
99
+ YJIT on the current parser benchmark harness: the current tree is
100
+ about 26% faster than non-YJIT on the generic eventstream path,
101
+ about 18% faster on the OpenAI stream parser, and about 16% faster
102
+ on the OpenAI Responses parser, with allocations unchanged.
103
+
43
104
  ### Fix
44
105
 
45
106
  * **Support parallel MCP tool calls on one client** <br>
data/README.md CHANGED
@@ -4,7 +4,7 @@
4
4
  <p align="center">
5
5
  <a href="https://0x1eef.github.io/x/llm.rb?rebuild=1"><img src="https://img.shields.io/badge/docs-0x1eef.github.io-blue.svg" alt="RubyDoc"></a>
6
6
  <a href="https://opensource.org/license/0bsd"><img src="https://img.shields.io/badge/License-0BSD-orange.svg?" alt="License"></a>
7
- <a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-4.14.0-green.svg?" alt="Version"></a>
7
+ <a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-4.15.0-green.svg?" alt="Version"></a>
8
8
  </p>
9
9
 
10
10
  ## About
@@ -17,9 +17,9 @@ state.
17
17
  It is built for engineers who want control over how these systems run. llm.rb
18
18
  stays close to Ruby, runs on the standard library by default, loads optional
19
19
  pieces only when needed, and remains easy to extend. It also works well in
20
- Rails or ActiveRecord applications, where a small wrapper around context
21
- persistence is enough to save and restore long-lived conversation state across
22
- requests, jobs, or retries.
20
+ Rails or ActiveRecord applications, and it includes built-in Sequel plugin
21
+ support, where a small wrapper around context persistence is enough to save
22
+ and restore long-lived conversation state across requests, jobs, or retries.
23
23
 
24
24
  Most LLM libraries stop at request/response APIs. Building real systems means
25
25
  stitching together streaming, tools, state, persistence, and external
@@ -34,7 +34,8 @@ so they compose naturally instead of becoming separate subsystems.
34
34
 
35
35
  ## Core Concept
36
36
 
37
- `LLM::Context` is the execution boundary in llm.rb.
37
+ [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
38
+ is the execution boundary in llm.rb.
38
39
 
39
40
  It holds:
40
41
  - message history
@@ -50,69 +51,89 @@ same context object.
50
51
 
51
52
  ### Execution Model
52
53
 
53
- - **A system layer, not just an API wrapper**
54
+ - **A system layer, not just an API wrapper** <br>
54
55
  Put providers, tools, MCP servers, and application APIs behind one runtime
55
56
  model instead of stitching them together by hand.
56
- - **Contexts are central**
57
+ - **Contexts are central** <br>
57
58
  Keep history, tools, schema, usage, persistence, and execution state in one
58
59
  place instead of spreading them across your app.
59
- - **Contexts can be serialized**
60
+ - **Contexts can be serialized** <br>
60
61
  Save and restore live state for jobs, databases, retries, or long-running
61
62
  workflows.
62
63
 
63
64
  ### Runtime Behavior
64
65
 
65
- - **Streaming and tool execution work together**
66
+ - **Streaming and tool execution work together** <br>
66
67
  Start tool work while output is still streaming so you can hide latency
67
68
  instead of waiting for turns to finish.
68
- - **Requests can be interrupted cleanly**
69
+ - **Tool calls have an explicit lifecycle** <br>
70
+ A tool call can be executed, cancelled through
71
+ [`LLM::Function#cancel`](https://0x1eef.github.io/x/llm.rb/LLM/Function.html#cancel-instance_method),
72
+ or left unresolved for manual handling, but the normal runtime contract is
73
+ still that a model-issued tool request is answered with a tool return.
74
+ - **Requests can be interrupted cleanly** <br>
69
75
  Stop in-flight provider work through the same runtime instead of treating
70
- cancellation as a separate concern. `LLM::Context#cancel!` is inspired by
71
- Go's context cancellation model.
72
- - **Concurrency is a first-class feature**
76
+ cancellation as a separate concern.
77
+ [`LLM::Context#cancel!`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html#cancel-21-instance_method)
78
+ is inspired by Go's context cancellation model.
79
+ - **Concurrency is a first-class feature** <br>
73
80
  Use threads, fibers, or async tasks without rewriting your tool layer.
74
- - **Advanced workloads are built in, not bolted on**
81
+ - **Advanced workloads are built in, not bolted on** <br>
75
82
  Streaming, concurrent tool execution, persistence, tracing, and MCP support
76
83
  all fit the same runtime model.
77
84
 
78
85
  ### Integration
79
86
 
80
- - **MCP is built in**
87
+ - **MCP is built in** <br>
81
88
  Connect to MCP servers over stdio or HTTP without bolting on a separate
82
89
  integration stack.
83
- - **Provider support is broad**
90
+ - **Sequel persistence is built in** <br>
91
+ Use `plugin :llm` to persist `LLM::Context` state on a Sequel model with
92
+ sensible default columns, then pass provider setup through
93
+ `provider:` when you need it. Use `format: :string` for text columns or
94
+ `format: :jsonb` when you want native PostgreSQL JSON storage with Sequel's
95
+ JSON typecasting support enabled.
96
+ - **Persistent HTTP pooling is shared process-wide** <br>
97
+ When enabled, separate
98
+ [`LLM::Provider`](https://0x1eef.github.io/x/llm.rb/LLM/Provider.html)
99
+ instances with the same endpoint settings can share one persistent
100
+ pool, and separate HTTP
101
+ [`LLM::MCP`](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html)
102
+ instances can do the same, instead of each object creating its own
103
+ isolated per-instance transport.
104
+ - **Provider support is broad** <br>
84
105
  Work with OpenAI, OpenAI-compatible endpoints, Anthropic, Google, DeepSeek,
85
106
  Z.ai, xAI, llama.cpp, and Ollama through the same runtime.
86
- - **Tools are explicit**
107
+ - **Tools are explicit** <br>
87
108
  Run local tools, provider-native tools, and MCP tools through the same path
88
109
  with fewer special cases.
89
- - **Providers are normalized, not flattened**
110
+ - **Providers are normalized, not flattened** <br>
90
111
  Share one API surface across providers without losing access to provider-
91
112
  specific capabilities where they matter.
92
- - **Responses keep a uniform shape**
113
+ - **Responses keep a uniform shape** <br>
93
114
  Provider calls return
94
115
  [`LLM::Response`](https://0x1eef.github.io/x/llm.rb/LLM/Response.html)
95
116
  objects as a common base shape, then extend them with endpoint- or
96
117
  provider-specific behavior when needed.
97
- - **Low-level access is still there**
118
+ - **Low-level access is still there** <br>
98
119
  Normalized responses still keep the raw `Net::HTTPResponse` available when
99
120
  you need headers, status, or other HTTP details.
100
- - **Local model metadata is included**
121
+ - **Local model metadata is included** <br>
101
122
  Model capabilities, pricing, and limits are available locally without extra
102
123
  API calls.
103
124
 
104
125
  ### Design Philosophy
105
126
 
106
- - **Runs on the stdlib**
127
+ - **Runs on the stdlib** <br>
107
128
  Start with Ruby's standard library and add extra dependencies only when you
108
129
  need them.
109
- - **It is highly pluggable**
130
+ - **It is highly pluggable** <br>
110
131
  Add tools, swap providers, change JSON backends, plug in tracing, or layer
111
132
  internal APIs and MCP servers into the same execution path.
112
- - **It scales from scripts to long-lived systems**
133
+ - **It scales from scripts to long-lived systems** <br>
113
134
  The same primitives work for one-off scripts, background jobs, and more
114
135
  demanding application workloads with streaming, persistence, and tracing.
115
- - **Thread boundaries are clear**
136
+ - **Thread boundaries are clear** <br>
116
137
  Providers are shareable. Contexts are stateful and should stay thread-local.
117
138
 
118
139
  ## Capabilities
@@ -145,7 +166,11 @@ same context object.
145
166
  gem install llm.rb
146
167
  ```
147
168
 
148
- ## Example
169
+ ## Examples
170
+
171
+ **REPL**
172
+
173
+ See the [deepdive](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) for more examples.
149
174
 
150
175
  ```ruby
151
176
  require "llm"
@@ -160,6 +185,24 @@ loop do
160
185
  end
161
186
  ```
162
187
 
188
+ **Sequel (ORM)**
189
+
190
+ See the [deepdive](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) for more examples.
191
+
192
+ ```ruby
193
+ require "llm"
194
+ require "sequel"
195
+ require "sequel/plugins/llm"
196
+
197
+ class Context < Sequel::Model
198
+ plugin :llm, provider: -> { { key: ENV["#{provider.upcase}_SECRET"], persistent: true } }
199
+ end
200
+
201
+ ctx = Context.create(provider: "openai", model: "gpt-5.4-mini")
202
+ ctx.talk("Remember that my favorite language is Ruby")
203
+ puts ctx.talk("What is my favorite language?").content
204
+ ```
205
+
163
206
  ## Resources
164
207
 
165
208
  - [deepdive](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) is the
data/lib/llm/context.rb CHANGED
@@ -2,16 +2,21 @@
2
2
 
3
3
  module LLM
4
4
  ##
5
- # {LLM::Context LLM::Context} represents a stateful interaction with
6
- # an LLM, including conversation history, tools, execution state,
7
- # and cost tracking. It evolves over time as the system runs.
5
+ # {LLM::Context LLM::Context} is the stateful execution boundary in
6
+ # llm.rb.
8
7
  #
9
- # Context is the stateful environment in which an LLM operates.
10
- # This is not just prompt context; it is an active, evolving
11
- # execution boundary for LLM workflows.
8
+ # It holds the evolving runtime state for an LLM workflow:
9
+ # conversation history, tool calls and returns, schema and streaming
10
+ # configuration, accumulated usage, and request ownership for
11
+ # interruption.
12
12
  #
13
- # A context can use the chat completions API that all providers
14
- # support or the responses API that currently only OpenAI supports.
13
+ # This is broader than prompt context alone. A context is the object
14
+ # that lets one-off prompts, streaming turns, tool execution,
15
+ # persistence, retries, and serialized long-lived workflows all run
16
+ # through the same model.
17
+ #
18
+ # A context can drive the chat completions API that all providers
19
+ # support or the Responses API on providers that expose it.
15
20
  #
16
21
  # @example
17
22
  # #!/usr/bin/env ruby
@@ -272,13 +277,13 @@ module LLM
272
277
  ##
273
278
  # @return [Hash]
274
279
  def to_h
275
- {model:, messages:}
280
+ {schema_version: 1, model:, messages:}
276
281
  end
277
282
 
278
283
  ##
279
284
  # @return [String]
280
285
  def to_json(...)
281
- {schema_version: 1}.merge!(to_h).to_json(...)
286
+ to_h.to_json(...)
282
287
  end
283
288
 
284
289
  ##
@@ -5,6 +5,7 @@ module LLM::EventStream
5
5
  # @private
6
6
  class Parser
7
7
  COMPACT_THRESHOLD = 4096
8
+ Visitor = Struct.new(:target, :on_data, :on_event, :on_id, :on_retry, :on_chunk)
8
9
 
9
10
  ##
10
11
  # @return [LLM::EventStream::Parser]
@@ -20,7 +21,12 @@ module LLM::EventStream
20
21
  # @param [#on_data] visitor
21
22
  # @return [void]
22
23
  def register(visitor)
23
- @visitors << visitor
24
+ @visitors << Visitor.new(
25
+ visitor,
26
+ visitor.respond_to?(:on_data), visitor.respond_to?(:on_event),
27
+ visitor.respond_to?(:on_id), visitor.respond_to?(:on_retry),
28
+ visitor.respond_to?(:on_chunk)
29
+ )
24
30
  end
25
31
 
26
32
  ##
@@ -58,12 +64,16 @@ module LLM::EventStream
58
64
 
59
65
  private
60
66
 
61
- def parse!(chunk)
62
- field, value = Event.parse(chunk)
67
+ def parse_event!(chunk, field, value)
63
68
  dispatch_visitors(field, value, chunk)
64
69
  dispatch_callbacks(field, value, chunk)
65
70
  end
66
71
 
72
+ def parse!(chunk)
73
+ field, value = Event.parse(chunk)
74
+ parse_event!(chunk, field, value)
75
+ end
76
+
67
77
  def dispatch_visitors(field, value, chunk)
68
78
  @visitors.each { dispatch_visitor(_1, field, value, chunk) }
69
79
  end
@@ -76,11 +86,33 @@ module LLM::EventStream
76
86
  end
77
87
 
78
88
  def dispatch_visitor(visitor, field, value, chunk)
79
- method = "on_#{field}"
80
- if visitor.respond_to?(method)
81
- visitor.public_send(method, value, chunk)
82
- elsif visitor.respond_to?("on_chunk")
83
- visitor.on_chunk(nil, chunk)
89
+ target = visitor.target
90
+ if field == "data"
91
+ if visitor.on_data
92
+ target.on_data(value, chunk)
93
+ elsif visitor.on_chunk
94
+ target.on_chunk(nil, chunk)
95
+ end
96
+ elsif field == "event"
97
+ if visitor.on_event
98
+ target.on_event(value, chunk)
99
+ elsif visitor.on_chunk
100
+ target.on_chunk(nil, chunk)
101
+ end
102
+ elsif field == "id"
103
+ if visitor.on_id
104
+ target.on_id(value, chunk)
105
+ elsif visitor.on_chunk
106
+ target.on_chunk(nil, chunk)
107
+ end
108
+ elsif field == "retry"
109
+ if visitor.on_retry
110
+ target.on_retry(value, chunk)
111
+ elsif visitor.on_chunk
112
+ target.on_chunk(nil, chunk)
113
+ end
114
+ elsif visitor.on_chunk
115
+ target.on_chunk(nil, chunk)
84
116
  end
85
117
  end
86
118
 
@@ -16,6 +16,9 @@ class LLM::Anthropic
16
16
  def initialize(stream)
17
17
  @body = {"role" => "assistant", "content" => []}
18
18
  @stream = stream
19
+ @can_emit_content = stream.respond_to?(:on_content)
20
+ @can_emit_tool_call = stream.respond_to?(:on_tool_call)
21
+ @can_push_content = stream.respond_to?(:<<)
19
22
  end
20
23
 
21
24
  ##
@@ -88,15 +91,15 @@ class LLM::Anthropic
88
91
  end
89
92
 
90
93
  def emit_content(value)
91
- if @stream.respond_to?(:on_content)
94
+ if @can_emit_content
92
95
  @stream.on_content(value)
93
- elsif @stream.respond_to?(:<<)
96
+ elsif @can_push_content
94
97
  @stream << value
95
98
  end
96
99
  end
97
100
 
98
101
  def emit_tool(tool)
99
- return unless @stream.respond_to?(:on_tool_call)
102
+ return unless @can_emit_tool_call
100
103
  function, error = resolve_tool(tool)
101
104
  @stream.on_tool_call(function, error)
102
105
  end
@@ -17,6 +17,9 @@ class LLM::Google
17
17
  @body = {"candidates" => []}
18
18
  @stream = stream
19
19
  @emits = {tools: []}
20
+ @can_emit_content = stream.respond_to?(:on_content)
21
+ @can_emit_tool_call = stream.respond_to?(:on_tool_call)
22
+ @can_push_content = stream.respond_to?(:<<)
20
23
  end
21
24
 
22
25
  ##
@@ -126,15 +129,15 @@ class LLM::Google
126
129
  end
127
130
 
128
131
  def emit_content(value)
129
- if @stream.respond_to?(:on_content)
132
+ if @can_emit_content
130
133
  @stream.on_content(value)
131
- elsif @stream.respond_to?(:<<)
134
+ elsif @can_push_content
132
135
  @stream << value
133
136
  end
134
137
  end
135
138
 
136
139
  def emit_tool(pindex, cindex, part)
137
- return unless @stream.respond_to?(:on_tool_call)
140
+ return unless @can_emit_tool_call
138
141
  return unless complete_tool?(part)
139
142
  key = [cindex, pindex]
140
143
  return if @emits[:tools].include?(key)
@@ -14,6 +14,7 @@ class LLM::Ollama
14
14
  def initialize(stream)
15
15
  @body = {}
16
16
  @stream = stream
17
+ @can_push_content = stream.respond_to?(:<<)
17
18
  end
18
19
 
19
20
  ##
@@ -36,10 +37,10 @@ class LLM::Ollama
36
37
  if key == "message"
37
38
  if @body[key]
38
39
  @body[key]["content"] << value["content"]
39
- @stream << value["content"] if @stream.respond_to?(:<<)
40
+ @stream << value["content"] if @can_push_content
40
41
  else
41
42
  @body[key] = value
42
- @stream << value["content"] if @stream.respond_to?(:<<)
43
+ @stream << value["content"] if @can_push_content
43
44
  end
44
45
  else
45
46
  @body[key] = value