llm.rb 8.0.0 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +165 -2
  3. data/README.md +161 -509
  4. data/data/bedrock.json +2948 -0
  5. data/data/deepseek.json +8 -8
  6. data/data/openai.json +39 -2
  7. data/data/xai.json +35 -0
  8. data/data/zai.json +1 -1
  9. data/lib/llm/active_record/acts_as_llm.rb +7 -8
  10. data/lib/llm/agent.rb +36 -16
  11. data/lib/llm/context.rb +30 -26
  12. data/lib/llm/contract/completion.rb +45 -0
  13. data/lib/llm/cost.rb +81 -4
  14. data/lib/llm/error.rb +1 -1
  15. data/lib/llm/function/array.rb +8 -5
  16. data/lib/llm/function/call_group.rb +39 -0
  17. data/lib/llm/function/fork/task.rb +6 -0
  18. data/lib/llm/function/ractor/task.rb +6 -0
  19. data/lib/llm/function/task.rb +10 -0
  20. data/lib/llm/function.rb +1 -0
  21. data/lib/llm/mcp/transport/http.rb +26 -46
  22. data/lib/llm/mcp/transport/stdio.rb +0 -8
  23. data/lib/llm/mcp.rb +6 -23
  24. data/lib/llm/object.rb +8 -0
  25. data/lib/llm/provider.rb +29 -19
  26. data/lib/llm/providers/anthropic/error_handler.rb +6 -7
  27. data/lib/llm/providers/anthropic/files.rb +2 -2
  28. data/lib/llm/providers/anthropic/response_adapter/completion.rb +30 -0
  29. data/lib/llm/providers/anthropic.rb +1 -1
  30. data/lib/llm/providers/bedrock/error_handler.rb +79 -0
  31. data/lib/llm/providers/bedrock/models.rb +109 -0
  32. data/lib/llm/providers/bedrock/request_adapter/completion.rb +153 -0
  33. data/lib/llm/providers/bedrock/request_adapter.rb +95 -0
  34. data/lib/llm/providers/bedrock/response_adapter/completion.rb +173 -0
  35. data/lib/llm/providers/bedrock/response_adapter/models.rb +34 -0
  36. data/lib/llm/providers/bedrock/response_adapter.rb +40 -0
  37. data/lib/llm/providers/bedrock/signature.rb +166 -0
  38. data/lib/llm/providers/bedrock/stream_decoder.rb +140 -0
  39. data/lib/llm/providers/bedrock/stream_parser.rb +201 -0
  40. data/lib/llm/providers/bedrock.rb +272 -0
  41. data/lib/llm/providers/google/error_handler.rb +6 -7
  42. data/lib/llm/providers/google/files.rb +2 -4
  43. data/lib/llm/providers/google/images.rb +1 -1
  44. data/lib/llm/providers/google/models.rb +0 -2
  45. data/lib/llm/providers/google/response_adapter/completion.rb +30 -0
  46. data/lib/llm/providers/google.rb +1 -1
  47. data/lib/llm/providers/ollama/error_handler.rb +6 -7
  48. data/lib/llm/providers/ollama/models.rb +0 -2
  49. data/lib/llm/providers/ollama/response_adapter/completion.rb +30 -0
  50. data/lib/llm/providers/ollama.rb +1 -1
  51. data/lib/llm/providers/openai/audio.rb +3 -3
  52. data/lib/llm/providers/openai/error_handler.rb +6 -7
  53. data/lib/llm/providers/openai/files.rb +2 -2
  54. data/lib/llm/providers/openai/images.rb +3 -3
  55. data/lib/llm/providers/openai/models.rb +1 -1
  56. data/lib/llm/providers/openai/response_adapter/completion.rb +42 -0
  57. data/lib/llm/providers/openai/response_adapter/responds.rb +39 -0
  58. data/lib/llm/providers/openai/responses.rb +2 -2
  59. data/lib/llm/providers/openai/vector_stores.rb +1 -1
  60. data/lib/llm/providers/openai.rb +1 -1
  61. data/lib/llm/response.rb +10 -8
  62. data/lib/llm/sequel/plugin.rb +7 -8
  63. data/lib/llm/stream/queue.rb +15 -42
  64. data/lib/llm/stream.rb +4 -4
  65. data/lib/llm/transport/execution.rb +67 -0
  66. data/lib/llm/transport/http.rb +134 -0
  67. data/lib/llm/transport/persistent_http.rb +152 -0
  68. data/lib/llm/transport/response/http.rb +113 -0
  69. data/lib/llm/transport/response.rb +112 -0
  70. data/lib/llm/{provider/transport/http → transport}/stream_decoder.rb +8 -4
  71. data/lib/llm/transport.rb +139 -0
  72. data/lib/llm/usage.rb +14 -5
  73. data/lib/llm/version.rb +1 -1
  74. data/lib/llm.rb +10 -12
  75. data/llm.gemspec +2 -16
  76. metadata +23 -19
  77. data/lib/llm/provider/transport/http/execution.rb +0 -115
  78. data/lib/llm/provider/transport/http/interruptible.rb +0 -114
  79. data/lib/llm/provider/transport/http.rb +0 -145
  80. data/lib/llm/utils.rb +0 -19
data/README.md CHANGED
@@ -4,514 +4,240 @@
4
4
  <p align="center">
5
5
  <a href="https://0x1eef.github.io/x/llm.rb?rebuild=1"><img src="https://img.shields.io/badge/docs-0x1eef.github.io-blue.svg" alt="RubyDoc"></a>
6
6
  <a href="https://opensource.org/license/0bsd"><img src="https://img.shields.io/badge/License-0BSD-orange.svg?" alt="License"></a>
7
- <a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-8.0.0-green.svg?" alt="Version"></a>
7
+ <a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-9.0.0-green.svg?" alt="Version"></a>
8
8
  </p>
9
9
 
10
10
  ## About
11
11
 
12
- llm.rb is the most capable runtime for building AI systems in Ruby.
13
- <br>
14
-
15
- llm.rb is designed for Ruby, and although it works great in Rails, it is not tightly
16
- coupled to it. It runs on the standard library by default (zero dependencies),
17
- loads optional pieces only when needed, includes built-in ActiveRecord support through
18
- `acts_as_llm` and `acts_as_agent`, includes built-in Sequel support through
19
- `plugin :llm` and `plugin :agent`, and is designed for engineers who want control over
20
- long-lived, tool-capable, stateful AI workflows instead of just
21
- request/response helpers.
22
-
23
- It provides one runtime for providers, agents, tools, skills, MCP servers, streaming,
24
- schemas, files, and persisted state, so real systems can be built out of one coherent
25
- execution model instead of a pile of adapters.
26
-
27
- It provides concurrent tool execution with multiple strategies exposed through a single
28
- runtime: async-task, threads, fibers, ractors and processes (fork). The first three are
29
- good for IO-bound work and the last two are good for CPU-bound work. Ractor support is
30
- experimental and comes with limitations.
31
-
32
- Want to see some code? Jump to [the examples](#examples) section. <br>
33
- Want to see a self-hosted LLM environment built on llm.rb? Check out [Relay](https://github.com/llmrb/relay).
34
-
35
- ## Architecture
12
+ llm.rb is Ruby's most capable AI runtime.
36
13
 
37
- <p align="center">
38
- <img src="https://github.com/llmrb/llm.rb/raw/main/resources/architecture.png" alt="llm.rb architecture" width="790">
39
- </p>
40
-
41
- ## Core Concept
14
+ It runs on Ruby's standard library by default. loads optional pieces
15
+ only when needed, and offers a single runtime for providers, agents,
16
+ tools, skills, MCP, streaming, files, and persisted state. As a bonus,
17
+ llm.rb is also [available for mruby](https://github.com/llmrb/mruby-llm).
42
18
 
43
- [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
44
- is the execution boundary in llm.rb.
19
+ It supports OpenAI, OpenAI-compatible endpoints, Anthropic, Google
20
+ Gemini, DeepSeek, xAI, Z.ai, AWS Bedrock, Ollama, and llama.cpp. It
21
+ also includes built-in ActiveRecord and Sequel support, plus concurrent
22
+ tool execution through threads, tasks (via async gem), fibers, ractors,
23
+ and fork (via xchan.rb gem).
45
24
 
46
- It holds:
47
- - message history
48
- - tool state
49
- - schemas
50
- - streaming configuration
51
- - usage and cost tracking
25
+ ## Quick start
52
26
 
53
- Instead of switching abstractions for each feature, everything builds on the
54
- same context object.
27
+ #### LLM::Context
55
28
 
56
- ## Standout features
57
-
58
- The following list is **not exhaustive**, but it covers a lot of ground.
59
-
60
- #### Skills
29
+ The
30
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
31
+ object is at the heart of the runtime. Almost all other features build
32
+ on top of it. It is a low-level interface to a model, and requires tool
33
+ execution to be managed manually. The
34
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html)
35
+ class is almost the same as
36
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
37
+ but it manages tool execution for you - we'll cover agents next:
61
38
 
62
- Skills are reusable, directory-backed capabilities loaded from `SKILL.md`.
63
- They run through the same runtime as tools, agents, and MCP. They do not
64
- require a second orchestration layer or a parallel abstraction. If you've
65
- used Claude or Codex, you know the general idea of skills, and llm.rb
66
- supports that same concept with the same execution model as the rest of the
67
- system.
39
+ ```ruby
40
+ require "llm"
68
41
 
69
- In llm.rb, a skill has frontmatter and instructions. The frontmatter can
70
- define `name`, `description`, and `tools`. The `tools` entries are tool names,
71
- and each name must resolve to a subclass of
72
- [`LLM::Tool`](https://0x1eef.github.io/x/llm.rb/LLM/Tool.html) that is already
73
- loaded in the runtime.
42
+ llm = LLM.openai(key: ENV["KEY"])
43
+ ctx = LLM::Context.new(llm, stream: $stdout)
44
+ ctx.talk "Hello world"
45
+ ```
74
46
 
75
- If you want Claude/Codex-like skills that can drive scripts or shell
76
- commands, you would typically pair the skill with a tool that can execute
77
- system commands.
47
+ #### LLM::Agent
78
48
 
79
- ```yaml
80
- ---
81
- name: release
82
- description: Prepare a release
83
- tools:
84
- - search_docs
85
- - git
86
- ---
87
- Review the release state, summarize what changed, and prepare the release.
88
- ```
49
+ The
50
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html)
51
+ object is implemented on top of
52
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html).
53
+ It provides the same interface, but manages tool execution for you. It
54
+ also has builtin features such as a loop guard that detects repeated
55
+ tool call patterns, and another guard that detects infinite tool call
56
+ loops. Both guards advise the model to change course rather than raise
57
+ an error:
89
58
 
90
59
  ```ruby
91
- class Agent < LLM::Agent
92
- model "gpt-5.4-mini"
93
- skills "./skills/release"
94
- tracer { LLM::Tracer::Logger.new(llm, path: "logs/release-agent.log") }
95
- end
60
+ require "llm"
96
61
 
97
62
  llm = LLM.openai(key: ENV["KEY"])
98
- Agent.new(llm, stream: $stdout).talk("Let's prepare the release!")
63
+ agent = LLM::Agent.new(llm, stream: $stdout)
64
+ agent.talk "Hello world"
99
65
  ```
100
66
 
101
- #### ORM
102
-
103
- Any ActiveRecord model or Sequel model can become an agent-capable model,
104
- including existing business and domain models, without forcing you into a
105
- separate agent table or a second persistence layer.
106
-
107
- `acts_as_agent` extends a model with agent capabilities: the same runtime
108
- surface as [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html),
109
- because it actually wraps an `LLM::Agent`, plus persistence through one text,
110
- JSON, or JSONB-backed `data` column on the same table. If your app also has
111
- provider or model columns, provide them to llm.rb through `set_provider` and
112
- `set_context`.
67
+ #### Tools
113
68
 
69
+ The
70
+ [LLM::Tool](https://0x1eef.github.io/x/llm.rb/LLM/Tool.html)
71
+ class can be subclassed to implement your own tools that can extend the
72
+ abilities of a model:
114
73
 
115
74
  ```ruby
116
- class Ticket < ApplicationRecord
117
- acts_as_agent provider: :set_provider, context: :set_context
118
- model "gpt-5.4-mini"
119
- instructions "You are a support assistant."
120
-
121
- private
122
-
123
- def set_provider
124
- LLM.openai(key: ENV["OPENAI_SECRET"])
125
- end
126
-
127
- def set_context
128
- { mode: :responses, store: false }
75
+ class ReadFile < LLM::Tool
76
+ name "read-file"
77
+ description "Read a file"
78
+ parameter :path, String, "The filename or path"
79
+ required %i[path]
80
+
81
+ def call(path:)
82
+ {contents: File.read(path)}
129
83
  end
130
84
  end
131
85
  ```
132
86
 
133
- #### Agentic Patterns
134
-
135
- llm.rb is especially strong when you want to build agentic systems in a Ruby
136
- way. Agents can be ordinary application models with state, associations,
137
- tools, skills, and persistence, which makes it much easier to build systems
138
- where users have their own specialized agents instead of treating agents as
139
- something outside the app.
140
-
141
- That pattern works so well in llm.rb because
142
- [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html),
143
- `acts_as_agent`, `plugin :agent`, skills, tools, and persisted runtime state
144
- all fit the same execution model. The runtime stays small enough that the
145
- main design work becomes application design, not orchestration glue.
146
-
147
- For a concrete example, see
148
- [How to build a platform of agents](https://0x1eef.github.io/posts/how-to-build-a-platform-of-agents).
149
-
150
- #### Persistence
87
+ #### MCP
151
88
 
152
- The same runtime can be serialized to disk, restored later, persisted in JSON
153
- or JSONB-backed ORM columns, resumed across process boundaries, or shared
154
- across long-lived workflows.
89
+ The
90
+ [LLM::MCP](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html)
91
+ object lets llm.rb use tools provided by an MCP server. Those tools are
92
+ exposed through the same runtime as local tools, so you can pass them
93
+ to either
94
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
95
+ or
96
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html).
97
+ In this example, the MCP server runs over stdio and
98
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
99
+ uses the same tool loop as local tools:
155
100
 
156
101
  ```ruby
157
- ctx = LLM::Context.new(llm)
158
- ctx.talk("Remember that my favorite language is Ruby.")
159
- ctx.save(path: "context.json")
160
- ```
161
-
162
- #### Context Compaction
102
+ require "llm"
163
103
 
164
- Long-lived contexts can compact older history into a summary instead of
165
- growing forever. Compaction is built into [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
166
- through [`LLM::Compactor`](https://0x1eef.github.io/x/llm.rb/LLM/Compactor.html),
167
- and when a stream is present it emits `on_compaction` and
168
- `on_compaction_finish` through [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html).
169
- The compactor can also use a different model from the main context, which is
170
- useful when you want summarization to run on a cheaper or faster model.
171
- `token_threshold:` accepts either a fixed token count or a percentage string
172
- like `"90%"`, which resolves against the active model context window and
173
- triggers compaction once total token usage goes over that percentage.
104
+ llm = LLM.openai(key: ENV["KEY"])
105
+ mcp = LLM::MCP.stdio(argv: ["ruby", "server.rb"])
174
106
 
175
- ```ruby
176
- ctx = LLM::Context.new(
177
- llm,
178
- compactor: {
179
- token_threshold: "90%",
180
- retention_window: 8,
181
- model: "gpt-5.4-mini"
182
- }
183
- )
107
+ mcp.run do
108
+ ctx = LLM::Context.new(llm, stream: $stdout, tools: mcp.tools)
109
+ ctx.talk "Use the available tools to inspect the environment."
110
+ ctx.talk(ctx.wait(:call)) while ctx.functions?
111
+ end
184
112
  ```
185
113
 
186
- #### Guards
114
+ #### Skills
115
+
116
+ Skills are reusable instructions loaded from a `SKILL.md` directory. They let
117
+ you package behavior and tool access together, and they plug into the same
118
+ runtime as tools, agents, and MCP. When a skill runs, llm.rb spawns a
119
+ subagent with the skill instructions, access to only the tools listed in the
120
+ skill, and recent conversation context:
187
121
 
188
- Guards let llm.rb supervise agentic execution, not just run it.
189
- They live on [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html),
190
- can inspect the current runtime state, and can step in when a context is no
191
- longer making progress.
122
+ ```yaml
123
+ ---
124
+ name: release
125
+ description: Prepare a release
126
+ tools: ["search-docs", "git"]
127
+ ---
192
128
 
193
- [`LLM::LoopGuard`](https://0x1eef.github.io/x/llm.rb/LLM/LoopGuard.html) is
194
- the built-in implementation. It detects repeated tool-call patterns and
195
- blocks pending tool execution with in-band guarded tool errors instead of
196
- letting the loop keep spinning. [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html)
197
- enables that guard by default through its wrapped context.
129
+ ## Task
198
130
 
199
- ```ruby
200
- ctx = LLM::Context.new(llm)
201
- ctx.guard = MyGuard.new
131
+ Review the release state, summarize what changed, and prepare the release.
202
132
  ```
203
133
 
204
- #### Transformers
205
-
206
- Transformers let llm.rb rewrite outgoing prompts and params before a request
207
- is sent to the provider. They also live on
208
- [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html), but
209
- they solve a different problem from guards: instead of blocking execution,
210
- they can normalize or scrub what gets sent. When a stream is present, that
211
- lifecycle is also exposed through
212
- [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html) with
213
- `on_transform` and `on_transform_finish`.
214
-
215
- That makes them a good fit for things like PII scrubbing, prompt
216
- normalization, or request-level param injection. A transformer just needs to
217
- implement `call(ctx, prompt, params)` and return `[prompt, params]`. That
218
- means a transformer can scrub plain text prompts, but it can also scrub
219
- [`LLM::Function::Return`](https://0x1eef.github.io/x/llm.rb/LLM/Function/Return.html)
220
- values. In other words, you can intercept a tool call's return value and
221
- modify it before sending it back to the LLM.
222
-
223
- That is also a useful UI hook. A stream can surface messages like
224
- `Anonymizing your data...` before a scrubber runs and `Data anonymized.`
225
- after it finishes.
226
-
227
134
  ```ruby
228
- class ScrubPII
229
- EMAIL = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
135
+ require "llm"
230
136
 
231
- def call(ctx, prompt, params)
232
- [scrub(prompt), params]
233
- end
137
+ class ReleaseAgent < LLM::Agent
138
+ model "gpt-5.4-mini"
139
+ skills "./skills/release"
140
+ end
234
141
 
235
- private
142
+ llm = LLM.openai(key: ENV["KEY"])
143
+ ReleaseAgent.new(llm, stream: $stdout).talk("Prepare the next release.")
144
+ ```
236
145
 
237
- def scrub(prompt)
238
- case prompt
239
- when String then prompt.gsub(EMAIL, "[REDACTED_EMAIL]")
240
- when Array then prompt.map { scrub(_1) }
241
- when LLM::Function::Return then on_tool_return(prompt)
242
- else prompt
243
- end
244
- end
146
+ #### LLM::Stream
245
147
 
246
- def on_tool_return(result)
247
- value = case result.name
248
- when "lookup-customer" then scrub_value(result.value)
249
- else result.value
250
- end
251
- LLM::Function::Return.new(result.id, result.name, value)
252
- end
148
+ The
149
+ [LLM::Stream](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html)
150
+ object lets you observe output and runtime events as they happen. You
151
+ can subclass it to handle streamed content in your own application:
253
152
 
254
- def scrub_value(value)
255
- case value
256
- when String then value.gsub(EMAIL, "[REDACTED_EMAIL]")
257
- when Array then value.map { scrub_value(_1) }
258
- when Hash then value.transform_values { scrub_value(_1) }
259
- else value
260
- end
153
+ ```ruby
154
+ require "llm"
155
+
156
+ class Stream < LLM::Stream
157
+ def on_content(content)
158
+ $stdout << content
261
159
  end
262
160
  end
263
161
 
264
- ctx = LLM::Context.new(llm)
265
- ctx.transformer = ScrubPII.new
162
+ llm = LLM.openai(key: ENV["KEY"])
163
+ ctx = LLM::Context.new(llm, stream: Stream.new)
164
+ ctx.talk "Write a haiku about Ruby."
266
165
  ```
267
166
 
268
- When a stream is present, that transformer lifecycle is also exposed through
269
- `on_transform` and `on_transform_finish` on
270
- [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html).
271
-
272
- #### LLM::Stream
167
+ #### LLM::Stream (advanced)
273
168
 
274
- `LLM::Stream` is not just for printing tokens. It supports `on_content`,
275
- `on_reasoning_content`, `on_tool_call`, `on_tool_return`, `on_transform`,
276
- `on_transform_finish`, `on_compaction`, and `on_compaction_finish`, which
277
- means visible output, reasoning output, request rewriting, tool execution,
278
- and context compaction can all be driven through the same execution path.
169
+ The
170
+ [LLM::Stream](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html)
171
+ object can also resolve tool calls while output is still streaming. In
172
+ `on_tool_call`, you can spawn the tool, push the work onto the stream
173
+ queue, and later drain it with `wait`:
279
174
 
280
175
  ```ruby
176
+ require "llm"
177
+
281
178
  class Stream < LLM::Stream
282
- def on_tool_call(tool, error)
283
- queue << (error || ctx.spawn(tool, :thread))
179
+ def on_content(content)
180
+ $stdout << content
284
181
  end
285
182
 
286
- def on_tool_return(tool, result)
287
- puts(result.value)
183
+ def on_tool_call(tool, error)
184
+ return queue << error if error
185
+ queue << ctx.spawn(tool, :thread)
288
186
  end
289
187
  end
188
+
189
+ llm = LLM.openai(key: ENV["KEY"])
190
+ ctx = LLM::Context.new(llm, stream: Stream.new, tools: [ReadFile])
191
+ ctx.talk "Read README.md and summarize the quick start."
192
+ ctx.talk(ctx.wait) while ctx.functions?
290
193
  ```
291
194
 
292
195
  #### Concurrency
293
196
 
294
- Tool execution can run sequentially with `:call` or concurrently through
295
- `:thread`, `:task`, `:fiber`, `:fork`, and experimental `:ractor`, without
296
- rewriting your tool layer. Async tasks, threads, and fibers are the
297
- I/O-bound options. Fork and ractor are the CPU-bound options. `:fork`
298
- requires [`xchan.rb`](https://github.com/0x1eef/xchan.rb#readme) support,
299
- and `:ractor` is still experimental.
300
-
301
- `:fiber` uses `Fiber.schedule`, so it requires `Fiber.scheduler`.
197
+ llm.rb can run tool work concurrently. This is useful when a model calls
198
+ multiple tools and you want to resolve them in parallel instead of one
199
+ at a time. On
200
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html),
201
+ you can enable this with `concurrency`. Common options are `:call` for
202
+ sequential execution, `:thread`, or `:task` for concurrent IO-bound work, and
203
+ `:ractor` or `:fork` for more isolated CPU-bound work:
302
204
 
303
205
  ```ruby
206
+ require "llm"
207
+
304
208
  class Agent < LLM::Agent
305
209
  model "gpt-5.4-mini"
306
- tools FetchWeather, FetchNews, FetchStock
210
+ tools ReadFile
307
211
  concurrency :thread
308
212
  end
309
- ```
310
213
 
311
- #### MCP
312
-
313
- Remote MCP tools and prompts are not bolted on as a separate integration
314
- stack. They adapt into the same tool and prompt path used by local tools,
315
- skills, contexts, and agents.
316
-
317
- Use `mcp.run do ... end` for scoped work where the client should start and
318
- stop around one block. Use `mcp.start` and `mcp.stop` directly when you need
319
- finer sequential control across several steps before shutting the client down.
320
-
321
- ```ruby
322
- mcp = LLM::MCP.http(
323
- url: "https://api.githubcopilot.com/mcp/",
324
- headers: {"Authorization" => "Bearer #{ENV["GITHUB_PAT"]}"},
325
- persistent: true
326
- )
327
- mcp.run do
328
- ctx = LLM::Context.new(llm, tools: mcp.tools)
329
- end
214
+ llm = LLM.openai(key: ENV["KEY"])
215
+ agent = Agent.new(llm, stream: $stdout)
216
+ agent.talk "Read README.md and CHANGELOG.md and compare them."
330
217
  ```
331
218
 
332
- #### Cancellation
219
+ #### Serialization
333
220
 
334
- Cancellation is one of the harder problems to get right, and while llm.rb
335
- makes it possible, it still requires careful engineering to use effectively.
336
- The point though is that it is possible to stop in-flight provider work cleanly
337
- through the same runtime, and the model used by llm.rb is directly inspired by
338
- Go's context package. In fact, llm.rb is heavily inspired by Go but with a Ruby
339
- twist.
221
+ The [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
222
+ object can be serialized to JSON, which makes it suitable for storing
223
+ in a file, a database column, or a Redis queue. The built-in
224
+ ActiveRecord and Sequel plugins are built on top of this feature:
340
225
 
341
226
  ```ruby
342
227
  require "llm"
343
- require "io/console"
344
228
 
345
229
  llm = LLM.openai(key: ENV["KEY"])
346
- ctx = LLM::Context.new(llm, stream: $stdout)
347
- worker = Thread.new do
348
- ctx.talk("Write a very long essay about network protocols.")
349
- rescue LLM::Interrupt
350
- puts "Request was interrupted!"
351
- end
352
230
 
353
- STDIN.getch
354
- ctx.interrupt!
355
- worker.join
356
- ```
231
+ # Serialize a context
232
+ ctx1 = LLM::Context.new(llm)
233
+ ctx1.talk "Remember that my favorite language is Ruby"
234
+ string = ctx1.to_json
357
235
 
358
- ## Differentiators
359
-
360
- ### Execution Model
361
-
362
- - **A system layer, not just an API wrapper** <br>
363
- Put providers, tools, MCP servers, and application APIs behind one runtime
364
- model instead of stitching them together by hand.
365
- - **Contexts are central** <br>
366
- Keep history, tools, schema, usage, persistence, and execution state in one
367
- place instead of spreading them across your app.
368
- - **Contexts can be serialized** <br>
369
- Save and restore live state for jobs, databases, retries, or long-running
370
- workflows.
371
-
372
- ### Runtime Behavior
373
-
374
- - **Streaming and tool execution work together** <br>
375
- Start tool work while output is still streaming so you can hide latency
376
- instead of waiting for turns to finish.
377
- - **Agents auto-manage tool execution** <br>
378
- Use `LLM::Agent` when you want the same stateful runtime surface as
379
- `LLM::Context`, but with tool loops executed automatically according to a
380
- configured concurrency mode such as `:call`, `:thread`, `:task`, `:fiber`,
381
- `:fork`, or experimental `:ractor` support for class-based tools. MCP tools
382
- are not supported by the current `:ractor` mode, but mixed tool sets can
383
- still route MCP tools and local tools through different strategies at
384
- runtime. By default, the tool attempt budget is `25`. When an agent
385
- exhausts that budget, it sends advisory tool errors back through the model
386
- instead of raising out of the runtime. Set `tool_attempts: nil` to disable
387
- that advisory behavior.
388
- - **Tool calls have an explicit lifecycle** <br>
389
- A tool call can be executed, cancelled through
390
- [`LLM::Function#cancel`](https://0x1eef.github.io/x/llm.rb/LLM/Function.html#cancel-instance_method),
391
- or left unresolved for manual handling, but the normal runtime contract is
392
- still that a model-issued tool request is answered with a tool return.
393
- - **Requests can be interrupted cleanly** <br>
394
- Stop in-flight provider work through the same runtime instead of treating
395
- cancellation as a separate concern.
396
- [`LLM::Context#cancel!`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html#cancel-21-instance_method)
397
- is inspired by Go's context cancellation model.
398
- - **Concurrency is a first-class feature** <br>
399
- Use async tasks, threads, fibers, forks, or experimental ractors without
400
- rewriting your tool layer. Async tasks, threads, and fibers are the
401
- I/O-bound options. Fork and ractor are the CPU-bound options. `:fork`
402
- requires [`xchan.rb`](https://github.com/0x1eef/xchan.rb#readme) support.
403
- The current `:ractor` mode is for class-based tools, and MCP tools are
404
- not supported by ractor, but mixed workloads can branch on `tool.mcp?`
405
- and choose a supported strategy per tool. Class-based `:ractor` tools
406
- still emit normal tool tracer callbacks. `:fiber` uses `Fiber.schedule`,
407
- so it requires `Fiber.scheduler`.
408
- - **Advanced workloads are built in, not bolted on** <br>
409
- Streaming, concurrent tool execution, persistence, tracing, and MCP support
410
- all fit the same runtime model.
411
-
412
- ### Integration
413
-
414
- - **MCP is built in** <br>
415
- Connect to MCP servers over stdio or HTTP without bolting on a separate
416
- integration stack.
417
- - **ActiveRecord and Sequel persistence are built in** <br>
418
- llm.rb includes built-in ActiveRecord support through `acts_as_llm` and
419
- `acts_as_agent`, plus built-in Sequel support through `plugin :llm` and
420
- `plugin :agent`.
421
- Use `acts_as_llm` when you want to wrap `LLM::Context`, `acts_as_agent`
422
- when you want to wrap `LLM::Agent`, `plugin :llm` when you want a
423
- `LLM::Context` on a Sequel model, or `plugin :agent` when you want an
424
- `LLM::Agent`. These integrations support `provider:` and `context:` hooks,
425
- plus `format: :string` for text columns or `format: :jsonb` for native
426
- PostgreSQL JSON storage when ORM JSON typecasting support is enabled.
427
- - **ORM models can become persistent agents** <br>
428
- Turn an ActiveRecord or Sequel model into an agent-capable model with
429
- built-in persistence, stored on the same table, with `jsonb` support when
430
- your ORM and database support native JSON columns.
431
- - **Persistent HTTP pooling is shared process-wide** <br>
432
- When enabled, separate
433
- [`LLM::Provider`](https://0x1eef.github.io/x/llm.rb/LLM/Provider.html)
434
- instances with the same endpoint settings can share one persistent
435
- pool, and separate HTTP
436
- [`LLM::MCP`](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html)
437
- instances can do the same, instead of each object creating its own
438
- isolated per-instance transport.
439
- - **OpenAI-compatible gateways are supported** <br>
440
- Target OpenAI-compatible services such as DeepInfra and OpenRouter, as well
441
- as proxies and self-hosted servers, with `host:` and `base_path:` when they
442
- preserve OpenAI request shapes but change the API root path.
443
- - **Provider support is broad** <br>
444
- Work with OpenAI, OpenAI-compatible endpoints, Anthropic, Google, DeepSeek,
445
- Z.ai, xAI, llama.cpp, and Ollama through the same runtime.
446
- - **Tools are explicit** <br>
447
- Run local tools, provider-native tools, and MCP tools through the same path
448
- with fewer special cases.
449
- - **Skills become bounded runtime capabilities** <br>
450
- Point llm.rb at directories with a `SKILL.md`, resolve named tools through
451
- the registry, and adapt each skill into its own callable capability through
452
- the normal runtime. Unlike a generic skill-discovery tool, each skill runs
453
- with its own bounded tool subset and behaves like a task-scoped sub-agent.
454
- - **Providers are normalized, not flattened** <br>
455
- Share one API surface across providers without losing access to provider-
456
- specific capabilities where they matter.
457
- - **Responses keep a uniform shape** <br>
458
- Provider calls return
459
- [`LLM::Response`](https://0x1eef.github.io/x/llm.rb/LLM/Response.html)
460
- objects as a common base shape, then extend them with endpoint- or
461
- provider-specific behavior when needed.
462
- - **Low-level access is still there** <br>
463
- Normalized responses still keep the raw `Net::HTTPResponse` available when
464
- you need headers, status, or other HTTP details.
465
- - **Local model metadata is included** <br>
466
- Model capabilities, pricing, and limits are available locally without extra
467
- API calls.
468
-
469
- ### Design Philosophy
470
-
471
- - **Runs on the stdlib** <br>
472
- Start with Ruby's standard library and add extra dependencies only when you
473
- need them.
474
- - **It is highly pluggable** <br>
475
- Add tools, swap providers, change JSON backends, plug in tracing, or layer
476
- internal APIs and MCP servers into the same execution path.
477
- - **It scales from scripts to long-lived systems** <br>
478
- The same primitives work for one-off scripts, background jobs, and more
479
- demanding application workloads with streaming, persistence, and tracing.
480
- - **Thread boundaries are clear** <br>
481
- Providers are shareable. Contexts are stateful and should stay thread-local.
482
-
483
- ## Capabilities
484
-
485
- Execution:
486
- - **Chat & Contexts** — stateless and stateful interactions with persistence
487
- - **Context Serialization** — save and restore state across processes or time
488
- - **Streaming** — visible output, reasoning output, tool-call events
489
- - **Request Interruption** — stop in-flight provider work cleanly
490
- - **Concurrent Execution** — threads, async tasks, and fibers
491
-
492
- Runtime Building Blocks:
493
- - **Tool Calling** — class-based tools and closure-based functions
494
- - **Run Tools While Streaming** — overlap model output with tool latency
495
- - **Agents** — reusable assistants with tool auto-execution
496
- - **Skills** — directory-backed capabilities loaded from `SKILL.md`
497
- - **MCP Support** — stdio and HTTP MCP clients with prompt and tool support
498
- - **Context Compaction** — summarize older history in long-lived contexts
499
-
500
- Data and Structure:
501
- - **Structured Outputs** — JSON Schema-based responses
502
- - **Responses API** — stateful response workflows where providers support them
503
- - **Multimodal Inputs** — text, images, audio, documents, URLs
504
- - **Audio** — speech generation, transcription, translation
505
- - **Images** — generation and editing
506
- - **Files API** — upload and reference files in prompts
507
- - **Embeddings** — vector generation for search and RAG
508
- - **Vector Stores** — retrieval workflows
509
-
510
- Operations:
511
- - **Cost Tracking** — local cost estimation without extra API calls
512
- - **Observability** — tracing, logging, telemetry
513
- - **Model Registry** — local metadata for capabilities, limits, pricing
514
- - **Persistent HTTP** — optional connection pooling for providers and MCP
236
+ # Restore a context (from JSON)
237
+ ctx2 = LLM::Context.new(llm, stream: $stdout)
238
+ ctx2.restore(string:)
239
+ ctx2.talk "What is my favorite language?"
240
+ ```
515
241
 
516
242
  ## Installation
517
243
 
@@ -561,80 +287,6 @@ ctx = LLM::Context.new(llm)
561
287
  ctx.talk ["Summarize this document.", ctx.local_file("README.md")]
562
288
  ```
563
289
 
564
- #### Agent
565
-
566
- This example uses [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html) directly and lets the agent manage tool execution. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
567
-
568
- ```ruby
569
- require "llm"
570
-
571
- class ShellAgent < LLM::Agent
572
- model "gpt-5.4-mini"
573
- instructions "You are a Linux system assistant."
574
- tools Shell
575
- concurrency :thread
576
- end
577
-
578
- llm = LLM.openai(key: ENV["KEY"])
579
- agent = ShellAgent.new(llm)
580
- puts agent.talk("What time is it on this system?").content
581
- ```
582
-
583
- #### Skills
584
-
585
- This example uses [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html) with directory-backed skills so `SKILL.md` capabilities run through the normal tool path. In llm.rb, a skill is exposed as a tool in the runtime. When that tool is called, it spawns a sub-agent with relevant context plus the instructions and tool subset declared in its own `SKILL.md`. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
586
-
587
- Each skill runs only with the tools declared in its own frontmatter.
588
-
589
- ```ruby
590
- require "llm"
591
-
592
- class Agent < LLM::Agent
593
- model "gpt-5.4-mini"
594
- instructions "You are a concise release assistant."
595
- skills "./skills/release", "./skills/review"
596
- tracer { LLM::Tracer::Logger.new(llm, path: "logs/release-agent.log") }
597
- end
598
-
599
- llm = LLM.openai(key: ENV["KEY"])
600
- puts Agent.new(llm).talk("Use the review skill.").content
601
- ```
602
-
603
- #### Streaming
604
-
605
- This example uses [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html) directly so visible output and tool execution can happen together. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
606
-
607
- ```ruby
608
- require "llm"
609
-
610
- class Stream < LLM::Stream
611
- def on_content(content)
612
- $stdout << content
613
- end
614
-
615
- def on_tool_call(tool, error)
616
- return queue << error if error
617
- $stdout << "\nRunning tool #{tool.name}...\n"
618
- queue << ctx.spawn(tool, :thread)
619
- end
620
-
621
- def on_tool_return(tool, result)
622
- if result.error?
623
- $stdout << "Tool #{tool.name} failed\n"
624
- else
625
- $stdout << "Finished tool #{tool.name}\n"
626
- end
627
- end
628
- end
629
-
630
- llm = LLM.openai(key: ENV["KEY"])
631
- stream = Stream.new
632
- ctx = LLM::Context.new(llm, stream:, tools: [System])
633
-
634
- ctx.talk("Run `date` and `uname -a`.")
635
- ctx.talk(ctx.wait(:thread)) while ctx.functions.any?
636
- ```
637
-
638
290
  #### Context Compaction
639
291
 
640
292
  This example uses [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html),
@@ -647,7 +299,7 @@ compactor can also use its own `model:` if you want summarization to run on a
647
299
  different model from the main context. `token_threshold:` accepts either a
648
300
  fixed token count or a percentage string like `"90%"`, which resolves
649
301
  against the active model context window and triggers compaction once total
650
- token usage goes over that percentage. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
302
+ token usage goes over that percentage. See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
651
303
 
652
304
  ```ruby
653
305
  require "llm"
@@ -739,7 +391,7 @@ class Context < Sequel::Model
739
391
  private
740
392
 
741
393
  def set_provider
742
- LLM.openai(key: ENV["OPENAI_SECRET"])
394
+ LLM.openai(key: ENV["OPENAI_SECRET"], persistent: true)
743
395
  end
744
396
 
745
397
  def set_context
@@ -875,7 +527,7 @@ This example uses [`LLM::MCP`](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html) o
875
527
  require "llm"
876
528
  require "net/http/persistent"
877
529
 
878
- llm = LLM.openai(key: ENV["KEY"])
530
+ llm = LLM.openai(key: ENV["KEY"], persistent: true)
879
531
  mcp = LLM::MCP.http(
880
532
  url: "https://api.githubcopilot.com/mcp/",
881
533
  headers: {"Authorization" => "Bearer #{ENV["GITHUB_PAT"]}"},
@@ -885,7 +537,7 @@ mcp = LLM::MCP.http(
885
537
  mcp.start
886
538
  ctx = LLM::Context.new(llm, stream: $stdout, tools: mcp.tools)
887
539
  ctx.talk("Pull information about my GitHub account.")
888
- ctx.talk(ctx.call(:functions)) while ctx.functions.any?
540
+ ctx.talk(ctx.wait(:call)) while ctx.functions?
889
541
  mcp.stop
890
542
  ```
891
543
 
@@ -900,7 +552,7 @@ mcp = LLM::MCP.http(
900
552
  mcp.run do
901
553
  ctx = LLM::Context.new(llm, stream: $stdout, tools: mcp.tools)
902
554
  ctx.talk("Pull information about my GitHub account.")
903
- ctx.talk(ctx.call(:functions)) while ctx.functions.any?
555
+ ctx.talk(ctx.wait(:call)) while ctx.functions?
904
556
  end
905
557
  ```
906
558