llm.rb 8.1.0 → 9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +120 -2
  3. data/README.md +161 -514
  4. data/lib/llm/active_record/acts_as_llm.rb +7 -8
  5. data/lib/llm/agent.rb +36 -16
  6. data/lib/llm/context.rb +30 -26
  7. data/lib/llm/contract/completion.rb +45 -0
  8. data/lib/llm/cost.rb +81 -4
  9. data/lib/llm/error.rb +1 -1
  10. data/lib/llm/function/array.rb +8 -5
  11. data/lib/llm/function/call_group.rb +39 -0
  12. data/lib/llm/function/fork/task.rb +6 -0
  13. data/lib/llm/function/ractor/task.rb +6 -0
  14. data/lib/llm/function/task.rb +10 -0
  15. data/lib/llm/function.rb +1 -0
  16. data/lib/llm/mcp/transport/http.rb +26 -46
  17. data/lib/llm/mcp/transport/stdio.rb +0 -8
  18. data/lib/llm/mcp.rb +6 -23
  19. data/lib/llm/provider.rb +23 -20
  20. data/lib/llm/providers/anthropic/error_handler.rb +6 -7
  21. data/lib/llm/providers/anthropic/files.rb +2 -2
  22. data/lib/llm/providers/anthropic/response_adapter/completion.rb +30 -0
  23. data/lib/llm/providers/anthropic.rb +1 -1
  24. data/lib/llm/providers/bedrock/error_handler.rb +8 -9
  25. data/lib/llm/providers/bedrock/models.rb +13 -13
  26. data/lib/llm/providers/bedrock/response_adapter/completion.rb +30 -0
  27. data/lib/llm/providers/bedrock.rb +1 -1
  28. data/lib/llm/providers/google/error_handler.rb +6 -7
  29. data/lib/llm/providers/google/files.rb +2 -4
  30. data/lib/llm/providers/google/images.rb +1 -1
  31. data/lib/llm/providers/google/models.rb +0 -2
  32. data/lib/llm/providers/google/response_adapter/completion.rb +30 -0
  33. data/lib/llm/providers/google.rb +1 -1
  34. data/lib/llm/providers/ollama/error_handler.rb +6 -7
  35. data/lib/llm/providers/ollama/models.rb +0 -2
  36. data/lib/llm/providers/ollama/response_adapter/completion.rb +30 -0
  37. data/lib/llm/providers/ollama.rb +1 -1
  38. data/lib/llm/providers/openai/audio.rb +3 -3
  39. data/lib/llm/providers/openai/error_handler.rb +6 -7
  40. data/lib/llm/providers/openai/files.rb +2 -2
  41. data/lib/llm/providers/openai/images.rb +3 -3
  42. data/lib/llm/providers/openai/models.rb +1 -1
  43. data/lib/llm/providers/openai/response_adapter/completion.rb +42 -0
  44. data/lib/llm/providers/openai/response_adapter/responds.rb +39 -0
  45. data/lib/llm/providers/openai/responses.rb +2 -2
  46. data/lib/llm/providers/openai/vector_stores.rb +1 -1
  47. data/lib/llm/providers/openai.rb +1 -1
  48. data/lib/llm/response.rb +10 -8
  49. data/lib/llm/sequel/plugin.rb +7 -8
  50. data/lib/llm/stream/queue.rb +15 -42
  51. data/lib/llm/stream.rb +4 -4
  52. data/lib/llm/transport/execution.rb +67 -0
  53. data/lib/llm/transport/http.rb +134 -0
  54. data/lib/llm/transport/persistent_http.rb +152 -0
  55. data/lib/llm/transport/response/http.rb +113 -0
  56. data/lib/llm/transport/response.rb +112 -0
  57. data/lib/llm/{provider/transport/http → transport}/stream_decoder.rb +8 -4
  58. data/lib/llm/transport.rb +139 -0
  59. data/lib/llm/usage.rb +14 -5
  60. data/lib/llm/version.rb +1 -1
  61. data/lib/llm.rb +2 -12
  62. data/llm.gemspec +2 -16
  63. metadata +11 -19
  64. data/lib/llm/provider/transport/http/execution.rb +0 -115
  65. data/lib/llm/provider/transport/http/interruptible.rb +0 -114
  66. data/lib/llm/provider/transport/http.rb +0 -145
  67. data/lib/llm/utils.rb +0 -19
data/README.md CHANGED
@@ -4,519 +4,240 @@
4
4
  <p align="center">
5
5
  <a href="https://0x1eef.github.io/x/llm.rb?rebuild=1"><img src="https://img.shields.io/badge/docs-0x1eef.github.io-blue.svg" alt="RubyDoc"></a>
6
6
  <a href="https://opensource.org/license/0bsd"><img src="https://img.shields.io/badge/License-0BSD-orange.svg?" alt="License"></a>
7
- <a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-8.1.0-green.svg?" alt="Version"></a>
7
+ <a href="https://github.com/llmrb/llm.rb/tags"><img src="https://img.shields.io/badge/version-9.0.0-green.svg?" alt="Version"></a>
8
8
  </p>
9
9
 
10
10
  ## About
11
11
 
12
- llm.rb is the most capable runtime for building AI systems in Ruby.
13
- <br>
14
-
15
- llm.rb is designed for Ruby, and although it works great in Rails, it is not tightly
16
- coupled to it. It runs on the standard library by default (zero dependencies),
17
- loads optional pieces only when needed, includes built-in ActiveRecord support through
18
- `acts_as_llm` and `acts_as_agent`, includes built-in Sequel support through
19
- `plugin :llm` and `plugin :agent`, and is designed for engineers who want control over
20
- long-lived, tool-capable, stateful AI workflows instead of just
21
- request/response helpers.
22
-
23
- It provides one runtime for providers, agents, tools, skills, MCP servers, streaming,
24
- schemas, files, and persisted state, so real systems can be built out of one coherent
25
- execution model instead of a pile of adapters.
26
-
27
- It supports providers including OpenAI, Anthropic, Google Gemini, DeepSeek, xAI,
28
- Z.ai, and AWS Bedrock.
29
-
30
- It provides concurrent tool execution with multiple strategies exposed through a single
31
- runtime: async-task, threads, fibers, ractors and processes (fork). The first three are
32
- good for IO-bound work and the last two are good for CPU-bound work. Ractor support is
33
- experimental and comes with limitations.
34
-
35
- Want to see some code? Jump to [the examples](#examples) section. <br>
36
- Want to see a self-hosted LLM environment built on llm.rb? Check out [relay.app](https://github.com/llmrb/relay.app). <br>
37
- Want to use llm.rb with mruby ? Check out [mruby-llm](https://github.com/llmrb/mruby-llm)
38
-
39
-
40
- ## Architecture
41
-
42
- <p align="center">
43
- <img src="https://github.com/llmrb/llm.rb/raw/main/resources/architecture.png" alt="llm.rb architecture" width="790">
44
- </p>
45
-
46
- ## Core Concept
12
+ llm.rb is Ruby's most capable AI runtime.
47
13
 
48
- [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
49
- is the execution boundary in llm.rb.
14
+ It runs on Ruby's standard library by default. loads optional pieces
15
+ only when needed, and offers a single runtime for providers, agents,
16
+ tools, skills, MCP, streaming, files, and persisted state. As a bonus,
17
+ llm.rb is also [available for mruby](https://github.com/llmrb/mruby-llm).
50
18
 
51
- It holds:
52
- - message history
53
- - tool state
54
- - schemas
55
- - streaming configuration
56
- - usage and cost tracking
19
+ It supports OpenAI, OpenAI-compatible endpoints, Anthropic, Google
20
+ Gemini, DeepSeek, xAI, Z.ai, AWS Bedrock, Ollama, and llama.cpp. It
21
+ also includes built-in ActiveRecord and Sequel support, plus concurrent
22
+ tool execution through threads, tasks (via async gem), fibers, ractors,
23
+ and fork (via xchan.rb gem).
57
24
 
58
- Instead of switching abstractions for each feature, everything builds on the
59
- same context object.
25
+ ## Quick start
60
26
 
61
- ## Standout features
27
+ #### LLM::Context
62
28
 
63
- The following list is **not exhaustive**, but it covers a lot of ground.
29
+ The
30
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
31
+ object is at the heart of the runtime. Almost all other features build
32
+ on top of it. It is a low-level interface to a model, and requires tool
33
+ execution to be managed manually. The
34
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html)
35
+ class is almost the same as
36
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
37
+ but it manages tool execution for you - we'll cover agents next:
64
38
 
65
- #### Skills
39
+ ```ruby
40
+ require "llm"
66
41
 
67
- Skills are reusable, directory-backed capabilities loaded from `SKILL.md`.
68
- They run through the same runtime as tools, agents, and MCP. They do not
69
- require a second orchestration layer or a parallel abstraction. If you've
70
- used Claude or Codex, you know the general idea of skills, and llm.rb
71
- supports that same concept with the same execution model as the rest of the
72
- system.
42
+ llm = LLM.openai(key: ENV["KEY"])
43
+ ctx = LLM::Context.new(llm, stream: $stdout)
44
+ ctx.talk "Hello world"
45
+ ```
73
46
 
74
- In llm.rb, a skill has frontmatter and instructions. The frontmatter can
75
- define `name`, `description`, and `tools`. The `tools` entries are tool names,
76
- and each name must resolve to a subclass of
77
- [`LLM::Tool`](https://0x1eef.github.io/x/llm.rb/LLM/Tool.html) that is already
78
- loaded in the runtime.
47
+ #### LLM::Agent
79
48
 
80
- If you want Claude/Codex-like skills that can drive scripts or shell
81
- commands, you would typically pair the skill with a tool that can execute
82
- system commands.
83
-
84
- ```yaml
85
- ---
86
- name: release
87
- description: Prepare a release
88
- tools:
89
- - search_docs
90
- - git
91
- ---
92
- Review the release state, summarize what changed, and prepare the release.
93
- ```
49
+ The
50
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html)
51
+ object is implemented on top of
52
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html).
53
+ It provides the same interface, but manages tool execution for you. It
54
+ also has builtin features such as a loop guard that detects repeated
55
+ tool call patterns, and another guard that detects infinite tool call
56
+ loops. Both guards advise the model to change course rather than raise
57
+ an error:
94
58
 
95
59
  ```ruby
96
- class Agent < LLM::Agent
97
- model "gpt-5.4-mini"
98
- skills "./skills/release"
99
- tracer { LLM::Tracer::Logger.new(llm, path: "logs/release-agent.log") }
100
- end
60
+ require "llm"
101
61
 
102
62
  llm = LLM.openai(key: ENV["KEY"])
103
- Agent.new(llm, stream: $stdout).talk("Let's prepare the release!")
63
+ agent = LLM::Agent.new(llm, stream: $stdout)
64
+ agent.talk "Hello world"
104
65
  ```
105
66
 
106
- #### ORM
107
-
108
- Any ActiveRecord model or Sequel model can become an agent-capable model,
109
- including existing business and domain models, without forcing you into a
110
- separate agent table or a second persistence layer.
111
-
112
- `acts_as_agent` extends a model with agent capabilities: the same runtime
113
- surface as [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html),
114
- because it actually wraps an `LLM::Agent`, plus persistence through one text,
115
- JSON, or JSONB-backed `data` column on the same table. If your app also has
116
- provider or model columns, provide them to llm.rb through `set_provider` and
117
- `set_context`.
67
+ #### Tools
118
68
 
69
+ The
70
+ [LLM::Tool](https://0x1eef.github.io/x/llm.rb/LLM/Tool.html)
71
+ class can be subclassed to implement your own tools that can extend the
72
+ abilities of a model:
119
73
 
120
74
  ```ruby
121
- class Ticket < ApplicationRecord
122
- acts_as_agent provider: :set_provider, context: :set_context
123
- model "gpt-5.4-mini"
124
- instructions "You are a support assistant."
125
-
126
- private
127
-
128
- def set_provider
129
- LLM.openai(key: ENV["OPENAI_SECRET"])
130
- end
131
-
132
- def set_context
133
- { mode: :responses, store: false }
75
+ class ReadFile < LLM::Tool
76
+ name "read-file"
77
+ description "Read a file"
78
+ parameter :path, String, "The filename or path"
79
+ required %i[path]
80
+
81
+ def call(path:)
82
+ {contents: File.read(path)}
134
83
  end
135
84
  end
136
85
  ```
137
86
 
138
- #### Agentic Patterns
139
-
140
- llm.rb is especially strong when you want to build agentic systems in a Ruby
141
- way. Agents can be ordinary application models with state, associations,
142
- tools, skills, and persistence, which makes it much easier to build systems
143
- where users have their own specialized agents instead of treating agents as
144
- something outside the app.
145
-
146
- That pattern works so well in llm.rb because
147
- [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html),
148
- `acts_as_agent`, `plugin :agent`, skills, tools, and persisted runtime state
149
- all fit the same execution model. The runtime stays small enough that the
150
- main design work becomes application design, not orchestration glue.
151
-
152
- For a concrete example, see
153
- [How to build a platform of agents](https://0x1eef.github.io/posts/how-to-build-a-platform-of-agents).
154
-
155
- #### Persistence
87
+ #### MCP
156
88
 
157
- The same runtime can be serialized to disk, restored later, persisted in JSON
158
- or JSONB-backed ORM columns, resumed across process boundaries, or shared
159
- across long-lived workflows.
89
+ The
90
+ [LLM::MCP](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html)
91
+ object lets llm.rb use tools provided by an MCP server. Those tools are
92
+ exposed through the same runtime as local tools, so you can pass them
93
+ to either
94
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
95
+ or
96
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html).
97
+ In this example, the MCP server runs over stdio and
98
+ [LLM::Context](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
99
+ uses the same tool loop as local tools:
160
100
 
161
101
  ```ruby
162
- ctx = LLM::Context.new(llm)
163
- ctx.talk("Remember that my favorite language is Ruby.")
164
- ctx.save(path: "context.json")
165
- ```
166
-
167
- #### Context Compaction
102
+ require "llm"
168
103
 
169
- Long-lived contexts can compact older history into a summary instead of
170
- growing forever. Compaction is built into [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
171
- through [`LLM::Compactor`](https://0x1eef.github.io/x/llm.rb/LLM/Compactor.html),
172
- and when a stream is present it emits `on_compaction` and
173
- `on_compaction_finish` through [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html).
174
- The compactor can also use a different model from the main context, which is
175
- useful when you want summarization to run on a cheaper or faster model.
176
- `token_threshold:` accepts either a fixed token count or a percentage string
177
- like `"90%"`, which resolves against the active model context window and
178
- triggers compaction once total token usage goes over that percentage.
104
+ llm = LLM.openai(key: ENV["KEY"])
105
+ mcp = LLM::MCP.stdio(argv: ["ruby", "server.rb"])
179
106
 
180
- ```ruby
181
- ctx = LLM::Context.new(
182
- llm,
183
- compactor: {
184
- token_threshold: "90%",
185
- retention_window: 8,
186
- model: "gpt-5.4-mini"
187
- }
188
- )
107
+ mcp.run do
108
+ ctx = LLM::Context.new(llm, stream: $stdout, tools: mcp.tools)
109
+ ctx.talk "Use the available tools to inspect the environment."
110
+ ctx.talk(ctx.wait(:call)) while ctx.functions?
111
+ end
189
112
  ```
190
113
 
191
- #### Guards
114
+ #### Skills
192
115
 
193
- Guards let llm.rb supervise agentic execution, not just run it.
194
- They live on [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html),
195
- can inspect the current runtime state, and can step in when a context is no
196
- longer making progress.
116
+ Skills are reusable instructions loaded from a `SKILL.md` directory. They let
117
+ you package behavior and tool access together, and they plug into the same
118
+ runtime as tools, agents, and MCP. When a skill runs, llm.rb spawns a
119
+ subagent with the skill instructions, access to only the tools listed in the
120
+ skill, and recent conversation context:
197
121
 
198
- [`LLM::LoopGuard`](https://0x1eef.github.io/x/llm.rb/LLM/LoopGuard.html) is
199
- the built-in implementation. It detects repeated tool-call patterns and
200
- blocks pending tool execution with in-band guarded tool errors instead of
201
- letting the loop keep spinning. [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html)
202
- enables that guard by default through its wrapped context.
122
+ ```yaml
123
+ ---
124
+ name: release
125
+ description: Prepare a release
126
+ tools: ["search-docs", "git"]
127
+ ---
203
128
 
204
- ```ruby
205
- ctx = LLM::Context.new(llm)
206
- ctx.guard = MyGuard.new
207
- ```
129
+ ## Task
208
130
 
209
- #### Transformers
210
-
211
- Transformers let llm.rb rewrite outgoing prompts and params before a request
212
- is sent to the provider. They also live on
213
- [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html), but
214
- they solve a different problem from guards: instead of blocking execution,
215
- they can normalize or scrub what gets sent. When a stream is present, that
216
- lifecycle is also exposed through
217
- [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html) with
218
- `on_transform` and `on_transform_finish`.
219
-
220
- That makes them a good fit for things like PII scrubbing, prompt
221
- normalization, or request-level param injection. A transformer just needs to
222
- implement `call(ctx, prompt, params)` and return `[prompt, params]`. That
223
- means a transformer can scrub plain text prompts, but it can also scrub
224
- [`LLM::Function::Return`](https://0x1eef.github.io/x/llm.rb/LLM/Function/Return.html)
225
- values. In other words, you can intercept a tool call's return value and
226
- modify it before sending it back to the LLM.
227
-
228
- That is also a useful UI hook. A stream can surface messages like
229
- `Anonymizing your data...` before a scrubber runs and `Data anonymized.`
230
- after it finishes.
131
+ Review the release state, summarize what changed, and prepare the release.
132
+ ```
231
133
 
232
134
  ```ruby
233
- class ScrubPII
234
- EMAIL = /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/i
135
+ require "llm"
235
136
 
236
- def call(ctx, prompt, params)
237
- [scrub(prompt), params]
238
- end
137
+ class ReleaseAgent < LLM::Agent
138
+ model "gpt-5.4-mini"
139
+ skills "./skills/release"
140
+ end
239
141
 
240
- private
142
+ llm = LLM.openai(key: ENV["KEY"])
143
+ ReleaseAgent.new(llm, stream: $stdout).talk("Prepare the next release.")
144
+ ```
241
145
 
242
- def scrub(prompt)
243
- case prompt
244
- when String then prompt.gsub(EMAIL, "[REDACTED_EMAIL]")
245
- when Array then prompt.map { scrub(_1) }
246
- when LLM::Function::Return then on_tool_return(prompt)
247
- else prompt
248
- end
249
- end
146
+ #### LLM::Stream
250
147
 
251
- def on_tool_return(result)
252
- value = case result.name
253
- when "lookup-customer" then scrub_value(result.value)
254
- else result.value
255
- end
256
- LLM::Function::Return.new(result.id, result.name, value)
257
- end
148
+ The
149
+ [LLM::Stream](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html)
150
+ object lets you observe output and runtime events as they happen. You
151
+ can subclass it to handle streamed content in your own application:
258
152
 
259
- def scrub_value(value)
260
- case value
261
- when String then value.gsub(EMAIL, "[REDACTED_EMAIL]")
262
- when Array then value.map { scrub_value(_1) }
263
- when Hash then value.transform_values { scrub_value(_1) }
264
- else value
265
- end
153
+ ```ruby
154
+ require "llm"
155
+
156
+ class Stream < LLM::Stream
157
+ def on_content(content)
158
+ $stdout << content
266
159
  end
267
160
  end
268
161
 
269
- ctx = LLM::Context.new(llm)
270
- ctx.transformer = ScrubPII.new
162
+ llm = LLM.openai(key: ENV["KEY"])
163
+ ctx = LLM::Context.new(llm, stream: Stream.new)
164
+ ctx.talk "Write a haiku about Ruby."
271
165
  ```
272
166
 
273
- When a stream is present, that transformer lifecycle is also exposed through
274
- `on_transform` and `on_transform_finish` on
275
- [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html).
167
+ #### LLM::Stream (advanced)
276
168
 
277
- #### LLM::Stream
278
-
279
- `LLM::Stream` is not just for printing tokens. It supports `on_content`,
280
- `on_reasoning_content`, `on_tool_call`, `on_tool_return`, `on_transform`,
281
- `on_transform_finish`, `on_compaction`, and `on_compaction_finish`, which
282
- means visible output, reasoning output, request rewriting, tool execution,
283
- and context compaction can all be driven through the same execution path.
169
+ The
170
+ [LLM::Stream](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html)
171
+ object can also resolve tool calls while output is still streaming. In
172
+ `on_tool_call`, you can spawn the tool, push the work onto the stream
173
+ queue, and later drain it with `wait`:
284
174
 
285
175
  ```ruby
176
+ require "llm"
177
+
286
178
  class Stream < LLM::Stream
287
- def on_tool_call(tool, error)
288
- queue << (error || ctx.spawn(tool, :thread))
179
+ def on_content(content)
180
+ $stdout << content
289
181
  end
290
182
 
291
- def on_tool_return(tool, result)
292
- puts(result.value)
183
+ def on_tool_call(tool, error)
184
+ return queue << error if error
185
+ queue << ctx.spawn(tool, :thread)
293
186
  end
294
187
  end
188
+
189
+ llm = LLM.openai(key: ENV["KEY"])
190
+ ctx = LLM::Context.new(llm, stream: Stream.new, tools: [ReadFile])
191
+ ctx.talk "Read README.md and summarize the quick start."
192
+ ctx.talk(ctx.wait) while ctx.functions?
295
193
  ```
296
194
 
297
195
  #### Concurrency
298
196
 
299
- Tool execution can run sequentially with `:call` or concurrently through
300
- `:thread`, `:task`, `:fiber`, `:fork`, and experimental `:ractor`, without
301
- rewriting your tool layer. Async tasks, threads, and fibers are the
302
- I/O-bound options. Fork and ractor are the CPU-bound options. `:fork`
303
- requires [`xchan.rb`](https://github.com/0x1eef/xchan.rb#readme) support,
304
- and `:ractor` is still experimental.
305
-
306
- `:fiber` uses `Fiber.schedule`, so it requires `Fiber.scheduler`.
197
+ llm.rb can run tool work concurrently. This is useful when a model calls
198
+ multiple tools and you want to resolve them in parallel instead of one
199
+ at a time. On
200
+ [LLM::Agent](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html),
201
+ you can enable this with `concurrency`. Common options are `:call` for
202
+ sequential execution, `:thread`, or `:task` for concurrent IO-bound work, and
203
+ `:ractor` or `:fork` for more isolated CPU-bound work:
307
204
 
308
205
  ```ruby
206
+ require "llm"
207
+
309
208
  class Agent < LLM::Agent
310
209
  model "gpt-5.4-mini"
311
- tools FetchWeather, FetchNews, FetchStock
210
+ tools ReadFile
312
211
  concurrency :thread
313
212
  end
314
- ```
315
-
316
- #### MCP
317
213
 
318
- Remote MCP tools and prompts are not bolted on as a separate integration
319
- stack. They adapt into the same tool and prompt path used by local tools,
320
- skills, contexts, and agents.
321
-
322
- Use `mcp.run do ... end` for scoped work where the client should start and
323
- stop around one block. Use `mcp.start` and `mcp.stop` directly when you need
324
- finer sequential control across several steps before shutting the client down.
325
-
326
- ```ruby
327
- mcp = LLM::MCP.http(
328
- url: "https://api.githubcopilot.com/mcp/",
329
- headers: {"Authorization" => "Bearer #{ENV["GITHUB_PAT"]}"},
330
- persistent: true
331
- )
332
- mcp.run do
333
- ctx = LLM::Context.new(llm, tools: mcp.tools)
334
- end
214
+ llm = LLM.openai(key: ENV["KEY"])
215
+ agent = Agent.new(llm, stream: $stdout)
216
+ agent.talk "Read README.md and CHANGELOG.md and compare them."
335
217
  ```
336
218
 
337
- #### Cancellation
219
+ #### Serialization
338
220
 
339
- Cancellation is one of the harder problems to get right, and while llm.rb
340
- makes it possible, it still requires careful engineering to use effectively.
341
- The point though is that it is possible to stop in-flight provider work cleanly
342
- through the same runtime, and the model used by llm.rb is directly inspired by
343
- Go's context package. In fact, llm.rb is heavily inspired by Go but with a Ruby
344
- twist.
221
+ The [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html)
222
+ object can be serialized to JSON, which makes it suitable for storing
223
+ in a file, a database column, or a Redis queue. The built-in
224
+ ActiveRecord and Sequel plugins are built on top of this feature:
345
225
 
346
226
  ```ruby
347
227
  require "llm"
348
- require "io/console"
349
228
 
350
229
  llm = LLM.openai(key: ENV["KEY"])
351
- ctx = LLM::Context.new(llm, stream: $stdout)
352
- worker = Thread.new do
353
- ctx.talk("Write a very long essay about network protocols.")
354
- rescue LLM::Interrupt
355
- puts "Request was interrupted!"
356
- end
357
230
 
358
- STDIN.getch
359
- ctx.interrupt!
360
- worker.join
361
- ```
231
+ # Serialize a context
232
+ ctx1 = LLM::Context.new(llm)
233
+ ctx1.talk "Remember that my favorite language is Ruby"
234
+ string = ctx1.to_json
362
235
 
363
- ## Differentiators
364
-
365
- ### Execution Model
366
-
367
- - **A system layer, not just an API wrapper** <br>
368
- Put providers, tools, MCP servers, and application APIs behind one runtime
369
- model instead of stitching them together by hand.
370
- - **Contexts are central** <br>
371
- Keep history, tools, schema, usage, persistence, and execution state in one
372
- place instead of spreading them across your app.
373
- - **Contexts can be serialized** <br>
374
- Save and restore live state for jobs, databases, retries, or long-running
375
- workflows.
376
-
377
- ### Runtime Behavior
378
-
379
- - **Streaming and tool execution work together** <br>
380
- Start tool work while output is still streaming so you can hide latency
381
- instead of waiting for turns to finish.
382
- - **Agents auto-manage tool execution** <br>
383
- Use `LLM::Agent` when you want the same stateful runtime surface as
384
- `LLM::Context`, but with tool loops executed automatically according to a
385
- configured concurrency mode such as `:call`, `:thread`, `:task`, `:fiber`,
386
- `:fork`, or experimental `:ractor` support for class-based tools. MCP tools
387
- are not supported by the current `:ractor` mode, but mixed tool sets can
388
- still route MCP tools and local tools through different strategies at
389
- runtime. By default, the tool attempt budget is `25`. When an agent
390
- exhausts that budget, it sends advisory tool errors back through the model
391
- instead of raising out of the runtime. Set `tool_attempts: nil` to disable
392
- that advisory behavior.
393
- - **Tool calls have an explicit lifecycle** <br>
394
- A tool call can be executed, cancelled through
395
- [`LLM::Function#cancel`](https://0x1eef.github.io/x/llm.rb/LLM/Function.html#cancel-instance_method),
396
- or left unresolved for manual handling, but the normal runtime contract is
397
- still that a model-issued tool request is answered with a tool return.
398
- - **Requests can be interrupted cleanly** <br>
399
- Stop in-flight provider work through the same runtime instead of treating
400
- cancellation as a separate concern.
401
- [`LLM::Context#cancel!`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html#cancel-21-instance_method)
402
- is inspired by Go's context cancellation model.
403
- - **Concurrency is a first-class feature** <br>
404
- Use async tasks, threads, fibers, forks, or experimental ractors without
405
- rewriting your tool layer. Async tasks, threads, and fibers are the
406
- I/O-bound options. Fork and ractor are the CPU-bound options. `:fork`
407
- requires [`xchan.rb`](https://github.com/0x1eef/xchan.rb#readme) support.
408
- The current `:ractor` mode is for class-based tools, and MCP tools are
409
- not supported by ractor, but mixed workloads can branch on `tool.mcp?`
410
- and choose a supported strategy per tool. Class-based `:ractor` tools
411
- still emit normal tool tracer callbacks. `:fiber` uses `Fiber.schedule`,
412
- so it requires `Fiber.scheduler`.
413
- - **Advanced workloads are built in, not bolted on** <br>
414
- Streaming, concurrent tool execution, persistence, tracing, and MCP support
415
- all fit the same runtime model.
416
-
417
- ### Integration
418
-
419
- - **MCP is built in** <br>
420
- Connect to MCP servers over stdio or HTTP without bolting on a separate
421
- integration stack.
422
- - **ActiveRecord and Sequel persistence are built in** <br>
423
- llm.rb includes built-in ActiveRecord support through `acts_as_llm` and
424
- `acts_as_agent`, plus built-in Sequel support through `plugin :llm` and
425
- `plugin :agent`.
426
- Use `acts_as_llm` when you want to wrap `LLM::Context`, `acts_as_agent`
427
- when you want to wrap `LLM::Agent`, `plugin :llm` when you want a
428
- `LLM::Context` on a Sequel model, or `plugin :agent` when you want an
429
- `LLM::Agent`. These integrations support `provider:` and `context:` hooks,
430
- plus `format: :string` for text columns or `format: :jsonb` for native
431
- PostgreSQL JSON storage when ORM JSON typecasting support is enabled.
432
- - **ORM models can become persistent agents** <br>
433
- Turn an ActiveRecord or Sequel model into an agent-capable model with
434
- built-in persistence, stored on the same table, with `jsonb` support when
435
- your ORM and database support native JSON columns.
436
- - **Persistent HTTP pooling is shared process-wide** <br>
437
- When enabled, separate
438
- [`LLM::Provider`](https://0x1eef.github.io/x/llm.rb/LLM/Provider.html)
439
- instances with the same endpoint settings can share one persistent
440
- pool, and separate HTTP
441
- [`LLM::MCP`](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html)
442
- instances can do the same, instead of each object creating its own
443
- isolated per-instance transport.
444
- - **OpenAI-compatible gateways are supported** <br>
445
- Target OpenAI-compatible services such as DeepInfra and OpenRouter, as well
446
- as proxies and self-hosted servers, with `host:` and `base_path:` when they
447
- preserve OpenAI request shapes but change the API root path.
448
- - **Provider support is broad** <br>
449
- Work with OpenAI, OpenAI-compatible endpoints, Anthropic, Google, DeepSeek,
450
- Z.ai, xAI, AWS Bedrock, llama.cpp, and Ollama through the same runtime.
451
- - **Tools are explicit** <br>
452
- Run local tools, provider-native tools, and MCP tools through the same path
453
- with fewer special cases.
454
- - **Skills become bounded runtime capabilities** <br>
455
- Point llm.rb at directories with a `SKILL.md`, resolve named tools through
456
- the registry, and adapt each skill into its own callable capability through
457
- the normal runtime. Unlike a generic skill-discovery tool, each skill runs
458
- with its own bounded tool subset and behaves like a task-scoped sub-agent.
459
- - **Providers are normalized, not flattened** <br>
460
- Share one API surface across providers without losing access to provider-
461
- specific capabilities where they matter.
462
- - **Responses keep a uniform shape** <br>
463
- Provider calls return
464
- [`LLM::Response`](https://0x1eef.github.io/x/llm.rb/LLM/Response.html)
465
- objects as a common base shape, then extend them with endpoint- or
466
- provider-specific behavior when needed.
467
- - **Low-level access is still there** <br>
468
- Normalized responses still keep the raw `Net::HTTPResponse` available when
469
- you need headers, status, or other HTTP details.
470
- - **Local model metadata is included** <br>
471
- Model capabilities, pricing, and limits are available locally without extra
472
- API calls.
473
-
474
- ### Design Philosophy
475
-
476
- - **Runs on the stdlib** <br>
477
- Start with Ruby's standard library and add extra dependencies only when you
478
- need them.
479
- - **It is highly pluggable** <br>
480
- Add tools, swap providers, change JSON backends, plug in tracing, or layer
481
- internal APIs and MCP servers into the same execution path.
482
- - **It scales from scripts to long-lived systems** <br>
483
- The same primitives work for one-off scripts, background jobs, and more
484
- demanding application workloads with streaming, persistence, and tracing.
485
- - **Thread boundaries are clear** <br>
486
- Providers are shareable. Contexts are stateful and should stay thread-local.
487
-
488
- ## Capabilities
489
-
490
- Execution:
491
- - **Chat & Contexts** — stateless and stateful interactions with persistence
492
- - **Context Serialization** — save and restore state across processes or time
493
- - **Streaming** — visible output, reasoning output, tool-call events
494
- - **Request Interruption** — stop in-flight provider work cleanly
495
- - **Concurrent Execution** — threads, async tasks, and fibers
496
-
497
- Runtime Building Blocks:
498
- - **Tool Calling** — class-based tools and closure-based functions
499
- - **Run Tools While Streaming** — overlap model output with tool latency
500
- - **Agents** — reusable assistants with tool auto-execution
501
- - **Skills** — directory-backed capabilities loaded from `SKILL.md`
502
- - **MCP Support** — stdio and HTTP MCP clients with prompt and tool support
503
- - **Context Compaction** — summarize older history in long-lived contexts
504
-
505
- Data and Structure:
506
- - **Structured Outputs** — JSON Schema-based responses
507
- - **Responses API** — stateful response workflows where providers support them
508
- - **Multimodal Inputs** — text, images, audio, documents, URLs
509
- - **Audio** — speech generation, transcription, translation
510
- - **Images** — generation and editing
511
- - **Files API** — upload and reference files in prompts
512
- - **Embeddings** — vector generation for search and RAG
513
- - **Vector Stores** — retrieval workflows
514
-
515
- Operations:
516
- - **Cost Tracking** — local cost estimation without extra API calls
517
- - **Observability** — tracing, logging, telemetry
518
- - **Model Registry** — local metadata for capabilities, limits, pricing
519
- - **Persistent HTTP** — optional connection pooling for providers and MCP
236
+ # Restore a context (from JSON)
237
+ ctx2 = LLM::Context.new(llm, stream: $stdout)
238
+ ctx2.restore(string:)
239
+ ctx2.talk "What is my favorite language?"
240
+ ```
520
241
 
521
242
  ## Installation
522
243
 
@@ -566,80 +287,6 @@ ctx = LLM::Context.new(llm)
566
287
  ctx.talk ["Summarize this document.", ctx.local_file("README.md")]
567
288
  ```
568
289
 
569
- #### Agent
570
-
571
- This example uses [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html) directly and lets the agent manage tool execution. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
572
-
573
- ```ruby
574
- require "llm"
575
-
576
- class ShellAgent < LLM::Agent
577
- model "gpt-5.4-mini"
578
- instructions "You are a Linux system assistant."
579
- tools Shell
580
- concurrency :thread
581
- end
582
-
583
- llm = LLM.openai(key: ENV["KEY"])
584
- agent = ShellAgent.new(llm)
585
- puts agent.talk("What time is it on this system?").content
586
- ```
587
-
588
- #### Skills
589
-
590
- This example uses [`LLM::Agent`](https://0x1eef.github.io/x/llm.rb/LLM/Agent.html) with directory-backed skills so `SKILL.md` capabilities run through the normal tool path. In llm.rb, a skill is exposed as a tool in the runtime. When that tool is called, it spawns a sub-agent with relevant context plus the instructions and tool subset declared in its own `SKILL.md`. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
591
-
592
- Each skill runs only with the tools declared in its own frontmatter.
593
-
594
- ```ruby
595
- require "llm"
596
-
597
- class Agent < LLM::Agent
598
- model "gpt-5.4-mini"
599
- instructions "You are a concise release assistant."
600
- skills "./skills/release", "./skills/review"
601
- tracer { LLM::Tracer::Logger.new(llm, path: "logs/release-agent.log") }
602
- end
603
-
604
- llm = LLM.openai(key: ENV["KEY"])
605
- puts Agent.new(llm).talk("Use the review skill.").content
606
- ```
607
-
608
- #### Streaming
609
-
610
- This example uses [`LLM::Stream`](https://0x1eef.github.io/x/llm.rb/LLM/Stream.html) directly so visible output and tool execution can happen together. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
611
-
612
- ```ruby
613
- require "llm"
614
-
615
- class Stream < LLM::Stream
616
- def on_content(content)
617
- $stdout << content
618
- end
619
-
620
- def on_tool_call(tool, error)
621
- return queue << error if error
622
- $stdout << "\nRunning tool #{tool.name}...\n"
623
- queue << ctx.spawn(tool, :thread)
624
- end
625
-
626
- def on_tool_return(tool, result)
627
- if result.error?
628
- $stdout << "Tool #{tool.name} failed\n"
629
- else
630
- $stdout << "Finished tool #{tool.name}\n"
631
- end
632
- end
633
- end
634
-
635
- llm = LLM.openai(key: ENV["KEY"])
636
- stream = Stream.new
637
- ctx = LLM::Context.new(llm, stream:, tools: [System])
638
-
639
- ctx.talk("Run `date` and `uname -a`.")
640
- ctx.talk(ctx.wait(:thread)) while ctx.functions.any?
641
- ```
642
-
643
290
  #### Context Compaction
644
291
 
645
292
  This example uses [`LLM::Context`](https://0x1eef.github.io/x/llm.rb/LLM/Context.html),
@@ -652,7 +299,7 @@ compactor can also use its own `model:` if you want summarization to run on a
652
299
  different model from the main context. `token_threshold:` accepts either a
653
300
  fixed token count or a percentage string like `"90%"`, which resolves
654
301
  against the active model context window and triggers compaction once total
655
- token usage goes over that percentage. <br> See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
302
+ token usage goes over that percentage. See the [deepdive (web)](https://0x1eef.github.io/x/llm.rb/file.deepdive.html) or [deepdive (markdown)](resources/deepdive.md) for more examples.
656
303
 
657
304
  ```ruby
658
305
  require "llm"
@@ -744,7 +391,7 @@ class Context < Sequel::Model
744
391
  private
745
392
 
746
393
  def set_provider
747
- LLM.openai(key: ENV["OPENAI_SECRET"])
394
+ LLM.openai(key: ENV["OPENAI_SECRET"], persistent: true)
748
395
  end
749
396
 
750
397
  def set_context
@@ -880,7 +527,7 @@ This example uses [`LLM::MCP`](https://0x1eef.github.io/x/llm.rb/LLM/MCP.html) o
880
527
  require "llm"
881
528
  require "net/http/persistent"
882
529
 
883
- llm = LLM.openai(key: ENV["KEY"])
530
+ llm = LLM.openai(key: ENV["KEY"], persistent: true)
884
531
  mcp = LLM::MCP.http(
885
532
  url: "https://api.githubcopilot.com/mcp/",
886
533
  headers: {"Authorization" => "Bearer #{ENV["GITHUB_PAT"]}"},
@@ -890,7 +537,7 @@ mcp = LLM::MCP.http(
890
537
  mcp.start
891
538
  ctx = LLM::Context.new(llm, stream: $stdout, tools: mcp.tools)
892
539
  ctx.talk("Pull information about my GitHub account.")
893
- ctx.talk(ctx.call(:functions)) while ctx.functions.any?
540
+ ctx.talk(ctx.wait(:call)) while ctx.functions?
894
541
  mcp.stop
895
542
  ```
896
543
 
@@ -905,7 +552,7 @@ mcp = LLM::MCP.http(
905
552
  mcp.run do
906
553
  ctx = LLM::Context.new(llm, stream: $stdout, tools: mcp.tools)
907
554
  ctx.talk("Pull information about my GitHub account.")
908
- ctx.talk(ctx.call(:functions)) while ctx.functions.any?
555
+ ctx.talk(ctx.wait(:call)) while ctx.functions?
909
556
  end
910
557
  ```
911
558