riffer 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.agents/code-style.md +2 -1
- data/.agents/rbs-inline.md +123 -0
- data/.agents/rdoc.md +14 -8
- data/.release-please-manifest.json +1 -1
- data/AGENTS.md +3 -0
- data/CHANGELOG.md +24 -0
- data/CLAUDE.md +1 -0
- data/Guardfile +7 -0
- data/Rakefile +23 -1
- data/Steepfile +14 -0
- data/docs/03_AGENTS.md +24 -1
- data/docs/06_STREAM_EVENTS.md +34 -0
- data/docs/08_EVALS.md +316 -0
- data/docs/09_GUARDRAILS.md +386 -0
- data/lib/riffer/agent/response.rb +51 -0
- data/lib/riffer/agent.rb +205 -116
- data/lib/riffer/config.rb +19 -13
- data/lib/riffer/core.rb +5 -6
- data/lib/riffer/evals/evaluator.rb +107 -0
- data/lib/riffer/evals/evaluators/answer_relevancy.rb +60 -0
- data/lib/riffer/evals/evaluators.rb +59 -0
- data/lib/riffer/evals/judge.rb +110 -0
- data/lib/riffer/evals/metric.rb +66 -0
- data/lib/riffer/evals/profile.rb +100 -0
- data/lib/riffer/evals/result.rb +70 -0
- data/lib/riffer/evals/run_result.rb +99 -0
- data/lib/riffer/evals/runner.rb +47 -0
- data/lib/riffer/evals.rb +11 -0
- data/lib/riffer/guardrail.rb +99 -0
- data/lib/riffer/guardrails/max_length.rb +69 -0
- data/lib/riffer/guardrails/modification.rb +38 -0
- data/lib/riffer/guardrails/result.rb +94 -0
- data/lib/riffer/guardrails/runner.rb +104 -0
- data/lib/riffer/guardrails/tripwire.rb +60 -0
- data/lib/riffer/guardrails.rb +10 -0
- data/lib/riffer/helpers/class_name_converter.rb +3 -5
- data/lib/riffer/helpers/dependencies.rb +3 -5
- data/lib/riffer/helpers/validations.rb +3 -5
- data/lib/riffer/helpers.rb +11 -0
- data/lib/riffer/messages/assistant.rb +9 -16
- data/lib/riffer/messages/base.rb +6 -7
- data/lib/riffer/messages/converter.rb +4 -4
- data/lib/riffer/messages/system.rb +2 -1
- data/lib/riffer/messages/tool.rb +9 -22
- data/lib/riffer/messages/user.rb +2 -1
- data/lib/riffer/messages.rb +1 -0
- data/lib/riffer/providers/amazon_bedrock.rb +16 -8
- data/lib/riffer/providers/anthropic.rb +15 -7
- data/lib/riffer/providers/base.rb +8 -14
- data/lib/riffer/providers/open_ai.rb +19 -9
- data/lib/riffer/providers/repository.rb +9 -10
- data/lib/riffer/providers/test.rb +17 -22
- data/lib/riffer/providers.rb +1 -0
- data/lib/riffer/stream_events/base.rb +5 -6
- data/lib/riffer/stream_events/guardrail_modification.rb +43 -0
- data/lib/riffer/stream_events/guardrail_tripwire.rb +52 -0
- data/lib/riffer/stream_events/reasoning_delta.rb +4 -10
- data/lib/riffer/stream_events/reasoning_done.rb +4 -10
- data/lib/riffer/stream_events/text_delta.rb +4 -10
- data/lib/riffer/stream_events/text_done.rb +4 -10
- data/lib/riffer/stream_events/token_usage_done.rb +4 -10
- data/lib/riffer/stream_events/tool_call_delta.rb +6 -12
- data/lib/riffer/stream_events/tool_call_done.rb +7 -14
- data/lib/riffer/stream_events.rb +1 -0
- data/lib/riffer/token_usage.rb +10 -23
- data/lib/riffer/tool.rb +58 -76
- data/lib/riffer/tools/param.rb +12 -15
- data/lib/riffer/tools/params.rb +8 -19
- data/lib/riffer/tools/response.rb +14 -20
- data/lib/riffer/tools.rb +1 -0
- data/lib/riffer/version.rb +2 -1
- data/lib/riffer.rb +22 -26
- data/sig/generated/riffer/agent/response.rbs +42 -0
- data/sig/generated/riffer/agent.rbs +175 -0
- data/sig/generated/riffer/config.rbs +60 -0
- data/sig/generated/riffer/core.rbs +17 -0
- data/sig/generated/riffer/evals/evaluator.rbs +66 -0
- data/sig/generated/riffer/evals/evaluators/answer_relevancy.rbs +24 -0
- data/sig/generated/riffer/evals/evaluators.rbs +42 -0
- data/sig/generated/riffer/evals/judge.rbs +52 -0
- data/sig/generated/riffer/evals/metric.rbs +46 -0
- data/sig/generated/riffer/evals/profile.rbs +60 -0
- data/sig/generated/riffer/evals/result.rbs +49 -0
- data/sig/generated/riffer/evals/run_result.rbs +62 -0
- data/sig/generated/riffer/evals/runner.rbs +27 -0
- data/sig/generated/riffer/evals.rbs +10 -0
- data/sig/generated/riffer/guardrail.rbs +79 -0
- data/sig/generated/riffer/guardrails/max_length.rbs +38 -0
- data/sig/generated/riffer/guardrails/modification.rbs +31 -0
- data/sig/generated/riffer/guardrails/result.rbs +73 -0
- data/sig/generated/riffer/guardrails/runner.rbs +51 -0
- data/sig/generated/riffer/guardrails/tripwire.rbs +45 -0
- data/sig/generated/riffer/guardrails.rbs +9 -0
- data/sig/generated/riffer/helpers/class_name_converter.rbs +11 -0
- data/sig/generated/riffer/helpers/dependencies.rbs +25 -0
- data/sig/generated/riffer/helpers/validations.rbs +11 -0
- data/sig/generated/riffer/helpers.rbs +10 -0
- data/sig/generated/riffer/messages/assistant.rbs +41 -0
- data/sig/generated/riffer/messages/base.rbs +24 -0
- data/sig/generated/riffer/messages/converter.rbs +18 -0
- data/sig/generated/riffer/messages/system.rbs +11 -0
- data/sig/generated/riffer/messages/tool.rbs +41 -0
- data/sig/generated/riffer/messages/user.rbs +11 -0
- data/sig/generated/riffer/messages.rbs +11 -0
- data/sig/generated/riffer/providers/amazon_bedrock.rbs +42 -0
- data/sig/generated/riffer/providers/anthropic.rbs +39 -0
- data/sig/generated/riffer/providers/base.rbs +37 -0
- data/sig/generated/riffer/providers/open_ai.rbs +46 -0
- data/sig/generated/riffer/providers/repository.rbs +12 -0
- data/sig/generated/riffer/providers/test.rbs +41 -0
- data/sig/generated/riffer/providers.rbs +10 -0
- data/sig/generated/riffer/stream_events/base.rbs +19 -0
- data/sig/generated/riffer/stream_events/guardrail_modification.rbs +37 -0
- data/sig/generated/riffer/stream_events/guardrail_tripwire.rbs +37 -0
- data/sig/generated/riffer/stream_events/reasoning_delta.rbs +16 -0
- data/sig/generated/riffer/stream_events/reasoning_done.rbs +16 -0
- data/sig/generated/riffer/stream_events/text_delta.rbs +15 -0
- data/sig/generated/riffer/stream_events/text_done.rbs +15 -0
- data/sig/generated/riffer/stream_events/token_usage_done.rbs +19 -0
- data/sig/generated/riffer/stream_events/tool_call_delta.rbs +21 -0
- data/sig/generated/riffer/stream_events/tool_call_done.rbs +24 -0
- data/sig/generated/riffer/stream_events.rbs +13 -0
- data/sig/generated/riffer/token_usage.rbs +48 -0
- data/sig/generated/riffer/tool.rbs +92 -0
- data/sig/generated/riffer/tools/param.rbs +39 -0
- data/sig/generated/riffer/tools/params.rbs +38 -0
- data/sig/generated/riffer/tools/response.rbs +62 -0
- data/sig/generated/riffer/tools.rbs +10 -0
- data/sig/generated/riffer/version.rbs +5 -0
- data/sig/generated/riffer.rbs +36 -0
- metadata +118 -6
- data/sig/riffer.rbs +0 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0cf5738c3a643db6e1626862456336f6b4bba78bbb1e0efa599ebf61e0b80ee5
|
|
4
|
+
data.tar.gz: 683586055701aa2ffb62dd88559ceac640b6e0221120dee6fa589ef3e44a6cb4
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2c3fed6e20f275474ac0e9388a9b5200eab9b899c32534c7ba67b2df5e804097774c35821045e1dfbdaf65db426f7d56a827cd86de2eb32cbd8b65cb7aa8d343
|
|
7
|
+
data.tar.gz: 3db7dd0aa5fde3a20f97bd462b634ef01d8221db5f9c1d6c71a6430d915dc17b2ddebdd625d38514731ba63d6803cab6e94f5f61205e69cbf8d8d47237a3ee42
|
data/.agents/code-style.md
CHANGED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# RBS Inline
|
|
2
|
+
|
|
3
|
+
Type annotations are added directly in Ruby source files using [rbs-inline](https://github.com/soutaro/rbs-inline).
|
|
4
|
+
|
|
5
|
+
## Magic Comment
|
|
6
|
+
|
|
7
|
+
Every `lib/**/*.rb` file must include the `rbs_inline: enabled` comment on line 2:
|
|
8
|
+
|
|
9
|
+
```ruby
|
|
10
|
+
# frozen_string_literal: true
|
|
11
|
+
# rbs_inline: enabled
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Annotation Syntax
|
|
15
|
+
|
|
16
|
+
The **`#:`** prefix is used — standalone lines above methods (type signatures) or inline on the same line (attributes, constants).
|
|
17
|
+
|
|
18
|
+
### Method Parameters and Return Types
|
|
19
|
+
|
|
20
|
+
Use a single `#:` line above the method with the RBS method signature:
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
#: (String, Integer) -> bool
|
|
24
|
+
def valid?(name, age)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
#### Parameter Mapping
|
|
28
|
+
|
|
29
|
+
| Ruby param | RBS signature |
|
|
30
|
+
| ---------------------------- | ------------------------ |
|
|
31
|
+
| `def foo(x)` | `(Type)` |
|
|
32
|
+
| `def foo(x = nil)` | `(?Type?)` |
|
|
33
|
+
| `def foo(x = val)` | `(?Type)` |
|
|
34
|
+
| `def foo(x:)` | `(x: Type)` |
|
|
35
|
+
| `def foo(x: nil)` | `(?x: Type?)` |
|
|
36
|
+
| `def foo(x: val)` | `(?x: Type)` |
|
|
37
|
+
| `def foo(*args)` | `(*untyped)` |
|
|
38
|
+
| `def foo(**kwargs)` | `(**untyped)` |
|
|
39
|
+
| `def foo(&block)` (required) | `() { (Type) -> void }` |
|
|
40
|
+
| `def foo(&block)` (optional) | `() ?{ (Type) -> void }` |
|
|
41
|
+
| `def foo(...)` | `(*untyped, **untyped)` |
|
|
42
|
+
|
|
43
|
+
#### Examples
|
|
44
|
+
|
|
45
|
+
```ruby
|
|
46
|
+
# No parameters
|
|
47
|
+
#: () -> String
|
|
48
|
+
def name
|
|
49
|
+
|
|
50
|
+
# Positional parameters
|
|
51
|
+
#: (String, Integer) -> bool
|
|
52
|
+
def valid?(name, age)
|
|
53
|
+
|
|
54
|
+
# Optional positional parameter
|
|
55
|
+
#: (?String?) -> String
|
|
56
|
+
def self.identifier(value = nil)
|
|
57
|
+
|
|
58
|
+
# Required keyword parameters
|
|
59
|
+
#: (input: String, output: String) -> Riffer::Evals::Result
|
|
60
|
+
def evaluate(input:, output:)
|
|
61
|
+
|
|
62
|
+
# Mixed keyword parameters (required + optional)
|
|
63
|
+
#: (input: String, output: String, ?context: Hash[Symbol, untyped]?) -> Riffer::Evals::Result
|
|
64
|
+
def evaluate(input:, output:, context: nil)
|
|
65
|
+
|
|
66
|
+
# Positional + keyword parameters
|
|
67
|
+
#: (String, ?tool_context: Hash[Symbol, untyped]?) -> String
|
|
68
|
+
def generate(prompt, tool_context: nil)
|
|
69
|
+
|
|
70
|
+
# Splat/double-splat
|
|
71
|
+
#: (**untyped) -> void
|
|
72
|
+
def initialize(**options)
|
|
73
|
+
|
|
74
|
+
# Forward arguments
|
|
75
|
+
#: (*untyped, **untyped) -> String
|
|
76
|
+
def self.generate(...)
|
|
77
|
+
|
|
78
|
+
# Block parameter (required)
|
|
79
|
+
#: () { (Riffer::Messages::Base) -> void } -> self
|
|
80
|
+
def on_message(&block)
|
|
81
|
+
|
|
82
|
+
# Block parameter (optional)
|
|
83
|
+
#: () ?{ (Riffer::Config) -> void } -> void
|
|
84
|
+
def configure(&block)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Attributes
|
|
88
|
+
|
|
89
|
+
```ruby
|
|
90
|
+
attr_reader :name #: String
|
|
91
|
+
attr_reader :items #: Array[String]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Constants
|
|
95
|
+
|
|
96
|
+
```ruby
|
|
97
|
+
VERSION = "1.0.0" #: String
|
|
98
|
+
DEFAULTS = {}.freeze #: Hash[Symbol, untyped]
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Common Type Patterns
|
|
102
|
+
|
|
103
|
+
| Pattern | Meaning |
|
|
104
|
+
| ------------------------- | --------------------------- |
|
|
105
|
+
| `String?` | Optional (String or nil) |
|
|
106
|
+
| `(String \| Integer)` | Union type |
|
|
107
|
+
| `Array[String]` | Typed array |
|
|
108
|
+
| `Hash[Symbol, untyped]` | Typed hash |
|
|
109
|
+
| `^(String) -> void` | Block/proc type |
|
|
110
|
+
| `singleton(Riffer::Tool)` | Class object (not instance) |
|
|
111
|
+
| `bool` | Boolean (true or false) |
|
|
112
|
+
| `untyped` | Any type |
|
|
113
|
+
| `void` | No meaningful return |
|
|
114
|
+
|
|
115
|
+
## Workflow
|
|
116
|
+
|
|
117
|
+
After changing type annotations:
|
|
118
|
+
|
|
119
|
+
1. Run `bundle exec rake rbs:generate` to regenerate `sig/generated/` files
|
|
120
|
+
2. Commit both the source changes and the generated `.rbs` files
|
|
121
|
+
3. CI checks for drift between source annotations and committed `.rbs` files
|
|
122
|
+
|
|
123
|
+
Use `bundle exec rake rbs:watch` during development to auto-regenerate on file changes.
|
data/.agents/rdoc.md
CHANGED
|
@@ -1,24 +1,30 @@
|
|
|
1
1
|
# RDoc Documentation
|
|
2
2
|
|
|
3
|
-
Use
|
|
3
|
+
Use RDoc prose comments for public API descriptions and RBS inline annotations for types.
|
|
4
4
|
|
|
5
|
-
## Parameters
|
|
5
|
+
## Parameters and Return Types
|
|
6
6
|
|
|
7
|
-
Use
|
|
7
|
+
Describe parameters in the RDoc prose comment. Use a single `#:` line for the RBS method signature (see [rbs-inline.md](rbs-inline.md) for the full type annotation syntax):
|
|
8
8
|
|
|
9
9
|
```ruby
|
|
10
10
|
# Creates a new agent.
|
|
11
11
|
#
|
|
12
|
-
# name
|
|
13
|
-
# options
|
|
12
|
+
# +name+ - the agent name.
|
|
13
|
+
# +options+ - optional configuration.
|
|
14
|
+
#
|
|
15
|
+
#: (String, ?options: Hash[Symbol, untyped]) -> void
|
|
16
|
+
def initialize(name, options: {})
|
|
14
17
|
```
|
|
15
18
|
|
|
16
|
-
##
|
|
19
|
+
## Attributes and Constants
|
|
17
20
|
|
|
18
|
-
|
|
21
|
+
Use `#:` inline syntax (on the same line) for attribute and constant types:
|
|
19
22
|
|
|
20
23
|
```ruby
|
|
21
|
-
#
|
|
24
|
+
# The agent name.
|
|
25
|
+
attr_reader :name #: String
|
|
26
|
+
|
|
27
|
+
DEFAULT_TIMEOUT = 10 #: Integer
|
|
22
28
|
```
|
|
23
29
|
|
|
24
30
|
## Exceptions
|
data/AGENTS.md
CHANGED
|
@@ -16,6 +16,7 @@ Ruby gem framework for building AI-powered agents with LLM provider adapters.
|
|
|
16
16
|
- [Code Style](.agents/code-style.md) - StandardRB and comment conventions
|
|
17
17
|
- [RDoc](.agents/rdoc.md) - Documentation format for public APIs
|
|
18
18
|
- [Providers](.agents/providers.md) - Adding new LLM provider adapters
|
|
19
|
+
- [RBS Inline](.agents/rbs-inline.md) - Type annotations with rbs-inline
|
|
19
20
|
|
|
20
21
|
## Commands
|
|
21
22
|
|
|
@@ -25,4 +26,6 @@ Ruby gem framework for building AI-powered agents with LLM provider adapters.
|
|
|
25
26
|
| `bundle exec rake test` | Run tests only |
|
|
26
27
|
| `bundle exec rake standard` | Check code style |
|
|
27
28
|
| `bundle exec rake standard:fix` | Auto-fix style issues |
|
|
29
|
+
| `bundle exec rake rbs:generate` | Generate RBS type signatures |
|
|
30
|
+
| `bundle exec rake rbs:watch` | Watch and regenerate RBS files |
|
|
28
31
|
| `bin/console` | Interactive console |
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,30 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.12.0](https://github.com/janeapp/riffer/compare/riffer/v0.11.0...riffer/v0.12.0) (2026-02-11)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
### ⚠ BREAKING CHANGES
|
|
12
|
+
|
|
13
|
+
* Agent#generate now returns Riffer::Agent::Response instead of String. Use response.content or response.to_s for the text.
|
|
14
|
+
|
|
15
|
+
### Features
|
|
16
|
+
|
|
17
|
+
* add Claude Code Review GitHub Action ([#108](https://github.com/janeapp/riffer/issues/108)) ([f4b281c](https://github.com/janeapp/riffer/commit/f4b281c43e6ad50430c38323bcb876b60efc994a))
|
|
18
|
+
* add evals primitive for LLM-as-judge evaluations ([#101](https://github.com/janeapp/riffer/issues/101)) ([8fd7b36](https://github.com/janeapp/riffer/commit/8fd7b369f2bd0236ea4c7d30cc12e71b960211dd))
|
|
19
|
+
* add guardrails primitive for input/output processing ([#100](https://github.com/janeapp/riffer/issues/100)) ([48d8bad](https://github.com/janeapp/riffer/commit/48d8badce98c0bf9110bafebd3097e25f46c8444))
|
|
20
|
+
* add inline RBS type annotations with Steep type checking ([#103](https://github.com/janeapp/riffer/issues/103)) ([02ae559](https://github.com/janeapp/riffer/commit/02ae559fa580ef4353bd969f2e50e056ab538e2d))
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
### Bug Fixes
|
|
24
|
+
|
|
25
|
+
* correct RBS inline annotations and remove ivar declarations ([#109](https://github.com/janeapp/riffer/issues/109)) ([d59076d](https://github.com/janeapp/riffer/commit/d59076d40b88f581b51ddbb9ee3d50ed57e84451))
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
### Miscellaneous Chores
|
|
29
|
+
|
|
30
|
+
* set next version ([#111](https://github.com/janeapp/riffer/issues/111)) ([faf41b9](https://github.com/janeapp/riffer/commit/faf41b92032e302c3f0d2d06ab93140137c1b199))
|
|
31
|
+
|
|
8
32
|
## [0.11.0](https://github.com/janeapp/riffer/compare/riffer/v0.10.0...riffer/v0.11.0) (2026-02-04)
|
|
9
33
|
|
|
10
34
|
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
AGENTS.md
|
data/Guardfile
ADDED
data/Rakefile
CHANGED
|
@@ -23,4 +23,26 @@ end
|
|
|
23
23
|
|
|
24
24
|
task docs: :rdoc
|
|
25
25
|
|
|
26
|
-
|
|
26
|
+
namespace :rbs do
|
|
27
|
+
desc "Generate RBS type signatures from inline annotations"
|
|
28
|
+
task :generate do
|
|
29
|
+
sh "bundle exec rbs-inline --output sig/generated lib"
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
desc "Watch lib/ for changes and regenerate RBS files"
|
|
33
|
+
task :watch do
|
|
34
|
+
require "guard"
|
|
35
|
+
require "guard/commander"
|
|
36
|
+
|
|
37
|
+
Guard.start(no_interactions: true)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
namespace :steep do
|
|
42
|
+
desc "Run Steep type checker"
|
|
43
|
+
task :check do
|
|
44
|
+
sh "bundle exec steep check"
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
task default: %i[test standard steep:check]
|
data/Steepfile
ADDED
data/docs/03_AGENTS.md
CHANGED
|
@@ -100,15 +100,38 @@ class MyAgent < Riffer::Agent
|
|
|
100
100
|
end
|
|
101
101
|
```
|
|
102
102
|
|
|
103
|
+
### guardrail
|
|
104
|
+
|
|
105
|
+
Registers guardrails for pre/post processing of messages. Pass the guardrail class and any options:
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
class MyAgent < Riffer::Agent
|
|
109
|
+
model 'openai/gpt-4o'
|
|
110
|
+
|
|
111
|
+
# Input-only guardrail
|
|
112
|
+
guardrail :before, with: InputValidator
|
|
113
|
+
|
|
114
|
+
# Output-only guardrail
|
|
115
|
+
guardrail :after, with: ResponseFilter
|
|
116
|
+
|
|
117
|
+
# Both input and output, with options
|
|
118
|
+
guardrail :around, with: Riffer::Guardrails::MaxLength, max: 1000
|
|
119
|
+
end
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
See [Guardrails](09_GUARDRAILS.md) for detailed documentation.
|
|
123
|
+
|
|
103
124
|
## Instance Methods
|
|
104
125
|
|
|
105
126
|
### generate
|
|
106
127
|
|
|
107
|
-
Generates a response synchronously:
|
|
128
|
+
Generates a response synchronously. Returns a `Riffer::Agent::Response` object:
|
|
108
129
|
|
|
109
130
|
```ruby
|
|
110
131
|
# Class method (recommended for simple calls)
|
|
111
132
|
response = MyAgent.generate('Hello')
|
|
133
|
+
puts response.content # Access the response text
|
|
134
|
+
puts response.blocked? # Check if guardrail blocked (always false without guardrails)
|
|
112
135
|
|
|
113
136
|
# Instance method (when you need message history or callbacks)
|
|
114
137
|
agent = MyAgent.new
|
data/docs/06_STREAM_EVENTS.md
CHANGED
|
@@ -109,6 +109,40 @@ event.role # => "assistant"
|
|
|
109
109
|
event.content # => "Let me think about this step by step..."
|
|
110
110
|
```
|
|
111
111
|
|
|
112
|
+
### GuardrailTripwire
|
|
113
|
+
|
|
114
|
+
Emitted when a guardrail blocks execution during streaming:
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
agent.stream("Hello").each do |event|
|
|
118
|
+
case event
|
|
119
|
+
when Riffer::StreamEvents::GuardrailTripwire
|
|
120
|
+
puts "Blocked by: #{event.guardrail_id}"
|
|
121
|
+
puts "Reason: #{event.reason}"
|
|
122
|
+
puts "Phase: #{event.phase}" # :before or :after
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
See [Guardrails](09_GUARDRAILS.md) for more information.
|
|
128
|
+
|
|
129
|
+
### GuardrailModification
|
|
130
|
+
|
|
131
|
+
Emitted when a guardrail transforms data during streaming:
|
|
132
|
+
|
|
133
|
+
```ruby
|
|
134
|
+
agent.stream("Hello").each do |event|
|
|
135
|
+
case event
|
|
136
|
+
when Riffer::StreamEvents::GuardrailModification
|
|
137
|
+
puts "Modified by: #{event.guardrail_id}"
|
|
138
|
+
puts "Phase: #{event.phase}" # :before or :after
|
|
139
|
+
puts "Changed: #{event.message_indices}" # Array of affected indices
|
|
140
|
+
end
|
|
141
|
+
end
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
See [Guardrails](09_GUARDRAILS.md) for more information.
|
|
145
|
+
|
|
112
146
|
### TokenUsageDone
|
|
113
147
|
|
|
114
148
|
Emitted when token usage data is available at the end of a response:
|
data/docs/08_EVALS.md
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
# Evals
|
|
2
|
+
|
|
3
|
+
Evals let you measure the quality of agent outputs using LLM-as-judge evaluations.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
Riffer Evals provides a framework for evaluating agent responses against configurable quality metrics. It uses an LLM-as-judge approach where a separate model evaluates the outputs of your agents.
|
|
8
|
+
|
|
9
|
+
Key concepts:
|
|
10
|
+
|
|
11
|
+
- **Evaluators** - Classes that evaluate input/output pairs and return scores
|
|
12
|
+
- **Metrics** - Evaluator configurations with pass/fail thresholds
|
|
13
|
+
- **Profiles** - Collections of metrics that can be included in agents
|
|
14
|
+
- **Results** - Individual evaluation scores and aggregate pass/fail status
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
|
|
18
|
+
```ruby
|
|
19
|
+
# 1. Configure the judge model
|
|
20
|
+
Riffer.config.evals.judge_model = "anthropic/claude-opus-4-5-20251101"
|
|
21
|
+
|
|
22
|
+
# 2. Define an eval profile
|
|
23
|
+
module QualityEvals
|
|
24
|
+
include Riffer::Evals::Profile
|
|
25
|
+
|
|
26
|
+
ai_evals do
|
|
27
|
+
metric :answer_relevancy, min: 0.85
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# 3. Include in your agent
|
|
32
|
+
class MyAgent < Riffer::Agent
|
|
33
|
+
include QualityEvals
|
|
34
|
+
model "anthropic/claude-haiku-4-5-20251001"
|
|
35
|
+
instructions "You are a helpful assistant."
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# 4. Run evals
|
|
39
|
+
result = MyAgent.run_eval(input: "What is Ruby?")
|
|
40
|
+
result.passed? # => true/false
|
|
41
|
+
result.aggregate_score # => 0.91
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Configuration
|
|
45
|
+
|
|
46
|
+
Before using evals, configure the judge model:
|
|
47
|
+
|
|
48
|
+
```ruby
|
|
49
|
+
Riffer.config.evals.judge_model = "anthropic/claude-opus-4-5-20251101"
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
The judge model is the LLM that evaluates agent outputs. You can use any configured provider.
|
|
53
|
+
|
|
54
|
+
## Built-in Evaluators
|
|
55
|
+
|
|
56
|
+
### answer_relevancy
|
|
57
|
+
|
|
58
|
+
Evaluates how well a response addresses the input question.
|
|
59
|
+
|
|
60
|
+
- **higher_is_better**: true
|
|
61
|
+
- **Score range**: 0.0 to 1.0
|
|
62
|
+
- **1.0**: Perfectly relevant, directly addresses the question
|
|
63
|
+
- **0.7-0.9**: Mostly relevant with minor tangents
|
|
64
|
+
- **0.4-0.6**: Partially relevant, some off-topic content
|
|
65
|
+
- **0.1-0.3**: Mostly irrelevant
|
|
66
|
+
- **0.0**: Completely irrelevant
|
|
67
|
+
|
|
68
|
+
```ruby
|
|
69
|
+
ai_evals do
|
|
70
|
+
metric :answer_relevancy, min: 0.85
|
|
71
|
+
end
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Eval Profiles
|
|
75
|
+
|
|
76
|
+
Eval profiles define which evaluators to run and their pass/fail thresholds.
|
|
77
|
+
|
|
78
|
+
### Defining a Profile
|
|
79
|
+
|
|
80
|
+
```ruby
|
|
81
|
+
module QualityEvals
|
|
82
|
+
include Riffer::Evals::Profile
|
|
83
|
+
|
|
84
|
+
ai_evals do
|
|
85
|
+
metric :answer_relevancy, min: 0.85
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Metric Options
|
|
91
|
+
|
|
92
|
+
- `min` - Minimum score to pass (for higher_is_better evaluators)
|
|
93
|
+
- `max` - Maximum score to pass (for lower_is_better evaluators)
|
|
94
|
+
- `weight` - Weight for aggregate scoring (default: 1.0)
|
|
95
|
+
|
|
96
|
+
```ruby
|
|
97
|
+
ai_evals do
|
|
98
|
+
metric :answer_relevancy, min: 0.85, weight: 2.0 # Weighted more heavily
|
|
99
|
+
end
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Including in Agents
|
|
103
|
+
|
|
104
|
+
```ruby
|
|
105
|
+
class MyAgent < Riffer::Agent
|
|
106
|
+
include QualityEvals
|
|
107
|
+
model "anthropic/claude-haiku-4-5-20251001"
|
|
108
|
+
end
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Running Evals
|
|
112
|
+
|
|
113
|
+
Once a profile is included, call `.eval` on the agent class:
|
|
114
|
+
|
|
115
|
+
```ruby
|
|
116
|
+
result = MyAgent.run_eval(
|
|
117
|
+
input: "What is the capital of France?",
|
|
118
|
+
context: { ground_truth: "Paris" } # Optional context
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### RunResult Object
|
|
123
|
+
|
|
124
|
+
The eval method returns a `Riffer::Evals::RunResult`:
|
|
125
|
+
|
|
126
|
+
```ruby
|
|
127
|
+
result.passed? # => true if all metrics pass thresholds
|
|
128
|
+
result.aggregate_score # => Weighted average of normalized scores (0.0-1.0)
|
|
129
|
+
result.failures # => Array of Result objects that failed
|
|
130
|
+
result.results # => Array of all Result objects
|
|
131
|
+
result.input # => The input that was evaluated
|
|
132
|
+
result.output # => The agent's output
|
|
133
|
+
result.to_h # => Hash representation
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
### Result Object
|
|
137
|
+
|
|
138
|
+
Individual evaluation results:
|
|
139
|
+
|
|
140
|
+
```ruby
|
|
141
|
+
result.results.first.evaluator # => "answer_relevancy"
|
|
142
|
+
result.results.first.score # => 0.92
|
|
143
|
+
result.results.first.reason # => "The response directly addresses..."
|
|
144
|
+
result.results.first.higher_is_better # => true
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Defining Custom Evaluators
|
|
148
|
+
|
|
149
|
+
Create evaluators by subclassing `Riffer::Evals::Evaluator`:
|
|
150
|
+
|
|
151
|
+
```ruby
|
|
152
|
+
# app/evals/medical_accuracy_evaluator.rb
|
|
153
|
+
class MedicalAccuracyEvaluator < Riffer::Evals::Evaluator
|
|
154
|
+
identifier "medical_accuracy"
|
|
155
|
+
description "Evaluates medical information accuracy"
|
|
156
|
+
higher_is_better true
|
|
157
|
+
judge_model "anthropic/claude-opus-4-5-20251101" # Optional override
|
|
158
|
+
|
|
159
|
+
SYSTEM_PROMPT = <<~PROMPT
|
|
160
|
+
You are an evaluation assistant that assesses medical accuracy.
|
|
161
|
+
|
|
162
|
+
Use the evaluation tool to submit your score (0.0-1.0) and reasoning.
|
|
163
|
+
PROMPT
|
|
164
|
+
|
|
165
|
+
def evaluate(input:, output:, context: nil)
|
|
166
|
+
user_prompt = <<~PROMPT
|
|
167
|
+
Question: #{input}
|
|
168
|
+
Response: #{output}
|
|
169
|
+
Ground truth: #{context[:ground_truth]}
|
|
170
|
+
PROMPT
|
|
171
|
+
|
|
172
|
+
evaluation = judge.evaluate(
|
|
173
|
+
system_prompt: SYSTEM_PROMPT,
|
|
174
|
+
user_prompt: user_prompt
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
result(score: evaluation[:score], reason: evaluation[:reason])
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Registering Custom Evaluators
|
|
183
|
+
|
|
184
|
+
Register custom evaluators in your app initialization. Built-in evaluators are always available.
|
|
185
|
+
|
|
186
|
+
```ruby
|
|
187
|
+
# config/initializers/riffer.rb
|
|
188
|
+
Riffer::Evals::Evaluators::Repository.register(:medical_accuracy, MedicalAccuracyEvaluator)
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Evaluator DSL
|
|
192
|
+
|
|
193
|
+
Class methods:
|
|
194
|
+
|
|
195
|
+
- `identifier(value)` - Set the evaluator identifier (defaults to snake_case class name)
|
|
196
|
+
- `description(value)` - Human-readable description
|
|
197
|
+
- `higher_is_better(value)` - Whether higher scores are better (default: true)
|
|
198
|
+
- `judge_model(value)` - Override the global judge model
|
|
199
|
+
|
|
200
|
+
Instance methods:
|
|
201
|
+
|
|
202
|
+
- `evaluate(input:, output:, context:)` - Must be implemented, returns a Result
|
|
203
|
+
- `judge` - Returns a Judge instance for LLM-as-judge calls
|
|
204
|
+
- `result(score:, reason:, metadata:)` - Helper to build Result objects
|
|
205
|
+
|
|
206
|
+
### Judge Options
|
|
207
|
+
|
|
208
|
+
The `judge.evaluate` method accepts either `system_prompt:` and `user_prompt:` or a `messages:` array:
|
|
209
|
+
|
|
210
|
+
```ruby
|
|
211
|
+
# Using system_prompt and user_prompt
|
|
212
|
+
evaluation = judge.evaluate(
|
|
213
|
+
system_prompt: "You are a judge.",
|
|
214
|
+
user_prompt: "Evaluate this response."
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
# Using messages array (for more control)
|
|
218
|
+
evaluation = judge.evaluate(
|
|
219
|
+
messages: [
|
|
220
|
+
{ role: "system", content: "You are a judge." },
|
|
221
|
+
{ role: "user", content: "Evaluate this response." }
|
|
222
|
+
]
|
|
223
|
+
)
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
The Judge uses tool calling internally to get structured output. An `evaluation` tool with `score` (Float) and `reason` (String) parameters is automatically provided to the judge model, so your prompts should instruct the model to use the evaluation tool rather than respond with raw JSON.
|
|
227
|
+
|
|
228
|
+
### Rule-Based Evaluators
|
|
229
|
+
|
|
230
|
+
Evaluators don't have to use LLM-as-judge:
|
|
231
|
+
|
|
232
|
+
```ruby
|
|
233
|
+
class LengthEvaluator < Riffer::Evals::Evaluator
|
|
234
|
+
identifier "response_length"
|
|
235
|
+
description "Checks response is within expected length"
|
|
236
|
+
higher_is_better true
|
|
237
|
+
|
|
238
|
+
def evaluate(input:, output:, context: nil)
|
|
239
|
+
min_length = context&.dig(:min_length) || 50
|
|
240
|
+
max_length = context&.dig(:max_length) || 500
|
|
241
|
+
|
|
242
|
+
length = output.length
|
|
243
|
+
|
|
244
|
+
if length < min_length
|
|
245
|
+
score = length.to_f / min_length
|
|
246
|
+
reason = "Response too short (#{length} < #{min_length})"
|
|
247
|
+
elsif length > max_length
|
|
248
|
+
score = max_length.to_f / length
|
|
249
|
+
reason = "Response too long (#{length} > #{max_length})"
|
|
250
|
+
else
|
|
251
|
+
score = 1.0
|
|
252
|
+
reason = "Response length is appropriate"
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
result(score: score, reason: reason)
|
|
256
|
+
end
|
|
257
|
+
end
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Aggregate Scoring
|
|
261
|
+
|
|
262
|
+
The `aggregate_score` normalizes all scores so higher is always better:
|
|
263
|
+
|
|
264
|
+
- For `higher_is_better` evaluators: score is used directly
|
|
265
|
+
- For `lower_is_better` evaluators: score is inverted (1.0 - score)
|
|
266
|
+
|
|
267
|
+
Scores are then weighted:
|
|
268
|
+
|
|
269
|
+
```ruby
|
|
270
|
+
# With weights: relevancy=2.0, toxicity=1.0
|
|
271
|
+
# relevancy score: 0.9 (higher_is_better)
|
|
272
|
+
# toxicity score: 0.1 (lower_is_better)
|
|
273
|
+
|
|
274
|
+
# Normalized: relevancy=0.9, toxicity=0.9 (1.0 - 0.1)
|
|
275
|
+
# Weighted average: (0.9 * 2.0 + 0.9 * 1.0) / (2.0 + 1.0) = 0.9
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
## Example: Full Integration
|
|
279
|
+
|
|
280
|
+
```ruby
|
|
281
|
+
# config/initializers/riffer.rb
|
|
282
|
+
Riffer.configure do |config|
|
|
283
|
+
config.anthropic.api_key = ENV["ANTHROPIC_API_KEY"]
|
|
284
|
+
config.evals.judge_model = "anthropic/claude-opus-4-5-20251101"
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
# app/evals/quality_evals.rb
|
|
288
|
+
module QualityEvals
|
|
289
|
+
include Riffer::Evals::Profile
|
|
290
|
+
|
|
291
|
+
ai_evals do
|
|
292
|
+
metric :answer_relevancy, min: 0.85, weight: 2.0
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
# app/agents/support_agent.rb
|
|
297
|
+
class SupportAgent < Riffer::Agent
|
|
298
|
+
include QualityEvals
|
|
299
|
+
|
|
300
|
+
model "anthropic/claude-opus-4-5-20251101"
|
|
301
|
+
instructions "You are a helpful customer support agent."
|
|
302
|
+
end
|
|
303
|
+
|
|
304
|
+
# test/agents/support_agent_test.rb
|
|
305
|
+
class SupportAgentTest < Minitest::Test
|
|
306
|
+
def test_response_quality
|
|
307
|
+
result = SupportAgent.run_eval
|
|
308
|
+
input: "How do I reset my password?",
|
|
309
|
+
context: {}
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
assert result.passed?, "Expected eval to pass: #{result.failures.map(&:reason)}"
|
|
313
|
+
assert result.aggregate_score >= 0.85
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
```
|