ruby_llm-tribunal 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -1
- data/lib/ruby_llm/tribunal/assertions/judge.rb +6 -1
- data/lib/ruby_llm/tribunal/reporters/github.rb +2 -2
- data/lib/ruby_llm/tribunal/version.rb +1 -1
- data/lib/tasks/tribunal.rake +9 -3
- metadata +6 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 95f293e61287cbd2f94763540bf159667d3e02223e0c8950d0f5e46622e1be40
|
|
4
|
+
data.tar.gz: b0e0c398b24b7847d3e4defd7f2200733493621f44f65129a818e0bef6c08894
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 05ee643802ca6de43a4cf89bbb5e9a528030c663e69097fadd26a15543be3b44bfa520956a6f1033c8add68cc505c92043786febcb7fe2b7c77972580102c541
|
|
7
|
+
data.tar.gz: 231b545756fca7e53210c9bc4598cc10ae5da16e8a576a6d2801c59c11bb65a4378c252b10bd269369530de26ece1f51e923342bc512cf73ea4b2df667787310
|
data/CHANGELOG.md
CHANGED
|
@@ -6,6 +6,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
|
6
6
|
|
|
7
7
|
## [Unreleased]
|
|
8
8
|
|
|
9
|
+
## [0.1.1] - 2026-01-16
|
|
10
|
+
|
|
11
|
+
### Fixed
|
|
12
|
+
|
|
13
|
+
- **Critical**: Fixed incorrect threshold logic for negative metrics (toxicity, hallucination, bias, etc.) with `partial` verdicts. High scores on negative metrics now correctly result in failures.
|
|
14
|
+
- **Concurrency**: The `--concurrency` option now gracefully falls back to sequential execution when the `parallel` gem is not installed, with a helpful warning message.
|
|
15
|
+
|
|
16
|
+
### Added
|
|
17
|
+
|
|
18
|
+
- Tests for negative metric edge cases (partial verdicts with inverted threshold logic)
|
|
19
|
+
|
|
9
20
|
## [0.1.0] - 2026-01-15
|
|
10
21
|
|
|
11
22
|
### Added
|
|
@@ -28,5 +39,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
|
28
39
|
- Requires `ruby_llm` >= 1.0
|
|
29
40
|
- Optional: `neighbor` gem for embedding-based similarity
|
|
30
41
|
|
|
31
|
-
[Unreleased]: https://github.com/Alqemist-labs/ruby_llm-tribunal/compare/v0.1.
|
|
42
|
+
[Unreleased]: https://github.com/Alqemist-labs/ruby_llm-tribunal/compare/v0.1.1...HEAD
|
|
43
|
+
[0.1.1]: https://github.com/Alqemist-labs/ruby_llm-tribunal/compare/v0.1.0...v0.1.1
|
|
32
44
|
[0.1.0]: https://github.com/Alqemist-labs/ruby_llm-tribunal/releases/tag/v0.1.0
|
|
@@ -140,7 +140,12 @@ module RubyLLM
|
|
|
140
140
|
passed = case verdict
|
|
141
141
|
when 'yes' then !negative_metric
|
|
142
142
|
when 'no' then negative_metric
|
|
143
|
-
when 'partial'
|
|
143
|
+
when 'partial'
|
|
144
|
+
return [:fail, details] unless score.is_a?(Numeric)
|
|
145
|
+
|
|
146
|
+
# For negative metrics (toxicity, hallucination, etc.), high score = bad
|
|
147
|
+
# So we invert the comparison: pass if score is LOW enough
|
|
148
|
+
negative_metric ? score <= (1.0 - threshold) : score >= threshold
|
|
144
149
|
end
|
|
145
150
|
|
|
146
151
|
passed ? [:pass, details] : [:fail, details]
|
|
@@ -10,8 +10,8 @@ module RubyLLM
|
|
|
10
10
|
annotations = results[:cases]
|
|
11
11
|
.select { |c| c[:status] == :failed }
|
|
12
12
|
.map do |c|
|
|
13
|
-
|
|
14
|
-
|
|
13
|
+
reasons = c[:failures].map { |type, reason| "#{type}: #{reason}" }.join('; ')
|
|
14
|
+
"::error::#{c[:input]}: #{reasons}"
|
|
15
15
|
end
|
|
16
16
|
|
|
17
17
|
summary = "::notice::Tribunal: #{results[:summary][:passed]}/#{results[:summary][:total]} passed " \
|
data/lib/tasks/tribunal.rake
CHANGED
|
@@ -133,9 +133,15 @@ namespace :tribunal do
|
|
|
133
133
|
cases = RubyLLM::Tribunal::Dataset.load_with_assertions(path)
|
|
134
134
|
|
|
135
135
|
if concurrency > 1
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
136
|
+
begin
|
|
137
|
+
require 'parallel'
|
|
138
|
+
Parallel.map(cases, in_threads: concurrency) do |test_case, assertions|
|
|
139
|
+
run_case(test_case, assertions, provider)
|
|
140
|
+
end
|
|
141
|
+
rescue LoadError
|
|
142
|
+
warn "Warning: 'parallel' gem not installed, falling back to sequential execution."
|
|
143
|
+
warn ' Install with: gem install parallel'
|
|
144
|
+
cases.map { |test_case, assertions| run_case(test_case, assertions, provider) }
|
|
139
145
|
end
|
|
140
146
|
else
|
|
141
147
|
cases.map { |test_case, assertions| run_case(test_case, assertions, provider) }
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: ruby_llm-tribunal
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Florian
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-01-
|
|
11
|
+
date: 2026-01-16 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: ruby_llm
|
|
@@ -77,7 +77,7 @@ metadata:
|
|
|
77
77
|
source_code_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal
|
|
78
78
|
changelog_uri: https://github.com/Alqemist-labs/ruby_llm-tribunal/blob/main/CHANGELOG.md
|
|
79
79
|
rubygems_mfa_required: 'true'
|
|
80
|
-
post_install_message:
|
|
80
|
+
post_install_message:
|
|
81
81
|
rdoc_options: []
|
|
82
82
|
require_paths:
|
|
83
83
|
- lib
|
|
@@ -92,8 +92,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
92
92
|
- !ruby/object:Gem::Version
|
|
93
93
|
version: '0'
|
|
94
94
|
requirements: []
|
|
95
|
-
rubygems_version: 3.
|
|
96
|
-
signing_key:
|
|
95
|
+
rubygems_version: 3.4.19
|
|
96
|
+
signing_key:
|
|
97
97
|
specification_version: 4
|
|
98
98
|
summary: LLM evaluation framework for Ruby
|
|
99
99
|
test_files: []
|