relevant_chunks 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +73 -0
- data/LICENSE +21 -0
- data/README.md +86 -0
- data/exe/relevant_chunks +78 -0
- data/lib/relevant_chunks/chunker.rb +161 -0
- data/lib/relevant_chunks/processor.rb +165 -0
- data/lib/relevant_chunks/version.rb +6 -0
- data/lib/relevant_chunks.rb +73 -0
- metadata +71 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 46f8f4e6500675c68339102a9be2646c6366c8e190a72cb3e4f3eb3b1dbf6f20
|
4
|
+
data.tar.gz: 9ad4948a253d78402bc42ee00c46320c7b714abefcd49ba3ba08f4031acc3bd1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 603f04bc204e25034d0feb8d523934fa23567e42fbcbf3f78b7e1ef59a7033bb6766ed1eba4ec41b54f46c20b0982d5193ce03bca7f866f9a25dbe92546f06ad
|
7
|
+
data.tar.gz: 79e57c25c813941aea5184ffedec7eb54b6296b700d922bb2b42a3edd35feadf2e406ace9279dcb9aef4bbb21b27968f6ee13c619ce6abc3371b10f64a4033c7
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
All notable changes to this project will be documented in this file.
|
4
|
+
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
7
|
+
|
8
|
+
## [0.3.0] - 2025-02-27
|
9
|
+
|
10
|
+
### Changed
|
11
|
+
|
12
|
+
- Renamed gem from `relevantchunks` to `relevant_chunks` to follow Ruby conventions
|
13
|
+
- Updated all file paths and references to use snake_case
|
14
|
+
- Enhanced consistency across codebase
|
15
|
+
|
16
|
+
## [0.2.0] - 2025-02-02
|
17
|
+
|
18
|
+
### Changed
|
19
|
+
|
20
|
+
- Renamed project from TokenTrim to RelevantChunks
|
21
|
+
- Updated all module references and documentation
|
22
|
+
- Renamed executable from `tokentrim` to `relevant_chunks`
|
23
|
+
|
24
|
+
## [0.1.2] - 2025-02-02
|
25
|
+
|
26
|
+
### Fixed
|
27
|
+
|
28
|
+
- Updated gem description to remove references to commercial features
|
29
|
+
|
30
|
+
## [0.1.1] - 2025-02-02
|
31
|
+
|
32
|
+
### Changed
|
33
|
+
|
34
|
+
- Simplified gem by removing parallelization and paid features
|
35
|
+
- Improved code readability with heredoc strings
|
36
|
+
- Updated documentation to reflect MIT-only license
|
37
|
+
|
38
|
+
## [0.1.0] - 2025-02-02
|
39
|
+
|
40
|
+
### Added
|
41
|
+
|
42
|
+
- Initial release
|
43
|
+
- Smart text chunking with natural boundary detection
|
44
|
+
- Configurable chunk size and overlap
|
45
|
+
- Relevance scoring using Claude/Anthropic API
|
46
|
+
- Advanced configuration options:
|
47
|
+
- Model selection (claude-3-5-sonnet-latest)
|
48
|
+
- Temperature control (0.0-1.0)
|
49
|
+
- Custom system prompts
|
50
|
+
- Configurable scoring range (default 0-100)
|
51
|
+
- Comprehensive documentation and examples
|
52
|
+
- CLI tool for command-line usage
|
53
|
+
|
54
|
+
### Changed
|
55
|
+
|
56
|
+
- N/A (initial release)
|
57
|
+
|
58
|
+
### Deprecated
|
59
|
+
|
60
|
+
- N/A (initial release)
|
61
|
+
|
62
|
+
### Removed
|
63
|
+
|
64
|
+
- N/A (initial release)
|
65
|
+
|
66
|
+
### Fixed
|
67
|
+
|
68
|
+
- N/A (initial release)
|
69
|
+
|
70
|
+
### Security
|
71
|
+
|
72
|
+
- Secure API key handling
|
73
|
+
- No key storage in code
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2025 Robert Lucy
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
# RelevantChunks
|
2
|
+
|
3
|
+
RelevantChunks is a Ruby gem that provides intelligent text chunking and relevance scoring using Claude/Anthropic's AI models. It features smart boundary detection and configurable overlap.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
- Smart text chunking with natural boundary detection
|
8
|
+
- Configurable chunk size and overlap
|
9
|
+
- Relevance scoring using Claude/Anthropic
|
10
|
+
- BYOK (Bring Your Own Key) model
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Add this line to your application's Gemfile:
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
gem 'relevant_chunks'
|
18
|
+
```
|
19
|
+
|
20
|
+
And then execute:
|
21
|
+
|
22
|
+
```bash
|
23
|
+
$ bundle install
|
24
|
+
```
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
|
28
|
+
```bash
|
29
|
+
$ gem install relevant_chunks
|
30
|
+
```
|
31
|
+
|
32
|
+
## Usage
|
33
|
+
|
34
|
+
First, configure RelevantChunks with your Anthropic API key:
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
RelevantChunks.configure do |config|
|
38
|
+
config.api_key = "your_anthropic_api_key"
|
39
|
+
config.max_tokens = 1000 # optional, default: 1000
|
40
|
+
config.overlap_size = 100 # optional, default: 100
|
41
|
+
end
|
42
|
+
```
|
43
|
+
|
44
|
+
Then use it to process text:
|
45
|
+
|
46
|
+
```ruby
|
47
|
+
text = "The solar system consists of the Sun and everything that orbits around it. " \
|
48
|
+
"This includes eight planets, numerous moons, asteroids, comets, and other celestial objects. " \
|
49
|
+
"Earth is the third planet from the Sun and the only known planet to harbor life. " \
|
50
|
+
"Mars, often called the Red Planet, has been the subject of numerous exploration missions."
|
51
|
+
|
52
|
+
# Query about Mars
|
53
|
+
results = RelevantChunks.process(text, "Tell me about Mars")
|
54
|
+
results.each do |result|
|
55
|
+
puts "Chunk: #{result[:chunk]}"
|
56
|
+
puts "Score: #{result[:score]}/100"
|
57
|
+
puts "---"
|
58
|
+
end
|
59
|
+
```
|
60
|
+
|
61
|
+
Example output:
|
62
|
+
|
63
|
+
```
|
64
|
+
Chunk: "Mars, often called the Red Planet, has been the subject of numerous exploration missions."
|
65
|
+
Score: 95/100
|
66
|
+
---
|
67
|
+
Chunk: "Earth is the third planet from the Sun and the only known planet to harbor life."
|
68
|
+
Score: 35/100
|
69
|
+
---
|
70
|
+
Chunk: "The solar system consists of the Sun and everything that orbits around it."
|
71
|
+
Score: 10/100
|
72
|
+
```
|
73
|
+
|
74
|
+
## Development
|
75
|
+
|
76
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
77
|
+
|
78
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
79
|
+
|
80
|
+
## Contributing
|
81
|
+
|
82
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/robert10997/relevant_chunks.
|
83
|
+
|
84
|
+
## License
|
85
|
+
|
86
|
+
The gem is available as open source under the terms of the MIT License.
|
data/exe/relevant_chunks
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "relevant_chunks"
|
5
|
+
require "optparse"
|
6
|
+
require "json"
|
7
|
+
|
8
|
+
options = {
|
9
|
+
max_tokens: 1000,
|
10
|
+
overlap_size: 100,
|
11
|
+
format: "text"
|
12
|
+
}
|
13
|
+
|
14
|
+
parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = "Usage: relevant_chunks [options] <text_file> <query>"
|
16
|
+
|
17
|
+
opts.on("-k", "--api-key KEY", "Anthropic API key") do |key|
|
18
|
+
options[:api_key] = key
|
19
|
+
end
|
20
|
+
|
21
|
+
opts.on("-t", "--max-tokens N", Integer, "Maximum tokens per chunk") do |n|
|
22
|
+
options[:max_tokens] = n
|
23
|
+
end
|
24
|
+
|
25
|
+
opts.on("-o", "--overlap N", Integer, "Overlap size between chunks") do |n|
|
26
|
+
options[:overlap_size] = n
|
27
|
+
end
|
28
|
+
|
29
|
+
opts.on("-f", "--format FORMAT", %w[text json], "Output format (text/json)") do |f|
|
30
|
+
options[:format] = f
|
31
|
+
end
|
32
|
+
|
33
|
+
opts.on("-h", "--help", "Show this help message") do
|
34
|
+
puts opts
|
35
|
+
exit
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
parser.parse!
|
40
|
+
|
41
|
+
if ARGV.length != 2
|
42
|
+
puts parser
|
43
|
+
exit 1
|
44
|
+
end
|
45
|
+
|
46
|
+
text_file, query = ARGV
|
47
|
+
|
48
|
+
begin
|
49
|
+
text = File.read(text_file)
|
50
|
+
rescue StandardError => e
|
51
|
+
puts "Error reading file: #{e.message}"
|
52
|
+
exit 1
|
53
|
+
end
|
54
|
+
|
55
|
+
begin
|
56
|
+
RelevantChunks.configure do |config|
|
57
|
+
config.api_key = options[:api_key] if options[:api_key]
|
58
|
+
config.max_tokens = options[:max_tokens]
|
59
|
+
config.overlap_size = options[:overlap_size]
|
60
|
+
end
|
61
|
+
|
62
|
+
results = RelevantChunks.process(text, query)
|
63
|
+
|
64
|
+
case options[:format]
|
65
|
+
when "json"
|
66
|
+
puts JSON.pretty_generate(results)
|
67
|
+
else
|
68
|
+
results.each.with_index(1) do |result, i|
|
69
|
+
puts "Chunk #{i} (Score: #{result[:score]}):"
|
70
|
+
puts "-" * 40
|
71
|
+
puts result[:chunk]
|
72
|
+
puts "\n"
|
73
|
+
end
|
74
|
+
end
|
75
|
+
rescue RelevantChunks::Error => e
|
76
|
+
puts "Error: #{e.message}"
|
77
|
+
exit 1
|
78
|
+
end
|
@@ -0,0 +1,161 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "logger"
|
4
|
+
|
5
|
+
module RelevantChunks
|
6
|
+
# Handles text chunking with smart boundary detection and configurable overlap
|
7
|
+
#
|
8
|
+
# The Chunker class splits text into chunks while trying to maintain natural
|
9
|
+
# boundaries like sentence endings and paragraphs. It also supports overlapping
|
10
|
+
# chunks to ensure context is maintained across chunk boundaries.
|
11
|
+
#
|
12
|
+
# @example
|
13
|
+
# chunker = RelevantChunks::Chunker.new(max_tokens: 1000, overlap_size: 100)
|
14
|
+
# chunks = chunker.chunk_text("Your long text here...")
|
15
|
+
class Chunker
|
16
|
+
class << self
|
17
|
+
attr_accessor :configuration
|
18
|
+
end
|
19
|
+
|
20
|
+
# @return [Integer] Size of overlap between chunks
|
21
|
+
attr_reader :overlap_size
|
22
|
+
|
23
|
+
# @return [Integer] Maximum number of tokens per chunk
|
24
|
+
attr_reader :max_tokens
|
25
|
+
|
26
|
+
# Initialize a new Chunker instance
|
27
|
+
#
|
28
|
+
# @param max_tokens [Integer] Maximum number of tokens per chunk
|
29
|
+
# @param overlap_size [Integer] Number of tokens to overlap between chunks
|
30
|
+
# @return [Chunker]
|
31
|
+
def initialize(max_tokens: 1000, overlap_size: 100)
|
32
|
+
@max_tokens = max_tokens
|
33
|
+
@overlap_size = overlap_size
|
34
|
+
@logger = Logger.new($stdout)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Split text into chunks with smart boundary detection
|
38
|
+
#
|
39
|
+
# @param text [String] The text to split into chunks
|
40
|
+
# @return [Array<String>] Array of text chunks
|
41
|
+
# @example
|
42
|
+
# chunker = RelevantChunks::Chunker.new
|
43
|
+
# chunks = chunker.chunk_text("First sentence. Second sentence.")
|
44
|
+
def chunk_text(text)
|
45
|
+
@logger.info "Starting chunk_text with text length: #{text.length}"
|
46
|
+
return [text] if text.length <= max_tokens
|
47
|
+
|
48
|
+
chunks = []
|
49
|
+
current_position = 0
|
50
|
+
|
51
|
+
while current_position < text.length
|
52
|
+
chunk_end = find_chunk_boundary(text, current_position)
|
53
|
+
add_chunk(text, current_position, chunk_end, chunks)
|
54
|
+
|
55
|
+
# If we've reached the end, break
|
56
|
+
break if chunk_end >= text.length - 1
|
57
|
+
|
58
|
+
# Calculate next position with overlap
|
59
|
+
next_position = calculate_next_position(current_position, chunk_end)
|
60
|
+
break if should_stop_chunking?(next_position, current_position, text)
|
61
|
+
|
62
|
+
current_position = next_position
|
63
|
+
@logger.info "Moving to position: #{current_position}"
|
64
|
+
end
|
65
|
+
|
66
|
+
@logger.info "Final chunks: #{chunks.inspect}"
|
67
|
+
chunks
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
def add_chunk(text, start_pos, end_pos, chunks)
|
73
|
+
@logger.info "Found chunk boundary at position #{end_pos}"
|
74
|
+
chunk = text[start_pos..end_pos]
|
75
|
+
@logger.info "Created chunk: #{chunk.inspect}"
|
76
|
+
chunks << chunk
|
77
|
+
end
|
78
|
+
|
79
|
+
def calculate_next_position(current_pos, chunk_end)
|
80
|
+
next_pos = [chunk_end - overlap_size + 1, current_pos + 1].max
|
81
|
+
@logger.info "Next position would be: #{next_pos}"
|
82
|
+
next_pos
|
83
|
+
end
|
84
|
+
|
85
|
+
def should_stop_chunking?(next_pos, current_pos, text)
|
86
|
+
remaining_length = text.length - next_pos
|
87
|
+
if next_pos <= current_pos || remaining_length <= overlap_size
|
88
|
+
add_final_chunk(text, next_pos) if remaining_length.positive?
|
89
|
+
true
|
90
|
+
else
|
91
|
+
false
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def add_final_chunk(text, start_pos)
|
96
|
+
final_chunk = text[start_pos..]
|
97
|
+
@logger.info "Adding final chunk: #{final_chunk.inspect}"
|
98
|
+
final_chunk
|
99
|
+
end
|
100
|
+
|
101
|
+
def find_chunk_boundary(text, start_position)
|
102
|
+
target_end = start_position + max_tokens
|
103
|
+
@logger.info "Target end position: #{target_end}"
|
104
|
+
|
105
|
+
return handle_text_end(text) if target_end >= text.length
|
106
|
+
|
107
|
+
# Look for natural boundaries
|
108
|
+
boundary = find_natural_boundary(text, start_position, target_end)
|
109
|
+
return boundary if boundary
|
110
|
+
|
111
|
+
# If no natural boundary found, break at the last space
|
112
|
+
boundary = find_space_boundary(text, start_position, target_end)
|
113
|
+
return boundary if boundary
|
114
|
+
|
115
|
+
target_end
|
116
|
+
end
|
117
|
+
|
118
|
+
def handle_text_end(text)
|
119
|
+
@logger.info "Target end exceeds text length, returning #{text.length - 1}"
|
120
|
+
text.length - 1
|
121
|
+
end
|
122
|
+
|
123
|
+
def find_natural_boundary(text, start_pos, target_end)
|
124
|
+
search_start = [target_end - 30, start_pos].max
|
125
|
+
@logger.info "Looking for natural boundaries between #{search_start} and #{target_end}"
|
126
|
+
|
127
|
+
target_end.downto(search_start) do |i|
|
128
|
+
break if i >= text.length || i + 1 >= text.length
|
129
|
+
|
130
|
+
if natural_boundary?(text, i)
|
131
|
+
@logger.info "Found natural boundary at #{i}"
|
132
|
+
return i
|
133
|
+
end
|
134
|
+
end
|
135
|
+
nil
|
136
|
+
end
|
137
|
+
|
138
|
+
def natural_boundary?(text, pos)
|
139
|
+
return false if pos + 1 >= text.length
|
140
|
+
|
141
|
+
char = text[pos]
|
142
|
+
next_char = text[pos + 1]
|
143
|
+
@logger.info "Checking position #{pos}: char='#{char}', next_char='#{next_char}'"
|
144
|
+
|
145
|
+
(char == "." && next_char == " ") ||
|
146
|
+
(char == "?" && next_char == " ") ||
|
147
|
+
(char == "!" && next_char == " ") ||
|
148
|
+
(char == "\n" && next_char == "\n")
|
149
|
+
end
|
150
|
+
|
151
|
+
def find_space_boundary(text, start_pos, target_end)
|
152
|
+
search_text = text[start_pos..target_end]
|
153
|
+
@logger.info "Looking for last space in: #{search_text.inspect}"
|
154
|
+
last_space = search_text.rindex(" ")
|
155
|
+
return unless last_space
|
156
|
+
|
157
|
+
@logger.info "Found last space at offset #{last_space}"
|
158
|
+
start_pos + last_space
|
159
|
+
end
|
160
|
+
end
|
161
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "faraday"
|
4
|
+
require "json"
|
5
|
+
|
6
|
+
module RelevantChunks
|
7
|
+
# Handles text processing and relevance scoring using Claude/Anthropic
|
8
|
+
#
|
9
|
+
# The Processor class manages the chunking and scoring of text using the Anthropic API.
|
10
|
+
#
|
11
|
+
# @example Basic usage
|
12
|
+
# processor = RelevantChunks::Processor.new(api_key: "your_key")
|
13
|
+
# results = processor.process("Long text here", "What is this about?")
|
14
|
+
#
|
15
|
+
# @example Advanced configuration
|
16
|
+
# processor = RelevantChunks::Processor.new(
|
17
|
+
# api_key: "your_key",
|
18
|
+
# model: "claude-3-5-sonnet-latest", # Use a different model variant
|
19
|
+
# temperature: 0.1, # Add slight variation to scores
|
20
|
+
# system_prompt: "Custom scoring system prompt...",
|
21
|
+
# max_score: 10 # Use 0-10 scoring range
|
22
|
+
# )
|
23
|
+
#
|
24
|
+
# @example Scoring text relevance with different queries
|
25
|
+
# processor = RelevantChunks::Processor.new(api_key: "your_key")
|
26
|
+
# text = "The solar system consists of the Sun and everything that orbits around it. " \
|
27
|
+
# "This includes eight planets, numerous moons, asteroids, comets, and other celestial objects. " \
|
28
|
+
# "Earth is the third planet from the Sun and the only known planet to harbor life. " \
|
29
|
+
# "Mars, often called the Red Planet, has been the subject of numerous exploration missions."
|
30
|
+
#
|
31
|
+
# # Query about Mars
|
32
|
+
# results = processor.process(text, "Tell me about Mars")
|
33
|
+
# # Returns chunks with scores like:
|
34
|
+
# # - "Mars, often called the Red Planet..." (Score: 60)
|
35
|
+
# # - "...numerous exploration missions." (Score: 35)
|
36
|
+
# # - General solar system info (Score: 15)
|
37
|
+
#
|
38
|
+
# # Query about life on planets
|
39
|
+
# results = processor.process(text, "What planets are known to have life?")
|
40
|
+
# # Returns chunks with scores like:
|
41
|
+
# # - "Earth is the third planet...only known planet to harbor life" (Score: 65)
|
42
|
+
# # - Chunks mentioning planets (Score: 35)
|
43
|
+
# # - Other chunks (Score: 5-15)
|
44
|
+
class Processor
|
45
|
+
class << self
|
46
|
+
attr_accessor :configuration
|
47
|
+
end
|
48
|
+
|
49
|
+
# @return [String] Anthropic API key
|
50
|
+
attr_reader :api_key
|
51
|
+
|
52
|
+
# @return [Chunker] Text chunker instance
|
53
|
+
attr_reader :chunker
|
54
|
+
|
55
|
+
# @return [String] Claude model to use
|
56
|
+
attr_reader :model
|
57
|
+
|
58
|
+
# @return [Float] Temperature for scoring (0.0-1.0)
|
59
|
+
attr_reader :temperature
|
60
|
+
|
61
|
+
# @return [String] System prompt for scoring
|
62
|
+
attr_reader :system_prompt
|
63
|
+
|
64
|
+
# @return [Integer] Maximum score in the scoring range
|
65
|
+
attr_reader :max_score
|
66
|
+
|
67
|
+
# Initialize a new Processor instance
|
68
|
+
#
|
69
|
+
# @param api_key [String] Anthropic API key
|
70
|
+
# @param max_tokens [Integer] Maximum tokens per chunk
|
71
|
+
# @param overlap_size [Integer] Overlap size between chunks
|
72
|
+
# @param model [String] Claude model to use (default: "claude-3-5-sonnet-latest")
|
73
|
+
# @param temperature [Float] Temperature for scoring (0.0-1.0, default: 0.0)
|
74
|
+
# @param system_prompt [String, nil] Custom system prompt for scoring (default: nil)
|
75
|
+
# @param max_score [Integer] Maximum score in range (default: 100)
|
76
|
+
# @return [Processor]
|
77
|
+
def initialize(api_key:, max_tokens: 1000, overlap_size: 100,
|
78
|
+
model: "claude-3-5-sonnet-latest", temperature: 0.0,
|
79
|
+
system_prompt: nil, max_score: 100)
|
80
|
+
@api_key = api_key
|
81
|
+
@chunker = Chunker.new(max_tokens: max_tokens, overlap_size: overlap_size)
|
82
|
+
@model = model
|
83
|
+
@temperature = temperature
|
84
|
+
@max_score = max_score
|
85
|
+
@system_prompt = system_prompt || default_system_prompt
|
86
|
+
@conn = Faraday.new(url: "https://api.anthropic.com") do |f|
|
87
|
+
f.request :json
|
88
|
+
f.response :json
|
89
|
+
f.adapter :net_http
|
90
|
+
f.headers = {
|
91
|
+
"accept" => "application/json",
|
92
|
+
"anthropic-version" => "2023-06-01",
|
93
|
+
"content-type" => "application/json",
|
94
|
+
"x-api-key" => api_key
|
95
|
+
}
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
# Process text and score chunks against a query
|
100
|
+
#
|
101
|
+
# @param text [String] The text to process
|
102
|
+
# @param query [String] The query to score chunks against
|
103
|
+
# @return [Array<Hash>] Array of chunks with their scores and API responses. Each hash contains:
|
104
|
+
# - :chunk [String] The text chunk that was scored
|
105
|
+
# - :score [Integer] The relevance score (0-100)
|
106
|
+
# - :response [Hash] The complete raw response from the Anthropic API
|
107
|
+
# @example
|
108
|
+
# processor = RelevantChunks::Processor.new(api_key: "your_key")
|
109
|
+
# results = processor.process("Long text here", "What is this about?")
|
110
|
+
# results.each do |result|
|
111
|
+
# puts "Chunk: #{result[:chunk]}"
|
112
|
+
# puts "Score: #{result[:score]}"
|
113
|
+
# puts "Raw response: #{result[:response].inspect}"
|
114
|
+
# end
|
115
|
+
def process(text, query)
|
116
|
+
chunks = chunker.chunk_text(text)
|
117
|
+
chunks.map { |chunk| score_chunk(chunk, query) }
|
118
|
+
end
|
119
|
+
|
120
|
+
private
|
121
|
+
|
122
|
+
# Score a single chunk against the query
|
123
|
+
#
|
124
|
+
# @param chunk [String] Text chunk to score
|
125
|
+
# @param query [String] Query to score against
|
126
|
+
# @return [Hash] Chunk with score and response
|
127
|
+
def score_chunk(chunk, query)
|
128
|
+
response = @conn.post("/v1/messages") do |req|
|
129
|
+
req.body = {
|
130
|
+
model: model,
|
131
|
+
temperature: temperature,
|
132
|
+
system: system_prompt,
|
133
|
+
messages: [
|
134
|
+
{
|
135
|
+
role: "user",
|
136
|
+
content: <<~STRING
|
137
|
+
Text chunk to evaluate: #{chunk}
|
138
|
+
|
139
|
+
Query: #{query}
|
140
|
+
|
141
|
+
Please output only a number from 0-#{max_score} indicating how relevant this text chunk is to the query.
|
142
|
+
STRING
|
143
|
+
}
|
144
|
+
]
|
145
|
+
}
|
146
|
+
end
|
147
|
+
|
148
|
+
# Parse the response body if it's a string
|
149
|
+
body = response.body.is_a?(String) ? JSON.parse(response.body) : response.body
|
150
|
+
score = body.dig("content", 0, "text").to_i
|
151
|
+
{ chunk: chunk, score: score, response: body }
|
152
|
+
end
|
153
|
+
|
154
|
+
# Default system prompt for scoring
|
155
|
+
#
|
156
|
+
# @return [String]
|
157
|
+
def default_system_prompt
|
158
|
+
<<~STRING
|
159
|
+
You are a text relevance scoring system. Your task is to evaluate how relevant a given text chunk is to a query.
|
160
|
+
Score on a scale of 0-#{max_score}, where 0 means completely irrelevant and #{max_score} means highly relevant.
|
161
|
+
Consider semantic meaning, not just keyword matching. Output only the numeric score.
|
162
|
+
STRING
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "relevant_chunks/version"
|
4
|
+
require_relative "relevant_chunks/chunker"
|
5
|
+
require_relative "relevant_chunks/processor"
|
6
|
+
require "net/http"
|
7
|
+
require "json"
|
8
|
+
|
9
|
+
# RelevantChunks provides intelligent text chunking and relevance scoring using Claude/Anthropic.
|
10
|
+
#
|
11
|
+
# @example Basic usage
|
12
|
+
# RelevantChunks.configure do |config|
|
13
|
+
# config.api_key = "your_anthropic_api_key"
|
14
|
+
# end
|
15
|
+
#
|
16
|
+
# results = RelevantChunks.process("Your text here", "Your query")
|
17
|
+
module RelevantChunks
|
18
|
+
class Error < StandardError; end
|
19
|
+
|
20
|
+
class << self
|
21
|
+
attr_accessor :configuration
|
22
|
+
end
|
23
|
+
|
24
|
+
# Configures RelevantChunks with the given settings
|
25
|
+
#
|
26
|
+
# @yield [config] Configuration object
|
27
|
+
# @example
|
28
|
+
# RelevantChunks.configure do |config|
|
29
|
+
# config.api_key = "your_anthropic_api_key"
|
30
|
+
# config.max_tokens = 1000
|
31
|
+
# end
|
32
|
+
def self.configure
|
33
|
+
self.configuration ||= Configuration.new
|
34
|
+
yield(configuration) if block_given?
|
35
|
+
end
|
36
|
+
|
37
|
+
# Configuration class for RelevantChunks
|
38
|
+
class Configuration
|
39
|
+
attr_accessor :api_key, :max_tokens, :overlap_size
|
40
|
+
|
41
|
+
# Initialize a new Configuration instance
|
42
|
+
#
|
43
|
+
# @return [Configuration]
|
44
|
+
def initialize
|
45
|
+
@max_tokens = 1000
|
46
|
+
@overlap_size = 100
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# Process text with the given query
|
51
|
+
#
|
52
|
+
# @param text [String] The text to process
|
53
|
+
# @param query [String] The query to score chunks against
|
54
|
+
# @return [Array<Hash>] Array of chunks with relevance scores
|
55
|
+
# @raise [Error] If API key is not configured
|
56
|
+
# @example
|
57
|
+
# results = RelevantChunks.process("Long text here", "What is this about?")
|
58
|
+
# results.each do |result|
|
59
|
+
# puts "Chunk: #{result[:chunk]}"
|
60
|
+
# puts "Score: #{result[:score]}"
|
61
|
+
# end
|
62
|
+
def self.process(text, query)
|
63
|
+
raise Error, "API key not configured" unless configuration&.api_key
|
64
|
+
|
65
|
+
processor = Processor.new(
|
66
|
+
api_key: configuration.api_key,
|
67
|
+
max_tokens: configuration.max_tokens,
|
68
|
+
overlap_size: configuration.overlap_size
|
69
|
+
)
|
70
|
+
|
71
|
+
processor.process(text, query)
|
72
|
+
end
|
73
|
+
end
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: relevant_chunks
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Robert Lucy
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2025-02-28 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: faraday
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '2.9'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '2.9'
|
27
|
+
description: RelevantChunks provides intelligent text chunking with smart boundaries
|
28
|
+
and overlap, plus relevance scoring using Claude/Anthropic's AI models.
|
29
|
+
email:
|
30
|
+
- robertlucy@gmail.com
|
31
|
+
executables:
|
32
|
+
- relevant_chunks
|
33
|
+
extensions: []
|
34
|
+
extra_rdoc_files: []
|
35
|
+
files:
|
36
|
+
- CHANGELOG.md
|
37
|
+
- LICENSE
|
38
|
+
- README.md
|
39
|
+
- exe/relevant_chunks
|
40
|
+
- lib/relevant_chunks.rb
|
41
|
+
- lib/relevant_chunks/chunker.rb
|
42
|
+
- lib/relevant_chunks/processor.rb
|
43
|
+
- lib/relevant_chunks/version.rb
|
44
|
+
homepage: https://github.com/robert10997/relevant_chunks
|
45
|
+
licenses:
|
46
|
+
- MIT
|
47
|
+
metadata:
|
48
|
+
homepage_uri: https://github.com/robert10997/relevant_chunks
|
49
|
+
source_code_uri: https://github.com/robert10997/relevant_chunks
|
50
|
+
changelog_uri: https://github.com/robert10997/relevant_chunks/blob/main/CHANGELOG.md
|
51
|
+
rubygems_mfa_required: 'true'
|
52
|
+
post_install_message:
|
53
|
+
rdoc_options: []
|
54
|
+
require_paths:
|
55
|
+
- lib
|
56
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 3.0.0
|
61
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
+
requirements:
|
63
|
+
- - ">="
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
requirements: []
|
67
|
+
rubygems_version: 3.4.10
|
68
|
+
signing_key:
|
69
|
+
specification_version: 4
|
70
|
+
summary: Smart text chunking and relevance scoring using Claude/Anthropic
|
71
|
+
test_files: []
|