boxcars 0.2.7 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +19 -1
- data/README.md +0 -1
- data/boxcars.gemspec +1 -0
- data/lib/boxcars/boxcar/embeddings/document.rb +14 -0
- data/lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb +50 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb +159 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb +56 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb +54 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb +80 -0
- data/lib/boxcars/boxcar/embeddings/similarity_search.rb +51 -0
- data/lib/boxcars/boxcar/embeddings/split_text.rb +104 -0
- data/lib/boxcars/boxcar/embeddings.rb +31 -0
- data/lib/boxcars/boxcar/google_search.rb +9 -1
- data/lib/boxcars/embedding.rb +11 -0
- data/lib/boxcars/engine/gpt4all_eng.rb +56 -0
- data/lib/boxcars/engine.rb +1 -0
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -0
- metadata +27 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69b70e1d02b1ec206438eaaf857a0495fe35ab01e64a265656fe21230675306f
|
4
|
+
data.tar.gz: 8681b9625a0684f1091eea7a4626964929b271370068c90b82dabeee4253d803
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eb5c0c00f8fcdbbd6d8a1999d7544fc584c701fdf9a8a9c271fff6d9795f75ef9cab058fee2c6829808a764c892cc3e2f4e4a8717155d34d6514b46d744e632c
|
7
|
+
data.tar.gz: b8fb4ad34d7b93d47388f037d1d93e9e7245303740bc04d58d21942112ff97315e5dd31fdba77e275b52ddba85ca1055b897646ae1e606daee485583d50c52a6
|
data/CHANGELOG.md
CHANGED
@@ -2,7 +2,22 @@
|
|
2
2
|
|
3
3
|
## [Unreleased](https://github.com/BoxcarsAI/boxcars/tree/HEAD)
|
4
4
|
|
5
|
-
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.
|
5
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.7...HEAD)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Getting the same verbosity as in the examples [\#54](https://github.com/BoxcarsAI/boxcars/issues/54)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Add Engine for Gpt4all [\#55](https://github.com/BoxcarsAI/boxcars/pull/55) ([francis](https://github.com/francis))
|
14
|
+
- update google search to return URL for result if present [\#53](https://github.com/BoxcarsAI/boxcars/pull/53) ([francis](https://github.com/francis))
|
15
|
+
- Draft: added gpt4all [\#49](https://github.com/BoxcarsAI/boxcars/pull/49) ([jaigouk](https://github.com/jaigouk))
|
16
|
+
- Embeddings with hnswlib [\#48](https://github.com/BoxcarsAI/boxcars/pull/48) ([jaigouk](https://github.com/jaigouk))
|
17
|
+
|
18
|
+
## [v0.2.7](https://github.com/BoxcarsAI/boxcars/tree/v0.2.7) (2023-04-13)
|
19
|
+
|
20
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...v0.2.7)
|
6
21
|
|
7
22
|
**Closed issues:**
|
8
23
|
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
boxcars (0.2.
|
4
|
+
boxcars (0.2.8)
|
5
5
|
google_search_results (~> 2.2)
|
6
|
+
gpt4all (~> 0.0.4)
|
6
7
|
ruby-openai (~> 3.0)
|
7
8
|
|
8
9
|
GEM
|
@@ -71,7 +72,12 @@ GEM
|
|
71
72
|
rainbow (>= 2.2.1)
|
72
73
|
rake (>= 10.0)
|
73
74
|
google_search_results (2.2.0)
|
75
|
+
gpt4all (0.0.5)
|
76
|
+
faraday (~> 2.7)
|
77
|
+
os (~> 1.1)
|
78
|
+
tty-progressbar (~> 0.18.2)
|
74
79
|
hashdiff (1.0.1)
|
80
|
+
hnswlib (0.8.1)
|
75
81
|
http-accept (1.7.0)
|
76
82
|
http-cookie (1.0.5)
|
77
83
|
domain_name (~> 0.5)
|
@@ -100,6 +106,7 @@ GEM
|
|
100
106
|
octokit (4.25.1)
|
101
107
|
faraday (>= 1, < 3)
|
102
108
|
sawyer (~> 0.9)
|
109
|
+
os (1.1.4)
|
103
110
|
parallel (1.22.1)
|
104
111
|
parser (3.2.1.1)
|
105
112
|
ast (~> 2.4.1)
|
@@ -163,10 +170,19 @@ GEM
|
|
163
170
|
faraday (>= 0.17.3, < 3)
|
164
171
|
sqlite3 (1.6.2)
|
165
172
|
mini_portile2 (~> 2.8.0)
|
173
|
+
sqlite3 (1.6.2-arm64-darwin)
|
166
174
|
sqlite3 (1.6.2-x86_64-darwin)
|
167
175
|
sqlite3 (1.6.2-x86_64-linux)
|
176
|
+
strings-ansi (0.2.0)
|
168
177
|
timers (4.3.5)
|
169
178
|
traces (0.9.1)
|
179
|
+
tty-cursor (0.7.1)
|
180
|
+
tty-progressbar (0.18.2)
|
181
|
+
strings-ansi (~> 0.2)
|
182
|
+
tty-cursor (~> 0.7)
|
183
|
+
tty-screen (~> 0.8)
|
184
|
+
unicode-display_width (>= 1.6, < 3.0)
|
185
|
+
tty-screen (0.8.1)
|
170
186
|
tzinfo (2.0.6)
|
171
187
|
concurrent-ruby (~> 1.0)
|
172
188
|
unf (0.1.4)
|
@@ -181,6 +197,7 @@ GEM
|
|
181
197
|
hashdiff (>= 0.4.0, < 2.0.0)
|
182
198
|
|
183
199
|
PLATFORMS
|
200
|
+
arm64-darwin-22
|
184
201
|
universal-java-11
|
185
202
|
x86_64-darwin-21
|
186
203
|
x86_64-darwin-22
|
@@ -194,6 +211,7 @@ DEPENDENCIES
|
|
194
211
|
dotenv (~> 2.8)
|
195
212
|
faraday-retry (~> 2.0)
|
196
213
|
github_changelog_generator (~> 1.16)
|
214
|
+
hnswlib (~> 0.8.1)
|
197
215
|
rake (~> 13.0)
|
198
216
|
rest-client (~> 2.1)
|
199
217
|
rspec (~> 3.2)
|
data/README.md
CHANGED
data/boxcars.gemspec
CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
|
|
37
37
|
|
38
38
|
# runtime dependencies
|
39
39
|
spec.add_dependency "google_search_results", "~> 2.2"
|
40
|
+
spec.add_dependency "gpt4all", "~> 0.0.4"
|
40
41
|
spec.add_dependency "ruby-openai", "~> 3.0"
|
41
42
|
|
42
43
|
# For more information and examples about making a new gem, checkout our
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module Embeddings
|
5
|
+
class Document
|
6
|
+
attr_accessor :page_content, :metadata
|
7
|
+
|
8
|
+
def initialize(fields = {})
|
9
|
+
@page_content = fields[:page_content] || ""
|
10
|
+
@metadata = fields[:metadata] || {}
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'openai'
|
4
|
+
|
5
|
+
module Boxcars
|
6
|
+
module Embeddings
|
7
|
+
class EmbedViaOpenAI
|
8
|
+
include Embeddings
|
9
|
+
|
10
|
+
attr_accessor :texts, :openai_connection, :model
|
11
|
+
|
12
|
+
def initialize(texts:, openai_connection:, model: 'text-embedding-ada-002')
|
13
|
+
validate_params(texts, openai_connection)
|
14
|
+
@texts = texts
|
15
|
+
@openai_connection = openai_connection
|
16
|
+
@model = model
|
17
|
+
end
|
18
|
+
|
19
|
+
def call
|
20
|
+
texts.map do |text|
|
21
|
+
embedding = embedding_with_retry(model: model, input: strip_new_lines(text))
|
22
|
+
{
|
23
|
+
embedding: embedding,
|
24
|
+
dim: embedding.size
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def validate_params(texts, openai_connection)
|
32
|
+
raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
|
33
|
+
raise_error 'openai_connection must be an OpenAI::Client' unless openai_connection.is_a?(OpenAI::Client)
|
34
|
+
end
|
35
|
+
|
36
|
+
def embedding_with_retry(request)
|
37
|
+
response = @openai_connection.embeddings(parameters: request)
|
38
|
+
response['data'][0]['embedding']
|
39
|
+
end
|
40
|
+
|
41
|
+
def strip_new_lines(text)
|
42
|
+
text.gsub("\n", ' ')
|
43
|
+
end
|
44
|
+
|
45
|
+
def raise_error(message)
|
46
|
+
raise ::Boxcars::ValueError, message
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'hnswlib'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module Embeddings
|
9
|
+
module Hnswlib
|
10
|
+
class BuildVectorStore
|
11
|
+
include Embeddings
|
12
|
+
|
13
|
+
# This class is responsible for building the vector store for the hnswlib similarity search.
|
14
|
+
# It will load the training data, generate the embeddings, and save the vector store.
|
15
|
+
# It will also load the vector store into memory.
|
16
|
+
# For later use, it will save the splitted document with index numbers to a json file.
|
17
|
+
#
|
18
|
+
# @param training_data_path [String] The path to the training data. Can be a glob pattern.
|
19
|
+
# @param index_file_path [String] The path to the index file.
|
20
|
+
# @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
|
21
|
+
# @option json_doc_file_path [String]. The json file containing the document text.
|
22
|
+
# if nil, it will reuse index file name.
|
23
|
+
# @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
|
24
|
+
def initialize(
|
25
|
+
training_data_path:,
|
26
|
+
index_file_path:,
|
27
|
+
split_chunk_size: 2000,
|
28
|
+
json_doc_file_path: nil,
|
29
|
+
force_rebuild: true
|
30
|
+
)
|
31
|
+
@training_data_path = training_data_path
|
32
|
+
@index_file_path = index_file_path
|
33
|
+
@split_chunk_size = split_chunk_size
|
34
|
+
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
35
|
+
@force_rebuild = force_rebuild
|
36
|
+
end
|
37
|
+
|
38
|
+
def call
|
39
|
+
validate_params
|
40
|
+
data = load_files
|
41
|
+
documents = split_text_into_chunks(data)
|
42
|
+
embeddings_with_config = generate_embeddings(documents)
|
43
|
+
save_vector_store(embeddings_with_config)
|
44
|
+
load_hnsw
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
|
50
|
+
|
51
|
+
def validate_params
|
52
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
53
|
+
raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
54
|
+
raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
55
|
+
|
56
|
+
index_dir = File.dirname(index_file_path)
|
57
|
+
raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
|
58
|
+
|
59
|
+
raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
60
|
+
end
|
61
|
+
|
62
|
+
def load_files
|
63
|
+
data = []
|
64
|
+
files = Dir.glob(training_data_path)
|
65
|
+
raise_error "No files found at #{training_data_path}" if files.empty?
|
66
|
+
|
67
|
+
files.each do |file|
|
68
|
+
data << File.read(file)
|
69
|
+
end
|
70
|
+
puts "Added #{files.length} files to data. Splitting text into chunks..."
|
71
|
+
data
|
72
|
+
end
|
73
|
+
|
74
|
+
def split_text_into_chunks(data)
|
75
|
+
return true unless rebuild_required?
|
76
|
+
|
77
|
+
docs = []
|
78
|
+
data.each do |chunk|
|
79
|
+
doc_output = Boxcars::Embeddings::SplitText.call(
|
80
|
+
separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
|
81
|
+
)
|
82
|
+
docs.concat(doc_output)
|
83
|
+
end
|
84
|
+
docs
|
85
|
+
end
|
86
|
+
|
87
|
+
def rebuild_required?
|
88
|
+
hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
89
|
+
return true unless File.exist?(index_file_path)
|
90
|
+
return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
|
91
|
+
return true if force_rebuild
|
92
|
+
|
93
|
+
false
|
94
|
+
end
|
95
|
+
|
96
|
+
def generate_embeddings(documents)
|
97
|
+
return true unless rebuild_required?
|
98
|
+
|
99
|
+
puts "Initializing Store..."
|
100
|
+
openai_client = OpenAI::Client.new(access_token: ENV.fetch('OPENAI_API_KEY', nil))
|
101
|
+
|
102
|
+
embeddings_with_dim = Boxcars::Embeddings::EmbedViaOpenAI.call(texts: documents, openai_connection: openai_client)
|
103
|
+
|
104
|
+
document_embeddings = embeddings_with_dim.map.with_index do |item, index|
|
105
|
+
{ doc_id: index, embedding: item[:embedding], document: documents[index] }
|
106
|
+
end
|
107
|
+
|
108
|
+
{ document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
|
109
|
+
end
|
110
|
+
|
111
|
+
def save_vector_store(embeddings_with_config)
|
112
|
+
return true unless rebuild_required?
|
113
|
+
|
114
|
+
puts "Saving Vectorstore"
|
115
|
+
Boxcars::Embeddings::Hnswlib::SaveToHnswlib.call(
|
116
|
+
document_embeddings: embeddings_with_config[:document_embeddings],
|
117
|
+
index_file_path: index_file_path,
|
118
|
+
json_doc_file_path: json_doc_file_path,
|
119
|
+
hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
|
120
|
+
)
|
121
|
+
puts "VectorStore saved"
|
122
|
+
end
|
123
|
+
|
124
|
+
def hnswlib_config(dim)
|
125
|
+
# dim: length of datum point vector that will be indexed.
|
126
|
+
Boxcars::Embeddings::Hnswlib::HnswlibConfig.new(
|
127
|
+
metric: "l2", max_item: 10000, dim: dim
|
128
|
+
)
|
129
|
+
end
|
130
|
+
|
131
|
+
def load_hnsw
|
132
|
+
puts "Loading Hnswlib"
|
133
|
+
|
134
|
+
config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
135
|
+
json_config = parse_json_file(config_file)
|
136
|
+
document_embeddings = parse_json_file(json_doc_file_path)
|
137
|
+
|
138
|
+
search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
|
139
|
+
search_index.load_index(index_file_path)
|
140
|
+
|
141
|
+
{ vector_store: search_index, document_embeddings: document_embeddings }
|
142
|
+
end
|
143
|
+
|
144
|
+
def parse_json_file(file_path)
|
145
|
+
return [] if file_path.nil?
|
146
|
+
|
147
|
+
file_content = File.read(file_path)
|
148
|
+
JSON.parse(file_content, symbolize_names: true)
|
149
|
+
rescue JSON::ParserError => e
|
150
|
+
raise_error("Error parsing hnswlib_config.json: #{e.message}")
|
151
|
+
end
|
152
|
+
|
153
|
+
def raise_error(message)
|
154
|
+
raise ::Boxcars::Error, message
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Boxcars
|
6
|
+
module Embeddings
|
7
|
+
module Hnswlib
|
8
|
+
class HnswlibConfig
|
9
|
+
attr_reader :metric, :max_item, :dim, :ef_construction, :m
|
10
|
+
|
11
|
+
# used for search index.
|
12
|
+
#
|
13
|
+
# @param max_item [Integer] The maximum number of items.
|
14
|
+
#
|
15
|
+
# @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
|
16
|
+
#
|
17
|
+
# @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
|
18
|
+
# It controls the index time/accuracy trade-off.
|
19
|
+
#
|
20
|
+
# @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
|
21
|
+
#
|
22
|
+
# reference: https://yoshoku.github.io/hnswlib.rb/doc/
|
23
|
+
def initialize(
|
24
|
+
metric: "l2",
|
25
|
+
max_item: 10000,
|
26
|
+
dim: 2,
|
27
|
+
ef_construction: 200,
|
28
|
+
max_outgoing_connection: 16
|
29
|
+
)
|
30
|
+
@metric = metric
|
31
|
+
@max_item = max_item
|
32
|
+
@dim = dim
|
33
|
+
@ef_construction = ef_construction
|
34
|
+
@max_outgoing_connection = max_outgoing_connection
|
35
|
+
end
|
36
|
+
|
37
|
+
def space
|
38
|
+
@metric == 'dot' ? 'ip' : 'l2'
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(*args)
|
42
|
+
JSON.pretty_generate(
|
43
|
+
{
|
44
|
+
metric: @metric,
|
45
|
+
max_item: @max_item,
|
46
|
+
dim: @dim,
|
47
|
+
ef_construction: @ef_construction,
|
48
|
+
max_outgoing_connection: @max_outgoing_connection
|
49
|
+
},
|
50
|
+
*args
|
51
|
+
)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'hnswlib'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Boxcars
|
7
|
+
module Embeddings
|
8
|
+
module Hnswlib
|
9
|
+
class HnswlibSearch
|
10
|
+
def initialize(vector_store:, options: {})
|
11
|
+
validate_params(vector_store)
|
12
|
+
@vector_store = vector_store
|
13
|
+
@json_doc_path = options[:json_doc_path]
|
14
|
+
@num_neighbors = options[:num_neighbors] || 1
|
15
|
+
end
|
16
|
+
|
17
|
+
def call(query)
|
18
|
+
search(query)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
attr_reader :json_doc_path, :vector_store, :num_neighbors
|
24
|
+
|
25
|
+
def validate_params(vector_store)
|
26
|
+
raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
|
27
|
+
end
|
28
|
+
|
29
|
+
def search(query)
|
30
|
+
raw_results = vector_store.search_knn(query, num_neighbors)
|
31
|
+
raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
|
32
|
+
end
|
33
|
+
|
34
|
+
def lookup_embedding2(doc_id, distance)
|
35
|
+
embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
|
36
|
+
return unless embedding_data
|
37
|
+
|
38
|
+
{ document: embedding_data[:document], distance: distance }
|
39
|
+
end
|
40
|
+
|
41
|
+
def parsed_data
|
42
|
+
@parsed_data ||= JSON.parse(
|
43
|
+
File.read(json_doc_path),
|
44
|
+
symbolize_names: true
|
45
|
+
)
|
46
|
+
end
|
47
|
+
|
48
|
+
def raise_error(message)
|
49
|
+
raise ArgumentError, message
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'hnswlib'
|
4
|
+
require 'json'
|
5
|
+
require 'fileutils'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module Embeddings
|
9
|
+
module Hnswlib
|
10
|
+
class SaveToHnswlib
|
11
|
+
include Embeddings
|
12
|
+
|
13
|
+
# @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
|
14
|
+
# @param index_file_path [String] The path to the index file.
|
15
|
+
# @param hnswlib_config [Boxcars::Embeddings::Hnswlib::Config] The config object for the hnswlib index.
|
16
|
+
# @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
|
17
|
+
def initialize(document_embeddings:, index_file_path:, hnswlib_config:, json_doc_file_path: nil)
|
18
|
+
@document_embeddings = document_embeddings
|
19
|
+
@index_file_path = index_file_path
|
20
|
+
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
21
|
+
|
22
|
+
@hnswlib_config = hnswlib_config
|
23
|
+
@index = ::Hnswlib::HnswIndex.new(
|
24
|
+
n_features: hnswlib_config.dim,
|
25
|
+
max_item: hnswlib_config.max_item,
|
26
|
+
metric: hnswlib_config.metric
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
def call
|
31
|
+
validate_params
|
32
|
+
document_texts = []
|
33
|
+
|
34
|
+
document_embeddings.each do |embedding|
|
35
|
+
index.add_item(embedding[:doc_id], embedding[:embedding])
|
36
|
+
|
37
|
+
document_texts << { doc_id: embedding[:doc_id], embedding: embedding[:embedding], document: embedding[:document] }
|
38
|
+
end
|
39
|
+
|
40
|
+
write_files(index, document_texts)
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def write_files(index, document_texts)
|
46
|
+
FileUtils.mkdir_p(File.dirname(json_doc_file_path))
|
47
|
+
File.write(json_doc_file_path, document_texts.to_json)
|
48
|
+
|
49
|
+
FileUtils.mkdir_p(File.dirname(index_file_path))
|
50
|
+
File.write("#{File.dirname(index_file_path)}/hnswlib_config.json", hnswlib_config.to_json)
|
51
|
+
|
52
|
+
index.save(index_file_path)
|
53
|
+
end
|
54
|
+
|
55
|
+
attr_reader :index, :document_embeddings, :index_file_path, :json_doc_file_path, :hnswlib_config
|
56
|
+
|
57
|
+
def validate_params
|
58
|
+
raise_error("document_embeddings must be an array") unless document_embeddings.is_a?(Array)
|
59
|
+
raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
|
60
|
+
raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
61
|
+
|
62
|
+
[index_file_path, json_doc_file_path].each do |path|
|
63
|
+
check_parent_directory(path)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def check_parent_directory(path)
|
68
|
+
return unless path
|
69
|
+
|
70
|
+
parent_dir = File.dirname(path)
|
71
|
+
raise_error('parent directory must exist') unless File.directory?(parent_dir)
|
72
|
+
end
|
73
|
+
|
74
|
+
def raise_error(message)
|
75
|
+
raise ::Boxcars::ValueError, message
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'hnswlib'
|
4
|
+
|
5
|
+
module Boxcars
|
6
|
+
module Embeddings
|
7
|
+
class SimilaritySearch
|
8
|
+
def initialize(embeddings:, vector_store:, openai_connection:)
|
9
|
+
@embeddings = embeddings
|
10
|
+
@vector_store = vector_store
|
11
|
+
@similarity_search_instance = create_similarity_search_instance
|
12
|
+
@openai_connection = openai_connection
|
13
|
+
end
|
14
|
+
|
15
|
+
def call(query:)
|
16
|
+
validate_query(query)
|
17
|
+
query_vector = convert_query_to_vector(query)
|
18
|
+
@similarity_search_instance.call(query_vector)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
|
23
|
+
attr_reader :embeddings, :vector_store, :openai_connection
|
24
|
+
|
25
|
+
def validate_query(query)
|
26
|
+
raise_error 'query must be a string' unless query.is_a?(String)
|
27
|
+
raise_error 'query must not be empty' if query.empty?
|
28
|
+
end
|
29
|
+
|
30
|
+
def convert_query_to_vector(query)
|
31
|
+
Boxcars::Embeddings::EmbedViaOpenAI.call(texts: [query], openai_connection: openai_connection).first[:embedding]
|
32
|
+
end
|
33
|
+
|
34
|
+
def create_similarity_search_instance
|
35
|
+
case vector_store
|
36
|
+
when ::Hnswlib::HierarchicalNSW
|
37
|
+
Boxcars::Embeddings::Hnswlib::HnswlibSearch.new(
|
38
|
+
vector_store: vector_store,
|
39
|
+
options: { json_doc_path: embeddings, num_neighbors: 2 }
|
40
|
+
)
|
41
|
+
else
|
42
|
+
raise_error 'Unsupported vector store provided'
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def raise_error(message)
|
47
|
+
raise ArgumentError, message
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module Embeddings
|
5
|
+
# Split a text into chunks of a given size.
|
6
|
+
class SplitText
|
7
|
+
include Embeddings
|
8
|
+
|
9
|
+
attr_reader :separator, :chunk_size, :chunk_overlap, :text
|
10
|
+
|
11
|
+
# @param separator [String] The string to use to split the text.
|
12
|
+
# @param chunk_size [Integer] The size of each chunk.
|
13
|
+
# @param chunk_overlap [Integer] The amount of overlap between chunks.
|
14
|
+
# @param text [String] The text to split.
|
15
|
+
def initialize(separator: "Search", chunk_size: 7, chunk_overlap: 3, text: "")
|
16
|
+
# require 'debugger'; debugger
|
17
|
+
validate_params(separator, chunk_size, chunk_overlap, text)
|
18
|
+
|
19
|
+
@separator = separator
|
20
|
+
@chunk_size = chunk_size
|
21
|
+
@chunk_overlap = chunk_overlap
|
22
|
+
@text = text
|
23
|
+
end
|
24
|
+
|
25
|
+
def call
|
26
|
+
splits = text.split(separator)
|
27
|
+
merged_splits = merge_splits(splits, separator)
|
28
|
+
|
29
|
+
merged_splits&.sort
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def validate_params(separator, chunk_size, chunk_overlap, text)
|
35
|
+
raise_error("separator must be a string") unless separator.is_a?(String)
|
36
|
+
raise_error("chunk_size must be an integer") unless chunk_size.is_a?(Integer)
|
37
|
+
raise_error("chunk_overlap must be an integer") unless chunk_overlap.is_a?(Integer)
|
38
|
+
raise_error("text must be a string") unless text.is_a?(String)
|
39
|
+
raise_error("chunk_overlap must be less than chunk_size") if chunk_overlap >= chunk_size
|
40
|
+
end
|
41
|
+
|
42
|
+
def raise_error(message)
|
43
|
+
raise ::Boxcars::ValueError, message
|
44
|
+
end
|
45
|
+
|
46
|
+
def merge_splits(splits, separator)
|
47
|
+
merged_splits = []
|
48
|
+
current_doc = []
|
49
|
+
total = 0
|
50
|
+
|
51
|
+
splits.each do |split|
|
52
|
+
split_len = split.length
|
53
|
+
total = process_split(total, split_len, current_doc, merged_splits, separator)
|
54
|
+
current_doc << split
|
55
|
+
total += split_len
|
56
|
+
end
|
57
|
+
|
58
|
+
add_remaining_doc(current_doc, merged_splits, separator)
|
59
|
+
merged_splits
|
60
|
+
end
|
61
|
+
|
62
|
+
def process_split(total, split_len, current_doc, merged_splits, separator)
|
63
|
+
if total + split_len >= chunk_size
|
64
|
+
warn_if_chunk_too_large(total)
|
65
|
+
total = handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
|
66
|
+
end
|
67
|
+
total
|
68
|
+
end
|
69
|
+
|
70
|
+
def warn_if_chunk_too_large(total)
|
71
|
+
return unless total > chunk_size
|
72
|
+
|
73
|
+
puts "Created a chunk of size #{total}, which is longer than the specified #{chunk_size}"
|
74
|
+
end
|
75
|
+
|
76
|
+
def handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
|
77
|
+
if current_doc.length.positive?
|
78
|
+
doc = join_docs(current_doc, separator)
|
79
|
+
merged_splits << doc if doc
|
80
|
+
total = remove_overlap(total, split_len, current_doc)
|
81
|
+
end
|
82
|
+
total
|
83
|
+
end
|
84
|
+
|
85
|
+
def remove_overlap(total, split_len, current_doc)
|
86
|
+
while total > chunk_overlap || (total + split_len > chunk_size && total.positive?)
|
87
|
+
total -= current_doc[0].length
|
88
|
+
current_doc.shift
|
89
|
+
end
|
90
|
+
total
|
91
|
+
end
|
92
|
+
|
93
|
+
def add_remaining_doc(current_doc, merged_splits, separator)
|
94
|
+
doc = join_docs(current_doc, separator)
|
95
|
+
merged_splits << doc if doc
|
96
|
+
end
|
97
|
+
|
98
|
+
def join_docs(docs, separator)
|
99
|
+
text = docs.join(separator).strip
|
100
|
+
text.empty? ? nil : text
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
4
|
+
module Boxcars
|
5
|
+
module Embeddings
|
6
|
+
module ClassMethods
|
7
|
+
EmbeddingsError = Class.new(StandardError)
|
8
|
+
|
9
|
+
def call(*args, **kw_args)
|
10
|
+
new(*args, **kw_args).call
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.included(base)
|
15
|
+
base.extend(ClassMethods)
|
16
|
+
|
17
|
+
class << base
|
18
|
+
private :new
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
require_relative "embeddings/document"
|
25
|
+
require_relative "embeddings/embed_via_open_ai"
|
26
|
+
require_relative "embeddings/split_text"
|
27
|
+
require_relative "embeddings/similarity_search"
|
28
|
+
require_relative "embeddings/hnswlib/hnswlib_config"
|
29
|
+
require_relative "embeddings/hnswlib/save_to_hnswlib"
|
30
|
+
require_relative "embeddings/hnswlib/build_vector_store"
|
31
|
+
require_relative "embeddings/hnswlib/hnswlib_search"
|
@@ -57,7 +57,15 @@ module Boxcars
|
|
57
57
|
raise Error, "Got error from SerpAPI: {res[:error]}" if res[:error]
|
58
58
|
|
59
59
|
ANSWER_LOCATIONS.each do |path|
|
60
|
-
|
60
|
+
next unless res.dig(*path)
|
61
|
+
|
62
|
+
Boxcars.debug("Found SERP answer at #{path}", :cyan)
|
63
|
+
path_link = path.dup
|
64
|
+
last_word = path_link.pop
|
65
|
+
path_link << :link
|
66
|
+
return { last_word => res.dig(*path), url: res.dig(*path_link) } if last_word.is_a?(Symbol) && res.dig(*path_link)
|
67
|
+
|
68
|
+
return res.dig(*path)
|
61
69
|
end
|
62
70
|
"No good search result found"
|
63
71
|
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
4
|
+
module Boxcars
|
5
|
+
# For Boxcars that use an engine to do their work.
|
6
|
+
class Embedding < Boxcar
|
7
|
+
Error = Class.new(StandardError)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
require "boxcars/boxcar/embeddings"
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'gpt4all'
|
4
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
5
|
+
module Boxcars
|
6
|
+
# A engine that uses local GPT4All API.
|
7
|
+
class Gpt4allEng < Engine
|
8
|
+
attr_reader :prompts, :model_kwargs, :batch_size
|
9
|
+
|
10
|
+
# the default name of the engine
|
11
|
+
DEFAULT_NAME = "Gpt4all engine"
|
12
|
+
# the default description of the engine
|
13
|
+
DEFAULT_DESCRIPTION = "useful for when you need to use local AI to answer questions. " \
|
14
|
+
"You should ask targeted questions"
|
15
|
+
|
16
|
+
# A engine is a container for a single tool to run.
|
17
|
+
# @param name [String] The name of the engine. Defaults to "OpenAI engine".
|
18
|
+
# @param description [String] A description of the engine. Defaults to:
|
19
|
+
# useful for when you need to use AI to answer questions. You should ask targeted questions".
|
20
|
+
# @param prompts [Array<String>] The prompts to use when asking the engine. Defaults to [].
|
21
|
+
# @param batch_size [Integer] The number of prompts to send to the engine at once. Defaults to 2.
|
22
|
+
def initialize(name: DEFAULT_NAME, description: DEFAULT_DESCRIPTION, prompts: [], batch_size: 2, **_kwargs)
|
23
|
+
@prompts = prompts
|
24
|
+
@batch_size = batch_size
|
25
|
+
super(description: description, name: name)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Get an answer from the engine.
|
29
|
+
# @param prompt [String] The prompt to use when asking the engine.
|
30
|
+
# @param openai_access_token [String] The access token to use when asking the engine.
|
31
|
+
# Defaults to Boxcars.configuration.openai_access_token.
|
32
|
+
# @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
|
33
|
+
def client(prompt:, inputs: {}, **_kwargs)
|
34
|
+
gpt4all = Gpt4all::ConversationalAI.new
|
35
|
+
gpt4all.prepare_resources(force_download: false)
|
36
|
+
gpt4all.start_bot
|
37
|
+
input_text = prompt.as_prompt(inputs)[:prompt]
|
38
|
+
Boxcars.debug("Prompt after formatting:\n#{input_text}", :cyan) if Boxcars.configuration.log_prompts
|
39
|
+
gpt4all.prompt(input_text)
|
40
|
+
rescue StandardError => e
|
41
|
+
Boxcars.error(["Error from gpt4all engine: #{e}", e.backtrace[-5..-1]].flatten.join("\n "))
|
42
|
+
ensure
|
43
|
+
gpt4all.stop_bot
|
44
|
+
end
|
45
|
+
|
46
|
+
# get an answer from the engine for a question.
|
47
|
+
# @param question [String] The question to ask the engine.
|
48
|
+
# @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
|
49
|
+
def run(question, **kwargs)
|
50
|
+
prompt = Prompt.new(template: question)
|
51
|
+
answer = client(prompt: prompt, **kwargs)
|
52
|
+
Boxcars.debug("Answer: #{answer}", :cyan)
|
53
|
+
answer
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
data/lib/boxcars/engine.rb
CHANGED
data/lib/boxcars/version.rb
CHANGED
data/lib/boxcars.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boxcars
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.8
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Sullivan
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-04-
|
12
|
+
date: 2023-04-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: debug
|
@@ -67,6 +67,20 @@ dependencies:
|
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '2.2'
|
70
|
+
- !ruby/object:Gem::Dependency
|
71
|
+
name: gpt4all
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
73
|
+
requirements:
|
74
|
+
- - "~>"
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: 0.0.4
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - "~>"
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: 0.0.4
|
70
84
|
- !ruby/object:Gem::Dependency
|
71
85
|
name: ruby-openai
|
72
86
|
requirement: !ruby/object:Gem::Requirement
|
@@ -106,6 +120,15 @@ files:
|
|
106
120
|
- lib/boxcars/boxcar.rb
|
107
121
|
- lib/boxcars/boxcar/active_record.rb
|
108
122
|
- lib/boxcars/boxcar/calculator.rb
|
123
|
+
- lib/boxcars/boxcar/embeddings.rb
|
124
|
+
- lib/boxcars/boxcar/embeddings/document.rb
|
125
|
+
- lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb
|
126
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb
|
127
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb
|
128
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb
|
129
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb
|
130
|
+
- lib/boxcars/boxcar/embeddings/similarity_search.rb
|
131
|
+
- lib/boxcars/boxcar/embeddings/split_text.rb
|
109
132
|
- lib/boxcars/boxcar/engine_boxcar.rb
|
110
133
|
- lib/boxcars/boxcar/google_search.rb
|
111
134
|
- lib/boxcars/boxcar/sql.rb
|
@@ -113,8 +136,10 @@ files:
|
|
113
136
|
- lib/boxcars/boxcar/wikipedia_search.rb
|
114
137
|
- lib/boxcars/conversation.rb
|
115
138
|
- lib/boxcars/conversation_prompt.rb
|
139
|
+
- lib/boxcars/embedding.rb
|
116
140
|
- lib/boxcars/engine.rb
|
117
141
|
- lib/boxcars/engine/engine_result.rb
|
142
|
+
- lib/boxcars/engine/gpt4all_eng.rb
|
118
143
|
- lib/boxcars/engine/openai.rb
|
119
144
|
- lib/boxcars/generation.rb
|
120
145
|
- lib/boxcars/prompt.rb
|