boxcars 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -1
- data/Gemfile +2 -0
- data/Gemfile.lock +19 -1
- data/README.md +0 -1
- data/boxcars.gemspec +1 -0
- data/lib/boxcars/boxcar/embeddings/document.rb +14 -0
- data/lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb +50 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb +159 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb +56 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb +54 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb +80 -0
- data/lib/boxcars/boxcar/embeddings/similarity_search.rb +51 -0
- data/lib/boxcars/boxcar/embeddings/split_text.rb +104 -0
- data/lib/boxcars/boxcar/embeddings.rb +31 -0
- data/lib/boxcars/boxcar/google_search.rb +9 -1
- data/lib/boxcars/embedding.rb +11 -0
- data/lib/boxcars/engine/gpt4all_eng.rb +56 -0
- data/lib/boxcars/engine.rb +1 -0
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -0
- metadata +27 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 69b70e1d02b1ec206438eaaf857a0495fe35ab01e64a265656fe21230675306f
|
|
4
|
+
data.tar.gz: 8681b9625a0684f1091eea7a4626964929b271370068c90b82dabeee4253d803
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: eb5c0c00f8fcdbbd6d8a1999d7544fc584c701fdf9a8a9c271fff6d9795f75ef9cab058fee2c6829808a764c892cc3e2f4e4a8717155d34d6514b46d744e632c
|
|
7
|
+
data.tar.gz: b8fb4ad34d7b93d47388f037d1d93e9e7245303740bc04d58d21942112ff97315e5dd31fdba77e275b52ddba85ca1055b897646ae1e606daee485583d50c52a6
|
data/CHANGELOG.md
CHANGED
|
@@ -2,7 +2,22 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased](https://github.com/BoxcarsAI/boxcars/tree/HEAD)
|
|
4
4
|
|
|
5
|
-
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.
|
|
5
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.7...HEAD)
|
|
6
|
+
|
|
7
|
+
**Closed issues:**
|
|
8
|
+
|
|
9
|
+
- Getting the same verbosity as in the examples [\#54](https://github.com/BoxcarsAI/boxcars/issues/54)
|
|
10
|
+
|
|
11
|
+
**Merged pull requests:**
|
|
12
|
+
|
|
13
|
+
- Add Engine for Gpt4all [\#55](https://github.com/BoxcarsAI/boxcars/pull/55) ([francis](https://github.com/francis))
|
|
14
|
+
- update google search to return URL for result if present [\#53](https://github.com/BoxcarsAI/boxcars/pull/53) ([francis](https://github.com/francis))
|
|
15
|
+
- Draft: added gpt4all [\#49](https://github.com/BoxcarsAI/boxcars/pull/49) ([jaigouk](https://github.com/jaigouk))
|
|
16
|
+
- Embeddings with hnswlib [\#48](https://github.com/BoxcarsAI/boxcars/pull/48) ([jaigouk](https://github.com/jaigouk))
|
|
17
|
+
|
|
18
|
+
## [v0.2.7](https://github.com/BoxcarsAI/boxcars/tree/v0.2.7) (2023-04-13)
|
|
19
|
+
|
|
20
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...v0.2.7)
|
|
6
21
|
|
|
7
22
|
**Closed issues:**
|
|
8
23
|
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
boxcars (0.2.
|
|
4
|
+
boxcars (0.2.8)
|
|
5
5
|
google_search_results (~> 2.2)
|
|
6
|
+
gpt4all (~> 0.0.4)
|
|
6
7
|
ruby-openai (~> 3.0)
|
|
7
8
|
|
|
8
9
|
GEM
|
|
@@ -71,7 +72,12 @@ GEM
|
|
|
71
72
|
rainbow (>= 2.2.1)
|
|
72
73
|
rake (>= 10.0)
|
|
73
74
|
google_search_results (2.2.0)
|
|
75
|
+
gpt4all (0.0.5)
|
|
76
|
+
faraday (~> 2.7)
|
|
77
|
+
os (~> 1.1)
|
|
78
|
+
tty-progressbar (~> 0.18.2)
|
|
74
79
|
hashdiff (1.0.1)
|
|
80
|
+
hnswlib (0.8.1)
|
|
75
81
|
http-accept (1.7.0)
|
|
76
82
|
http-cookie (1.0.5)
|
|
77
83
|
domain_name (~> 0.5)
|
|
@@ -100,6 +106,7 @@ GEM
|
|
|
100
106
|
octokit (4.25.1)
|
|
101
107
|
faraday (>= 1, < 3)
|
|
102
108
|
sawyer (~> 0.9)
|
|
109
|
+
os (1.1.4)
|
|
103
110
|
parallel (1.22.1)
|
|
104
111
|
parser (3.2.1.1)
|
|
105
112
|
ast (~> 2.4.1)
|
|
@@ -163,10 +170,19 @@ GEM
|
|
|
163
170
|
faraday (>= 0.17.3, < 3)
|
|
164
171
|
sqlite3 (1.6.2)
|
|
165
172
|
mini_portile2 (~> 2.8.0)
|
|
173
|
+
sqlite3 (1.6.2-arm64-darwin)
|
|
166
174
|
sqlite3 (1.6.2-x86_64-darwin)
|
|
167
175
|
sqlite3 (1.6.2-x86_64-linux)
|
|
176
|
+
strings-ansi (0.2.0)
|
|
168
177
|
timers (4.3.5)
|
|
169
178
|
traces (0.9.1)
|
|
179
|
+
tty-cursor (0.7.1)
|
|
180
|
+
tty-progressbar (0.18.2)
|
|
181
|
+
strings-ansi (~> 0.2)
|
|
182
|
+
tty-cursor (~> 0.7)
|
|
183
|
+
tty-screen (~> 0.8)
|
|
184
|
+
unicode-display_width (>= 1.6, < 3.0)
|
|
185
|
+
tty-screen (0.8.1)
|
|
170
186
|
tzinfo (2.0.6)
|
|
171
187
|
concurrent-ruby (~> 1.0)
|
|
172
188
|
unf (0.1.4)
|
|
@@ -181,6 +197,7 @@ GEM
|
|
|
181
197
|
hashdiff (>= 0.4.0, < 2.0.0)
|
|
182
198
|
|
|
183
199
|
PLATFORMS
|
|
200
|
+
arm64-darwin-22
|
|
184
201
|
universal-java-11
|
|
185
202
|
x86_64-darwin-21
|
|
186
203
|
x86_64-darwin-22
|
|
@@ -194,6 +211,7 @@ DEPENDENCIES
|
|
|
194
211
|
dotenv (~> 2.8)
|
|
195
212
|
faraday-retry (~> 2.0)
|
|
196
213
|
github_changelog_generator (~> 1.16)
|
|
214
|
+
hnswlib (~> 0.8.1)
|
|
197
215
|
rake (~> 13.0)
|
|
198
216
|
rest-client (~> 2.1)
|
|
199
217
|
rspec (~> 3.2)
|
data/README.md
CHANGED
data/boxcars.gemspec
CHANGED
|
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
|
|
|
37
37
|
|
|
38
38
|
# runtime dependencies
|
|
39
39
|
spec.add_dependency "google_search_results", "~> 2.2"
|
|
40
|
+
spec.add_dependency "gpt4all", "~> 0.0.4"
|
|
40
41
|
spec.add_dependency "ruby-openai", "~> 3.0"
|
|
41
42
|
|
|
42
43
|
# For more information and examples about making a new gem, checkout our
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Boxcars
|
|
4
|
+
module Embeddings
|
|
5
|
+
class Document
|
|
6
|
+
attr_accessor :page_content, :metadata
|
|
7
|
+
|
|
8
|
+
def initialize(fields = {})
|
|
9
|
+
@page_content = fields[:page_content] || ""
|
|
10
|
+
@metadata = fields[:metadata] || {}
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'openai'
|
|
4
|
+
|
|
5
|
+
module Boxcars
|
|
6
|
+
module Embeddings
|
|
7
|
+
class EmbedViaOpenAI
|
|
8
|
+
include Embeddings
|
|
9
|
+
|
|
10
|
+
attr_accessor :texts, :openai_connection, :model
|
|
11
|
+
|
|
12
|
+
def initialize(texts:, openai_connection:, model: 'text-embedding-ada-002')
|
|
13
|
+
validate_params(texts, openai_connection)
|
|
14
|
+
@texts = texts
|
|
15
|
+
@openai_connection = openai_connection
|
|
16
|
+
@model = model
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def call
|
|
20
|
+
texts.map do |text|
|
|
21
|
+
embedding = embedding_with_retry(model: model, input: strip_new_lines(text))
|
|
22
|
+
{
|
|
23
|
+
embedding: embedding,
|
|
24
|
+
dim: embedding.size
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
private
|
|
30
|
+
|
|
31
|
+
def validate_params(texts, openai_connection)
|
|
32
|
+
raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
|
|
33
|
+
raise_error 'openai_connection must be an OpenAI::Client' unless openai_connection.is_a?(OpenAI::Client)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def embedding_with_retry(request)
|
|
37
|
+
response = @openai_connection.embeddings(parameters: request)
|
|
38
|
+
response['data'][0]['embedding']
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def strip_new_lines(text)
|
|
42
|
+
text.gsub("\n", ' ')
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def raise_error(message)
|
|
46
|
+
raise ::Boxcars::ValueError, message
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'fileutils'
|
|
4
|
+
require 'hnswlib'
|
|
5
|
+
require 'json'
|
|
6
|
+
|
|
7
|
+
module Boxcars
|
|
8
|
+
module Embeddings
|
|
9
|
+
module Hnswlib
|
|
10
|
+
class BuildVectorStore
|
|
11
|
+
include Embeddings
|
|
12
|
+
|
|
13
|
+
# This class is responsible for building the vector store for the hnswlib similarity search.
|
|
14
|
+
# It will load the training data, generate the embeddings, and save the vector store.
|
|
15
|
+
# It will also load the vector store into memory.
|
|
16
|
+
# For later use, it will save the splitted document with index numbers to a json file.
|
|
17
|
+
#
|
|
18
|
+
# @param training_data_path [String] The path to the training data. Can be a glob pattern.
|
|
19
|
+
# @param index_file_path [String] The path to the index file.
|
|
20
|
+
# @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
|
|
21
|
+
# @option json_doc_file_path [String]. The json file containing the document text.
|
|
22
|
+
# if nil, it will reuse index file name.
|
|
23
|
+
# @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
|
|
24
|
+
def initialize(
|
|
25
|
+
training_data_path:,
|
|
26
|
+
index_file_path:,
|
|
27
|
+
split_chunk_size: 2000,
|
|
28
|
+
json_doc_file_path: nil,
|
|
29
|
+
force_rebuild: true
|
|
30
|
+
)
|
|
31
|
+
@training_data_path = training_data_path
|
|
32
|
+
@index_file_path = index_file_path
|
|
33
|
+
@split_chunk_size = split_chunk_size
|
|
34
|
+
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
|
35
|
+
@force_rebuild = force_rebuild
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def call
|
|
39
|
+
validate_params
|
|
40
|
+
data = load_files
|
|
41
|
+
documents = split_text_into_chunks(data)
|
|
42
|
+
embeddings_with_config = generate_embeddings(documents)
|
|
43
|
+
save_vector_store(embeddings_with_config)
|
|
44
|
+
load_hnsw
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
|
|
50
|
+
|
|
51
|
+
def validate_params
|
|
52
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
|
53
|
+
raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
|
54
|
+
raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
|
55
|
+
|
|
56
|
+
index_dir = File.dirname(index_file_path)
|
|
57
|
+
raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
|
|
58
|
+
|
|
59
|
+
raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def load_files
|
|
63
|
+
data = []
|
|
64
|
+
files = Dir.glob(training_data_path)
|
|
65
|
+
raise_error "No files found at #{training_data_path}" if files.empty?
|
|
66
|
+
|
|
67
|
+
files.each do |file|
|
|
68
|
+
data << File.read(file)
|
|
69
|
+
end
|
|
70
|
+
puts "Added #{files.length} files to data. Splitting text into chunks..."
|
|
71
|
+
data
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def split_text_into_chunks(data)
|
|
75
|
+
return true unless rebuild_required?
|
|
76
|
+
|
|
77
|
+
docs = []
|
|
78
|
+
data.each do |chunk|
|
|
79
|
+
doc_output = Boxcars::Embeddings::SplitText.call(
|
|
80
|
+
separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
|
|
81
|
+
)
|
|
82
|
+
docs.concat(doc_output)
|
|
83
|
+
end
|
|
84
|
+
docs
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def rebuild_required?
|
|
88
|
+
hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
|
89
|
+
return true unless File.exist?(index_file_path)
|
|
90
|
+
return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
|
|
91
|
+
return true if force_rebuild
|
|
92
|
+
|
|
93
|
+
false
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def generate_embeddings(documents)
|
|
97
|
+
return true unless rebuild_required?
|
|
98
|
+
|
|
99
|
+
puts "Initializing Store..."
|
|
100
|
+
openai_client = OpenAI::Client.new(access_token: ENV.fetch('OPENAI_API_KEY', nil))
|
|
101
|
+
|
|
102
|
+
embeddings_with_dim = Boxcars::Embeddings::EmbedViaOpenAI.call(texts: documents, openai_connection: openai_client)
|
|
103
|
+
|
|
104
|
+
document_embeddings = embeddings_with_dim.map.with_index do |item, index|
|
|
105
|
+
{ doc_id: index, embedding: item[:embedding], document: documents[index] }
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
{ document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def save_vector_store(embeddings_with_config)
|
|
112
|
+
return true unless rebuild_required?
|
|
113
|
+
|
|
114
|
+
puts "Saving Vectorstore"
|
|
115
|
+
Boxcars::Embeddings::Hnswlib::SaveToHnswlib.call(
|
|
116
|
+
document_embeddings: embeddings_with_config[:document_embeddings],
|
|
117
|
+
index_file_path: index_file_path,
|
|
118
|
+
json_doc_file_path: json_doc_file_path,
|
|
119
|
+
hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
|
|
120
|
+
)
|
|
121
|
+
puts "VectorStore saved"
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def hnswlib_config(dim)
|
|
125
|
+
# dim: length of datum point vector that will be indexed.
|
|
126
|
+
Boxcars::Embeddings::Hnswlib::HnswlibConfig.new(
|
|
127
|
+
metric: "l2", max_item: 10000, dim: dim
|
|
128
|
+
)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
def load_hnsw
|
|
132
|
+
puts "Loading Hnswlib"
|
|
133
|
+
|
|
134
|
+
config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
|
135
|
+
json_config = parse_json_file(config_file)
|
|
136
|
+
document_embeddings = parse_json_file(json_doc_file_path)
|
|
137
|
+
|
|
138
|
+
search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
|
|
139
|
+
search_index.load_index(index_file_path)
|
|
140
|
+
|
|
141
|
+
{ vector_store: search_index, document_embeddings: document_embeddings }
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
def parse_json_file(file_path)
|
|
145
|
+
return [] if file_path.nil?
|
|
146
|
+
|
|
147
|
+
file_content = File.read(file_path)
|
|
148
|
+
JSON.parse(file_content, symbolize_names: true)
|
|
149
|
+
rescue JSON::ParserError => e
|
|
150
|
+
raise_error("Error parsing hnswlib_config.json: #{e.message}")
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def raise_error(message)
|
|
154
|
+
raise ::Boxcars::Error, message
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Boxcars
|
|
6
|
+
module Embeddings
|
|
7
|
+
module Hnswlib
|
|
8
|
+
class HnswlibConfig
|
|
9
|
+
attr_reader :metric, :max_item, :dim, :ef_construction, :m
|
|
10
|
+
|
|
11
|
+
# used for search index.
|
|
12
|
+
#
|
|
13
|
+
# @param max_item [Integer] The maximum number of items.
|
|
14
|
+
#
|
|
15
|
+
# @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
|
|
16
|
+
#
|
|
17
|
+
# @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
|
|
18
|
+
# It controls the index time/accuracy trade-off.
|
|
19
|
+
#
|
|
20
|
+
# @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
|
|
21
|
+
#
|
|
22
|
+
# reference: https://yoshoku.github.io/hnswlib.rb/doc/
|
|
23
|
+
def initialize(
|
|
24
|
+
metric: "l2",
|
|
25
|
+
max_item: 10000,
|
|
26
|
+
dim: 2,
|
|
27
|
+
ef_construction: 200,
|
|
28
|
+
max_outgoing_connection: 16
|
|
29
|
+
)
|
|
30
|
+
@metric = metric
|
|
31
|
+
@max_item = max_item
|
|
32
|
+
@dim = dim
|
|
33
|
+
@ef_construction = ef_construction
|
|
34
|
+
@max_outgoing_connection = max_outgoing_connection
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def space
|
|
38
|
+
@metric == 'dot' ? 'ip' : 'l2'
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def to_json(*args)
|
|
42
|
+
JSON.pretty_generate(
|
|
43
|
+
{
|
|
44
|
+
metric: @metric,
|
|
45
|
+
max_item: @max_item,
|
|
46
|
+
dim: @dim,
|
|
47
|
+
ef_construction: @ef_construction,
|
|
48
|
+
max_outgoing_connection: @max_outgoing_connection
|
|
49
|
+
},
|
|
50
|
+
*args
|
|
51
|
+
)
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'hnswlib'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
module Boxcars
|
|
7
|
+
module Embeddings
|
|
8
|
+
module Hnswlib
|
|
9
|
+
class HnswlibSearch
|
|
10
|
+
def initialize(vector_store:, options: {})
|
|
11
|
+
validate_params(vector_store)
|
|
12
|
+
@vector_store = vector_store
|
|
13
|
+
@json_doc_path = options[:json_doc_path]
|
|
14
|
+
@num_neighbors = options[:num_neighbors] || 1
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def call(query)
|
|
18
|
+
search(query)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
attr_reader :json_doc_path, :vector_store, :num_neighbors
|
|
24
|
+
|
|
25
|
+
def validate_params(vector_store)
|
|
26
|
+
raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def search(query)
|
|
30
|
+
raw_results = vector_store.search_knn(query, num_neighbors)
|
|
31
|
+
raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def lookup_embedding2(doc_id, distance)
|
|
35
|
+
embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
|
|
36
|
+
return unless embedding_data
|
|
37
|
+
|
|
38
|
+
{ document: embedding_data[:document], distance: distance }
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def parsed_data
|
|
42
|
+
@parsed_data ||= JSON.parse(
|
|
43
|
+
File.read(json_doc_path),
|
|
44
|
+
symbolize_names: true
|
|
45
|
+
)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def raise_error(message)
|
|
49
|
+
raise ArgumentError, message
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'hnswlib'
|
|
4
|
+
require 'json'
|
|
5
|
+
require 'fileutils'
|
|
6
|
+
|
|
7
|
+
module Boxcars
|
|
8
|
+
module Embeddings
|
|
9
|
+
module Hnswlib
|
|
10
|
+
class SaveToHnswlib
|
|
11
|
+
include Embeddings
|
|
12
|
+
|
|
13
|
+
# @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
|
|
14
|
+
# @param index_file_path [String] The path to the index file.
|
|
15
|
+
# @param hnswlib_config [Boxcars::Embeddings::Hnswlib::Config] The config object for the hnswlib index.
|
|
16
|
+
# @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
|
|
17
|
+
def initialize(document_embeddings:, index_file_path:, hnswlib_config:, json_doc_file_path: nil)
|
|
18
|
+
@document_embeddings = document_embeddings
|
|
19
|
+
@index_file_path = index_file_path
|
|
20
|
+
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
|
21
|
+
|
|
22
|
+
@hnswlib_config = hnswlib_config
|
|
23
|
+
@index = ::Hnswlib::HnswIndex.new(
|
|
24
|
+
n_features: hnswlib_config.dim,
|
|
25
|
+
max_item: hnswlib_config.max_item,
|
|
26
|
+
metric: hnswlib_config.metric
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def call
|
|
31
|
+
validate_params
|
|
32
|
+
document_texts = []
|
|
33
|
+
|
|
34
|
+
document_embeddings.each do |embedding|
|
|
35
|
+
index.add_item(embedding[:doc_id], embedding[:embedding])
|
|
36
|
+
|
|
37
|
+
document_texts << { doc_id: embedding[:doc_id], embedding: embedding[:embedding], document: embedding[:document] }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
write_files(index, document_texts)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private
|
|
44
|
+
|
|
45
|
+
def write_files(index, document_texts)
|
|
46
|
+
FileUtils.mkdir_p(File.dirname(json_doc_file_path))
|
|
47
|
+
File.write(json_doc_file_path, document_texts.to_json)
|
|
48
|
+
|
|
49
|
+
FileUtils.mkdir_p(File.dirname(index_file_path))
|
|
50
|
+
File.write("#{File.dirname(index_file_path)}/hnswlib_config.json", hnswlib_config.to_json)
|
|
51
|
+
|
|
52
|
+
index.save(index_file_path)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
attr_reader :index, :document_embeddings, :index_file_path, :json_doc_file_path, :hnswlib_config
|
|
56
|
+
|
|
57
|
+
def validate_params
|
|
58
|
+
raise_error("document_embeddings must be an array") unless document_embeddings.is_a?(Array)
|
|
59
|
+
raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
|
|
60
|
+
raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
|
61
|
+
|
|
62
|
+
[index_file_path, json_doc_file_path].each do |path|
|
|
63
|
+
check_parent_directory(path)
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def check_parent_directory(path)
|
|
68
|
+
return unless path
|
|
69
|
+
|
|
70
|
+
parent_dir = File.dirname(path)
|
|
71
|
+
raise_error('parent directory must exist') unless File.directory?(parent_dir)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def raise_error(message)
|
|
75
|
+
raise ::Boxcars::ValueError, message
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
end
|
|
80
|
+
end
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'hnswlib'
|
|
4
|
+
|
|
5
|
+
module Boxcars
|
|
6
|
+
module Embeddings
|
|
7
|
+
class SimilaritySearch
|
|
8
|
+
def initialize(embeddings:, vector_store:, openai_connection:)
|
|
9
|
+
@embeddings = embeddings
|
|
10
|
+
@vector_store = vector_store
|
|
11
|
+
@similarity_search_instance = create_similarity_search_instance
|
|
12
|
+
@openai_connection = openai_connection
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def call(query:)
|
|
16
|
+
validate_query(query)
|
|
17
|
+
query_vector = convert_query_to_vector(query)
|
|
18
|
+
@similarity_search_instance.call(query_vector)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
attr_reader :embeddings, :vector_store, :openai_connection
|
|
24
|
+
|
|
25
|
+
def validate_query(query)
|
|
26
|
+
raise_error 'query must be a string' unless query.is_a?(String)
|
|
27
|
+
raise_error 'query must not be empty' if query.empty?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def convert_query_to_vector(query)
|
|
31
|
+
Boxcars::Embeddings::EmbedViaOpenAI.call(texts: [query], openai_connection: openai_connection).first[:embedding]
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def create_similarity_search_instance
|
|
35
|
+
case vector_store
|
|
36
|
+
when ::Hnswlib::HierarchicalNSW
|
|
37
|
+
Boxcars::Embeddings::Hnswlib::HnswlibSearch.new(
|
|
38
|
+
vector_store: vector_store,
|
|
39
|
+
options: { json_doc_path: embeddings, num_neighbors: 2 }
|
|
40
|
+
)
|
|
41
|
+
else
|
|
42
|
+
raise_error 'Unsupported vector store provided'
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def raise_error(message)
|
|
47
|
+
raise ArgumentError, message
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Boxcars
|
|
4
|
+
module Embeddings
|
|
5
|
+
# Split a text into chunks of a given size.
|
|
6
|
+
class SplitText
|
|
7
|
+
include Embeddings
|
|
8
|
+
|
|
9
|
+
attr_reader :separator, :chunk_size, :chunk_overlap, :text
|
|
10
|
+
|
|
11
|
+
# @param separator [String] The string to use to split the text.
|
|
12
|
+
# @param chunk_size [Integer] The size of each chunk.
|
|
13
|
+
# @param chunk_overlap [Integer] The amount of overlap between chunks.
|
|
14
|
+
# @param text [String] The text to split.
|
|
15
|
+
def initialize(separator: "Search", chunk_size: 7, chunk_overlap: 3, text: "")
|
|
16
|
+
# require 'debugger'; debugger
|
|
17
|
+
validate_params(separator, chunk_size, chunk_overlap, text)
|
|
18
|
+
|
|
19
|
+
@separator = separator
|
|
20
|
+
@chunk_size = chunk_size
|
|
21
|
+
@chunk_overlap = chunk_overlap
|
|
22
|
+
@text = text
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def call
|
|
26
|
+
splits = text.split(separator)
|
|
27
|
+
merged_splits = merge_splits(splits, separator)
|
|
28
|
+
|
|
29
|
+
merged_splits&.sort
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def validate_params(separator, chunk_size, chunk_overlap, text)
|
|
35
|
+
raise_error("separator must be a string") unless separator.is_a?(String)
|
|
36
|
+
raise_error("chunk_size must be an integer") unless chunk_size.is_a?(Integer)
|
|
37
|
+
raise_error("chunk_overlap must be an integer") unless chunk_overlap.is_a?(Integer)
|
|
38
|
+
raise_error("text must be a string") unless text.is_a?(String)
|
|
39
|
+
raise_error("chunk_overlap must be less than chunk_size") if chunk_overlap >= chunk_size
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def raise_error(message)
|
|
43
|
+
raise ::Boxcars::ValueError, message
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def merge_splits(splits, separator)
|
|
47
|
+
merged_splits = []
|
|
48
|
+
current_doc = []
|
|
49
|
+
total = 0
|
|
50
|
+
|
|
51
|
+
splits.each do |split|
|
|
52
|
+
split_len = split.length
|
|
53
|
+
total = process_split(total, split_len, current_doc, merged_splits, separator)
|
|
54
|
+
current_doc << split
|
|
55
|
+
total += split_len
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
add_remaining_doc(current_doc, merged_splits, separator)
|
|
59
|
+
merged_splits
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def process_split(total, split_len, current_doc, merged_splits, separator)
|
|
63
|
+
if total + split_len >= chunk_size
|
|
64
|
+
warn_if_chunk_too_large(total)
|
|
65
|
+
total = handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
|
|
66
|
+
end
|
|
67
|
+
total
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def warn_if_chunk_too_large(total)
|
|
71
|
+
return unless total > chunk_size
|
|
72
|
+
|
|
73
|
+
puts "Created a chunk of size #{total}, which is longer than the specified #{chunk_size}"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def handle_large_chunk(total, split_len, current_doc, merged_splits, separator)
|
|
77
|
+
if current_doc.length.positive?
|
|
78
|
+
doc = join_docs(current_doc, separator)
|
|
79
|
+
merged_splits << doc if doc
|
|
80
|
+
total = remove_overlap(total, split_len, current_doc)
|
|
81
|
+
end
|
|
82
|
+
total
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def remove_overlap(total, split_len, current_doc)
|
|
86
|
+
while total > chunk_overlap || (total + split_len > chunk_size && total.positive?)
|
|
87
|
+
total -= current_doc[0].length
|
|
88
|
+
current_doc.shift
|
|
89
|
+
end
|
|
90
|
+
total
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def add_remaining_doc(current_doc, merged_splits, separator)
|
|
94
|
+
doc = join_docs(current_doc, separator)
|
|
95
|
+
merged_splits << doc if doc
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
def join_docs(docs, separator)
|
|
99
|
+
text = docs.join(separator).strip
|
|
100
|
+
text.empty? ? nil : text
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
|
4
|
+
module Boxcars
|
|
5
|
+
module Embeddings
|
|
6
|
+
module ClassMethods
|
|
7
|
+
EmbeddingsError = Class.new(StandardError)
|
|
8
|
+
|
|
9
|
+
def call(*args, **kw_args)
|
|
10
|
+
new(*args, **kw_args).call
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def self.included(base)
|
|
15
|
+
base.extend(ClassMethods)
|
|
16
|
+
|
|
17
|
+
class << base
|
|
18
|
+
private :new
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
require_relative "embeddings/document"
|
|
25
|
+
require_relative "embeddings/embed_via_open_ai"
|
|
26
|
+
require_relative "embeddings/split_text"
|
|
27
|
+
require_relative "embeddings/similarity_search"
|
|
28
|
+
require_relative "embeddings/hnswlib/hnswlib_config"
|
|
29
|
+
require_relative "embeddings/hnswlib/save_to_hnswlib"
|
|
30
|
+
require_relative "embeddings/hnswlib/build_vector_store"
|
|
31
|
+
require_relative "embeddings/hnswlib/hnswlib_search"
|
|
@@ -57,7 +57,15 @@ module Boxcars
|
|
|
57
57
|
raise Error, "Got error from SerpAPI: {res[:error]}" if res[:error]
|
|
58
58
|
|
|
59
59
|
ANSWER_LOCATIONS.each do |path|
|
|
60
|
-
|
|
60
|
+
next unless res.dig(*path)
|
|
61
|
+
|
|
62
|
+
Boxcars.debug("Found SERP answer at #{path}", :cyan)
|
|
63
|
+
path_link = path.dup
|
|
64
|
+
last_word = path_link.pop
|
|
65
|
+
path_link << :link
|
|
66
|
+
return { last_word => res.dig(*path), url: res.dig(*path_link) } if last_word.is_a?(Symbol) && res.dig(*path_link)
|
|
67
|
+
|
|
68
|
+
return res.dig(*path)
|
|
61
69
|
end
|
|
62
70
|
"No good search result found"
|
|
63
71
|
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
|
4
|
+
module Boxcars
|
|
5
|
+
# For Boxcars that use an engine to do their work.
|
|
6
|
+
class Embedding < Boxcar
|
|
7
|
+
Error = Class.new(StandardError)
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
require "boxcars/boxcar/embeddings"
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'gpt4all'
|
|
4
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
|
5
|
+
module Boxcars
|
|
6
|
+
# A engine that uses local GPT4All API.
|
|
7
|
+
class Gpt4allEng < Engine
|
|
8
|
+
attr_reader :prompts, :model_kwargs, :batch_size
|
|
9
|
+
|
|
10
|
+
# the default name of the engine
|
|
11
|
+
DEFAULT_NAME = "Gpt4all engine"
|
|
12
|
+
# the default description of the engine
|
|
13
|
+
DEFAULT_DESCRIPTION = "useful for when you need to use local AI to answer questions. " \
|
|
14
|
+
"You should ask targeted questions"
|
|
15
|
+
|
|
16
|
+
# A engine is a container for a single tool to run.
|
|
17
|
+
# @param name [String] The name of the engine. Defaults to "OpenAI engine".
|
|
18
|
+
# @param description [String] A description of the engine. Defaults to:
|
|
19
|
+
# useful for when you need to use AI to answer questions. You should ask targeted questions".
|
|
20
|
+
# @param prompts [Array<String>] The prompts to use when asking the engine. Defaults to [].
|
|
21
|
+
# @param batch_size [Integer] The number of prompts to send to the engine at once. Defaults to 2.
|
|
22
|
+
def initialize(name: DEFAULT_NAME, description: DEFAULT_DESCRIPTION, prompts: [], batch_size: 2, **_kwargs)
|
|
23
|
+
@prompts = prompts
|
|
24
|
+
@batch_size = batch_size
|
|
25
|
+
super(description: description, name: name)
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
# Get an answer from the engine.
|
|
29
|
+
# @param prompt [String] The prompt to use when asking the engine.
|
|
30
|
+
# @param openai_access_token [String] The access token to use when asking the engine.
|
|
31
|
+
# Defaults to Boxcars.configuration.openai_access_token.
|
|
32
|
+
# @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
|
|
33
|
+
def client(prompt:, inputs: {}, **_kwargs)
|
|
34
|
+
gpt4all = Gpt4all::ConversationalAI.new
|
|
35
|
+
gpt4all.prepare_resources(force_download: false)
|
|
36
|
+
gpt4all.start_bot
|
|
37
|
+
input_text = prompt.as_prompt(inputs)[:prompt]
|
|
38
|
+
Boxcars.debug("Prompt after formatting:\n#{input_text}", :cyan) if Boxcars.configuration.log_prompts
|
|
39
|
+
gpt4all.prompt(input_text)
|
|
40
|
+
rescue StandardError => e
|
|
41
|
+
Boxcars.error(["Error from gpt4all engine: #{e}", e.backtrace[-5..-1]].flatten.join("\n "))
|
|
42
|
+
ensure
|
|
43
|
+
gpt4all.stop_bot
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
# get an answer from the engine for a question.
|
|
47
|
+
# @param question [String] The question to ask the engine.
|
|
48
|
+
# @param kwargs [Hash] Additional parameters to pass to the engine if wanted.
|
|
49
|
+
def run(question, **kwargs)
|
|
50
|
+
prompt = Prompt.new(template: question)
|
|
51
|
+
answer = client(prompt: prompt, **kwargs)
|
|
52
|
+
Boxcars.debug("Answer: #{answer}", :cyan)
|
|
53
|
+
answer
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
data/lib/boxcars/engine.rb
CHANGED
data/lib/boxcars/version.rb
CHANGED
data/lib/boxcars.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: boxcars
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.2.
|
|
4
|
+
version: 0.2.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Francis Sullivan
|
|
@@ -9,7 +9,7 @@ authors:
|
|
|
9
9
|
autorequire:
|
|
10
10
|
bindir: exe
|
|
11
11
|
cert_chain: []
|
|
12
|
-
date: 2023-04-
|
|
12
|
+
date: 2023-04-19 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
|
14
14
|
- !ruby/object:Gem::Dependency
|
|
15
15
|
name: debug
|
|
@@ -67,6 +67,20 @@ dependencies:
|
|
|
67
67
|
- - "~>"
|
|
68
68
|
- !ruby/object:Gem::Version
|
|
69
69
|
version: '2.2'
|
|
70
|
+
- !ruby/object:Gem::Dependency
|
|
71
|
+
name: gpt4all
|
|
72
|
+
requirement: !ruby/object:Gem::Requirement
|
|
73
|
+
requirements:
|
|
74
|
+
- - "~>"
|
|
75
|
+
- !ruby/object:Gem::Version
|
|
76
|
+
version: 0.0.4
|
|
77
|
+
type: :runtime
|
|
78
|
+
prerelease: false
|
|
79
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
80
|
+
requirements:
|
|
81
|
+
- - "~>"
|
|
82
|
+
- !ruby/object:Gem::Version
|
|
83
|
+
version: 0.0.4
|
|
70
84
|
- !ruby/object:Gem::Dependency
|
|
71
85
|
name: ruby-openai
|
|
72
86
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -106,6 +120,15 @@ files:
|
|
|
106
120
|
- lib/boxcars/boxcar.rb
|
|
107
121
|
- lib/boxcars/boxcar/active_record.rb
|
|
108
122
|
- lib/boxcars/boxcar/calculator.rb
|
|
123
|
+
- lib/boxcars/boxcar/embeddings.rb
|
|
124
|
+
- lib/boxcars/boxcar/embeddings/document.rb
|
|
125
|
+
- lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb
|
|
126
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb
|
|
127
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb
|
|
128
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb
|
|
129
|
+
- lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb
|
|
130
|
+
- lib/boxcars/boxcar/embeddings/similarity_search.rb
|
|
131
|
+
- lib/boxcars/boxcar/embeddings/split_text.rb
|
|
109
132
|
- lib/boxcars/boxcar/engine_boxcar.rb
|
|
110
133
|
- lib/boxcars/boxcar/google_search.rb
|
|
111
134
|
- lib/boxcars/boxcar/sql.rb
|
|
@@ -113,8 +136,10 @@ files:
|
|
|
113
136
|
- lib/boxcars/boxcar/wikipedia_search.rb
|
|
114
137
|
- lib/boxcars/conversation.rb
|
|
115
138
|
- lib/boxcars/conversation_prompt.rb
|
|
139
|
+
- lib/boxcars/embedding.rb
|
|
116
140
|
- lib/boxcars/engine.rb
|
|
117
141
|
- lib/boxcars/engine/engine_result.rb
|
|
142
|
+
- lib/boxcars/engine/gpt4all_eng.rb
|
|
118
143
|
- lib/boxcars/engine/openai.rb
|
|
119
144
|
- lib/boxcars/generation.rb
|
|
120
145
|
- lib/boxcars/prompt.rb
|