boxcars 0.2.11 → 0.2.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.env_sample +1 -0
- data/.rubocop.yml +16 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +15 -11
- data/Gemfile.lock +40 -32
- data/README.md +4 -1
- data/boxcars.gemspec +4 -7
- data/lib/boxcars/boxcar/active_record.rb +2 -2
- data/lib/boxcars/boxcar/engine_boxcar.rb +2 -2
- data/lib/boxcars/boxcar/sql.rb +1 -1
- data/lib/boxcars/boxcar/swagger.rb +1 -1
- data/lib/boxcars/boxcar/vector_answer.rb +71 -0
- data/lib/boxcars/boxcar.rb +2 -0
- data/lib/boxcars/engine/openai.rb +8 -1
- data/lib/boxcars/train/zero_shot.rb +1 -1
- data/lib/boxcars/train.rb +1 -1
- data/lib/boxcars/vector_search.rb +66 -2
- data/lib/boxcars/vector_store/document.rb +3 -2
- data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +104 -0
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
- data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
- data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
- data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
- data/lib/boxcars/vector_store/split_text.rb +2 -3
- data/lib/boxcars/vector_store.rb +73 -7
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -1
- metadata +31 -40
- data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
- data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
- data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
- data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
- data/lib/boxcars/vector_store/similarity_search.rb +0 -55
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boxcars
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Sullivan
|
@@ -9,92 +9,78 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-05-
|
12
|
+
date: 2023-05-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
requirements:
|
18
|
-
- - "~>"
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: '1.1'
|
21
|
-
type: :development
|
22
|
-
prerelease: false
|
23
|
-
version_requirements: !ruby/object:Gem::Requirement
|
24
|
-
requirements:
|
25
|
-
- - "~>"
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
version: '1.1'
|
28
|
-
- !ruby/object:Gem::Dependency
|
29
|
-
name: dotenv
|
15
|
+
name: google_search_results
|
30
16
|
requirement: !ruby/object:Gem::Requirement
|
31
17
|
requirements:
|
32
18
|
- - "~>"
|
33
19
|
- !ruby/object:Gem::Version
|
34
|
-
version: '2.
|
35
|
-
type: :
|
20
|
+
version: '2.2'
|
21
|
+
type: :runtime
|
36
22
|
prerelease: false
|
37
23
|
version_requirements: !ruby/object:Gem::Requirement
|
38
24
|
requirements:
|
39
25
|
- - "~>"
|
40
26
|
- !ruby/object:Gem::Version
|
41
|
-
version: '2.
|
27
|
+
version: '2.2'
|
42
28
|
- !ruby/object:Gem::Dependency
|
43
|
-
name:
|
29
|
+
name: gpt4all
|
44
30
|
requirement: !ruby/object:Gem::Requirement
|
45
31
|
requirements:
|
46
32
|
- - "~>"
|
47
33
|
- !ruby/object:Gem::Version
|
48
|
-
version:
|
49
|
-
type: :
|
34
|
+
version: 0.0.4
|
35
|
+
type: :runtime
|
50
36
|
prerelease: false
|
51
37
|
version_requirements: !ruby/object:Gem::Requirement
|
52
38
|
requirements:
|
53
39
|
- - "~>"
|
54
40
|
- !ruby/object:Gem::Version
|
55
|
-
version:
|
41
|
+
version: 0.0.4
|
56
42
|
- !ruby/object:Gem::Dependency
|
57
|
-
name:
|
43
|
+
name: hnswlib
|
58
44
|
requirement: !ruby/object:Gem::Requirement
|
59
45
|
requirements:
|
60
46
|
- - "~>"
|
61
47
|
- !ruby/object:Gem::Version
|
62
|
-
version: '
|
48
|
+
version: '0.8'
|
63
49
|
type: :runtime
|
64
50
|
prerelease: false
|
65
51
|
version_requirements: !ruby/object:Gem::Requirement
|
66
52
|
requirements:
|
67
53
|
- - "~>"
|
68
54
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
55
|
+
version: '0.8'
|
70
56
|
- !ruby/object:Gem::Dependency
|
71
|
-
name:
|
57
|
+
name: ruby-openai
|
72
58
|
requirement: !ruby/object:Gem::Requirement
|
73
59
|
requirements:
|
74
60
|
- - "~>"
|
75
61
|
- !ruby/object:Gem::Version
|
76
|
-
version:
|
62
|
+
version: '4.1'
|
77
63
|
type: :runtime
|
78
64
|
prerelease: false
|
79
65
|
version_requirements: !ruby/object:Gem::Requirement
|
80
66
|
requirements:
|
81
67
|
- - "~>"
|
82
68
|
- !ruby/object:Gem::Version
|
83
|
-
version:
|
69
|
+
version: '4.1'
|
84
70
|
- !ruby/object:Gem::Dependency
|
85
|
-
name:
|
71
|
+
name: pgvector
|
86
72
|
requirement: !ruby/object:Gem::Requirement
|
87
73
|
requirements:
|
88
74
|
- - "~>"
|
89
75
|
- !ruby/object:Gem::Version
|
90
|
-
version: '
|
76
|
+
version: '0.2'
|
91
77
|
type: :runtime
|
92
78
|
prerelease: false
|
93
79
|
version_requirements: !ruby/object:Gem::Requirement
|
94
80
|
requirements:
|
95
81
|
- - "~>"
|
96
82
|
- !ruby/object:Gem::Version
|
97
|
-
version: '
|
83
|
+
version: '0.2'
|
98
84
|
description: You simply set an OpenAI key, give a number of Boxcars to a Train, and
|
99
85
|
magic ensues when you run it.
|
100
86
|
email:
|
@@ -124,6 +110,7 @@ files:
|
|
124
110
|
- lib/boxcars/boxcar/google_search.rb
|
125
111
|
- lib/boxcars/boxcar/sql.rb
|
126
112
|
- lib/boxcars/boxcar/swagger.rb
|
113
|
+
- lib/boxcars/boxcar/vector_answer.rb
|
127
114
|
- lib/boxcars/boxcar/wikipedia_search.rb
|
128
115
|
- lib/boxcars/conversation.rb
|
129
116
|
- lib/boxcars/conversation_prompt.rb
|
@@ -144,13 +131,17 @@ files:
|
|
144
131
|
- lib/boxcars/vector_store/document.rb
|
145
132
|
- lib/boxcars/vector_store/embed_via_open_ai.rb
|
146
133
|
- lib/boxcars/vector_store/embed_via_tensorflow.rb
|
147
|
-
- lib/boxcars/vector_store/hnswlib/
|
148
|
-
- lib/boxcars/vector_store/hnswlib/
|
149
|
-
- lib/boxcars/vector_store/hnswlib/hnswlib_search.rb
|
134
|
+
- lib/boxcars/vector_store/hnswlib/build_from_files.rb
|
135
|
+
- lib/boxcars/vector_store/hnswlib/load_from_disk.rb
|
150
136
|
- lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb
|
151
|
-
- lib/boxcars/vector_store/
|
137
|
+
- lib/boxcars/vector_store/hnswlib/search.rb
|
138
|
+
- lib/boxcars/vector_store/in_memory/build_from_document_array.rb
|
139
|
+
- lib/boxcars/vector_store/in_memory/build_from_files.rb
|
152
140
|
- lib/boxcars/vector_store/in_memory/search.rb
|
153
|
-
- lib/boxcars/vector_store/
|
141
|
+
- lib/boxcars/vector_store/pgvector/build_from_array.rb
|
142
|
+
- lib/boxcars/vector_store/pgvector/build_from_files.rb
|
143
|
+
- lib/boxcars/vector_store/pgvector/save_to_database.rb
|
144
|
+
- lib/boxcars/vector_store/pgvector/search.rb
|
154
145
|
- lib/boxcars/vector_store/split_text.rb
|
155
146
|
- lib/boxcars/version.rb
|
156
147
|
homepage: https://github.com/BoxcarsAI/boxcars
|
@@ -169,14 +160,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
169
160
|
requirements:
|
170
161
|
- - ">="
|
171
162
|
- !ruby/object:Gem::Version
|
172
|
-
version:
|
163
|
+
version: '3.0'
|
173
164
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
174
165
|
requirements:
|
175
166
|
- - ">="
|
176
167
|
- !ruby/object:Gem::Version
|
177
168
|
version: '0'
|
178
169
|
requirements: []
|
179
|
-
rubygems_version: 3.
|
170
|
+
rubygems_version: 3.2.32
|
180
171
|
signing_key:
|
181
172
|
specification_version: 4
|
182
173
|
summary: Boxcars is a gem that enables you to create new systems with AI composability.
|
@@ -1,157 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'fileutils'
|
4
|
-
require 'hnswlib'
|
5
|
-
require 'json'
|
6
|
-
|
7
|
-
module Boxcars
|
8
|
-
module VectorStore
|
9
|
-
module Hnswlib
|
10
|
-
class BuildVectorStore
|
11
|
-
include VectorStore
|
12
|
-
|
13
|
-
# This class is responsible for building the vector store for the hnswlib similarity search.
|
14
|
-
# It will load the training data, generate the embeddings, and save the vector store.
|
15
|
-
# It will also load the vector store into memory.
|
16
|
-
# For later use, it will save the splitted document with index numbers to a json file.
|
17
|
-
#
|
18
|
-
# @param training_data_path [String] The path to the training data. Can be a glob pattern.
|
19
|
-
# @param index_file_path [String] The path to the index file.
|
20
|
-
# @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
|
21
|
-
# @option json_doc_file_path [String]. The json file containing the document text.
|
22
|
-
# if nil, it will reuse index file name.
|
23
|
-
# @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
|
24
|
-
def initialize(
|
25
|
-
training_data_path:,
|
26
|
-
index_file_path:,
|
27
|
-
split_chunk_size: 2000,
|
28
|
-
json_doc_file_path: nil,
|
29
|
-
force_rebuild: true
|
30
|
-
)
|
31
|
-
@training_data_path = training_data_path
|
32
|
-
@index_file_path = index_file_path
|
33
|
-
@split_chunk_size = split_chunk_size
|
34
|
-
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
35
|
-
@force_rebuild = force_rebuild
|
36
|
-
end
|
37
|
-
|
38
|
-
def call
|
39
|
-
validate_params
|
40
|
-
data = load_files
|
41
|
-
documents = split_text_into_chunks(data)
|
42
|
-
embeddings_with_config = generate_embeddings(documents)
|
43
|
-
save_vector_store(embeddings_with_config)
|
44
|
-
load_hnsw
|
45
|
-
end
|
46
|
-
|
47
|
-
private
|
48
|
-
|
49
|
-
attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
|
50
|
-
|
51
|
-
def validate_params
|
52
|
-
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
53
|
-
raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
54
|
-
raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
55
|
-
|
56
|
-
index_dir = File.dirname(index_file_path)
|
57
|
-
raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
|
58
|
-
|
59
|
-
raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
60
|
-
end
|
61
|
-
|
62
|
-
def load_files
|
63
|
-
data = []
|
64
|
-
files = Dir.glob(training_data_path)
|
65
|
-
raise_error "No files found at #{training_data_path}" if files.empty?
|
66
|
-
|
67
|
-
files.each do |file|
|
68
|
-
data << File.read(file)
|
69
|
-
end
|
70
|
-
puts "Added #{files.length} files to data. Splitting text into chunks..."
|
71
|
-
data
|
72
|
-
end
|
73
|
-
|
74
|
-
def split_text_into_chunks(data)
|
75
|
-
return true unless rebuild_required?
|
76
|
-
|
77
|
-
docs = []
|
78
|
-
data.each do |chunk|
|
79
|
-
doc_output = Boxcars::VectorStore::SplitText.call(
|
80
|
-
separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
|
81
|
-
)
|
82
|
-
docs.concat(doc_output)
|
83
|
-
end
|
84
|
-
docs
|
85
|
-
end
|
86
|
-
|
87
|
-
def rebuild_required?
|
88
|
-
hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
89
|
-
return true unless File.exist?(index_file_path)
|
90
|
-
return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
|
91
|
-
return true if force_rebuild
|
92
|
-
|
93
|
-
false
|
94
|
-
end
|
95
|
-
|
96
|
-
def generate_embeddings(documents)
|
97
|
-
return true unless rebuild_required?
|
98
|
-
|
99
|
-
puts "Initializing Store..."
|
100
|
-
openai_client = Openai.open_ai_client
|
101
|
-
embeddings_with_dim = Boxcars::VectorStore::EmbedViaOpenAI.call(texts: documents, client: openai_client)
|
102
|
-
document_embeddings = embeddings_with_dim.map.with_index do |item, index|
|
103
|
-
{ doc_id: index, embedding: item[:embedding], document: documents[index] }
|
104
|
-
end
|
105
|
-
|
106
|
-
{ document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
|
107
|
-
end
|
108
|
-
|
109
|
-
def save_vector_store(embeddings_with_config)
|
110
|
-
return true unless rebuild_required?
|
111
|
-
|
112
|
-
puts "Saving Vectorstore"
|
113
|
-
Boxcars::VectorStore::Hnswlib::SaveToHnswlib.call(
|
114
|
-
document_embeddings: embeddings_with_config[:document_embeddings],
|
115
|
-
index_file_path: index_file_path,
|
116
|
-
json_doc_file_path: json_doc_file_path,
|
117
|
-
hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
|
118
|
-
)
|
119
|
-
puts "VectorStore saved"
|
120
|
-
end
|
121
|
-
|
122
|
-
def hnswlib_config(dim)
|
123
|
-
# dim: length of datum point vector that will be indexed.
|
124
|
-
Boxcars::VectorStore::Hnswlib::HnswlibConfig.new(
|
125
|
-
metric: "l2", max_item: 10000, dim: dim
|
126
|
-
)
|
127
|
-
end
|
128
|
-
|
129
|
-
def load_hnsw
|
130
|
-
puts "Loading Hnswlib"
|
131
|
-
|
132
|
-
config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
133
|
-
json_config = parse_json_file(config_file)
|
134
|
-
document_embeddings = parse_json_file(json_doc_file_path)
|
135
|
-
|
136
|
-
search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
|
137
|
-
search_index.load_index(index_file_path)
|
138
|
-
|
139
|
-
{ vector_store: search_index, document_embeddings: document_embeddings }
|
140
|
-
end
|
141
|
-
|
142
|
-
def parse_json_file(file_path)
|
143
|
-
return [] if file_path.nil?
|
144
|
-
|
145
|
-
file_content = File.read(file_path)
|
146
|
-
JSON.parse(file_content, symbolize_names: true)
|
147
|
-
rescue JSON::ParserError => e
|
148
|
-
raise_error("Error parsing hnswlib_config.json: #{e.message}")
|
149
|
-
end
|
150
|
-
|
151
|
-
def raise_error(message)
|
152
|
-
raise ::Boxcars::Error, message
|
153
|
-
end
|
154
|
-
end
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|
@@ -1,56 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
module Boxcars
|
6
|
-
module VectorStore
|
7
|
-
module Hnswlib
|
8
|
-
class HnswlibConfig
|
9
|
-
attr_reader :metric, :max_item, :dim, :ef_construction, :m
|
10
|
-
|
11
|
-
# used for search index.
|
12
|
-
#
|
13
|
-
# @param max_item [Integer] The maximum number of items.
|
14
|
-
#
|
15
|
-
# @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
|
16
|
-
#
|
17
|
-
# @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
|
18
|
-
# It controls the index time/accuracy trade-off.
|
19
|
-
#
|
20
|
-
# @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
|
21
|
-
#
|
22
|
-
# reference: https://yoshoku.github.io/hnswlib.rb/doc/
|
23
|
-
def initialize(
|
24
|
-
metric: "l2",
|
25
|
-
max_item: 10000,
|
26
|
-
dim: 2,
|
27
|
-
ef_construction: 200,
|
28
|
-
max_outgoing_connection: 16
|
29
|
-
)
|
30
|
-
@metric = metric
|
31
|
-
@max_item = max_item
|
32
|
-
@dim = dim
|
33
|
-
@ef_construction = ef_construction
|
34
|
-
@max_outgoing_connection = max_outgoing_connection
|
35
|
-
end
|
36
|
-
|
37
|
-
def space
|
38
|
-
@metric == 'dot' ? 'ip' : 'l2'
|
39
|
-
end
|
40
|
-
|
41
|
-
def to_json(*args)
|
42
|
-
JSON.pretty_generate(
|
43
|
-
{
|
44
|
-
metric: @metric,
|
45
|
-
max_item: @max_item,
|
46
|
-
dim: @dim,
|
47
|
-
ef_construction: @ef_construction,
|
48
|
-
max_outgoing_connection: @max_outgoing_connection
|
49
|
-
},
|
50
|
-
*args
|
51
|
-
)
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|
56
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'hnswlib'
|
4
|
-
require 'json'
|
5
|
-
|
6
|
-
module Boxcars
|
7
|
-
module VectorStore
|
8
|
-
module Hnswlib
|
9
|
-
class HnswlibSearch
|
10
|
-
def initialize(vector_store:, options: {})
|
11
|
-
validate_params(vector_store)
|
12
|
-
@vector_store = vector_store
|
13
|
-
@json_doc_path = options[:json_doc_path]
|
14
|
-
@num_neighbors = options[:num_neighbors] || 1
|
15
|
-
end
|
16
|
-
|
17
|
-
def call(query)
|
18
|
-
search(query)
|
19
|
-
end
|
20
|
-
|
21
|
-
private
|
22
|
-
|
23
|
-
attr_reader :json_doc_path, :vector_store, :num_neighbors
|
24
|
-
|
25
|
-
def validate_params(vector_store)
|
26
|
-
raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
|
27
|
-
end
|
28
|
-
|
29
|
-
def search(query)
|
30
|
-
raw_results = vector_store.search_knn(query, num_neighbors)
|
31
|
-
raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
|
32
|
-
end
|
33
|
-
|
34
|
-
def lookup_embedding2(doc_id, distance)
|
35
|
-
embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
|
36
|
-
return unless embedding_data
|
37
|
-
|
38
|
-
{ document: embedding_data[:document], distance: distance }
|
39
|
-
end
|
40
|
-
|
41
|
-
def parsed_data
|
42
|
-
@parsed_data ||= JSON.parse(
|
43
|
-
File.read(json_doc_path),
|
44
|
-
symbolize_names: true
|
45
|
-
)
|
46
|
-
end
|
47
|
-
|
48
|
-
def raise_error(message)
|
49
|
-
raise ::Boxcars::ArgumentError, message
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,67 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Boxcars
|
4
|
-
module VectorStore
|
5
|
-
module InMemory
|
6
|
-
MemoryVector = Struct.new(:content, :embedding, :metadatax)
|
7
|
-
|
8
|
-
class AddDocuments
|
9
|
-
include VectorStore
|
10
|
-
|
11
|
-
def initialize(embedding_tool: :openai, documents: nil)
|
12
|
-
validate_params(embedding_tool, documents)
|
13
|
-
@embedding_tool = embedding_tool
|
14
|
-
@documents = documents
|
15
|
-
@memory_vectors = []
|
16
|
-
end
|
17
|
-
|
18
|
-
def call
|
19
|
-
texts = @documents.map { |doc| doc[:page_content] }
|
20
|
-
vectors = generate_vectors(texts)
|
21
|
-
add_vectors(vectors, @documents)
|
22
|
-
@memory_vectors
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def validate_params(embedding_tool, documents)
|
28
|
-
raise ::Boxcars::ArgumentError, 'documents is nil' unless documents
|
29
|
-
return if %i[openai tensorflow].include?(embedding_tool)
|
30
|
-
|
31
|
-
raise ::Boxcars::ArgumentError, 'embedding_tool is invalid'
|
32
|
-
end
|
33
|
-
|
34
|
-
# returns array of documents with vectors
|
35
|
-
def add_vectors(vectors, documents)
|
36
|
-
vectors.zip(documents).each do |vector, doc|
|
37
|
-
memory_vector = MemoryVector.new(doc[:page_content], vector, doc[:metadata])
|
38
|
-
@memory_vectors << memory_vector
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def generate_vectors(texts)
|
43
|
-
embeddings_method[:klass].call(
|
44
|
-
texts: texts, client: embeddings_method[:client]
|
45
|
-
)
|
46
|
-
end
|
47
|
-
|
48
|
-
def embeddings_method
|
49
|
-
@embeddings_method ||=
|
50
|
-
case @embedding_tool
|
51
|
-
when :openai
|
52
|
-
{ klass: Boxcars::VectorStore::EmbedViaOpenAI, client: openai_client }
|
53
|
-
when :tensorflow
|
54
|
-
{ klass: Boxcars::VectorStore::EmbedViaTensorflow, client: nil }
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
# Get the OpenAI client
|
59
|
-
# @param openai_access_token [String] the OpenAI access token
|
60
|
-
# @return [OpenAI::Client]
|
61
|
-
def openai_client(openai_access_token: nil)
|
62
|
-
@openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'hnswlib'
|
4
|
-
|
5
|
-
module Boxcars
|
6
|
-
module VectorStore
|
7
|
-
class SimilaritySearch
|
8
|
-
def initialize(embeddings:, vector_store:, openai_connection: nil, openai_access_token: nil)
|
9
|
-
@embeddings = embeddings
|
10
|
-
@vector_store = vector_store
|
11
|
-
@similarity_search_instance = create_similarity_search_instance
|
12
|
-
@openai_connection = openai_connection || default_connection(openai_access_token: openai_access_token)
|
13
|
-
end
|
14
|
-
|
15
|
-
def call(query:)
|
16
|
-
validate_query(query)
|
17
|
-
query_vector = convert_query_to_vector(query)
|
18
|
-
@similarity_search_instance.call(query_vector)
|
19
|
-
end
|
20
|
-
|
21
|
-
private
|
22
|
-
|
23
|
-
attr_reader :embeddings, :vector_store, :openai_connection
|
24
|
-
|
25
|
-
def default_connection(openai_access_token: nil)
|
26
|
-
Openai.open_ai_client(openai_access_token: openai_access_token)
|
27
|
-
end
|
28
|
-
|
29
|
-
def validate_query(query)
|
30
|
-
raise_error 'query must be a string' unless query.is_a?(String)
|
31
|
-
raise_error 'query must not be empty' if query.empty?
|
32
|
-
end
|
33
|
-
|
34
|
-
def convert_query_to_vector(query)
|
35
|
-
Boxcars::VectorStore::EmbedViaOpenAI.call(texts: [query], client: openai_connection).first[:embedding]
|
36
|
-
end
|
37
|
-
|
38
|
-
def create_similarity_search_instance
|
39
|
-
case vector_store
|
40
|
-
when ::Hnswlib::HierarchicalNSW
|
41
|
-
Boxcars::VectorStore::Hnswlib::HnswlibSearch.new(
|
42
|
-
vector_store: vector_store,
|
43
|
-
options: { json_doc_path: embeddings, num_neighbors: 2 }
|
44
|
-
)
|
45
|
-
else
|
46
|
-
raise_error 'Unsupported vector store provided'
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def raise_error(message)
|
51
|
-
raise ArgumentError, message
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|