boxcars 0.2.10 → 0.2.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.env_sample +1 -0
- data/.rubocop.yml +16 -0
- data/CHANGELOG.md +26 -2
- data/Gemfile +12 -12
- data/Gemfile.lock +34 -28
- data/README.md +4 -1
- data/boxcars.gemspec +2 -2
- data/lib/boxcars/boxcar/active_record.rb +1 -1
- data/lib/boxcars/boxcar.rb +2 -1
- data/lib/boxcars/engine/openai.rb +8 -1
- data/lib/boxcars/vector_search.rb +75 -0
- data/lib/boxcars/{boxcar/vector_stores → vector_store}/document.rb +4 -3
- data/lib/boxcars/{boxcar/vector_stores → vector_store}/embed_via_open_ai.rb +3 -3
- data/lib/boxcars/{boxcar/vector_stores → vector_store}/embed_via_tensorflow.rb +1 -1
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
- data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +90 -0
- data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +61 -0
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
- data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
- data/lib/boxcars/{boxcar/vector_stores → vector_store}/split_text.rb +3 -4
- data/lib/boxcars/vector_store.rb +100 -0
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -1
- metadata +22 -18
- data/lib/boxcars/boxcar/vector_search.rb +0 -11
- data/lib/boxcars/boxcar/vector_store.rb +0 -34
- data/lib/boxcars/boxcar/vector_stores/hnswlib/build_vector_store.rb +0 -157
- data/lib/boxcars/boxcar/vector_stores/hnswlib/hnswlib_config.rb +0 -56
- data/lib/boxcars/boxcar/vector_stores/hnswlib/hnswlib_search.rb +0 -54
- data/lib/boxcars/boxcar/vector_stores/hnswlib/save_to_hnswlib.rb +0 -80
- data/lib/boxcars/boxcar/vector_stores/in_memory/add_documents.rb +0 -67
- data/lib/boxcars/boxcar/vector_stores/in_memory/search.rb +0 -81
- data/lib/boxcars/boxcar/vector_stores/similarity_search.rb +0 -55
@@ -1,54 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'hnswlib'
|
4
|
-
require 'json'
|
5
|
-
|
6
|
-
module Boxcars
|
7
|
-
module VectorStores
|
8
|
-
module Hnswlib
|
9
|
-
class HnswlibSearch
|
10
|
-
def initialize(vector_store:, options: {})
|
11
|
-
validate_params(vector_store)
|
12
|
-
@vector_store = vector_store
|
13
|
-
@json_doc_path = options[:json_doc_path]
|
14
|
-
@num_neighbors = options[:num_neighbors] || 1
|
15
|
-
end
|
16
|
-
|
17
|
-
def call(query)
|
18
|
-
search(query)
|
19
|
-
end
|
20
|
-
|
21
|
-
private
|
22
|
-
|
23
|
-
attr_reader :json_doc_path, :vector_store, :num_neighbors
|
24
|
-
|
25
|
-
def validate_params(vector_store)
|
26
|
-
raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
|
27
|
-
end
|
28
|
-
|
29
|
-
def search(query)
|
30
|
-
raw_results = vector_store.search_knn(query, num_neighbors)
|
31
|
-
raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
|
32
|
-
end
|
33
|
-
|
34
|
-
def lookup_embedding2(doc_id, distance)
|
35
|
-
embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
|
36
|
-
return unless embedding_data
|
37
|
-
|
38
|
-
{ document: embedding_data[:document], distance: distance }
|
39
|
-
end
|
40
|
-
|
41
|
-
def parsed_data
|
42
|
-
@parsed_data ||= JSON.parse(
|
43
|
-
File.read(json_doc_path),
|
44
|
-
symbolize_names: true
|
45
|
-
)
|
46
|
-
end
|
47
|
-
|
48
|
-
def raise_error(message)
|
49
|
-
raise ::Boxcars::ArgumentError, message
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,80 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'hnswlib'
|
4
|
-
require 'json'
|
5
|
-
require 'fileutils'
|
6
|
-
|
7
|
-
module Boxcars
|
8
|
-
module VectorStores
|
9
|
-
module Hnswlib
|
10
|
-
class SaveToHnswlib
|
11
|
-
include VectorStore
|
12
|
-
|
13
|
-
# @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
|
14
|
-
# @param index_file_path [String] The path to the index file.
|
15
|
-
# @param hnswlib_config [Boxcars::VectorStores::Hnswlib::Config] The config object for the hnswlib index.
|
16
|
-
# @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
|
17
|
-
def initialize(document_embeddings:, index_file_path:, hnswlib_config:, json_doc_file_path: nil)
|
18
|
-
@document_embeddings = document_embeddings
|
19
|
-
@index_file_path = index_file_path
|
20
|
-
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
21
|
-
|
22
|
-
@hnswlib_config = hnswlib_config
|
23
|
-
@index = ::Hnswlib::HnswIndex.new(
|
24
|
-
n_features: hnswlib_config.dim,
|
25
|
-
max_item: hnswlib_config.max_item,
|
26
|
-
metric: hnswlib_config.metric
|
27
|
-
)
|
28
|
-
end
|
29
|
-
|
30
|
-
def call
|
31
|
-
validate_params
|
32
|
-
document_texts = []
|
33
|
-
|
34
|
-
document_embeddings.each do |embedding|
|
35
|
-
index.add_item(embedding[:doc_id], embedding[:embedding])
|
36
|
-
|
37
|
-
document_texts << { doc_id: embedding[:doc_id], embedding: embedding[:embedding], document: embedding[:document] }
|
38
|
-
end
|
39
|
-
|
40
|
-
write_files(index, document_texts)
|
41
|
-
end
|
42
|
-
|
43
|
-
private
|
44
|
-
|
45
|
-
def write_files(index, document_texts)
|
46
|
-
FileUtils.mkdir_p(File.dirname(json_doc_file_path))
|
47
|
-
File.write(json_doc_file_path, document_texts.to_json)
|
48
|
-
|
49
|
-
FileUtils.mkdir_p(File.dirname(index_file_path))
|
50
|
-
File.write("#{File.dirname(index_file_path)}/hnswlib_config.json", hnswlib_config.to_json)
|
51
|
-
|
52
|
-
index.save(index_file_path)
|
53
|
-
end
|
54
|
-
|
55
|
-
attr_reader :index, :document_embeddings, :index_file_path, :json_doc_file_path, :hnswlib_config
|
56
|
-
|
57
|
-
def validate_params
|
58
|
-
raise_error("document_embeddings must be an array") unless document_embeddings.is_a?(Array)
|
59
|
-
raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
|
60
|
-
raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
61
|
-
|
62
|
-
[index_file_path, json_doc_file_path].each do |path|
|
63
|
-
check_parent_directory(path)
|
64
|
-
end
|
65
|
-
end
|
66
|
-
|
67
|
-
def check_parent_directory(path)
|
68
|
-
return unless path
|
69
|
-
|
70
|
-
parent_dir = File.dirname(path)
|
71
|
-
raise_error('parent directory must exist') unless File.directory?(parent_dir)
|
72
|
-
end
|
73
|
-
|
74
|
-
def raise_error(message)
|
75
|
-
raise ::Boxcars::ValueError, message
|
76
|
-
end
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
@@ -1,67 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Boxcars
|
4
|
-
module VectorStores
|
5
|
-
module InMemory
|
6
|
-
MemoryVector = Struct.new(:content, :embedding, :metadatax)
|
7
|
-
|
8
|
-
class AddDocuments
|
9
|
-
include VectorStore
|
10
|
-
|
11
|
-
def initialize(embedding_tool: :openai, documents: nil)
|
12
|
-
validate_params(embedding_tool, documents)
|
13
|
-
@embedding_tool = embedding_tool
|
14
|
-
@documents = documents
|
15
|
-
@memory_vectors = []
|
16
|
-
end
|
17
|
-
|
18
|
-
def call
|
19
|
-
texts = @documents.map { |doc| doc[:page_content] }
|
20
|
-
vectors = generate_vectors(texts)
|
21
|
-
add_vectors(vectors, @documents)
|
22
|
-
@memory_vectors
|
23
|
-
end
|
24
|
-
|
25
|
-
private
|
26
|
-
|
27
|
-
def validate_params(embedding_tool, documents)
|
28
|
-
raise ::Boxcars::ArgumentError, 'documents is nil' unless documents
|
29
|
-
return if %i[openai tensorflow].include?(embedding_tool)
|
30
|
-
|
31
|
-
raise ::Boxcars::ArgumentError, 'embedding_tool is invalid'
|
32
|
-
end
|
33
|
-
|
34
|
-
# returns array of documents with vectors
|
35
|
-
def add_vectors(vectors, documents)
|
36
|
-
vectors.zip(documents).each do |vector, doc|
|
37
|
-
memory_vector = MemoryVector.new(doc[:page_content], vector, doc[:metadata])
|
38
|
-
@memory_vectors << memory_vector
|
39
|
-
end
|
40
|
-
end
|
41
|
-
|
42
|
-
def generate_vectors(texts)
|
43
|
-
embeddings_method[:klass].call(
|
44
|
-
texts: texts, client: embeddings_method[:client]
|
45
|
-
)
|
46
|
-
end
|
47
|
-
|
48
|
-
def embeddings_method
|
49
|
-
@embeddings_method ||=
|
50
|
-
case @embedding_tool
|
51
|
-
when :openai
|
52
|
-
{ klass: Boxcars::VectorStores::EmbedViaOpenAI, client: openai_client }
|
53
|
-
when :tensorflow
|
54
|
-
{ klass: Boxcars::VectorStores::EmbedViaTensorflow, client: nil }
|
55
|
-
end
|
56
|
-
end
|
57
|
-
|
58
|
-
# Get the OpenAI client
|
59
|
-
# @param openai_access_token [String] the OpenAI access token
|
60
|
-
# @return [OpenAI::Client]
|
61
|
-
def openai_client(openai_access_token: nil)
|
62
|
-
@openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
|
63
|
-
end
|
64
|
-
end
|
65
|
-
end
|
66
|
-
end
|
67
|
-
end
|
@@ -1,81 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
# require 'openai'
|
4
|
-
#
|
5
|
-
# documents = [
|
6
|
-
# { page_content: "hello", metadata: { a: 1 } },
|
7
|
-
# { page_content: "hi", metadata: { a: 1 } },
|
8
|
-
# { page_content: "bye", metadata: { a: 1 } },
|
9
|
-
# { page_content: "what's this", metadata: { a: 1 } },
|
10
|
-
# ]
|
11
|
-
#
|
12
|
-
# vector_documents = Boxcars::VectorStores::InMemory::AddDocuments.call(embedding_tool: :openai, documents: documents)
|
13
|
-
#
|
14
|
-
# result = Boxcars::VectorStores::InMemory::Search.call(vecotr_documents: vector_documents, query: "hello")
|
15
|
-
#
|
16
|
-
# expect(result).to eq(Boxcars::VectorStores::Document.new({ page_content: "hello", metadata: { a: 1 } }))
|
17
|
-
|
18
|
-
module Boxcars
|
19
|
-
module VectorStores
|
20
|
-
module InMemory
|
21
|
-
class Search
|
22
|
-
include VectorStore
|
23
|
-
def initialize(vector_documents:, query:, embedding_tool: :openai)
|
24
|
-
validate_params(vector_documents, query, embedding_tool)
|
25
|
-
@vector_documents = vector_documents
|
26
|
-
@query = query
|
27
|
-
@embedding_tool = embedding_tool
|
28
|
-
end
|
29
|
-
|
30
|
-
def call
|
31
|
-
results = @vector_documents.map do |doc|
|
32
|
-
{
|
33
|
-
document: doc,
|
34
|
-
similarity: cosine_similarity(query_vector, doc[:vector])
|
35
|
-
}
|
36
|
-
end
|
37
|
-
results.min_by { |result| -result[:similarity] }[:document]
|
38
|
-
end
|
39
|
-
|
40
|
-
private
|
41
|
-
|
42
|
-
def validate_params(vector_documents, query, embedding_tool)
|
43
|
-
raise ::Boxcars::ArgumentError, 'query is empty' if query.to_s.empty?
|
44
|
-
raise ::Boxcars::ArgumentError, 'embedding_tool is invalid' unless %i[openai tensorflow].include?(embedding_tool)
|
45
|
-
|
46
|
-
unless vector_documents.is_a?(Array) && vector_documents.all? do |doc|
|
47
|
-
doc.is_a?(Hash) && doc.key?(:document) && doc.key?(:vector)
|
48
|
-
end
|
49
|
-
raise ::Boxcars::ArgumentError, "vector_documents is not valid"
|
50
|
-
end
|
51
|
-
end
|
52
|
-
|
53
|
-
def query_vector
|
54
|
-
embeddings_method(@embedding_tool)[:klass].call(
|
55
|
-
texts: [@query], client: embeddings_method(@embedding_tool)[:client]
|
56
|
-
).first
|
57
|
-
end
|
58
|
-
|
59
|
-
def openai_client(openai_access_token: nil)
|
60
|
-
@openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
|
61
|
-
end
|
62
|
-
|
63
|
-
def embeddings_method(embedding_tool)
|
64
|
-
case embedding_tool
|
65
|
-
when :openai
|
66
|
-
{ klass: Boxcars::VectorStores::EmbedViaOpenAI, client: openai_client }
|
67
|
-
when :tensorflow
|
68
|
-
{ klass: Boxcars::VectorStores::EmbedViaTensorflow, client: nil }
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def cosine_similarity(vector1, vector2)
|
73
|
-
dot_product = vector1.zip(vector2).reduce(0) { |sum, (a, b)| sum + (a * b) }
|
74
|
-
magnitude1 = Math.sqrt(vector1.reduce(0) { |sum, a| sum + (a**2) })
|
75
|
-
magnitude2 = Math.sqrt(vector2.reduce(0) { |sum, b| sum + (b**2) })
|
76
|
-
dot_product / (magnitude1 * magnitude2)
|
77
|
-
end
|
78
|
-
end
|
79
|
-
end
|
80
|
-
end
|
81
|
-
end
|
@@ -1,55 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'hnswlib'
|
4
|
-
|
5
|
-
module Boxcars
|
6
|
-
module VectorStores
|
7
|
-
class SimilaritySearch
|
8
|
-
def initialize(embeddings:, vector_store:, openai_connection: nil, openai_access_token: nil)
|
9
|
-
@embeddings = embeddings
|
10
|
-
@vector_store = vector_store
|
11
|
-
@similarity_search_instance = create_similarity_search_instance
|
12
|
-
@openai_connection = openai_connection || default_connection(openai_access_token: openai_access_token)
|
13
|
-
end
|
14
|
-
|
15
|
-
def call(query:)
|
16
|
-
validate_query(query)
|
17
|
-
query_vector = convert_query_to_vector(query)
|
18
|
-
@similarity_search_instance.call(query_vector)
|
19
|
-
end
|
20
|
-
|
21
|
-
private
|
22
|
-
|
23
|
-
attr_reader :embeddings, :vector_store, :openai_connection
|
24
|
-
|
25
|
-
def default_connection(openai_access_token: nil)
|
26
|
-
Openai.open_ai_client(openai_access_token: openai_access_token)
|
27
|
-
end
|
28
|
-
|
29
|
-
def validate_query(query)
|
30
|
-
raise_error 'query must be a string' unless query.is_a?(String)
|
31
|
-
raise_error 'query must not be empty' if query.empty?
|
32
|
-
end
|
33
|
-
|
34
|
-
def convert_query_to_vector(query)
|
35
|
-
Boxcars::VectorStores::EmbedViaOpenAI.call(texts: [query], client: openai_connection).first[:embedding]
|
36
|
-
end
|
37
|
-
|
38
|
-
def create_similarity_search_instance
|
39
|
-
case vector_store
|
40
|
-
when ::Hnswlib::HierarchicalNSW
|
41
|
-
Boxcars::VectorStores::Hnswlib::HnswlibSearch.new(
|
42
|
-
vector_store: vector_store,
|
43
|
-
options: { json_doc_path: embeddings, num_neighbors: 2 }
|
44
|
-
)
|
45
|
-
else
|
46
|
-
raise_error 'Unsupported vector store provided'
|
47
|
-
end
|
48
|
-
end
|
49
|
-
|
50
|
-
def raise_error(message)
|
51
|
-
raise ArgumentError, message
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end
|