boxcars 0.2.10 → 0.2.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.env_sample +1 -0
  3. data/.rubocop.yml +16 -0
  4. data/CHANGELOG.md +26 -2
  5. data/Gemfile +12 -12
  6. data/Gemfile.lock +34 -28
  7. data/README.md +4 -1
  8. data/boxcars.gemspec +2 -2
  9. data/lib/boxcars/boxcar/active_record.rb +1 -1
  10. data/lib/boxcars/boxcar.rb +2 -1
  11. data/lib/boxcars/engine/openai.rb +8 -1
  12. data/lib/boxcars/vector_search.rb +75 -0
  13. data/lib/boxcars/{boxcar/vector_stores → vector_store}/document.rb +4 -3
  14. data/lib/boxcars/{boxcar/vector_stores → vector_store}/embed_via_open_ai.rb +3 -3
  15. data/lib/boxcars/{boxcar/vector_stores → vector_store}/embed_via_tensorflow.rb +1 -1
  16. data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
  17. data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
  18. data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +90 -0
  19. data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
  20. data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
  21. data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
  22. data/lib/boxcars/vector_store/in_memory/search.rb +61 -0
  23. data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
  24. data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
  25. data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
  26. data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
  27. data/lib/boxcars/{boxcar/vector_stores → vector_store}/split_text.rb +3 -4
  28. data/lib/boxcars/vector_store.rb +100 -0
  29. data/lib/boxcars/version.rb +1 -1
  30. data/lib/boxcars.rb +1 -1
  31. metadata +22 -18
  32. data/lib/boxcars/boxcar/vector_search.rb +0 -11
  33. data/lib/boxcars/boxcar/vector_store.rb +0 -34
  34. data/lib/boxcars/boxcar/vector_stores/hnswlib/build_vector_store.rb +0 -157
  35. data/lib/boxcars/boxcar/vector_stores/hnswlib/hnswlib_config.rb +0 -56
  36. data/lib/boxcars/boxcar/vector_stores/hnswlib/hnswlib_search.rb +0 -54
  37. data/lib/boxcars/boxcar/vector_stores/hnswlib/save_to_hnswlib.rb +0 -80
  38. data/lib/boxcars/boxcar/vector_stores/in_memory/add_documents.rb +0 -67
  39. data/lib/boxcars/boxcar/vector_stores/in_memory/search.rb +0 -81
  40. data/lib/boxcars/boxcar/vector_stores/similarity_search.rb +0 -55
@@ -1,54 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'hnswlib'
4
- require 'json'
5
-
6
- module Boxcars
7
- module VectorStores
8
- module Hnswlib
9
- class HnswlibSearch
10
- def initialize(vector_store:, options: {})
11
- validate_params(vector_store)
12
- @vector_store = vector_store
13
- @json_doc_path = options[:json_doc_path]
14
- @num_neighbors = options[:num_neighbors] || 1
15
- end
16
-
17
- def call(query)
18
- search(query)
19
- end
20
-
21
- private
22
-
23
- attr_reader :json_doc_path, :vector_store, :num_neighbors
24
-
25
- def validate_params(vector_store)
26
- raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
27
- end
28
-
29
- def search(query)
30
- raw_results = vector_store.search_knn(query, num_neighbors)
31
- raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
32
- end
33
-
34
- def lookup_embedding2(doc_id, distance)
35
- embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
36
- return unless embedding_data
37
-
38
- { document: embedding_data[:document], distance: distance }
39
- end
40
-
41
- def parsed_data
42
- @parsed_data ||= JSON.parse(
43
- File.read(json_doc_path),
44
- symbolize_names: true
45
- )
46
- end
47
-
48
- def raise_error(message)
49
- raise ::Boxcars::ArgumentError, message
50
- end
51
- end
52
- end
53
- end
54
- end
@@ -1,80 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'hnswlib'
4
- require 'json'
5
- require 'fileutils'
6
-
7
- module Boxcars
8
- module VectorStores
9
- module Hnswlib
10
- class SaveToHnswlib
11
- include VectorStore
12
-
13
- # @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
14
- # @param index_file_path [String] The path to the index file.
15
- # @param hnswlib_config [Boxcars::VectorStores::Hnswlib::Config] The config object for the hnswlib index.
16
- # @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
17
- def initialize(document_embeddings:, index_file_path:, hnswlib_config:, json_doc_file_path: nil)
18
- @document_embeddings = document_embeddings
19
- @index_file_path = index_file_path
20
- @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
21
-
22
- @hnswlib_config = hnswlib_config
23
- @index = ::Hnswlib::HnswIndex.new(
24
- n_features: hnswlib_config.dim,
25
- max_item: hnswlib_config.max_item,
26
- metric: hnswlib_config.metric
27
- )
28
- end
29
-
30
- def call
31
- validate_params
32
- document_texts = []
33
-
34
- document_embeddings.each do |embedding|
35
- index.add_item(embedding[:doc_id], embedding[:embedding])
36
-
37
- document_texts << { doc_id: embedding[:doc_id], embedding: embedding[:embedding], document: embedding[:document] }
38
- end
39
-
40
- write_files(index, document_texts)
41
- end
42
-
43
- private
44
-
45
- def write_files(index, document_texts)
46
- FileUtils.mkdir_p(File.dirname(json_doc_file_path))
47
- File.write(json_doc_file_path, document_texts.to_json)
48
-
49
- FileUtils.mkdir_p(File.dirname(index_file_path))
50
- File.write("#{File.dirname(index_file_path)}/hnswlib_config.json", hnswlib_config.to_json)
51
-
52
- index.save(index_file_path)
53
- end
54
-
55
- attr_reader :index, :document_embeddings, :index_file_path, :json_doc_file_path, :hnswlib_config
56
-
57
- def validate_params
58
- raise_error("document_embeddings must be an array") unless document_embeddings.is_a?(Array)
59
- raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
60
- raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
61
-
62
- [index_file_path, json_doc_file_path].each do |path|
63
- check_parent_directory(path)
64
- end
65
- end
66
-
67
- def check_parent_directory(path)
68
- return unless path
69
-
70
- parent_dir = File.dirname(path)
71
- raise_error('parent directory must exist') unless File.directory?(parent_dir)
72
- end
73
-
74
- def raise_error(message)
75
- raise ::Boxcars::ValueError, message
76
- end
77
- end
78
- end
79
- end
80
- end
@@ -1,67 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Boxcars
4
- module VectorStores
5
- module InMemory
6
- MemoryVector = Struct.new(:content, :embedding, :metadatax)
7
-
8
- class AddDocuments
9
- include VectorStore
10
-
11
- def initialize(embedding_tool: :openai, documents: nil)
12
- validate_params(embedding_tool, documents)
13
- @embedding_tool = embedding_tool
14
- @documents = documents
15
- @memory_vectors = []
16
- end
17
-
18
- def call
19
- texts = @documents.map { |doc| doc[:page_content] }
20
- vectors = generate_vectors(texts)
21
- add_vectors(vectors, @documents)
22
- @memory_vectors
23
- end
24
-
25
- private
26
-
27
- def validate_params(embedding_tool, documents)
28
- raise ::Boxcars::ArgumentError, 'documents is nil' unless documents
29
- return if %i[openai tensorflow].include?(embedding_tool)
30
-
31
- raise ::Boxcars::ArgumentError, 'embedding_tool is invalid'
32
- end
33
-
34
- # returns array of documents with vectors
35
- def add_vectors(vectors, documents)
36
- vectors.zip(documents).each do |vector, doc|
37
- memory_vector = MemoryVector.new(doc[:page_content], vector, doc[:metadata])
38
- @memory_vectors << memory_vector
39
- end
40
- end
41
-
42
- def generate_vectors(texts)
43
- embeddings_method[:klass].call(
44
- texts: texts, client: embeddings_method[:client]
45
- )
46
- end
47
-
48
- def embeddings_method
49
- @embeddings_method ||=
50
- case @embedding_tool
51
- when :openai
52
- { klass: Boxcars::VectorStores::EmbedViaOpenAI, client: openai_client }
53
- when :tensorflow
54
- { klass: Boxcars::VectorStores::EmbedViaTensorflow, client: nil }
55
- end
56
- end
57
-
58
- # Get the OpenAI client
59
- # @param openai_access_token [String] the OpenAI access token
60
- # @return [OpenAI::Client]
61
- def openai_client(openai_access_token: nil)
62
- @openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
63
- end
64
- end
65
- end
66
- end
67
- end
@@ -1,81 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # require 'openai'
4
- #
5
- # documents = [
6
- # { page_content: "hello", metadata: { a: 1 } },
7
- # { page_content: "hi", metadata: { a: 1 } },
8
- # { page_content: "bye", metadata: { a: 1 } },
9
- # { page_content: "what's this", metadata: { a: 1 } },
10
- # ]
11
- #
12
- # vector_documents = Boxcars::VectorStores::InMemory::AddDocuments.call(embedding_tool: :openai, documents: documents)
13
- #
14
- # result = Boxcars::VectorStores::InMemory::Search.call(vecotr_documents: vector_documents, query: "hello")
15
- #
16
- # expect(result).to eq(Boxcars::VectorStores::Document.new({ page_content: "hello", metadata: { a: 1 } }))
17
-
18
- module Boxcars
19
- module VectorStores
20
- module InMemory
21
- class Search
22
- include VectorStore
23
- def initialize(vector_documents:, query:, embedding_tool: :openai)
24
- validate_params(vector_documents, query, embedding_tool)
25
- @vector_documents = vector_documents
26
- @query = query
27
- @embedding_tool = embedding_tool
28
- end
29
-
30
- def call
31
- results = @vector_documents.map do |doc|
32
- {
33
- document: doc,
34
- similarity: cosine_similarity(query_vector, doc[:vector])
35
- }
36
- end
37
- results.min_by { |result| -result[:similarity] }[:document]
38
- end
39
-
40
- private
41
-
42
- def validate_params(vector_documents, query, embedding_tool)
43
- raise ::Boxcars::ArgumentError, 'query is empty' if query.to_s.empty?
44
- raise ::Boxcars::ArgumentError, 'embedding_tool is invalid' unless %i[openai tensorflow].include?(embedding_tool)
45
-
46
- unless vector_documents.is_a?(Array) && vector_documents.all? do |doc|
47
- doc.is_a?(Hash) && doc.key?(:document) && doc.key?(:vector)
48
- end
49
- raise ::Boxcars::ArgumentError, "vector_documents is not valid"
50
- end
51
- end
52
-
53
- def query_vector
54
- embeddings_method(@embedding_tool)[:klass].call(
55
- texts: [@query], client: embeddings_method(@embedding_tool)[:client]
56
- ).first
57
- end
58
-
59
- def openai_client(openai_access_token: nil)
60
- @openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
61
- end
62
-
63
- def embeddings_method(embedding_tool)
64
- case embedding_tool
65
- when :openai
66
- { klass: Boxcars::VectorStores::EmbedViaOpenAI, client: openai_client }
67
- when :tensorflow
68
- { klass: Boxcars::VectorStores::EmbedViaTensorflow, client: nil }
69
- end
70
- end
71
-
72
- def cosine_similarity(vector1, vector2)
73
- dot_product = vector1.zip(vector2).reduce(0) { |sum, (a, b)| sum + (a * b) }
74
- magnitude1 = Math.sqrt(vector1.reduce(0) { |sum, a| sum + (a**2) })
75
- magnitude2 = Math.sqrt(vector2.reduce(0) { |sum, b| sum + (b**2) })
76
- dot_product / (magnitude1 * magnitude2)
77
- end
78
- end
79
- end
80
- end
81
- end
@@ -1,55 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'hnswlib'
4
-
5
- module Boxcars
6
- module VectorStores
7
- class SimilaritySearch
8
- def initialize(embeddings:, vector_store:, openai_connection: nil, openai_access_token: nil)
9
- @embeddings = embeddings
10
- @vector_store = vector_store
11
- @similarity_search_instance = create_similarity_search_instance
12
- @openai_connection = openai_connection || default_connection(openai_access_token: openai_access_token)
13
- end
14
-
15
- def call(query:)
16
- validate_query(query)
17
- query_vector = convert_query_to_vector(query)
18
- @similarity_search_instance.call(query_vector)
19
- end
20
-
21
- private
22
-
23
- attr_reader :embeddings, :vector_store, :openai_connection
24
-
25
- def default_connection(openai_access_token: nil)
26
- Openai.open_ai_client(openai_access_token: openai_access_token)
27
- end
28
-
29
- def validate_query(query)
30
- raise_error 'query must be a string' unless query.is_a?(String)
31
- raise_error 'query must not be empty' if query.empty?
32
- end
33
-
34
- def convert_query_to_vector(query)
35
- Boxcars::VectorStores::EmbedViaOpenAI.call(texts: [query], client: openai_connection).first[:embedding]
36
- end
37
-
38
- def create_similarity_search_instance
39
- case vector_store
40
- when ::Hnswlib::HierarchicalNSW
41
- Boxcars::VectorStores::Hnswlib::HnswlibSearch.new(
42
- vector_store: vector_store,
43
- options: { json_doc_path: embeddings, num_neighbors: 2 }
44
- )
45
- else
46
- raise_error 'Unsupported vector store provided'
47
- end
48
- end
49
-
50
- def raise_error(message)
51
- raise ArgumentError, message
52
- end
53
- end
54
- end
55
- end