boxcars 0.2.10 → 0.2.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/.env_sample +1 -0
  3. data/.rubocop.yml +16 -0
  4. data/CHANGELOG.md +26 -2
  5. data/Gemfile +12 -12
  6. data/Gemfile.lock +34 -28
  7. data/README.md +4 -1
  8. data/boxcars.gemspec +2 -2
  9. data/lib/boxcars/boxcar/active_record.rb +1 -1
  10. data/lib/boxcars/boxcar.rb +2 -1
  11. data/lib/boxcars/engine/openai.rb +8 -1
  12. data/lib/boxcars/vector_search.rb +75 -0
  13. data/lib/boxcars/{boxcar/vector_stores → vector_store}/document.rb +4 -3
  14. data/lib/boxcars/{boxcar/vector_stores → vector_store}/embed_via_open_ai.rb +3 -3
  15. data/lib/boxcars/{boxcar/vector_stores → vector_store}/embed_via_tensorflow.rb +1 -1
  16. data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
  17. data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
  18. data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +90 -0
  19. data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
  20. data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
  21. data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
  22. data/lib/boxcars/vector_store/in_memory/search.rb +61 -0
  23. data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
  24. data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
  25. data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
  26. data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
  27. data/lib/boxcars/{boxcar/vector_stores → vector_store}/split_text.rb +3 -4
  28. data/lib/boxcars/vector_store.rb +100 -0
  29. data/lib/boxcars/version.rb +1 -1
  30. data/lib/boxcars.rb +1 -1
  31. metadata +22 -18
  32. data/lib/boxcars/boxcar/vector_search.rb +0 -11
  33. data/lib/boxcars/boxcar/vector_store.rb +0 -34
  34. data/lib/boxcars/boxcar/vector_stores/hnswlib/build_vector_store.rb +0 -157
  35. data/lib/boxcars/boxcar/vector_stores/hnswlib/hnswlib_config.rb +0 -56
  36. data/lib/boxcars/boxcar/vector_stores/hnswlib/hnswlib_search.rb +0 -54
  37. data/lib/boxcars/boxcar/vector_stores/hnswlib/save_to_hnswlib.rb +0 -80
  38. data/lib/boxcars/boxcar/vector_stores/in_memory/add_documents.rb +0 -67
  39. data/lib/boxcars/boxcar/vector_stores/in_memory/search.rb +0 -81
  40. data/lib/boxcars/boxcar/vector_stores/similarity_search.rb +0 -55
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+ require 'json'
5
+ require 'fileutils'
6
+
7
+ module Boxcars
8
+ module VectorStore
9
+ module Hnswlib
10
+ class SaveToHnswlib
11
+ include VectorStore
12
+
13
+ # @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
14
+ # @param index_file_path [String] The path to the index file.
15
+ # @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
16
+ def initialize(hnsw_vectors_array)
17
+ @metadata = hnsw_vectors_array&.first&.metadata
18
+ validate_params(hnsw_vectors_array, metadata)
19
+
20
+ @vectors = hnsw_vectors_array
21
+ @index_file_path = metadata[:index_file_path]
22
+ @json_doc_file_path = metadata[:json_doc_file_path] || @index_file_path.gsub(/\.bin$/, '.json')
23
+
24
+ @metric = metadata[:metric] || "l2"
25
+ @dim = metadata[:dim]
26
+ @max_item = metadata[:max_item] || 10000
27
+
28
+ @index = ::Hnswlib::HierarchicalNSW.new(
29
+ space: @metric,
30
+ dim: @dim
31
+ )
32
+ @index.init_index(max_elements: @max_item)
33
+ end
34
+
35
+ def call
36
+ document_texts = add_vectors_to_index
37
+ write_files(index, document_texts)
38
+ end
39
+
40
+ private
41
+
42
+ attr_reader :metadata, :index, :vectors, :index_file_path, :json_doc_file_path, :metric, :dim, :max_item
43
+
44
+ def validate_params(hnsw_vectors_array, metadata)
45
+ raise_argument_error('argument must be an array') unless hnsw_vectors_array.is_a?(Array)
46
+ raise_argument_error('missing data') if hnsw_vectors_array.empty?
47
+ raise_error('missing metadata') unless metadata || metadata.empty?
48
+
49
+ raise_argument_error("dim must be an integer") unless metadata[:dim].is_a?(Integer)
50
+ raise_argument_error('missing dim') unless metadata[:dim]
51
+ raise_argument_error('missing index_file_path') unless metadata[:index_file_path]
52
+
53
+ check_parent_directory(metadata[:index_file_path])
54
+ check_parent_directory(metadata[:json_doc_file_path])
55
+ end
56
+
57
+ def add_vectors_to_index
58
+ document_texts = []
59
+
60
+ vectors.each do |item|
61
+ index.add_point(item.embedding, item.metadata[:doc_id])
62
+
63
+ document_texts << {
64
+ doc_id: item.metadata[:doc_id],
65
+ embedding: item.embedding,
66
+ document: item.content,
67
+ metadata: item.metadata
68
+ }
69
+ end
70
+ document_texts
71
+ end
72
+
73
+ def write_files(index, document_texts)
74
+ FileUtils.mkdir_p(File.dirname(json_doc_file_path))
75
+ File.write(json_doc_file_path, document_texts.to_json)
76
+
77
+ FileUtils.mkdir_p(File.dirname(index_file_path))
78
+ index.save_index(index_file_path)
79
+ end
80
+
81
+ def check_parent_directory(path)
82
+ return unless path
83
+
84
+ parent_dir = File.dirname(path)
85
+ raise_argument_error('parent directory must exist') unless File.directory?(parent_dir)
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'hnswlib'
4
+ require 'json'
5
+
6
+ module Boxcars
7
+ module VectorStore
8
+ module Hnswlib
9
+ class Search
10
+ include VectorStore
11
+
12
+ def initialize(params)
13
+ validate_params(params[:vector_documents])
14
+ @vector_documents = params[:vector_documents]
15
+ @search_index = load_index(params[:vector_documents])
16
+ end
17
+
18
+ def call(query_vector:, count: 1)
19
+ search(query_vector, count)
20
+ end
21
+
22
+ private
23
+
24
+ attr_reader :vector_documents, :vector_store, :json_doc, :search_index, :metadata
25
+
26
+ def validate_params(vector_documents)
27
+ raise_argument_error('vector_documents is nil') unless vector_documents
28
+ raise_arugment_error('vector_documents must be a hash') unless vector_documents.is_a?(Hash)
29
+ raise_arugment_error('type must be hnswlib') unless vector_documents[:type] == :hnswlib
30
+ raise_arugment_error('vector_store is nil') unless vector_documents[:vector_store]
31
+ raise_arugment_error('vector_store must be an array') unless vector_documents[:vector_store].is_a?(Array)
32
+
33
+ unless vector_documents[:vector_store].all? { |doc| doc.is_a?(Document) }
34
+ raise_arugment_error('vector_store must be an array of Document objects')
35
+ end
36
+
37
+ true
38
+ end
39
+
40
+ def load_index(vector_documents)
41
+ @metadata = vector_documents[:vector_store].first.metadata
42
+ @json_doc = @metadata[:json_doc_file_path]
43
+
44
+ search_index = ::Hnswlib::HierarchicalNSW.new(
45
+ space: metadata[:metric],
46
+ dim: metadata[:dim]
47
+ )
48
+ search_index.load_index(metadata[:index_file_path])
49
+ @search_index = search_index
50
+ @vector_store = vector_documents[:vector_store]
51
+
52
+ search_index
53
+ end
54
+
55
+ def search(query_vector, num_neighbors)
56
+ raw_results = search_index.search_knn(query_vector, num_neighbors)
57
+ raw_results.map { |doc_id, distance| lookup_embedding(doc_id, distance) }.compact
58
+ rescue StandardError => e
59
+ raise_argument_error("Error searching for #{query_vector}: #{e.message}")
60
+ end
61
+
62
+ def lookup_embedding(doc_id, distance)
63
+ return unless vector_store[doc_id]
64
+
65
+ { document: vector_store[doc_id], distance: distance }
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,51 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module VectorStore
5
+ module InMemory
6
+ class BuildFromDocumentArray
7
+ include VectorStore
8
+
9
+ def initialize(embedding_tool: :openai, documents: nil)
10
+ validate_params(embedding_tool, documents)
11
+ @embedding_tool = embedding_tool
12
+ @documents = documents
13
+ @memory_vectors = []
14
+ end
15
+
16
+ def call
17
+ texts = documents
18
+ vectors = generate_vectors(texts)
19
+ add_vectors(vectors, documents)
20
+ {
21
+ type: :in_memory,
22
+ vector_store: memory_vectors
23
+ }
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :documents, :memory_vectors
29
+
30
+ def validate_params(embedding_tool, documents)
31
+ raise_argument_error('documents is nil') unless documents
32
+ return if %i[openai tensorflow].include?(embedding_tool)
33
+
34
+ raise_argument_error('embedding_tool is invalid')
35
+ end
36
+
37
+ # returns array of documents with vectors
38
+ def add_vectors(vectors, documents)
39
+ vectors.zip(documents).each do |vector, doc|
40
+ memory_vector = Document.new(
41
+ content: doc[:content],
42
+ embedding: vector[:embedding],
43
+ metadata: doc[:metadata].merge(dim: vector[:dim])
44
+ )
45
+ @memory_vectors << memory_vector
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module VectorStore
5
+ module InMemory
6
+ class BuildFromFiles
7
+ include VectorStore
8
+
9
+ def initialize(params)
10
+ @split_chunk_size = params[:split_chunk_size] || 2000
11
+ @training_data_path = File.absolute_path(params[:training_data_path])
12
+ @embedding_tool = params[:embedding_tool] || :openai
13
+
14
+ validate_params(embedding_tool, training_data_path)
15
+ @memory_vectors = []
16
+ end
17
+
18
+ def call
19
+ data = load_data_files(training_data_path)
20
+ texts = split_text_into_chunks(data)
21
+ vectors = generate_vectors(texts)
22
+ add_vectors(vectors, texts)
23
+
24
+ {
25
+ type: :in_memory,
26
+ vector_store: memory_vectors
27
+ }
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :memory_vectors
33
+
34
+ def validate_params(embedding_tool, training_data_path)
35
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
36
+
37
+ raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
38
+ raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
39
+
40
+ return if %i[openai tensorflow].include?(embedding_tool)
41
+
42
+ raise_argument_error('embedding_tool is invalid')
43
+ end
44
+
45
+ def add_vectors(vectors, texts)
46
+ vectors.map.with_index do |vector, index|
47
+ memory_vector = Document.new(
48
+ content: texts[index],
49
+ embedding: vector[:embedding],
50
+ metadata: {
51
+ doc_id: index,
52
+ training_data_path: training_data_path
53
+ }
54
+ )
55
+ memory_vectors << memory_vector
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,61 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module VectorStore
5
+ module InMemory
6
+ class Search
7
+ include VectorStore
8
+
9
+ def initialize(params)
10
+ validate_params(params[:vector_documents])
11
+ @vector_documents = params[:vector_documents]
12
+ end
13
+
14
+ def call(query_vector:, count: 1)
15
+ raise ::Boxcars::ArgumentError, 'query_vector is empty' if query_vector.empty?
16
+
17
+ search(query_vector, count)
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :vector_documents
23
+
24
+ def validate_params(vector_documents)
25
+ return if valid_vector_store?(vector_documents)
26
+
27
+ raise ::Boxcars::ArgumentError, "vector_documents is not valid"
28
+ end
29
+
30
+ def valid_vector_store?(vector_documents)
31
+ vector_documents && vector_documents[:type] == :in_memory &&
32
+ vector_documents[:vector_store].is_a?(Array) &&
33
+ vector_documents[:vector_store].all? do |doc|
34
+ doc.is_a?(Boxcars::VectorStore::Document)
35
+ end
36
+ end
37
+
38
+ def search(query_vector, num_neighbors)
39
+ results = vector_documents[:vector_store].map do |doc|
40
+ {
41
+ document: doc,
42
+ similarity: cosine_similarity(query_vector, doc.embedding)
43
+ }
44
+ end
45
+ results.sort_by { |result| -result[:similarity] }
46
+ .first(num_neighbors)
47
+ rescue StandardError => e
48
+ raise_argument_error("Error searching for #{query_vector}: #{e.message}")
49
+ raise_error
50
+ end
51
+
52
+ def cosine_similarity(vector1, vector2)
53
+ dot_product = vector1.zip(vector2).reduce(0) { |sum, (a, b)| sum + (a * b) }
54
+ magnitude1 = Math.sqrt(vector1.reduce(0) { |sum, a| sum + (a**2) })
55
+ magnitude2 = Math.sqrt(vector2.reduce(0) { |sum, b| sum + (b**2) })
56
+ dot_product / (magnitude1 * magnitude2)
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,95 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module VectorStore
5
+ # install pgvector: https://github.com/pgvector/pgvector#installation-notes
6
+ module Pgvector
7
+ class BuildFromArray
8
+ include VectorStore
9
+
10
+ # params = {
11
+ # embedding_tool: embedding_tool,
12
+ # input_array: input_array,
13
+ # database_url: db_url,
14
+ # table_name: table_name,
15
+ # embedding_column_name: embedding_column_name,
16
+ # content_column_name: content_column_name,
17
+ # metadata_column_name: metadata_column_name
18
+ # }
19
+ def initialize(params)
20
+ @embedding_tool = params[:embedding_tool] || :openai
21
+
22
+ validate_params(embedding_tool, params[:input_array])
23
+
24
+ @database_url = params[:database_url]
25
+ @table_name = params[:table_name]
26
+ @embedding_column_name = params[:embedding_column_name]
27
+ @content_column_name = params[:content_column_name]
28
+ @metadata_column_name = params[:metadata_column_name]
29
+
30
+ @input_array = params[:input_array]
31
+ @pg_vectors = []
32
+ end
33
+
34
+ def call
35
+ texts = input_array
36
+ vectors = generate_vectors(texts)
37
+ add_vectors(vectors, texts)
38
+ documents = save_vector_store
39
+
40
+ {
41
+ type: :pgvector,
42
+ vector_store: documents
43
+ }
44
+ end
45
+
46
+ private
47
+
48
+ attr_reader :input_array, :embedding_tool, :pg_vectors, :database_url,
49
+ :table_name, :embedding_column_name, :content_column_name,
50
+ :metadata_column_name
51
+
52
+ def validate_params(embedding_tool, input_array)
53
+ raise_argument_error('input_array is nil') unless input_array
54
+ return if %i[openai tensorflow].include?(embedding_tool)
55
+
56
+ raise_argument_error('embedding_tool is invalid') unless %i[openai tensorflow].include?(embedding_tool)
57
+
58
+ input_array.each do |item|
59
+ next if item.key?(:content) && item.key?(:metadata)
60
+
61
+ return raise_argument_error('embedding_tool is invalid')
62
+ end
63
+ end
64
+
65
+ def add_vectors(vectors, texts)
66
+ raise_argument_error("vectors are nil") unless vectors
67
+ raise_argument_error("vectors and texts are not the same size") unless vectors.size == texts.size
68
+
69
+ vectors.zip(texts) do |vector, doc|
70
+ pg_vector = Document.new(
71
+ content: doc[:content],
72
+ embedding: vector[:embedding],
73
+ metadata: doc[:metadata]
74
+ )
75
+ @pg_vectors << pg_vector
76
+ end
77
+ end
78
+
79
+ def save_vector_store
80
+ result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
81
+ pg_vectors: pg_vectors,
82
+ database_url: database_url,
83
+ table_name: table_name,
84
+ embedding_column_name: embedding_column_name,
85
+ content_column_name: content_column_name,
86
+ metadata_column_name: metadata_column_name
87
+ )
88
+ raise_argument_error('Error saving vector store to database.') unless result
89
+
90
+ result
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pgvector'
4
+ require 'fileutils'
5
+ require 'json'
6
+
7
+ module Boxcars
8
+ module VectorStore
9
+ module Pgvector
10
+ class BuildFromFiles
11
+ include VectorStore
12
+
13
+ # params = {
14
+ # training_data_path: training_data_path,
15
+ # split_chunk_size: 200,
16
+ # embedding_tool: embedding_tool,
17
+ # database_url: db_url,
18
+ # table_name: table_name,
19
+ # embedding_column_name: embedding_column_name,
20
+ # content_column_name: content_column_name
21
+ # }
22
+ def initialize(params)
23
+ @split_chunk_size = params[:split_chunk_size] || 2000
24
+ @training_data_path = File.absolute_path(params[:training_data_path])
25
+ @embedding_tool = params[:embedding_tool] || :openai
26
+
27
+ validate_params(embedding_tool, training_data_path)
28
+
29
+ @database_url = params[:database_url]
30
+ @table_name = params[:table_name]
31
+ @embedding_column_name = params[:embedding_column_name]
32
+ @content_column_name = params[:content_column_name]
33
+ @metadata_column_name = params[:metadata_column_name]
34
+
35
+ @pg_vectors = []
36
+ end
37
+
38
+ def call
39
+ data = load_data_files(training_data_path)
40
+ texts = split_text_into_chunks(data)
41
+ embeddings = generate_vectors(texts)
42
+ add_vectors(embeddings, texts)
43
+ documents = save_vector_store
44
+
45
+ {
46
+ type: :pgvector,
47
+ vector_store: documents
48
+ }
49
+ end
50
+
51
+ private
52
+
53
+ attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :database_url,
54
+ :table_name, :embedding_column_name, :content_column_name,
55
+ :metadata_column_name, :pg_vectors
56
+
57
+ def validate_params(embedding_tool, training_data_path)
58
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
59
+
60
+ raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
61
+ raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
62
+ return if %i[openai tensorflow].include?(embedding_tool)
63
+
64
+ raise_argument_error('embedding_tool is invalid')
65
+ end
66
+
67
+ def add_vectors(vectors, texts)
68
+ vectors.map.with_index do |vector, index|
69
+ pg_vector = Document.new(
70
+ content: texts[index],
71
+ embedding: vector[:embedding],
72
+ metadata: {
73
+ doc_id: index,
74
+ training_data_path: training_data_path
75
+ }
76
+ )
77
+ pg_vectors << pg_vector
78
+ end
79
+ end
80
+
81
+ def save_vector_store
82
+ result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
83
+ pg_vectors: pg_vectors,
84
+ database_url: database_url,
85
+ table_name: table_name,
86
+ embedding_column_name: embedding_column_name,
87
+ content_column_name: content_column_name,
88
+ metadata_column_name: metadata_column_name
89
+ )
90
+ raise_argument_error('Error saving vector store to database.') unless result
91
+
92
+ result
93
+ end
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pg'
4
+ require 'pgvector'
5
+
6
+ module Boxcars
7
+ module VectorStore
8
+ module Pgvector
9
+ class SaveToDatabase
10
+ include VectorStore
11
+
12
+ # params = {
13
+ # pg_vectors: pg_vectors,
14
+ # database_url: db_url,
15
+ # table_name: table_name,
16
+ # embedding_column_name: embedding_column_name,
17
+ # content_column_name: content_column_name
18
+ # }
19
+ def initialize(params)
20
+ @errors = []
21
+ validate_param_types(params)
22
+ @db_connection = test_db_params(params)
23
+
24
+ @table_name = params[:table_name]
25
+ @content_column_name = params[:content_column_name]
26
+ @embedding_column_name = params[:embedding_column_name]
27
+ @metadata_column_name = params[:metadata_column_name]
28
+
29
+ @pg_vectors = params[:pg_vectors]
30
+ end
31
+
32
+ def call
33
+ return { success: false, error: errors } unless errors.empty?
34
+
35
+ add_vectors_to_database
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
41
+ :embedding_column_name, :content_column_name,
42
+ :metadata_column_name, :errors
43
+
44
+ def validate_param_types(params)
45
+ pg_vectors = params[:pg_vectors]
46
+
47
+ raise_argument_error('pg_vectors must be an array') unless pg_vectors.is_a?(Array)
48
+ raise_argument_error('missing data') if pg_vectors.empty?
49
+ raise_argument_error('invalid vector_store') unless valid_vector_store?(pg_vectors)
50
+ @database_url = params[:database_url]
51
+ raise_argument_error('missing database_url argument') if @database_url.to_s.empty?
52
+ end
53
+
54
+ def valid_vector_store?(pg_vectors)
55
+ pg_vectors.all? do |doc|
56
+ doc.is_a?(Boxcars::VectorStore::Document)
57
+ end
58
+ rescue TypeError => e
59
+ raise_argument_error(e.message)
60
+ end
61
+
62
+ def test_db_params(params)
63
+ conn = ::PG::Connection.new(@database_url)
64
+
65
+ check_db_connection(conn)
66
+ check_vector_extension(conn)
67
+ check_table_exists(conn, params[:table_name])
68
+ check_column_exists(conn, params)
69
+
70
+ registry = PG::BasicTypeRegistry.new.define_default_types
71
+ ::Pgvector::PG.register_vector(registry)
72
+ conn.type_map_for_queries = PG::BasicTypeMapForQueries.new(conn, registry: registry)
73
+ conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
74
+ conn
75
+ rescue PG::Error, NameError => e
76
+ raise_argument_error(e.message)
77
+ end
78
+
79
+ def check_db_connection(conn)
80
+ return if conn.status == PG::CONNECTION_OK
81
+
82
+ raise_argument_error("PostgreSQL connection is not ok")
83
+ end
84
+
85
+ def check_vector_extension(conn)
86
+ return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
87
+
88
+ raise_argument_error("PostgreSQL 'vector' extension is not installed")
89
+ end
90
+
91
+ def check_table_exists(conn, table_name)
92
+ table_exists = conn.exec_params(
93
+ "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
94
+ ).getvalue(0, 0) == "t"
95
+ return if table_exists
96
+
97
+ raise_argument_error("Table '#{table_name}' does not exist")
98
+ end
99
+
100
+ def check_column_exists(conn, params)
101
+ column_names = %i[embedding_column_name content_column_name metadata_column_name]
102
+ table_name = params[:table_name]
103
+
104
+ column_names.each do |target|
105
+ column_name = params[target]
106
+ column_exists = conn.exec_params(
107
+ "SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
108
+ [table_name, column_name]
109
+ ).getvalue(0, 0) == "t"
110
+ next if column_exists
111
+
112
+ raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
113
+ end
114
+ end
115
+
116
+ def add_vectors_to_database
117
+ pg_vectors.each do |document|
118
+ embedding = document.embedding.map(&:to_f)
119
+ content = document.content
120
+ metadata = document.metadata.to_json
121
+
122
+ if document.metadata[:id]
123
+ id = document.metadata[:id]
124
+ # directly inserting table_name, embedding_column_name, and content_column_name
125
+ # into the SQL command. If these values are coming from an untrusted source,
126
+ # there is a risk of SQL injection
127
+ sql = <<-SQL
128
+ INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
129
+ VALUES ($1, $2, $3, $4)
130
+ ON CONFLICT (id) DO UPDATE
131
+ SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
132
+ #{content_column_name} = EXCLUDED.#{content_column_name},
133
+ #{metadata_column_name} = EXCLUDED.#{metadata_column_name}
134
+ SQL
135
+ # parameters are given separately from the SQL command,
136
+ # there's no risk of them being interpreted as part of the command.
137
+ db_connection.exec_params(sql, [id, embedding, content, metadata])
138
+ else
139
+ sql = <<-SQL
140
+ INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
141
+ VALUES ($1, $2, $3)
142
+ SQL
143
+ db_connection.exec_params(sql, [embedding, content, metadata])
144
+ end
145
+ end
146
+ rescue PG::Error => e
147
+ raise_argument_error(e.message)
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end