boxcars 0.2.11 → 0.2.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.env_sample +1 -0
  3. data/.rubocop.yml +16 -0
  4. data/CHANGELOG.md +12 -0
  5. data/Gemfile +12 -12
  6. data/Gemfile.lock +34 -28
  7. data/README.md +4 -1
  8. data/boxcars.gemspec +2 -2
  9. data/lib/boxcars/boxcar/active_record.rb +1 -1
  10. data/lib/boxcars/boxcar.rb +1 -0
  11. data/lib/boxcars/engine/openai.rb +8 -1
  12. data/lib/boxcars/vector_search.rb +66 -2
  13. data/lib/boxcars/vector_store/document.rb +3 -2
  14. data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
  15. data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
  16. data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
  17. data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
  18. data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
  19. data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
  20. data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
  21. data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
  22. data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
  23. data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
  24. data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
  25. data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
  26. data/lib/boxcars/vector_store/split_text.rb +2 -3
  27. data/lib/boxcars/vector_store.rb +73 -7
  28. data/lib/boxcars/version.rb +1 -1
  29. data/lib/boxcars.rb +1 -1
  30. metadata +14 -10
  31. data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
  32. data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
  33. data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
  34. data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
  35. data/lib/boxcars/vector_store/similarity_search.rb +0 -55
@@ -0,0 +1,152 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pg'
4
+ require 'pgvector'
5
+
6
+ module Boxcars
7
+ module VectorStore
8
+ module Pgvector
9
+ class SaveToDatabase
10
+ include VectorStore
11
+
12
+ # params = {
13
+ # pg_vectors: pg_vectors,
14
+ # database_url: db_url,
15
+ # table_name: table_name,
16
+ # embedding_column_name: embedding_column_name,
17
+ # content_column_name: content_column_name
18
+ # }
19
+ def initialize(params)
20
+ @errors = []
21
+ validate_param_types(params)
22
+ @db_connection = test_db_params(params)
23
+
24
+ @table_name = params[:table_name]
25
+ @content_column_name = params[:content_column_name]
26
+ @embedding_column_name = params[:embedding_column_name]
27
+ @metadata_column_name = params[:metadata_column_name]
28
+
29
+ @pg_vectors = params[:pg_vectors]
30
+ end
31
+
32
+ def call
33
+ return { success: false, error: errors } unless errors.empty?
34
+
35
+ add_vectors_to_database
36
+ end
37
+
38
+ private
39
+
40
+ attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
41
+ :embedding_column_name, :content_column_name,
42
+ :metadata_column_name, :errors
43
+
44
+ def validate_param_types(params)
45
+ pg_vectors = params[:pg_vectors]
46
+
47
+ raise_argument_error('pg_vectors must be an array') unless pg_vectors.is_a?(Array)
48
+ raise_argument_error('missing data') if pg_vectors.empty?
49
+ raise_argument_error('invalid vector_store') unless valid_vector_store?(pg_vectors)
50
+ @database_url = params[:database_url]
51
+ raise_argument_error('missing database_url argument') if @database_url.to_s.empty?
52
+ end
53
+
54
+ def valid_vector_store?(pg_vectors)
55
+ pg_vectors.all? do |doc|
56
+ doc.is_a?(Boxcars::VectorStore::Document)
57
+ end
58
+ rescue TypeError => e
59
+ raise_argument_error(e.message)
60
+ end
61
+
62
+ def test_db_params(params)
63
+ conn = ::PG::Connection.new(@database_url)
64
+
65
+ check_db_connection(conn)
66
+ check_vector_extension(conn)
67
+ check_table_exists(conn, params[:table_name])
68
+ check_column_exists(conn, params)
69
+
70
+ registry = PG::BasicTypeRegistry.new.define_default_types
71
+ ::Pgvector::PG.register_vector(registry)
72
+ conn.type_map_for_queries = PG::BasicTypeMapForQueries.new(conn, registry: registry)
73
+ conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
74
+ conn
75
+ rescue PG::Error, NameError => e
76
+ raise_argument_error(e.message)
77
+ end
78
+
79
+ def check_db_connection(conn)
80
+ return if conn.status == PG::CONNECTION_OK
81
+
82
+ raise_argument_error("PostgreSQL connection is not ok")
83
+ end
84
+
85
+ def check_vector_extension(conn)
86
+ return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
87
+
88
+ raise_argument_error("PostgreSQL 'vector' extension is not installed")
89
+ end
90
+
91
+ def check_table_exists(conn, table_name)
92
+ table_exists = conn.exec_params(
93
+ "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
94
+ ).getvalue(0, 0) == "t"
95
+ return if table_exists
96
+
97
+ raise_argument_error("Table '#{table_name}' does not exist")
98
+ end
99
+
100
+ def check_column_exists(conn, params)
101
+ column_names = %i[embedding_column_name content_column_name metadata_column_name]
102
+ table_name = params[:table_name]
103
+
104
+ column_names.each do |target|
105
+ column_name = params[target]
106
+ column_exists = conn.exec_params(
107
+ "SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
108
+ [table_name, column_name]
109
+ ).getvalue(0, 0) == "t"
110
+ next if column_exists
111
+
112
+ raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
113
+ end
114
+ end
115
+
116
+ def add_vectors_to_database
117
+ pg_vectors.each do |document|
118
+ embedding = document.embedding.map(&:to_f)
119
+ content = document.content
120
+ metadata = document.metadata.to_json
121
+
122
+ if document.metadata[:id]
123
+ id = document.metadata[:id]
124
+ # directly inserting table_name, embedding_column_name, and content_column_name
125
+ # into the SQL command. If these values are coming from an untrusted source,
126
+ # there is a risk of SQL injection
127
+ sql = <<-SQL
128
+ INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
129
+ VALUES ($1, $2, $3, $4)
130
+ ON CONFLICT (id) DO UPDATE
131
+ SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
132
+ #{content_column_name} = EXCLUDED.#{content_column_name},
133
+ #{metadata_column_name} = EXCLUDED.#{metadata_column_name}
134
+ SQL
135
+ # parameters are given separately from the SQL command,
136
+ # there's no risk of them being interpreted as part of the command.
137
+ db_connection.exec_params(sql, [id, embedding, content, metadata])
138
+ else
139
+ sql = <<-SQL
140
+ INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
141
+ VALUES ($1, $2, $3)
142
+ SQL
143
+ db_connection.exec_params(sql, [embedding, content, metadata])
144
+ end
145
+ end
146
+ rescue PG::Error => e
147
+ raise_argument_error(e.message)
148
+ end
149
+ end
150
+ end
151
+ end
152
+ end
@@ -0,0 +1,144 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pg'
4
+ require 'json'
5
+
6
+ module Boxcars
7
+ module VectorStore
8
+ module Pgvector
9
+ class Search
10
+ include VectorStore
11
+
12
+ # required params:
13
+ # {
14
+ # type: :pgvector,
15
+ # vector_store: {
16
+ # database_url: database_url,
17
+ # table_name: table_name,
18
+ # embedding_column_name: embedding_column_name,
19
+ # content_column_name: content_column_name,
20
+ # metadata_column_name: metadata_column_name
21
+ # }
22
+ # }
23
+ def initialize(params)
24
+ vector_store = validate_params(params)
25
+ db_url = validate_vector_store(vector_store)
26
+ @db_connection = test_db(db_url)
27
+
28
+ @vector_documents = params[:vector_documents]
29
+ end
30
+
31
+ def call(query_vector:, count: 1)
32
+ raise ::Boxcars::ArgumentError, 'query_vector is empty' if query_vector.empty?
33
+
34
+ search(query_vector, count)
35
+ end
36
+
37
+ private
38
+
39
+ attr_reader :vector_documents, :vector_store, :db_connection,
40
+ :table_name, :embedding_column_name, :content_column_name
41
+
42
+ def validate_params(params)
43
+ @vector_documents = params[:vector_documents]
44
+
45
+ raise_argument_error('vector_documents is nil') unless vector_documents
46
+ raise_arugment_error('vector_documents must be a hash') unless vector_documents.is_a?(Hash)
47
+ raise_arugment_error('type must be pgvector') unless vector_documents[:type] == :pgvector
48
+
49
+ @vector_store = vector_documents[:vector_store]
50
+ @vector_store
51
+ end
52
+
53
+ def validate_vector_store(vector_store)
54
+ raise_arugment_error('vector_store is nil') unless vector_store
55
+ raise_arugment_error('vector_store must be a hash') unless vector_store.is_a?(Hash)
56
+ raise_arugment_error('vector_store must have a table_name') unless vector_store[:table_name]
57
+ raise_arugment_error('vector_store must have a embedding_column_name') unless vector_store[:embedding_column_name]
58
+ raise_arugment_error('vector_store must have a content_column_name') unless vector_store[:content_column_name]
59
+ raise_argument_error('missing DATABASE_URL') unless vector_store[:database_url]
60
+
61
+ vector_store[:database_url]
62
+ end
63
+
64
+ def test_db(db_url)
65
+ conn = ::PG::Connection.new(db_url)
66
+
67
+ check_db_connection(conn)
68
+ check_vector_extension(conn)
69
+ check_table_exists(conn, vector_store[:table_name])
70
+ check_column_exists(conn)
71
+
72
+ @table_name = vector_store[:table_name]
73
+ @embedding_column_name = vector_store[:embedding_column_name]
74
+ @content_column_name = vector_store[:content_column_name]
75
+
76
+ conn
77
+ rescue PG::Error, PG::UndefinedTable, NameError => e
78
+ raise_argument_error(e.message)
79
+ end
80
+
81
+ def check_db_connection(conn)
82
+ return if conn.status == PG::CONNECTION_OK
83
+
84
+ raise_argument_error("PostgreSQL connection is not ok")
85
+ end
86
+
87
+ def check_vector_extension(conn)
88
+ return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
89
+
90
+ raise_argument_error("PostgreSQL 'vector' extension is not installed")
91
+ end
92
+
93
+ def check_table_exists(conn, table_name)
94
+ table_exists = conn.exec_params(
95
+ "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
96
+ ).getvalue(0, 0) == "t"
97
+ return if table_exists
98
+
99
+ raise_argument_error("Table '#{table_name}' does not exist")
100
+ end
101
+
102
+ def check_column_exists(conn)
103
+ column_names = %i[embedding_column_name content_column_name]
104
+ table_name = vector_store[:table_name]
105
+
106
+ column_names.each do |target|
107
+ column_name = vector_store[target]
108
+ column_exists = conn.exec_params(
109
+ "SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
110
+ [table_name, column_name]
111
+ ).getvalue(0, 0) == "t"
112
+ next if column_exists
113
+
114
+ raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
115
+ end
116
+ end
117
+
118
+ def search(query_vector, num_neighbors)
119
+ sql = <<-SQL
120
+ SELECT *, #{embedding_column_name} <-> $1 AS distance FROM #{table_name}
121
+ ORDER BY #{embedding_column_name} <-> $1
122
+ LIMIT #{num_neighbors}
123
+ SQL
124
+ result = db_connection.exec_params(sql, [query_vector.to_s]).to_a
125
+ return [] if result.empty?
126
+
127
+ result.map { |hash| hash.transform_keys(&:to_sym) }
128
+ .map do |item|
129
+ {
130
+ document: Boxcars::VectorStore::Document.new(
131
+ content: item[:content],
132
+ embedding: JSON.parse(item[:embedding]),
133
+ metadata: JSON.parse(item[:metadata], symbolize_names: true)
134
+ ),
135
+ distance: item[:distance].to_f
136
+ }
137
+ end
138
+ rescue StandardError => e
139
+ raise_argument_error("Error searching for #{query_vector}: #{e.message}")
140
+ end
141
+ end
142
+ end
143
+ end
144
+ end
@@ -6,14 +6,11 @@ module Boxcars
6
6
  class SplitText
7
7
  include VectorStore
8
8
 
9
- attr_reader :separator, :chunk_size, :chunk_overlap, :text
10
-
11
9
  # @param separator [String] The string to use to split the text.
12
10
  # @param chunk_size [Integer] The size of each chunk.
13
11
  # @param chunk_overlap [Integer] The amount of overlap between chunks.
14
12
  # @param text [String] The text to split.
15
13
  def initialize(separator: "Search", chunk_size: 7, chunk_overlap: 3, text: "")
16
- # require 'debugger'; debugger
17
14
  validate_params(separator, chunk_size, chunk_overlap, text)
18
15
 
19
16
  @separator = separator
@@ -31,6 +28,8 @@ module Boxcars
31
28
 
32
29
  private
33
30
 
31
+ attr_reader :separator, :chunk_size, :chunk_overlap, :text
32
+
34
33
  def validate_params(separator, chunk_size, chunk_overlap, text)
35
34
  raise_error("separator must be a string") unless separator.is_a?(String)
36
35
  raise_error("chunk_size must be an integer") unless chunk_size.is_a?(Integer)
@@ -13,10 +13,72 @@ module Boxcars
13
13
 
14
14
  def self.included(base)
15
15
  base.extend(ClassMethods)
16
+ end
17
+
18
+ private
19
+
20
+ attr_reader :embedding_tool
21
+
22
+ def generate_vectors(texts)
23
+ @embedding_tool = embedding_tool || :openai
24
+
25
+ embeddings_method[:klass]
26
+ .call(
27
+ texts: texts, client: embeddings_method[:client]
28
+ )
29
+ .map { |item| item.transform_keys(&:to_sym) }
30
+ end
31
+
32
+ def embeddings_method
33
+ case @embedding_tool
34
+ when :openai
35
+ { klass: Boxcars::VectorStore::EmbedViaOpenAI, client: openai_client }
36
+ when :tensorflow
37
+ { klass: Boxcars::VectorStore::EmbedViaTensorflow, client: nil }
38
+ end
39
+ end
40
+
41
+ # Get the OpenAI client
42
+ # @param openai_access_token [String] the OpenAI access token
43
+ # @return [OpenAI::Client]
44
+ def openai_client(openai_access_token: nil)
45
+ @openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
46
+ end
47
+
48
+ def raise_argument_error(message)
49
+ raise ::Boxcars::ArgumentError, message
50
+ end
51
+
52
+ def parse_json_file(file_path)
53
+ return [] if file_path.nil?
54
+
55
+ file_content = File.read(file_path)
56
+ JSON.parse(file_content, symbolize_names: true)
57
+ rescue JSON::ParserError => e
58
+ raise_argument_error("Error parsing #{file_path}: #{e.message}")
59
+ end
60
+
61
+ def load_data_files(training_data_path)
62
+ data = []
63
+ files = Dir.glob(training_data_path)
64
+ raise_error "No files found at #{training_data_path}" if files.empty?
65
+
66
+ files.each do |file|
67
+ data << File.read(file)
68
+ end
69
+ puts "Added #{files.length} files to data. Splitting text into chunks..."
70
+ data
71
+ end
16
72
 
17
- class << base
18
- private :new
73
+ def split_text_into_chunks(data)
74
+ docs = []
75
+ data.each do |chunk|
76
+ doc_output = Boxcars::VectorStore::SplitText.call(
77
+ separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
78
+ )
79
+ docs.concat(doc_output)
19
80
  end
81
+ docs
20
82
  end
21
83
  end
22
84
  end
@@ -25,10 +87,14 @@ require_relative "vector_store/document"
25
87
  require_relative "vector_store/embed_via_open_ai"
26
88
  require_relative "vector_store/embed_via_tensorflow"
27
89
  require_relative "vector_store/split_text"
28
- require_relative "vector_store/similarity_search"
29
- require_relative "vector_store/hnswlib/hnswlib_config"
90
+ require_relative "vector_store/hnswlib/load_from_disk"
30
91
  require_relative "vector_store/hnswlib/save_to_hnswlib"
31
- require_relative "vector_store/hnswlib/build_vector_store"
32
- require_relative "vector_store/hnswlib/hnswlib_search"
33
- require_relative "vector_store/in_memory/add_documents"
92
+ require_relative "vector_store/hnswlib/build_from_files"
93
+ require_relative "vector_store/hnswlib/search"
94
+ require_relative "vector_store/in_memory/build_from_files"
95
+ require_relative "vector_store/in_memory/build_from_document_array"
34
96
  require_relative "vector_store/in_memory/search"
97
+ require_relative "vector_store/pgvector/build_from_files"
98
+ require_relative "vector_store/pgvector/build_from_array"
99
+ require_relative "vector_store/pgvector/save_to_database"
100
+ require_relative "vector_store/pgvector/search"
@@ -2,5 +2,5 @@
2
2
 
3
3
  module Boxcars
4
4
  # The current version of the gem.
5
- VERSION = "0.2.11"
5
+ VERSION = "0.2.12"
6
6
  end
data/lib/boxcars.rb CHANGED
@@ -58,7 +58,7 @@ module Boxcars
58
58
  # override with kwargs if present
59
59
  kwargs[key]
60
60
  elsif (provided_val = instance_variable_get("@#{key}"))
61
- # use saved value if present. Set using Boxcars::configuration.the_key = "abcde"
61
+ # use saved value if present. Set using Boxcars.configuration.the_key = "abcde"
62
62
  provided_val
63
63
  else
64
64
  # otherwise, dig out of the environment
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boxcars
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.11
4
+ version: 0.2.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Sullivan
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-05 00:00:00.000000000 Z
12
+ date: 2023-05-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: debug
@@ -87,14 +87,14 @@ dependencies:
87
87
  requirements:
88
88
  - - "~>"
89
89
  - !ruby/object:Gem::Version
90
- version: '3.0'
90
+ version: '4.0'
91
91
  type: :runtime
92
92
  prerelease: false
93
93
  version_requirements: !ruby/object:Gem::Requirement
94
94
  requirements:
95
95
  - - "~>"
96
96
  - !ruby/object:Gem::Version
97
- version: '3.0'
97
+ version: '4.0'
98
98
  description: You simply set an OpenAI key, give a number of Boxcars to a Train, and
99
99
  magic ensues when you run it.
100
100
  email:
@@ -144,13 +144,17 @@ files:
144
144
  - lib/boxcars/vector_store/document.rb
145
145
  - lib/boxcars/vector_store/embed_via_open_ai.rb
146
146
  - lib/boxcars/vector_store/embed_via_tensorflow.rb
147
- - lib/boxcars/vector_store/hnswlib/build_vector_store.rb
148
- - lib/boxcars/vector_store/hnswlib/hnswlib_config.rb
149
- - lib/boxcars/vector_store/hnswlib/hnswlib_search.rb
147
+ - lib/boxcars/vector_store/hnswlib/build_from_files.rb
148
+ - lib/boxcars/vector_store/hnswlib/load_from_disk.rb
150
149
  - lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb
151
- - lib/boxcars/vector_store/in_memory/add_documents.rb
150
+ - lib/boxcars/vector_store/hnswlib/search.rb
151
+ - lib/boxcars/vector_store/in_memory/build_from_document_array.rb
152
+ - lib/boxcars/vector_store/in_memory/build_from_files.rb
152
153
  - lib/boxcars/vector_store/in_memory/search.rb
153
- - lib/boxcars/vector_store/similarity_search.rb
154
+ - lib/boxcars/vector_store/pgvector/build_from_array.rb
155
+ - lib/boxcars/vector_store/pgvector/build_from_files.rb
156
+ - lib/boxcars/vector_store/pgvector/save_to_database.rb
157
+ - lib/boxcars/vector_store/pgvector/search.rb
154
158
  - lib/boxcars/vector_store/split_text.rb
155
159
  - lib/boxcars/version.rb
156
160
  homepage: https://github.com/BoxcarsAI/boxcars
@@ -169,7 +173,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
169
173
  requirements:
170
174
  - - ">="
171
175
  - !ruby/object:Gem::Version
172
- version: 2.6.0
176
+ version: '3.0'
173
177
  required_rubygems_version: !ruby/object:Gem::Requirement
174
178
  requirements:
175
179
  - - ">="
@@ -1,157 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'fileutils'
4
- require 'hnswlib'
5
- require 'json'
6
-
7
- module Boxcars
8
- module VectorStore
9
- module Hnswlib
10
- class BuildVectorStore
11
- include VectorStore
12
-
13
- # This class is responsible for building the vector store for the hnswlib similarity search.
14
- # It will load the training data, generate the embeddings, and save the vector store.
15
- # It will also load the vector store into memory.
16
- # For later use, it will save the splitted document with index numbers to a json file.
17
- #
18
- # @param training_data_path [String] The path to the training data. Can be a glob pattern.
19
- # @param index_file_path [String] The path to the index file.
20
- # @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
21
- # @option json_doc_file_path [String]. The json file containing the document text.
22
- # if nil, it will reuse index file name.
23
- # @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
24
- def initialize(
25
- training_data_path:,
26
- index_file_path:,
27
- split_chunk_size: 2000,
28
- json_doc_file_path: nil,
29
- force_rebuild: true
30
- )
31
- @training_data_path = training_data_path
32
- @index_file_path = index_file_path
33
- @split_chunk_size = split_chunk_size
34
- @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
35
- @force_rebuild = force_rebuild
36
- end
37
-
38
- def call
39
- validate_params
40
- data = load_files
41
- documents = split_text_into_chunks(data)
42
- embeddings_with_config = generate_embeddings(documents)
43
- save_vector_store(embeddings_with_config)
44
- load_hnsw
45
- end
46
-
47
- private
48
-
49
- attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
50
-
51
- def validate_params
52
- training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
53
- raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
54
- raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
55
-
56
- index_dir = File.dirname(index_file_path)
57
- raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
58
-
59
- raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
60
- end
61
-
62
- def load_files
63
- data = []
64
- files = Dir.glob(training_data_path)
65
- raise_error "No files found at #{training_data_path}" if files.empty?
66
-
67
- files.each do |file|
68
- data << File.read(file)
69
- end
70
- puts "Added #{files.length} files to data. Splitting text into chunks..."
71
- data
72
- end
73
-
74
- def split_text_into_chunks(data)
75
- return true unless rebuild_required?
76
-
77
- docs = []
78
- data.each do |chunk|
79
- doc_output = Boxcars::VectorStore::SplitText.call(
80
- separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
81
- )
82
- docs.concat(doc_output)
83
- end
84
- docs
85
- end
86
-
87
- def rebuild_required?
88
- hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
89
- return true unless File.exist?(index_file_path)
90
- return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
91
- return true if force_rebuild
92
-
93
- false
94
- end
95
-
96
- def generate_embeddings(documents)
97
- return true unless rebuild_required?
98
-
99
- puts "Initializing Store..."
100
- openai_client = Openai.open_ai_client
101
- embeddings_with_dim = Boxcars::VectorStore::EmbedViaOpenAI.call(texts: documents, client: openai_client)
102
- document_embeddings = embeddings_with_dim.map.with_index do |item, index|
103
- { doc_id: index, embedding: item[:embedding], document: documents[index] }
104
- end
105
-
106
- { document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
107
- end
108
-
109
- def save_vector_store(embeddings_with_config)
110
- return true unless rebuild_required?
111
-
112
- puts "Saving Vectorstore"
113
- Boxcars::VectorStore::Hnswlib::SaveToHnswlib.call(
114
- document_embeddings: embeddings_with_config[:document_embeddings],
115
- index_file_path: index_file_path,
116
- json_doc_file_path: json_doc_file_path,
117
- hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
118
- )
119
- puts "VectorStore saved"
120
- end
121
-
122
- def hnswlib_config(dim)
123
- # dim: length of datum point vector that will be indexed.
124
- Boxcars::VectorStore::Hnswlib::HnswlibConfig.new(
125
- metric: "l2", max_item: 10000, dim: dim
126
- )
127
- end
128
-
129
- def load_hnsw
130
- puts "Loading Hnswlib"
131
-
132
- config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
133
- json_config = parse_json_file(config_file)
134
- document_embeddings = parse_json_file(json_doc_file_path)
135
-
136
- search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
137
- search_index.load_index(index_file_path)
138
-
139
- { vector_store: search_index, document_embeddings: document_embeddings }
140
- end
141
-
142
- def parse_json_file(file_path)
143
- return [] if file_path.nil?
144
-
145
- file_content = File.read(file_path)
146
- JSON.parse(file_content, symbolize_names: true)
147
- rescue JSON::ParserError => e
148
- raise_error("Error parsing hnswlib_config.json: #{e.message}")
149
- end
150
-
151
- def raise_error(message)
152
- raise ::Boxcars::Error, message
153
- end
154
- end
155
- end
156
- end
157
- end