boxcars 0.2.11 → 0.2.12

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. checksums.yaml +4 -4
  2. data/.env_sample +1 -0
  3. data/.rubocop.yml +16 -0
  4. data/CHANGELOG.md +12 -0
  5. data/Gemfile +12 -12
  6. data/Gemfile.lock +34 -28
  7. data/README.md +4 -1
  8. data/boxcars.gemspec +2 -2
  9. data/lib/boxcars/boxcar/active_record.rb +1 -1
  10. data/lib/boxcars/boxcar.rb +1 -0
  11. data/lib/boxcars/engine/openai.rb +8 -1
  12. data/lib/boxcars/vector_search.rb +66 -2
  13. data/lib/boxcars/vector_store/document.rb +3 -2
  14. data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
  15. data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
  16. data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
  17. data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
  18. data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
  19. data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
  20. data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
  21. data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
  22. data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
  23. data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
  24. data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
  25. data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
  26. data/lib/boxcars/vector_store/split_text.rb +2 -3
  27. data/lib/boxcars/vector_store.rb +73 -7
  28. data/lib/boxcars/version.rb +1 -1
  29. data/lib/boxcars.rb +1 -1
  30. metadata +14 -10
  31. data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
  32. data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
  33. data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
  34. data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
  35. data/lib/boxcars/vector_store/similarity_search.rb +0 -55
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: da3d8b9838602151837c0cc5bb9f3cae841ba24d1c338eade82c7807d913d4bb
4
- data.tar.gz: f7be434c18f0ff2c95625fe32fae25f3a5df265331425d0c1f0430ab75761578
3
+ metadata.gz: 69712266f9506d71ed3ad1fdcbfeef5a389bdbb7157d88e3f703f9a9b3ad8323
4
+ data.tar.gz: d7a5d0796d2963b737dc018c644042fe1e744ec7bd230f581367baf84df60f76
5
5
  SHA512:
6
- metadata.gz: 57dd238c56f13f63a4665d4469efdabfa5f3c54f82cb6832c4158858d4b307a80c57f619633cdad6934d64186d560dcab7a62efa9adc727edfa61afbc5acc188
7
- data.tar.gz: d2c782acf20c6b6b13cbfadf8f5406363b347be90a058626ec1bb21fe32baf1acb57a4a72c4770a7ad820700b465c0474a498080604e23e5d0270001d5d4aec1
6
+ metadata.gz: 85876f5e1053bb8100795020c33da778a06668f9e3be856a8689d90d13728cef73e437ee6d5c0888b4a5483f698ee8288c061573a93fdff93559080e525c4254
7
+ data.tar.gz: 99e15b3fe0c5d5277c5ed123e5569bca1f1ddfca3a1b3ec054504b855bc7a005d6eb9a8f7ba71989d16ded297959fa09cddf7b31879ff37df78df5dfb21b3240
data/.env_sample CHANGED
@@ -1,2 +1,3 @@
1
1
  openai_access_token: ''
2
2
  serpapi_api_key: ''
3
+ DATABASE_URL: 'postgres://postgres:postgres@localhost:5432/boxcars_test'
data/.rubocop.yml CHANGED
@@ -3,6 +3,7 @@ require:
3
3
  - rubocop-rake
4
4
 
5
5
  AllCops:
6
+ TargetRubyVersion: 3
6
7
  Exclude:
7
8
  - 'bin/{rails,rake}'
8
9
  - 'node_modules/**/*'
@@ -152,3 +153,18 @@ Style/SlicingWithRange:
152
153
 
153
154
  Bundler/OrderedGems:
154
155
  Enabled: false
156
+
157
+ RSpec/MultipleMemoizedHelpers:
158
+ Enabled: false
159
+
160
+ RSpec/PendingWithoutReason:
161
+ Enabled: false
162
+
163
+ RSpec/NestedGroups:
164
+ Enabled: false
165
+
166
+ RSpec/ExampleLength:
167
+ Enabled: false
168
+
169
+ RSpec/MultipleExpectations:
170
+ Enabled: false
data/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ ## [v0.2.11](https://github.com/BoxcarsAI/boxcars/tree/v0.2.11) (2023-05-05)
4
+
5
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.10...v0.2.11)
6
+
7
+ **Closed issues:**
8
+
9
+ - Chore: move vector store to top level [\#67](https://github.com/BoxcarsAI/boxcars/issues/67)
10
+
11
+ **Merged pull requests:**
12
+
13
+ - Move vector store [\#69](https://github.com/BoxcarsAI/boxcars/pull/69) ([francis](https://github.com/francis))
14
+
3
15
  ## [v0.2.10](https://github.com/BoxcarsAI/boxcars/tree/v0.2.10) (2023-05-05)
4
16
 
5
17
  [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.9...v0.2.10)
data/Gemfile CHANGED
@@ -7,18 +7,6 @@ gemspec
7
7
 
8
8
  gem "rake", "~> 13.0"
9
9
 
10
- gem "rspec", "~> 3.2"
11
-
12
- gem "rubocop", "~> 1.21"
13
-
14
- gem "vcr", "~> 6.1.0"
15
-
16
- gem "webmock", "~> 3.18.1"
17
-
18
- gem "rubocop-rake", "~> 0.6.0"
19
-
20
- gem "rubocop-rspec", "~> 2.17"
21
-
22
10
  gem "sqlite3", "~> 1.6"
23
11
 
24
12
  gem "activerecord", "~> 7.0"
@@ -32,3 +20,15 @@ gem "activesupport", "~> 7.0"
32
20
  gem "rest-client", "~> 2.1"
33
21
 
34
22
  gem "hnswlib", "~> 0.8.1"
23
+
24
+ gem "pg", "~> 1.5", ">= 1.5.3"
25
+ gem "pgvector", "~> 0.2.0"
26
+
27
+ group :development, :test do
28
+ gem "rspec", "~> 3.2"
29
+ gem "rubocop", "~> 1.21"
30
+ gem "vcr", "~> 6.1.0"
31
+ gem "webmock", "~> 3.18.1"
32
+ gem "rubocop-rake", "~> 0.6.0"
33
+ gem "rubocop-rspec", "~> 2.17"
34
+ end
data/Gemfile.lock CHANGED
@@ -1,10 +1,10 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- boxcars (0.2.11)
4
+ boxcars (0.2.12)
5
5
  google_search_results (~> 2.2)
6
6
  gpt4all (~> 0.0.4)
7
- ruby-openai (~> 3.0)
7
+ ruby-openai (~> 4.0)
8
8
 
9
9
  GEM
10
10
  remote: https://rubygems.org/
@@ -19,7 +19,7 @@ GEM
19
19
  i18n (>= 1.6, < 2)
20
20
  minitest (>= 5.1)
21
21
  tzinfo (~> 2.0)
22
- addressable (2.8.1)
22
+ addressable (2.8.4)
23
23
  public_suffix (>= 2.0.2, < 6.0)
24
24
  ast (2.4.2)
25
25
  async (1.31.0)
@@ -34,7 +34,7 @@ GEM
34
34
  protocol-http1 (~> 0.15.0)
35
35
  protocol-http2 (~> 0.15.0)
36
36
  traces (>= 0.8.0)
37
- async-http-faraday (0.11.0)
37
+ async-http-faraday (0.12.0)
38
38
  async-http (~> 0.42)
39
39
  faraday
40
40
  async-io (1.34.3)
@@ -46,7 +46,7 @@ GEM
46
46
  fiber-local
47
47
  crack (0.4.5)
48
48
  rexml
49
- debug (1.7.2)
49
+ debug (1.8.0)
50
50
  irb (>= 1.5.0)
51
51
  reline (>= 0.3.1)
52
52
  diff-lcs (1.5.0)
@@ -56,8 +56,10 @@ GEM
56
56
  faraday (2.7.4)
57
57
  faraday-net_http (>= 2.0, < 3.1)
58
58
  ruby2_keywords (>= 0.0.4)
59
- faraday-http-cache (2.4.1)
59
+ faraday-http-cache (2.5.0)
60
60
  faraday (>= 0.8)
61
+ faraday-multipart (1.0.4)
62
+ multipart-post (~> 2)
61
63
  faraday-net_http (3.0.2)
62
64
  faraday-retry (2.1.0)
63
65
  faraday (~> 2.0)
@@ -81,35 +83,33 @@ GEM
81
83
  http-accept (1.7.0)
82
84
  http-cookie (1.0.5)
83
85
  domain_name (~> 0.5)
84
- httparty (0.21.0)
85
- mini_mime (>= 1.0.0)
86
- multi_xml (>= 0.5.2)
87
- i18n (1.12.0)
86
+ i18n (1.13.0)
88
87
  concurrent-ruby (~> 1.0)
89
88
  io-console (0.6.0)
90
89
  io-console (0.6.0-java)
91
- irb (1.6.3)
90
+ irb (1.6.4)
92
91
  reline (>= 0.3.0)
93
92
  json (2.6.3)
94
93
  json (2.6.3-java)
95
94
  mime-types (3.4.1)
96
95
  mime-types-data (~> 3.2015)
97
96
  mime-types-data (3.2023.0218.1)
98
- mini_mime (1.1.2)
99
- mini_portile2 (2.8.1)
97
+ mini_portile2 (2.8.2)
100
98
  minitest (5.18.0)
101
99
  multi_json (1.15.0)
102
- multi_xml (0.6.0)
100
+ multipart-post (2.3.0)
103
101
  netrc (0.11.0)
104
- nio4r (2.5.8)
105
- nio4r (2.5.8-java)
102
+ nio4r (2.5.9)
103
+ nio4r (2.5.9-java)
106
104
  octokit (4.25.1)
107
105
  faraday (>= 1, < 3)
108
106
  sawyer (~> 0.9)
109
107
  os (1.1.4)
110
- parallel (1.22.1)
111
- parser (3.2.1.1)
108
+ parallel (1.23.0)
109
+ parser (3.2.2.1)
112
110
  ast (~> 2.4.1)
111
+ pg (1.5.3)
112
+ pgvector (0.2.0)
113
113
  protocol-hpack (1.4.2)
114
114
  protocol-http (0.24.1)
115
115
  protocol-http1 (0.15.0)
@@ -120,7 +120,7 @@ GEM
120
120
  public_suffix (5.0.1)
121
121
  rainbow (3.1.1)
122
122
  rake (13.0.6)
123
- regexp_parser (2.7.0)
123
+ regexp_parser (2.8.0)
124
124
  reline (0.3.3)
125
125
  io-console (~> 0.5)
126
126
  rest-client (2.1.0)
@@ -133,36 +133,40 @@ GEM
133
133
  rspec-core (~> 3.12.0)
134
134
  rspec-expectations (~> 3.12.0)
135
135
  rspec-mocks (~> 3.12.0)
136
- rspec-core (3.12.1)
136
+ rspec-core (3.12.2)
137
137
  rspec-support (~> 3.12.0)
138
- rspec-expectations (3.12.2)
138
+ rspec-expectations (3.12.3)
139
139
  diff-lcs (>= 1.2.0, < 2.0)
140
140
  rspec-support (~> 3.12.0)
141
141
  rspec-mocks (3.12.5)
142
142
  diff-lcs (>= 1.2.0, < 2.0)
143
143
  rspec-support (~> 3.12.0)
144
144
  rspec-support (3.12.0)
145
- rubocop (1.48.1)
145
+ rubocop (1.50.2)
146
146
  json (~> 2.3)
147
147
  parallel (~> 1.10)
148
148
  parser (>= 3.2.0.0)
149
149
  rainbow (>= 2.2.2, < 4.0)
150
150
  regexp_parser (>= 1.8, < 3.0)
151
151
  rexml (>= 3.2.5, < 4.0)
152
- rubocop-ast (>= 1.26.0, < 2.0)
152
+ rubocop-ast (>= 1.28.0, < 2.0)
153
153
  ruby-progressbar (~> 1.7)
154
154
  unicode-display_width (>= 2.4.0, < 3.0)
155
- rubocop-ast (1.28.0)
155
+ rubocop-ast (1.28.1)
156
156
  parser (>= 3.2.1.0)
157
- rubocop-capybara (2.17.1)
157
+ rubocop-capybara (2.18.0)
158
158
  rubocop (~> 1.41)
159
+ rubocop-factory_bot (2.22.0)
160
+ rubocop (~> 1.33)
159
161
  rubocop-rake (0.6.0)
160
162
  rubocop (~> 1.0)
161
- rubocop-rspec (2.19.0)
163
+ rubocop-rspec (2.22.0)
162
164
  rubocop (~> 1.33)
163
165
  rubocop-capybara (~> 2.17)
164
- ruby-openai (3.7.0)
165
- httparty (>= 0.18.1)
166
+ rubocop-factory_bot (~> 2.22)
167
+ ruby-openai (4.0.0)
168
+ faraday (>= 1)
169
+ faraday-multipart (>= 1)
166
170
  ruby-progressbar (1.13.0)
167
171
  ruby2_keywords (0.0.5)
168
172
  sawyer (0.9.2)
@@ -212,6 +216,8 @@ DEPENDENCIES
212
216
  faraday-retry (~> 2.0)
213
217
  github_changelog_generator (~> 1.16)
214
218
  hnswlib (~> 0.8.1)
219
+ pg (~> 1.5, >= 1.5.3)
220
+ pgvector (~> 0.2.0)
215
221
  rake (~> 13.0)
216
222
  rest-client (~> 2.1)
217
223
  rspec (~> 3.2)
data/README.md CHANGED
@@ -21,6 +21,7 @@ All of these concepts are in a module named Boxcars:
21
21
  - Train - Given a list of Boxcars and optionally an Engine, a Train breaks down a problem into pieces for individual Boxcars to solve. The individual results are then combined until a final answer is found. ZeroShot is the only current implementation of Train (but we are adding more soon), and you can either construct it directly or use `Boxcars::train` when you want to build a Train.
22
22
  - Prompt - used by an Engine to generate text results. Our Boxcars have built-in prompts, but you have the flexibility to change or augment them if you so desire.
23
23
  - Engine - an entity that generates text from a Prompt. OpenAI's LLM text generator is the default Engine if no other is specified, and you can override the default engine if so desired (`Boxcar.configuration.default_engine`).
24
+ - VectorStore - a place to store and query vectors.
24
25
 
25
26
  ## Security
26
27
  Currently, our system is designed for individuals who already possess administrative privileges for their project. It is likely possible to manipulate the system's prompts to carry out malicious actions, but if you already have administrative access, you can perform such actions without requiring boxcars in the first place.
@@ -132,7 +133,9 @@ Next Actions:
132
133
  ### More Examples
133
134
  See [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/boxcars_examples.ipynb) Jupyter Notebook for more examples.
134
135
 
135
- For the new Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
136
+ For the Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
137
+
138
+ For simple vector storage and search, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/vector_store_examples.ipynb) Jupyter Notebook.
136
139
 
137
140
  Note, some folks that we talked to didn't know that you could run Ruby Jupyter notebooks. [You can](https://github.com/SciRuby/iruby).
138
141
 
data/boxcars.gemspec CHANGED
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
12
12
  spec.description = "You simply set an OpenAI key, give a number of Boxcars to a Train, and magic ensues when you run it."
13
13
  spec.homepage = "https://github.com/BoxcarsAI/boxcars"
14
14
  spec.license = "MIT"
15
- spec.required_ruby_version = ">= 2.6.0"
15
+ spec.required_ruby_version = ">= 3.0"
16
16
 
17
17
  spec.metadata["homepage_uri"] = spec.homepage
18
18
  spec.metadata["source_code_uri"] = spec.homepage
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
38
38
  # runtime dependencies
39
39
  spec.add_dependency "google_search_results", "~> 2.2"
40
40
  spec.add_dependency "gpt4all", "~> 0.0.4"
41
- spec.add_dependency "ruby-openai", "~> 3.0"
41
+ spec.add_dependency "ruby-openai", "~> 4.0"
42
42
 
43
43
  # For more information and examples about making a new gem, checkout our
44
44
  # guide at: https://bundler.io/guides/creating_gem.html
@@ -161,7 +161,7 @@ module Boxcars
161
161
  begin
162
162
  return true unless changes&.positive?
163
163
  rescue StandardError => e
164
- Boscar.error "Error while computing change count: #{e.message}", :red
164
+ Boxcars.error "Error while computing change count: #{e.message}", :red
165
165
  end
166
166
 
167
167
  Boxcars.debug "#{name}(Pending Changes): #{changes}", :yellow
@@ -156,4 +156,5 @@ require "boxcars/boxcar/wikipedia_search"
156
156
  require "boxcars/boxcar/sql"
157
157
  require "boxcars/boxcar/swagger"
158
158
  require "boxcars/boxcar/active_record"
159
+ require "boxcars/vector_store"
159
160
  require "boxcars/vector_search"
@@ -43,6 +43,10 @@ module Boxcars
43
43
  ::OpenAI::Client.new(access_token: access_token, organization_id: organization_id)
44
44
  end
45
45
 
46
+ def conversation_model?(model)
47
+ ["gpt-3.5-turbo", "gpt-4"].include?(model)
48
+ end
49
+
46
50
  # Get an answer from the engine.
47
51
  # @param prompt [String] The prompt to use when asking the engine.
48
52
  # @param openai_access_token [String] The access token to use when asking the engine.
@@ -51,7 +55,7 @@ module Boxcars
51
55
  def client(prompt:, inputs: {}, openai_access_token: nil, **kwargs)
52
56
  clnt = Openai.open_ai_client(openai_access_token: openai_access_token)
53
57
  params = open_ai_params.merge(kwargs)
54
- if params[:model] == "gpt-3.5-turbo"
58
+ if conversation_model?(params[:model])
55
59
  prompt = prompt.first if prompt.is_a?(Array)
56
60
  params = prompt.as_messages(inputs).merge(params)
57
61
  if Boxcars.configuration.log_prompts
@@ -71,6 +75,9 @@ module Boxcars
71
75
  def run(question, **kwargs)
72
76
  prompt = Prompt.new(template: question)
73
77
  response = client(prompt: prompt, **kwargs)
78
+ raise Error, "OpenAI: No response from API" unless response
79
+ raise Error, "OpenAI: #{response['error']}" if response["error"]
80
+
74
81
  answer = response["choices"].map { |c| c.dig("message", "content") || c["text"] }.join("\n").strip
75
82
  puts answer
76
83
  answer
@@ -3,8 +3,72 @@
3
3
  # Boxcars is a framework for running a series of tools to get an answer to a question.
4
4
  module Boxcars
5
5
  # For Boxcars that use an engine to do their work.
6
- class VectorSearch < Boxcar
7
- Error = Class.new(StandardError)
6
+ class VectorSearch
7
+ def initialize(params)
8
+ @vector_documents = params[:vector_documents]
9
+ @embedding_tool = params[:embedding_tool] || :openai
10
+ @vector_search_instance = vector_search_instance
11
+ @openai_connection = params[:openai_connection] || default_connection(openai_access_token: openai_access_token)
12
+ end
13
+
14
+ def call(query:, count: 1)
15
+ validate_query(query)
16
+ query_vector = convert_query_to_vector(query)
17
+ @vector_search_instance.call(query_vector: query_vector, count: count)
18
+ end
19
+
20
+ private
21
+
22
+ attr_reader :vector_documents, :embedding_tool, :openai_connection
23
+
24
+ def vector_search_instance
25
+ case vector_documents[:type]
26
+ when :hnswlib
27
+ Boxcars::VectorStore::Hnswlib::Search.new(
28
+ vector_documents: vector_documents
29
+ )
30
+ when :in_memory
31
+ Boxcars::VectorStore::InMemory::Search.new(
32
+ vector_documents: vector_documents
33
+ )
34
+ when :pgvector
35
+ Boxcars::VectorStore::Pgvector::Search.new(
36
+ vector_documents: vector_documents
37
+ )
38
+ else
39
+ raise_argument_error('Unsupported vector store provided')
40
+ end
41
+ end
42
+
43
+ def default_connection(openai_access_token: nil)
44
+ Openai.open_ai_client(openai_access_token: openai_access_token)
45
+ end
46
+
47
+ def validate_query(query)
48
+ raise_argument_error('query must be a string') unless query.is_a?(String)
49
+ raise_argument_error('query must not be empty') if query.empty?
50
+ end
51
+
52
+ def convert_query_to_vector(query)
53
+ tool = embeddings_method(embedding_tool)
54
+ res = tool[:klass].call(
55
+ texts: [query], client: tool[:client]
56
+ ).first
57
+ res[:embedding]
58
+ end
59
+
60
+ def embeddings_method(embedding_tool)
61
+ case embedding_tool
62
+ when :openai
63
+ { klass: Boxcars::VectorStore::EmbedViaOpenAI, client: openai_connection }
64
+ when :tensorflow
65
+ { klass: Boxcars::VectorStore::EmbedViaTensorflow, client: nil }
66
+ end
67
+ end
68
+
69
+ def raise_argument_error(message)
70
+ raise ::Boxcars::ArgumentError, message
71
+ end
8
72
  end
9
73
  end
10
74
 
@@ -3,10 +3,11 @@
3
3
  module Boxcars
4
4
  module VectorStore
5
5
  class Document
6
- attr_accessor :page_content, :metadata
6
+ attr_accessor :content, :metadata, :embedding
7
7
 
8
8
  def initialize(fields = {})
9
- @page_content = fields[:page_content] || ""
9
+ @content = fields[:content] || ""
10
+ @embedding = fields[:embedding] || []
10
11
  @metadata = fields[:metadata] || {}
11
12
  end
12
13
  end
@@ -7,8 +7,6 @@ module Boxcars
7
7
  class EmbedViaOpenAI
8
8
  include VectorStore
9
9
 
10
- attr_accessor :texts, :client, :model
11
-
12
10
  def initialize(texts:, client:, model: 'text-embedding-ada-002')
13
11
  validate_params(texts, client)
14
12
  @texts = texts
@@ -28,6 +26,8 @@ module Boxcars
28
26
 
29
27
  private
30
28
 
29
+ attr_accessor :texts, :client, :model
30
+
31
31
  def validate_params(texts, client)
32
32
  raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
33
33
  raise_error 'openai_connection must be an OpenAI::Client' unless client.is_a?(OpenAI::Client)
@@ -0,0 +1,100 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'hnswlib'
5
+ require 'json'
6
+
7
+ module Boxcars
8
+ module VectorStore
9
+ module Hnswlib
10
+ # This class is responsible for building the vector store for the hnswlib similarity search.
11
+ # It will load the training data, generate the embeddings, and save the vector store.
12
+ # It will also load the vector store into memory.
13
+ # For later use, it will save the splitted document with index numbers to a json file.
14
+ class BuildFromFiles
15
+ include VectorStore
16
+
17
+ def initialize(params)
18
+ @split_chunk_size = params[:split_chunk_size] || 2000
19
+ @training_data_path = File.absolute_path(params[:training_data_path])
20
+ @index_file_path = File.absolute_path(params[:index_file_path])
21
+
22
+ validate_params(@training_data_path, @index_file_path, split_chunk_size)
23
+
24
+ @json_doc_file_path = absolute_json_doc_file_path(@index_file_path, params[:json_doc_file_path])
25
+ @force_rebuild = params[:force_rebuild] || true
26
+ @hnsw_vectors = []
27
+ end
28
+
29
+ def call
30
+ if !force_rebuild && File.exist?(index_file_path)
31
+ load_existing_vector_store
32
+ else
33
+ puts "Building Hnswlib vector store..."
34
+ data = load_data_files(training_data_path)
35
+ texts = split_text_into_chunks(data)
36
+ vectors = generate_vectors(texts)
37
+ add_vectors(vectors, texts)
38
+ save_vector_store
39
+
40
+ {
41
+ type: :hnswlib,
42
+ vector_store: hnsw_vectors
43
+ }
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild, :hnsw_vectors
50
+
51
+ def validate_params(training_data_path, index_file_path, split_chunk_size)
52
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
53
+
54
+ raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
55
+ raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
56
+
57
+ index_dir = File.dirname(index_file_path)
58
+
59
+ raise_argument_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
60
+ raise_argument_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
61
+ end
62
+
63
+ def absolute_json_doc_file_path(index_file_path, json_doc_file_path)
64
+ return index_file_path.gsub(/\.bin$/, '.json') unless json_doc_file_path
65
+
66
+ File.absolute_path(json_doc_file_path)
67
+ end
68
+
69
+ def add_vectors(vectors, texts)
70
+ vectors.map.with_index do |vector, index|
71
+ hnsw_vector = Document.new(
72
+ content: texts[index],
73
+ embedding: vector[:embedding],
74
+ metadata: {
75
+ doc_id: index,
76
+ dim: vector[:dim],
77
+ metric: 'l2',
78
+ max_item: 10000,
79
+ index_file_path: index_file_path,
80
+ json_doc_file_path: json_doc_file_path
81
+ }
82
+ )
83
+ hnsw_vectors << hnsw_vector
84
+ end
85
+ end
86
+
87
+ def save_vector_store
88
+ Boxcars::VectorStore::Hnswlib::SaveToHnswlib.call(hnsw_vectors)
89
+ end
90
+
91
+ def load_existing_vector_store
92
+ Boxcars::VectorStore::Hnswlib::LoadFromDisk.call(
93
+ index_file_path: index_file_path,
94
+ json_doc_file_path: json_doc_file_path
95
+ )
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'hnswlib'
5
+ require 'json'
6
+
7
+ module Boxcars
8
+ module VectorStore
9
+ module Hnswlib
10
+ class LoadFromDisk
11
+ include VectorStore
12
+
13
+ def initialize(params)
14
+ validate_params(params[:index_file_path], params[:json_doc_file_path])
15
+
16
+ @index_file_path = File.absolute_path(params[:index_file_path])
17
+ @json_doc_file_path = File.absolute_path(params[:json_doc_file_path])
18
+ end
19
+
20
+ def call
21
+ vectors = parse_json_file(json_doc_file_path)
22
+ hnsw_vectors = load_as_hnsw_vectors(vectors)
23
+
24
+ {
25
+ type: :hnswlib,
26
+ vector_store: hnsw_vectors
27
+ }
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :index_file_path, :json_doc_file_path
33
+
34
+ def validate_params(index_file_path, json_doc_file_path)
35
+ raise_argument_error("index_file_path must be a string") unless index_file_path.is_a?(String)
36
+ raise_argument_error("json_doc_file_path must be a string") unless json_doc_file_path.is_a?(String)
37
+
38
+ raise_argument_error("index_file_path must exist") unless File.exist?(index_file_path)
39
+ raise_argument_error("json_doc_file_path must exist") unless File.exist?(json_doc_file_path)
40
+ end
41
+
42
+ def load_as_hnsw_vectors(vectors)
43
+ hnsw_vectors = []
44
+ vectors.each do |vector|
45
+ hnsw_vector = Document.new(
46
+ content: vector[:document],
47
+ embedding: vector[:embedding],
48
+ metadata: vector[:metadata]
49
+ )
50
+ hnsw_vectors[vectors.first[:doc_id].to_i] = hnsw_vector
51
+ end
52
+ hnsw_vectors
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end