boxcars 0.2.5 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a3593a9df2d9d8a867729e0b6081b300125933566923ca905e2eefb16a933394
4
- data.tar.gz: 90c03ea9b328b8cff10f828f8b9bf4375cce0fd574dd8a26c690776e58a58c1b
3
+ metadata.gz: 69b70e1d02b1ec206438eaaf857a0495fe35ab01e64a265656fe21230675306f
4
+ data.tar.gz: 8681b9625a0684f1091eea7a4626964929b271370068c90b82dabeee4253d803
5
5
  SHA512:
6
- metadata.gz: 5de41be1f154b2c21fcd6602159a6428d8702dc318904e6e03db9de1b0ed1788b03aa10365203557b39fe268f04b4594cf6915faa941b2c64e475cd6cbb55d09
7
- data.tar.gz: 2ad84e5f416b19759807d658d739d1abb8405ba6ded3b38bdf5ce9406efeadccbc2102b7b477a90df67beaa287a9810b09e59b5843090fbe4619b03500d0a4f4
6
+ metadata.gz: eb5c0c00f8fcdbbd6d8a1999d7544fc584c701fdf9a8a9c271fff6d9795f75ef9cab058fee2c6829808a764c892cc3e2f4e4a8717155d34d6514b46d744e632c
7
+ data.tar.gz: b8fb4ad34d7b93d47388f037d1d93e9e7245303740bc04d58d21942112ff97315e5dd31fdba77e275b52ddba85ca1055b897646ae1e606daee485583d50c52a6
data/CHANGELOG.md CHANGED
@@ -1,5 +1,44 @@
1
1
  # Changelog
2
2
 
3
+ ## [Unreleased](https://github.com/BoxcarsAI/boxcars/tree/HEAD)
4
+
5
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.7...HEAD)
6
+
7
+ **Closed issues:**
8
+
9
+ - Getting the same verbosity as in the examples [\#54](https://github.com/BoxcarsAI/boxcars/issues/54)
10
+
11
+ **Merged pull requests:**
12
+
13
+ - Add Engine for Gpt4all [\#55](https://github.com/BoxcarsAI/boxcars/pull/55) ([francis](https://github.com/francis))
14
+ - update google search to return URL for result if present [\#53](https://github.com/BoxcarsAI/boxcars/pull/53) ([francis](https://github.com/francis))
15
+ - Draft: added gpt4all [\#49](https://github.com/BoxcarsAI/boxcars/pull/49) ([jaigouk](https://github.com/jaigouk))
16
+ - Embeddings with hnswlib [\#48](https://github.com/BoxcarsAI/boxcars/pull/48) ([jaigouk](https://github.com/jaigouk))
17
+
18
+ ## [v0.2.7](https://github.com/BoxcarsAI/boxcars/tree/v0.2.7) (2023-04-13)
19
+
20
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...v0.2.7)
21
+
22
+ **Closed issues:**
23
+
24
+ - The class name in the sample code of BoxCar-Google-Search wiki has not been changed. [\#50](https://github.com/BoxcarsAI/boxcars/issues/50)
25
+
26
+ **Merged pull requests:**
27
+
28
+ - Add Swagger Boxcar [\#51](https://github.com/BoxcarsAI/boxcars/pull/51) ([francis](https://github.com/francis))
29
+ - Boxcars::SQL tables and except\_tables [\#47](https://github.com/BoxcarsAI/boxcars/pull/47) ([arihh](https://github.com/arihh))
30
+ - ActiveRecord updates and new Wikipedia Search boxcar [\#46](https://github.com/BoxcarsAI/boxcars/pull/46) ([francis](https://github.com/francis))
31
+ - Fix README.md log\_prompts settings [\#45](https://github.com/BoxcarsAI/boxcars/pull/45) ([arihh](https://github.com/arihh))
32
+ - Update README.md to use the GoogleSearch Boxcar [\#44](https://github.com/BoxcarsAI/boxcars/pull/44) ([stockandawe](https://github.com/stockandawe))
33
+
34
+ ## [v0.2.5](https://github.com/BoxcarsAI/boxcars/tree/v0.2.5) (2023-03-30)
35
+
36
+ [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.4...v0.2.5)
37
+
38
+ **Merged pull requests:**
39
+
40
+ - switch to safe level 4 for eval, and rerun tests [\#43](https://github.com/BoxcarsAI/boxcars/pull/43) ([francis](https://github.com/francis))
41
+
3
42
  ## [v0.2.4](https://github.com/BoxcarsAI/boxcars/tree/v0.2.4) (2023-03-28)
4
43
 
5
44
  [Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.3...v0.2.4)
data/Gemfile CHANGED
@@ -26,3 +26,9 @@ gem "activerecord", "~> 7.0"
26
26
  gem "github_changelog_generator", "~> 1.16"
27
27
 
28
28
  gem "faraday-retry", "~> 2.0"
29
+
30
+ gem "activesupport", "~> 7.0"
31
+
32
+ gem "rest-client", "~> 2.1"
33
+
34
+ gem "hnswlib", "~> 0.8.1"
data/Gemfile.lock CHANGED
@@ -1,8 +1,9 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- boxcars (0.2.5)
4
+ boxcars (0.2.8)
5
5
  google_search_results (~> 2.2)
6
+ gpt4all (~> 0.0.4)
6
7
  ruby-openai (~> 3.0)
7
8
 
8
9
  GEM
@@ -49,6 +50,8 @@ GEM
49
50
  irb (>= 1.5.0)
50
51
  reline (>= 0.3.1)
51
52
  diff-lcs (1.5.0)
53
+ domain_name (0.5.20190701)
54
+ unf (>= 0.0.5, < 1.0.0)
52
55
  dotenv (2.8.1)
53
56
  faraday (2.7.4)
54
57
  faraday-net_http (>= 2.0, < 3.1)
@@ -69,7 +72,15 @@ GEM
69
72
  rainbow (>= 2.2.1)
70
73
  rake (>= 10.0)
71
74
  google_search_results (2.2.0)
75
+ gpt4all (0.0.5)
76
+ faraday (~> 2.7)
77
+ os (~> 1.1)
78
+ tty-progressbar (~> 0.18.2)
72
79
  hashdiff (1.0.1)
80
+ hnswlib (0.8.1)
81
+ http-accept (1.7.0)
82
+ http-cookie (1.0.5)
83
+ domain_name (~> 0.5)
73
84
  httparty (0.21.0)
74
85
  mini_mime (>= 1.0.0)
75
86
  multi_xml (>= 0.5.2)
@@ -81,16 +92,21 @@ GEM
81
92
  reline (>= 0.3.0)
82
93
  json (2.6.3)
83
94
  json (2.6.3-java)
95
+ mime-types (3.4.1)
96
+ mime-types-data (~> 3.2015)
97
+ mime-types-data (3.2023.0218.1)
84
98
  mini_mime (1.1.2)
85
99
  mini_portile2 (2.8.1)
86
100
  minitest (5.18.0)
87
101
  multi_json (1.15.0)
88
102
  multi_xml (0.6.0)
103
+ netrc (0.11.0)
89
104
  nio4r (2.5.8)
90
105
  nio4r (2.5.8-java)
91
106
  octokit (4.25.1)
92
107
  faraday (>= 1, < 3)
93
108
  sawyer (~> 0.9)
109
+ os (1.1.4)
94
110
  parallel (1.22.1)
95
111
  parser (3.2.1.1)
96
112
  ast (~> 2.4.1)
@@ -107,6 +123,11 @@ GEM
107
123
  regexp_parser (2.7.0)
108
124
  reline (0.3.3)
109
125
  io-console (~> 0.5)
126
+ rest-client (2.1.0)
127
+ http-accept (>= 1.7.0, < 2.0)
128
+ http-cookie (>= 1.0.2, < 2.0)
129
+ mime-types (>= 1.16, < 4.0)
130
+ netrc (~> 0.8)
110
131
  rexml (3.2.5)
111
132
  rspec (3.12.0)
112
133
  rspec-core (~> 3.12.0)
@@ -149,12 +170,25 @@ GEM
149
170
  faraday (>= 0.17.3, < 3)
150
171
  sqlite3 (1.6.2)
151
172
  mini_portile2 (~> 2.8.0)
173
+ sqlite3 (1.6.2-arm64-darwin)
152
174
  sqlite3 (1.6.2-x86_64-darwin)
153
175
  sqlite3 (1.6.2-x86_64-linux)
176
+ strings-ansi (0.2.0)
154
177
  timers (4.3.5)
155
178
  traces (0.9.1)
179
+ tty-cursor (0.7.1)
180
+ tty-progressbar (0.18.2)
181
+ strings-ansi (~> 0.2)
182
+ tty-cursor (~> 0.7)
183
+ tty-screen (~> 0.8)
184
+ unicode-display_width (>= 1.6, < 3.0)
185
+ tty-screen (0.8.1)
156
186
  tzinfo (2.0.6)
157
187
  concurrent-ruby (~> 1.0)
188
+ unf (0.1.4)
189
+ unf_ext
190
+ unf (0.1.4-java)
191
+ unf_ext (0.0.8.2)
158
192
  unicode-display_width (2.4.2)
159
193
  vcr (6.1.0)
160
194
  webmock (3.18.1)
@@ -163,6 +197,7 @@ GEM
163
197
  hashdiff (>= 0.4.0, < 2.0.0)
164
198
 
165
199
  PLATFORMS
200
+ arm64-darwin-22
166
201
  universal-java-11
167
202
  x86_64-darwin-21
168
203
  x86_64-darwin-22
@@ -170,12 +205,15 @@ PLATFORMS
170
205
 
171
206
  DEPENDENCIES
172
207
  activerecord (~> 7.0)
208
+ activesupport (~> 7.0)
173
209
  boxcars!
174
210
  debug (~> 1.1)
175
211
  dotenv (~> 2.8)
176
212
  faraday-retry (~> 2.0)
177
213
  github_changelog_generator (~> 1.16)
214
+ hnswlib (~> 0.8.1)
178
215
  rake (~> 13.0)
216
+ rest-client (~> 2.1)
179
217
  rspec (~> 3.2)
180
218
  rubocop (~> 1.21)
181
219
  rubocop-rake (~> 0.6.0)
data/README.md CHANGED
@@ -2,7 +2,6 @@
2
2
 
3
3
  <h4 align="center">
4
4
  <a href="https://www.boxcars.ai">Website</a> |
5
- <a href="https://www.boxcars.ai/roadmap">Roadmap</a> |
6
5
  <a href="https://www.boxcars.ai/blog">Blog</a> |
7
6
  <a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
8
7
  </h4>
@@ -18,10 +17,10 @@ This gem was inspired by the popular Python library Langchain. However, we wante
18
17
  ## Concepts
19
18
  All of these concepts are in a module named Boxcars:
20
19
 
21
- - Boxcar - an encapsulation that performs something of interest (such as search, math, SQL or an Active Record Query). A Boxcar can use an Engine (described below) to do its work.
20
+ - Boxcar - an encapsulation that performs something of interest (such as search, math, SQL, an Active Record Query, or an API call to a service). A Boxcar can use an Engine (described below) to do its work, and if not specified but needed, the default Engine is used `Boxcars.engine`.
22
21
  - Train - Given a list of Boxcars and optionally an Engine, a Train breaks down a problem into pieces for individual Boxcars to solve. The individual results are then combined until a final answer is found. ZeroShot is the only current implementation of Train (but we are adding more soon), and you can either construct it directly or use `Boxcars::train` when you want to build a Train.
23
- - Prompt - used by an Engine to generate text results. Most of the Boxcars have built-in prompts, so you only need to worry about these if you are extending the system.
24
- - Engine - an entity that generates text from a Prompt. OpenAI's LLM text generator is the default Engine if no other is specified.
22
+ - Prompt - used by an Engine to generate text results. Our Boxcars have built-in prompts, but you have the flexibility to change or augment them if you so desire.
23
+ - Engine - an entity that generates text from a Prompt. OpenAI's LLM text generator is the default Engine if no other is specified, and you can override the default engine if so desired (`Boxcar.configuration.default_engine`).
25
24
 
26
25
  ## Security
27
26
  Currently, our system is designed for individuals who already possess administrative privileges for their project. It is likely possible to manipulate the system's prompts to carry out malicious actions, but if you already have administrative access, you can perform such actions without requiring boxcars in the first place.
@@ -90,14 +89,16 @@ You can change the default_engine with `Boxcars::configuration.default_engine =
90
89
 
91
90
  Here is what we have so far, but please put up a PR with your new ideas.
92
91
  - GoogleSearch: uses the SERP API to do seaches
92
+ - WikipediaSearch: uses the Wikipedia API to do searches
93
93
  - Calculator: uses an Engine to generate ruby code to do math
94
94
  - SQL: given an ActiveRecord connection, it will generate and run sql statments from a prompt.
95
95
  - ActiveRecord: given an ActiveRecord connection, it will generate and run ActiveRecord statements from a prompt.
96
+ - Swagger: give a Swagger Open API file (YAML or JSON), answer questions about or run against the referenced service. See [here](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) for examples.
96
97
 
97
98
  ### Run a list of Boxcars
98
99
  ```ruby
99
100
  # run a Train for a calculator, and search using default Engine
100
- boxcars = [Boxcars::Calculator.new, Boxcars::Serp.new]
101
+ boxcars = [Boxcars::Calculator.new, Boxcars::GoogleSearch.new]
101
102
  train = Boxcars.train.new(boxcars: boxcars)
102
103
  train.run "What is pi times the square root of the average temperature in Austin TX in January?"
103
104
  ```
@@ -105,38 +106,40 @@ Produces:
105
106
  ```text
106
107
  > Entering Zero Shot#run
107
108
  What is pi times the square root of the average temperature in Austin TX in January?
108
- Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of that value. We can use a search engine to find the average temperature and a calculator to perform the multiplication.
109
+ Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of the average temperature. We can use a search engine to find the average temperature in Austin TX in January and a calculator to perform the multiplication.
109
110
  Question: Average temperature in Austin TX in January
110
- Answer: increase from 62°F to 64°F
111
- Observation: increase from 62°F to 64°F
112
- Thought: The average temperature in Austin TX in January is around 63°F.
111
+ Answer: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
112
+ Observation: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
113
+ Thought: We have found the average temperature in Austin TX in January, which is 64°F. Now we can use a calculator to perform the multiplication.
113
114
  > Entering Calculator#run
114
- pi * sqrt(63)
115
- RubyREPL: puts(Math::PI * Math.sqrt(63))
116
- Answer: 24.935618646198247
115
+ pi * sqrt(64)
116
+ RubyREPL: puts(Math::PI * Math.sqrt(64))
117
+ Answer: 25.132741228718345
117
118
 
118
- {"status":"ok","answer":"24.935618646198247","explanation":"Answer: 24.935618646198247","code":"puts(Math::PI * Math.sqrt(63))"}
119
+ {"status":"ok","answer":"25.132741228718345","explanation":"Answer: 25.132741228718345","code":"puts(Math::PI * Math.sqrt(64))"}
119
120
  < Exiting Calculator#run
120
- Observation: 24.935618646198247
121
- The result of pi times the square root of the average temperature in Austin TX in January is approximately 24.94.
121
+ Observation: 25.132741228718345
122
+ We have the final answer.
122
123
 
123
- Final Answer: 24.94
124
+ Final Answer: 25.132741228718345
124
125
 
125
126
  Next Actions:
126
127
  1. What is the average temperature in Austin TX in July?
127
- 2. What is the formula for calculating the area of a circle?
128
- 3. What is the value of pi to 10 decimal places?
128
+ 2. What is the value of pi to 10 decimal places?
129
+ 3. What is the square root of the average temperature in Miami FL in January?
129
130
  < Exiting Zero Shot#run
130
131
  ```
131
132
  ### More Examples
132
133
  See [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/boxcars_examples.ipynb) Jupyter Notebook for more examples.
133
134
 
135
+ For the new Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
136
+
134
137
  Note, some folks that we talked to didn't know that you could run Ruby Jupyter notebooks. [You can](https://github.com/SciRuby/iruby).
135
138
 
136
139
  ### Logging
137
140
  If you use this in a Rails application, or configure `Boxcars.configuration.logger = your_logger`, logging will go to your log file.
138
141
 
139
- Also, if you set this flag: `Boxcars.configuration.lop_prompts = true`
142
+ Also, if you set this flag: `Boxcars.configuration.log_prompts = true`
140
143
  The actual prompts handed to the connected Engine will be logged. This is off by default because it is very wordy, but handy if you are debugging prompts.
141
144
 
142
145
  Otherwise, we print to standard out.
data/boxcars.gemspec CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
37
37
 
38
38
  # runtime dependencies
39
39
  spec.add_dependency "google_search_results", "~> 2.2"
40
+ spec.add_dependency "gpt4all", "~> 0.0.4"
40
41
  spec.add_dependency "ruby-openai", "~> 3.0"
41
42
 
42
43
  # For more information and examples about making a new gem, checkout our
@@ -3,6 +3,7 @@
3
3
  # Boxcars is a framework for running a series of tools to get an answer to a question.
4
4
  module Boxcars
5
5
  # A Boxcar that interprets a prompt and executes SQL code to get answers
6
+ # rubocop:disable Metrics/ClassLength
6
7
  class ActiveRecord < EngineBoxcar
7
8
  # the description of this engine boxcar
8
9
  ARDESC = "useful for when you need to query a database for an application named %<name>s."
@@ -21,7 +22,7 @@ module Boxcars
21
22
  @approval_callback = approval_callback
22
23
  @read_only = read_only.nil? ? !approval_callback : read_only
23
24
  @code_only = kwargs.delete(:code_only) || false
24
- kwargs[:name] ||= "Data"
25
+ kwargs[:name] ||= get_name
25
26
  kwargs[:description] ||= format(ARDESC, name: name)
26
27
  kwargs[:prompt] ||= my_prompt
27
28
  super(**kwargs)
@@ -34,6 +35,13 @@ module Boxcars
34
35
 
35
36
  private
36
37
 
38
+ def get_name
39
+ return Rails.application.class.module_parent.name if defined?(Rails)
40
+ rescue StandardError => e
41
+ boxcars.error "Error getting rails name application name: #{e.message}"
42
+ nil
43
+ end
44
+
37
45
  def read_only?
38
46
  read_only
39
47
  end
@@ -44,6 +52,7 @@ module Boxcars
44
52
 
45
53
  def check_models(models, exceptions)
46
54
  if models.is_a?(Array) && models.length.positive?
55
+ models.map { |m| m.is_a?(Class) ? m : m.constantize }
47
56
  @requested_models = models
48
57
  models.each do |m|
49
58
  raise ArgumentError, "model #{m} needs to be an Active Record model" unless m.ancestors.include?(::ActiveRecord::Base)
@@ -119,7 +128,10 @@ module Boxcars
119
128
  # @return [Object] The result of the code
120
129
  def eval_safe_wrapper(code)
121
130
  # if the code used ActiveRecord, we need to add :: in front of it to escape the module
122
- new_code = code.gsub(/(\W)ActiveRecord::/, '\1::ActiveRecord::')
131
+ new_code = code.gsub(/\b(ActiveRecord::)/, '::\1')
132
+
133
+ # sometimes the code will have a puts or print in it, which will miss. Remove them.
134
+ new_code = new_code.gsub(/\b(puts|print)\b/, '')
123
135
  proc do
124
136
  $SAFE = 4
125
137
  # rubocop:disable Security/Eval
@@ -146,7 +158,11 @@ module Boxcars
146
158
  def approved?(changes_code, code)
147
159
  # find out how many changes there are
148
160
  changes = change_count(changes_code)
149
- return true unless changes&.positive?
161
+ begin
162
+ return true unless changes&.positive?
163
+ rescue StandardError => e
164
+ Boscar.error "Error while computing change count: #{e.message}", :red
165
+ end
150
166
 
151
167
  Boxcars.debug "#{name}(Pending Changes): #{changes}", :yellow
152
168
  if read_only?
@@ -242,7 +258,7 @@ module Boxcars
242
258
  "Pay attention to use only the attribute names that you can see in the model description.\n",
243
259
  "Do not make up variable or attribute names, and do not share variables between the code in ARChanges and ARCode\n",
244
260
  "Be careful to not query for attributes that do not exist, and to use the format specified above.\n",
245
- "Finally, do not use print or puts in your code."
261
+ "Finally, try not to use print or puts in your code"
246
262
  ),
247
263
  user("Question: %<question>s")
248
264
  ].freeze
@@ -257,4 +273,5 @@ module Boxcars
257
273
  output_variables: [:answer])
258
274
  end
259
275
  end
276
+ # rubocop:enable Metrics/ClassLength
260
277
  end
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Boxcars
4
+ module Embeddings
5
+ class Document
6
+ attr_accessor :page_content, :metadata
7
+
8
+ def initialize(fields = {})
9
+ @page_content = fields[:page_content] || ""
10
+ @metadata = fields[:metadata] || {}
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'openai'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ class EmbedViaOpenAI
8
+ include Embeddings
9
+
10
+ attr_accessor :texts, :openai_connection, :model
11
+
12
+ def initialize(texts:, openai_connection:, model: 'text-embedding-ada-002')
13
+ validate_params(texts, openai_connection)
14
+ @texts = texts
15
+ @openai_connection = openai_connection
16
+ @model = model
17
+ end
18
+
19
+ def call
20
+ texts.map do |text|
21
+ embedding = embedding_with_retry(model: model, input: strip_new_lines(text))
22
+ {
23
+ embedding: embedding,
24
+ dim: embedding.size
25
+ }
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ def validate_params(texts, openai_connection)
32
+ raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
33
+ raise_error 'openai_connection must be an OpenAI::Client' unless openai_connection.is_a?(OpenAI::Client)
34
+ end
35
+
36
+ def embedding_with_retry(request)
37
+ response = @openai_connection.embeddings(parameters: request)
38
+ response['data'][0]['embedding']
39
+ end
40
+
41
+ def strip_new_lines(text)
42
+ text.gsub("\n", ' ')
43
+ end
44
+
45
+ def raise_error(message)
46
+ raise ::Boxcars::ValueError, message
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,159 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'fileutils'
4
+ require 'hnswlib'
5
+ require 'json'
6
+
7
+ module Boxcars
8
+ module Embeddings
9
+ module Hnswlib
10
+ class BuildVectorStore
11
+ include Embeddings
12
+
13
+ # This class is responsible for building the vector store for the hnswlib similarity search.
14
+ # It will load the training data, generate the embeddings, and save the vector store.
15
+ # It will also load the vector store into memory.
16
+ # For later use, it will save the splitted document with index numbers to a json file.
17
+ #
18
+ # @param training_data_path [String] The path to the training data. Can be a glob pattern.
19
+ # @param index_file_path [String] The path to the index file.
20
+ # @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
21
+ # @option json_doc_file_path [String]. The json file containing the document text.
22
+ # if nil, it will reuse index file name.
23
+ # @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
24
+ def initialize(
25
+ training_data_path:,
26
+ index_file_path:,
27
+ split_chunk_size: 2000,
28
+ json_doc_file_path: nil,
29
+ force_rebuild: true
30
+ )
31
+ @training_data_path = training_data_path
32
+ @index_file_path = index_file_path
33
+ @split_chunk_size = split_chunk_size
34
+ @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
35
+ @force_rebuild = force_rebuild
36
+ end
37
+
38
+ def call
39
+ validate_params
40
+ data = load_files
41
+ documents = split_text_into_chunks(data)
42
+ embeddings_with_config = generate_embeddings(documents)
43
+ save_vector_store(embeddings_with_config)
44
+ load_hnsw
45
+ end
46
+
47
+ private
48
+
49
+ attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
50
+
51
+ def validate_params
52
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
53
+ raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
54
+ raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
55
+
56
+ index_dir = File.dirname(index_file_path)
57
+ raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
58
+
59
+ raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
60
+ end
61
+
62
+ def load_files
63
+ data = []
64
+ files = Dir.glob(training_data_path)
65
+ raise_error "No files found at #{training_data_path}" if files.empty?
66
+
67
+ files.each do |file|
68
+ data << File.read(file)
69
+ end
70
+ puts "Added #{files.length} files to data. Splitting text into chunks..."
71
+ data
72
+ end
73
+
74
+ def split_text_into_chunks(data)
75
+ return true unless rebuild_required?
76
+
77
+ docs = []
78
+ data.each do |chunk|
79
+ doc_output = Boxcars::Embeddings::SplitText.call(
80
+ separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
81
+ )
82
+ docs.concat(doc_output)
83
+ end
84
+ docs
85
+ end
86
+
87
+ def rebuild_required?
88
+ hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
89
+ return true unless File.exist?(index_file_path)
90
+ return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
91
+ return true if force_rebuild
92
+
93
+ false
94
+ end
95
+
96
+ def generate_embeddings(documents)
97
+ return true unless rebuild_required?
98
+
99
+ puts "Initializing Store..."
100
+ openai_client = OpenAI::Client.new(access_token: ENV.fetch('OPENAI_API_KEY', nil))
101
+
102
+ embeddings_with_dim = Boxcars::Embeddings::EmbedViaOpenAI.call(texts: documents, openai_connection: openai_client)
103
+
104
+ document_embeddings = embeddings_with_dim.map.with_index do |item, index|
105
+ { doc_id: index, embedding: item[:embedding], document: documents[index] }
106
+ end
107
+
108
+ { document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
109
+ end
110
+
111
+ def save_vector_store(embeddings_with_config)
112
+ return true unless rebuild_required?
113
+
114
+ puts "Saving Vectorstore"
115
+ Boxcars::Embeddings::Hnswlib::SaveToHnswlib.call(
116
+ document_embeddings: embeddings_with_config[:document_embeddings],
117
+ index_file_path: index_file_path,
118
+ json_doc_file_path: json_doc_file_path,
119
+ hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
120
+ )
121
+ puts "VectorStore saved"
122
+ end
123
+
124
+ def hnswlib_config(dim)
125
+ # dim: length of datum point vector that will be indexed.
126
+ Boxcars::Embeddings::Hnswlib::HnswlibConfig.new(
127
+ metric: "l2", max_item: 10000, dim: dim
128
+ )
129
+ end
130
+
131
+ def load_hnsw
132
+ puts "Loading Hnswlib"
133
+
134
+ config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
135
+ json_config = parse_json_file(config_file)
136
+ document_embeddings = parse_json_file(json_doc_file_path)
137
+
138
+ search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
139
+ search_index.load_index(index_file_path)
140
+
141
+ { vector_store: search_index, document_embeddings: document_embeddings }
142
+ end
143
+
144
+ def parse_json_file(file_path)
145
+ return [] if file_path.nil?
146
+
147
+ file_content = File.read(file_path)
148
+ JSON.parse(file_content, symbolize_names: true)
149
+ rescue JSON::ParserError => e
150
+ raise_error("Error parsing hnswlib_config.json: #{e.message}")
151
+ end
152
+
153
+ def raise_error(message)
154
+ raise ::Boxcars::Error, message
155
+ end
156
+ end
157
+ end
158
+ end
159
+ end
@@ -0,0 +1,56 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Boxcars
6
+ module Embeddings
7
+ module Hnswlib
8
+ class HnswlibConfig
9
+ attr_reader :metric, :max_item, :dim, :ef_construction, :m
10
+
11
+ # used for search index.
12
+ #
13
+ # @param max_item [Integer] The maximum number of items.
14
+ #
15
+ # @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
16
+ #
17
+ # @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
18
+ # It controls the index time/accuracy trade-off.
19
+ #
20
+ # @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
21
+ #
22
+ # reference: https://yoshoku.github.io/hnswlib.rb/doc/
23
+ def initialize(
24
+ metric: "l2",
25
+ max_item: 10000,
26
+ dim: 2,
27
+ ef_construction: 200,
28
+ max_outgoing_connection: 16
29
+ )
30
+ @metric = metric
31
+ @max_item = max_item
32
+ @dim = dim
33
+ @ef_construction = ef_construction
34
+ @max_outgoing_connection = max_outgoing_connection
35
+ end
36
+
37
+ def space
38
+ @metric == 'dot' ? 'ip' : 'l2'
39
+ end
40
+
41
+ def to_json(*args)
42
+ JSON.pretty_generate(
43
+ {
44
+ metric: @metric,
45
+ max_item: @max_item,
46
+ dim: @dim,
47
+ ef_construction: @ef_construction,
48
+ max_outgoing_connection: @max_outgoing_connection
49
+ },
50
+ *args
51
+ )
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end