boxcars 0.2.5 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +39 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +39 -1
- data/README.md +22 -19
- data/boxcars.gemspec +1 -0
- data/lib/boxcars/boxcar/active_record.rb +21 -4
- data/lib/boxcars/boxcar/embeddings/document.rb +14 -0
- data/lib/boxcars/boxcar/embeddings/embed_via_open_ai.rb +50 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/build_vector_store.rb +159 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_config.rb +56 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/hnswlib_search.rb +54 -0
- data/lib/boxcars/boxcar/embeddings/hnswlib/save_to_hnswlib.rb +80 -0
- data/lib/boxcars/boxcar/embeddings/similarity_search.rb +51 -0
- data/lib/boxcars/boxcar/embeddings/split_text.rb +104 -0
- data/lib/boxcars/boxcar/embeddings.rb +31 -0
- data/lib/boxcars/boxcar/engine_boxcar.rb +1 -1
- data/lib/boxcars/boxcar/google_search.rb +13 -5
- data/lib/boxcars/boxcar/sql.rb +4 -2
- data/lib/boxcars/boxcar/swagger.rb +80 -0
- data/lib/boxcars/boxcar/wikipedia_search.rb +39 -0
- data/lib/boxcars/boxcar.rb +6 -1
- data/lib/boxcars/embedding.rb +11 -0
- data/lib/boxcars/engine/gpt4all_eng.rb +56 -0
- data/lib/boxcars/engine.rb +1 -0
- data/lib/boxcars/ruby_repl.rb +1 -0
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -0
- metadata +29 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69b70e1d02b1ec206438eaaf857a0495fe35ab01e64a265656fe21230675306f
|
4
|
+
data.tar.gz: 8681b9625a0684f1091eea7a4626964929b271370068c90b82dabeee4253d803
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eb5c0c00f8fcdbbd6d8a1999d7544fc584c701fdf9a8a9c271fff6d9795f75ef9cab058fee2c6829808a764c892cc3e2f4e4a8717155d34d6514b46d744e632c
|
7
|
+
data.tar.gz: b8fb4ad34d7b93d47388f037d1d93e9e7245303740bc04d58d21942112ff97315e5dd31fdba77e275b52ddba85ca1055b897646ae1e606daee485583d50c52a6
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,44 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [Unreleased](https://github.com/BoxcarsAI/boxcars/tree/HEAD)
|
4
|
+
|
5
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.7...HEAD)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Getting the same verbosity as in the examples [\#54](https://github.com/BoxcarsAI/boxcars/issues/54)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Add Engine for Gpt4all [\#55](https://github.com/BoxcarsAI/boxcars/pull/55) ([francis](https://github.com/francis))
|
14
|
+
- update google search to return URL for result if present [\#53](https://github.com/BoxcarsAI/boxcars/pull/53) ([francis](https://github.com/francis))
|
15
|
+
- Draft: added gpt4all [\#49](https://github.com/BoxcarsAI/boxcars/pull/49) ([jaigouk](https://github.com/jaigouk))
|
16
|
+
- Embeddings with hnswlib [\#48](https://github.com/BoxcarsAI/boxcars/pull/48) ([jaigouk](https://github.com/jaigouk))
|
17
|
+
|
18
|
+
## [v0.2.7](https://github.com/BoxcarsAI/boxcars/tree/v0.2.7) (2023-04-13)
|
19
|
+
|
20
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.5...v0.2.7)
|
21
|
+
|
22
|
+
**Closed issues:**
|
23
|
+
|
24
|
+
- The class name in the sample code of BoxCar-Google-Search wiki has not been changed. [\#50](https://github.com/BoxcarsAI/boxcars/issues/50)
|
25
|
+
|
26
|
+
**Merged pull requests:**
|
27
|
+
|
28
|
+
- Add Swagger Boxcar [\#51](https://github.com/BoxcarsAI/boxcars/pull/51) ([francis](https://github.com/francis))
|
29
|
+
- Boxcars::SQL tables and except\_tables [\#47](https://github.com/BoxcarsAI/boxcars/pull/47) ([arihh](https://github.com/arihh))
|
30
|
+
- ActiveRecord updates and new Wikipedia Search boxcar [\#46](https://github.com/BoxcarsAI/boxcars/pull/46) ([francis](https://github.com/francis))
|
31
|
+
- Fix README.md log\_prompts settings [\#45](https://github.com/BoxcarsAI/boxcars/pull/45) ([arihh](https://github.com/arihh))
|
32
|
+
- Update README.md to use the GoogleSearch Boxcar [\#44](https://github.com/BoxcarsAI/boxcars/pull/44) ([stockandawe](https://github.com/stockandawe))
|
33
|
+
|
34
|
+
## [v0.2.5](https://github.com/BoxcarsAI/boxcars/tree/v0.2.5) (2023-03-30)
|
35
|
+
|
36
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.4...v0.2.5)
|
37
|
+
|
38
|
+
**Merged pull requests:**
|
39
|
+
|
40
|
+
- switch to safe level 4 for eval, and rerun tests [\#43](https://github.com/BoxcarsAI/boxcars/pull/43) ([francis](https://github.com/francis))
|
41
|
+
|
3
42
|
## [v0.2.4](https://github.com/BoxcarsAI/boxcars/tree/v0.2.4) (2023-03-28)
|
4
43
|
|
5
44
|
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.3...v0.2.4)
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,8 +1,9 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
boxcars (0.2.
|
4
|
+
boxcars (0.2.8)
|
5
5
|
google_search_results (~> 2.2)
|
6
|
+
gpt4all (~> 0.0.4)
|
6
7
|
ruby-openai (~> 3.0)
|
7
8
|
|
8
9
|
GEM
|
@@ -49,6 +50,8 @@ GEM
|
|
49
50
|
irb (>= 1.5.0)
|
50
51
|
reline (>= 0.3.1)
|
51
52
|
diff-lcs (1.5.0)
|
53
|
+
domain_name (0.5.20190701)
|
54
|
+
unf (>= 0.0.5, < 1.0.0)
|
52
55
|
dotenv (2.8.1)
|
53
56
|
faraday (2.7.4)
|
54
57
|
faraday-net_http (>= 2.0, < 3.1)
|
@@ -69,7 +72,15 @@ GEM
|
|
69
72
|
rainbow (>= 2.2.1)
|
70
73
|
rake (>= 10.0)
|
71
74
|
google_search_results (2.2.0)
|
75
|
+
gpt4all (0.0.5)
|
76
|
+
faraday (~> 2.7)
|
77
|
+
os (~> 1.1)
|
78
|
+
tty-progressbar (~> 0.18.2)
|
72
79
|
hashdiff (1.0.1)
|
80
|
+
hnswlib (0.8.1)
|
81
|
+
http-accept (1.7.0)
|
82
|
+
http-cookie (1.0.5)
|
83
|
+
domain_name (~> 0.5)
|
73
84
|
httparty (0.21.0)
|
74
85
|
mini_mime (>= 1.0.0)
|
75
86
|
multi_xml (>= 0.5.2)
|
@@ -81,16 +92,21 @@ GEM
|
|
81
92
|
reline (>= 0.3.0)
|
82
93
|
json (2.6.3)
|
83
94
|
json (2.6.3-java)
|
95
|
+
mime-types (3.4.1)
|
96
|
+
mime-types-data (~> 3.2015)
|
97
|
+
mime-types-data (3.2023.0218.1)
|
84
98
|
mini_mime (1.1.2)
|
85
99
|
mini_portile2 (2.8.1)
|
86
100
|
minitest (5.18.0)
|
87
101
|
multi_json (1.15.0)
|
88
102
|
multi_xml (0.6.0)
|
103
|
+
netrc (0.11.0)
|
89
104
|
nio4r (2.5.8)
|
90
105
|
nio4r (2.5.8-java)
|
91
106
|
octokit (4.25.1)
|
92
107
|
faraday (>= 1, < 3)
|
93
108
|
sawyer (~> 0.9)
|
109
|
+
os (1.1.4)
|
94
110
|
parallel (1.22.1)
|
95
111
|
parser (3.2.1.1)
|
96
112
|
ast (~> 2.4.1)
|
@@ -107,6 +123,11 @@ GEM
|
|
107
123
|
regexp_parser (2.7.0)
|
108
124
|
reline (0.3.3)
|
109
125
|
io-console (~> 0.5)
|
126
|
+
rest-client (2.1.0)
|
127
|
+
http-accept (>= 1.7.0, < 2.0)
|
128
|
+
http-cookie (>= 1.0.2, < 2.0)
|
129
|
+
mime-types (>= 1.16, < 4.0)
|
130
|
+
netrc (~> 0.8)
|
110
131
|
rexml (3.2.5)
|
111
132
|
rspec (3.12.0)
|
112
133
|
rspec-core (~> 3.12.0)
|
@@ -149,12 +170,25 @@ GEM
|
|
149
170
|
faraday (>= 0.17.3, < 3)
|
150
171
|
sqlite3 (1.6.2)
|
151
172
|
mini_portile2 (~> 2.8.0)
|
173
|
+
sqlite3 (1.6.2-arm64-darwin)
|
152
174
|
sqlite3 (1.6.2-x86_64-darwin)
|
153
175
|
sqlite3 (1.6.2-x86_64-linux)
|
176
|
+
strings-ansi (0.2.0)
|
154
177
|
timers (4.3.5)
|
155
178
|
traces (0.9.1)
|
179
|
+
tty-cursor (0.7.1)
|
180
|
+
tty-progressbar (0.18.2)
|
181
|
+
strings-ansi (~> 0.2)
|
182
|
+
tty-cursor (~> 0.7)
|
183
|
+
tty-screen (~> 0.8)
|
184
|
+
unicode-display_width (>= 1.6, < 3.0)
|
185
|
+
tty-screen (0.8.1)
|
156
186
|
tzinfo (2.0.6)
|
157
187
|
concurrent-ruby (~> 1.0)
|
188
|
+
unf (0.1.4)
|
189
|
+
unf_ext
|
190
|
+
unf (0.1.4-java)
|
191
|
+
unf_ext (0.0.8.2)
|
158
192
|
unicode-display_width (2.4.2)
|
159
193
|
vcr (6.1.0)
|
160
194
|
webmock (3.18.1)
|
@@ -163,6 +197,7 @@ GEM
|
|
163
197
|
hashdiff (>= 0.4.0, < 2.0.0)
|
164
198
|
|
165
199
|
PLATFORMS
|
200
|
+
arm64-darwin-22
|
166
201
|
universal-java-11
|
167
202
|
x86_64-darwin-21
|
168
203
|
x86_64-darwin-22
|
@@ -170,12 +205,15 @@ PLATFORMS
|
|
170
205
|
|
171
206
|
DEPENDENCIES
|
172
207
|
activerecord (~> 7.0)
|
208
|
+
activesupport (~> 7.0)
|
173
209
|
boxcars!
|
174
210
|
debug (~> 1.1)
|
175
211
|
dotenv (~> 2.8)
|
176
212
|
faraday-retry (~> 2.0)
|
177
213
|
github_changelog_generator (~> 1.16)
|
214
|
+
hnswlib (~> 0.8.1)
|
178
215
|
rake (~> 13.0)
|
216
|
+
rest-client (~> 2.1)
|
179
217
|
rspec (~> 3.2)
|
180
218
|
rubocop (~> 1.21)
|
181
219
|
rubocop-rake (~> 0.6.0)
|
data/README.md
CHANGED
@@ -2,7 +2,6 @@
|
|
2
2
|
|
3
3
|
<h4 align="center">
|
4
4
|
<a href="https://www.boxcars.ai">Website</a> |
|
5
|
-
<a href="https://www.boxcars.ai/roadmap">Roadmap</a> |
|
6
5
|
<a href="https://www.boxcars.ai/blog">Blog</a> |
|
7
6
|
<a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
|
8
7
|
</h4>
|
@@ -18,10 +17,10 @@ This gem was inspired by the popular Python library Langchain. However, we wante
|
|
18
17
|
## Concepts
|
19
18
|
All of these concepts are in a module named Boxcars:
|
20
19
|
|
21
|
-
- Boxcar - an encapsulation that performs something of interest (such as search, math, SQL
|
20
|
+
- Boxcar - an encapsulation that performs something of interest (such as search, math, SQL, an Active Record Query, or an API call to a service). A Boxcar can use an Engine (described below) to do its work, and if not specified but needed, the default Engine is used `Boxcars.engine`.
|
22
21
|
- Train - Given a list of Boxcars and optionally an Engine, a Train breaks down a problem into pieces for individual Boxcars to solve. The individual results are then combined until a final answer is found. ZeroShot is the only current implementation of Train (but we are adding more soon), and you can either construct it directly or use `Boxcars::train` when you want to build a Train.
|
23
|
-
- Prompt - used by an Engine to generate text results.
|
24
|
-
- Engine - an entity that generates text from a Prompt. OpenAI's LLM text generator is the default Engine if no other is specified.
|
22
|
+
- Prompt - used by an Engine to generate text results. Our Boxcars have built-in prompts, but you have the flexibility to change or augment them if you so desire.
|
23
|
+
- Engine - an entity that generates text from a Prompt. OpenAI's LLM text generator is the default Engine if no other is specified, and you can override the default engine if so desired (`Boxcar.configuration.default_engine`).
|
25
24
|
|
26
25
|
## Security
|
27
26
|
Currently, our system is designed for individuals who already possess administrative privileges for their project. It is likely possible to manipulate the system's prompts to carry out malicious actions, but if you already have administrative access, you can perform such actions without requiring boxcars in the first place.
|
@@ -90,14 +89,16 @@ You can change the default_engine with `Boxcars::configuration.default_engine =
|
|
90
89
|
|
91
90
|
Here is what we have so far, but please put up a PR with your new ideas.
|
92
91
|
- GoogleSearch: uses the SERP API to do seaches
|
92
|
+
- WikipediaSearch: uses the Wikipedia API to do searches
|
93
93
|
- Calculator: uses an Engine to generate ruby code to do math
|
94
94
|
- SQL: given an ActiveRecord connection, it will generate and run sql statments from a prompt.
|
95
95
|
- ActiveRecord: given an ActiveRecord connection, it will generate and run ActiveRecord statements from a prompt.
|
96
|
+
- Swagger: give a Swagger Open API file (YAML or JSON), answer questions about or run against the referenced service. See [here](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) for examples.
|
96
97
|
|
97
98
|
### Run a list of Boxcars
|
98
99
|
```ruby
|
99
100
|
# run a Train for a calculator, and search using default Engine
|
100
|
-
boxcars = [Boxcars::Calculator.new, Boxcars::
|
101
|
+
boxcars = [Boxcars::Calculator.new, Boxcars::GoogleSearch.new]
|
101
102
|
train = Boxcars.train.new(boxcars: boxcars)
|
102
103
|
train.run "What is pi times the square root of the average temperature in Austin TX in January?"
|
103
104
|
```
|
@@ -105,38 +106,40 @@ Produces:
|
|
105
106
|
```text
|
106
107
|
> Entering Zero Shot#run
|
107
108
|
What is pi times the square root of the average temperature in Austin TX in January?
|
108
|
-
Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of
|
109
|
+
Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of the average temperature. We can use a search engine to find the average temperature in Austin TX in January and a calculator to perform the multiplication.
|
109
110
|
Question: Average temperature in Austin TX in January
|
110
|
-
Answer: increase from 62°F to 64°F
|
111
|
-
Observation: increase from 62°F to 64°F
|
112
|
-
Thought:
|
111
|
+
Answer: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
|
112
|
+
Observation: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
|
113
|
+
Thought: We have found the average temperature in Austin TX in January, which is 64°F. Now we can use a calculator to perform the multiplication.
|
113
114
|
> Entering Calculator#run
|
114
|
-
pi * sqrt(
|
115
|
-
RubyREPL: puts(Math::PI * Math.sqrt(
|
116
|
-
Answer:
|
115
|
+
pi * sqrt(64)
|
116
|
+
RubyREPL: puts(Math::PI * Math.sqrt(64))
|
117
|
+
Answer: 25.132741228718345
|
117
118
|
|
118
|
-
{"status":"ok","answer":"
|
119
|
+
{"status":"ok","answer":"25.132741228718345","explanation":"Answer: 25.132741228718345","code":"puts(Math::PI * Math.sqrt(64))"}
|
119
120
|
< Exiting Calculator#run
|
120
|
-
Observation:
|
121
|
-
|
121
|
+
Observation: 25.132741228718345
|
122
|
+
We have the final answer.
|
122
123
|
|
123
|
-
Final Answer:
|
124
|
+
Final Answer: 25.132741228718345
|
124
125
|
|
125
126
|
Next Actions:
|
126
127
|
1. What is the average temperature in Austin TX in July?
|
127
|
-
2. What is the
|
128
|
-
3. What is the
|
128
|
+
2. What is the value of pi to 10 decimal places?
|
129
|
+
3. What is the square root of the average temperature in Miami FL in January?
|
129
130
|
< Exiting Zero Shot#run
|
130
131
|
```
|
131
132
|
### More Examples
|
132
133
|
See [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/boxcars_examples.ipynb) Jupyter Notebook for more examples.
|
133
134
|
|
135
|
+
For the new Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
|
136
|
+
|
134
137
|
Note, some folks that we talked to didn't know that you could run Ruby Jupyter notebooks. [You can](https://github.com/SciRuby/iruby).
|
135
138
|
|
136
139
|
### Logging
|
137
140
|
If you use this in a Rails application, or configure `Boxcars.configuration.logger = your_logger`, logging will go to your log file.
|
138
141
|
|
139
|
-
Also, if you set this flag: `Boxcars.configuration.
|
142
|
+
Also, if you set this flag: `Boxcars.configuration.log_prompts = true`
|
140
143
|
The actual prompts handed to the connected Engine will be logged. This is off by default because it is very wordy, but handy if you are debugging prompts.
|
141
144
|
|
142
145
|
Otherwise, we print to standard out.
|
data/boxcars.gemspec
CHANGED
@@ -37,6 +37,7 @@ Gem::Specification.new do |spec|
|
|
37
37
|
|
38
38
|
# runtime dependencies
|
39
39
|
spec.add_dependency "google_search_results", "~> 2.2"
|
40
|
+
spec.add_dependency "gpt4all", "~> 0.0.4"
|
40
41
|
spec.add_dependency "ruby-openai", "~> 3.0"
|
41
42
|
|
42
43
|
# For more information and examples about making a new gem, checkout our
|
@@ -3,6 +3,7 @@
|
|
3
3
|
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
4
4
|
module Boxcars
|
5
5
|
# A Boxcar that interprets a prompt and executes SQL code to get answers
|
6
|
+
# rubocop:disable Metrics/ClassLength
|
6
7
|
class ActiveRecord < EngineBoxcar
|
7
8
|
# the description of this engine boxcar
|
8
9
|
ARDESC = "useful for when you need to query a database for an application named %<name>s."
|
@@ -21,7 +22,7 @@ module Boxcars
|
|
21
22
|
@approval_callback = approval_callback
|
22
23
|
@read_only = read_only.nil? ? !approval_callback : read_only
|
23
24
|
@code_only = kwargs.delete(:code_only) || false
|
24
|
-
kwargs[:name] ||=
|
25
|
+
kwargs[:name] ||= get_name
|
25
26
|
kwargs[:description] ||= format(ARDESC, name: name)
|
26
27
|
kwargs[:prompt] ||= my_prompt
|
27
28
|
super(**kwargs)
|
@@ -34,6 +35,13 @@ module Boxcars
|
|
34
35
|
|
35
36
|
private
|
36
37
|
|
38
|
+
def get_name
|
39
|
+
return Rails.application.class.module_parent.name if defined?(Rails)
|
40
|
+
rescue StandardError => e
|
41
|
+
boxcars.error "Error getting rails name application name: #{e.message}"
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
|
37
45
|
def read_only?
|
38
46
|
read_only
|
39
47
|
end
|
@@ -44,6 +52,7 @@ module Boxcars
|
|
44
52
|
|
45
53
|
def check_models(models, exceptions)
|
46
54
|
if models.is_a?(Array) && models.length.positive?
|
55
|
+
models.map { |m| m.is_a?(Class) ? m : m.constantize }
|
47
56
|
@requested_models = models
|
48
57
|
models.each do |m|
|
49
58
|
raise ArgumentError, "model #{m} needs to be an Active Record model" unless m.ancestors.include?(::ActiveRecord::Base)
|
@@ -119,7 +128,10 @@ module Boxcars
|
|
119
128
|
# @return [Object] The result of the code
|
120
129
|
def eval_safe_wrapper(code)
|
121
130
|
# if the code used ActiveRecord, we need to add :: in front of it to escape the module
|
122
|
-
new_code = code.gsub(
|
131
|
+
new_code = code.gsub(/\b(ActiveRecord::)/, '::\1')
|
132
|
+
|
133
|
+
# sometimes the code will have a puts or print in it, which will miss. Remove them.
|
134
|
+
new_code = new_code.gsub(/\b(puts|print)\b/, '')
|
123
135
|
proc do
|
124
136
|
$SAFE = 4
|
125
137
|
# rubocop:disable Security/Eval
|
@@ -146,7 +158,11 @@ module Boxcars
|
|
146
158
|
def approved?(changes_code, code)
|
147
159
|
# find out how many changes there are
|
148
160
|
changes = change_count(changes_code)
|
149
|
-
|
161
|
+
begin
|
162
|
+
return true unless changes&.positive?
|
163
|
+
rescue StandardError => e
|
164
|
+
Boscar.error "Error while computing change count: #{e.message}", :red
|
165
|
+
end
|
150
166
|
|
151
167
|
Boxcars.debug "#{name}(Pending Changes): #{changes}", :yellow
|
152
168
|
if read_only?
|
@@ -242,7 +258,7 @@ module Boxcars
|
|
242
258
|
"Pay attention to use only the attribute names that you can see in the model description.\n",
|
243
259
|
"Do not make up variable or attribute names, and do not share variables between the code in ARChanges and ARCode\n",
|
244
260
|
"Be careful to not query for attributes that do not exist, and to use the format specified above.\n",
|
245
|
-
"Finally,
|
261
|
+
"Finally, try not to use print or puts in your code"
|
246
262
|
),
|
247
263
|
user("Question: %<question>s")
|
248
264
|
].freeze
|
@@ -257,4 +273,5 @@ module Boxcars
|
|
257
273
|
output_variables: [:answer])
|
258
274
|
end
|
259
275
|
end
|
276
|
+
# rubocop:enable Metrics/ClassLength
|
260
277
|
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module Embeddings
|
5
|
+
class Document
|
6
|
+
attr_accessor :page_content, :metadata
|
7
|
+
|
8
|
+
def initialize(fields = {})
|
9
|
+
@page_content = fields[:page_content] || ""
|
10
|
+
@metadata = fields[:metadata] || {}
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'openai'
|
4
|
+
|
5
|
+
module Boxcars
|
6
|
+
module Embeddings
|
7
|
+
class EmbedViaOpenAI
|
8
|
+
include Embeddings
|
9
|
+
|
10
|
+
attr_accessor :texts, :openai_connection, :model
|
11
|
+
|
12
|
+
def initialize(texts:, openai_connection:, model: 'text-embedding-ada-002')
|
13
|
+
validate_params(texts, openai_connection)
|
14
|
+
@texts = texts
|
15
|
+
@openai_connection = openai_connection
|
16
|
+
@model = model
|
17
|
+
end
|
18
|
+
|
19
|
+
def call
|
20
|
+
texts.map do |text|
|
21
|
+
embedding = embedding_with_retry(model: model, input: strip_new_lines(text))
|
22
|
+
{
|
23
|
+
embedding: embedding,
|
24
|
+
dim: embedding.size
|
25
|
+
}
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def validate_params(texts, openai_connection)
|
32
|
+
raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
|
33
|
+
raise_error 'openai_connection must be an OpenAI::Client' unless openai_connection.is_a?(OpenAI::Client)
|
34
|
+
end
|
35
|
+
|
36
|
+
def embedding_with_retry(request)
|
37
|
+
response = @openai_connection.embeddings(parameters: request)
|
38
|
+
response['data'][0]['embedding']
|
39
|
+
end
|
40
|
+
|
41
|
+
def strip_new_lines(text)
|
42
|
+
text.gsub("\n", ' ')
|
43
|
+
end
|
44
|
+
|
45
|
+
def raise_error(message)
|
46
|
+
raise ::Boxcars::ValueError, message
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,159 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'hnswlib'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module Embeddings
|
9
|
+
module Hnswlib
|
10
|
+
class BuildVectorStore
|
11
|
+
include Embeddings
|
12
|
+
|
13
|
+
# This class is responsible for building the vector store for the hnswlib similarity search.
|
14
|
+
# It will load the training data, generate the embeddings, and save the vector store.
|
15
|
+
# It will also load the vector store into memory.
|
16
|
+
# For later use, it will save the splitted document with index numbers to a json file.
|
17
|
+
#
|
18
|
+
# @param training_data_path [String] The path to the training data. Can be a glob pattern.
|
19
|
+
# @param index_file_path [String] The path to the index file.
|
20
|
+
# @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
|
21
|
+
# @option json_doc_file_path [String]. The json file containing the document text.
|
22
|
+
# if nil, it will reuse index file name.
|
23
|
+
# @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
|
24
|
+
def initialize(
|
25
|
+
training_data_path:,
|
26
|
+
index_file_path:,
|
27
|
+
split_chunk_size: 2000,
|
28
|
+
json_doc_file_path: nil,
|
29
|
+
force_rebuild: true
|
30
|
+
)
|
31
|
+
@training_data_path = training_data_path
|
32
|
+
@index_file_path = index_file_path
|
33
|
+
@split_chunk_size = split_chunk_size
|
34
|
+
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
35
|
+
@force_rebuild = force_rebuild
|
36
|
+
end
|
37
|
+
|
38
|
+
def call
|
39
|
+
validate_params
|
40
|
+
data = load_files
|
41
|
+
documents = split_text_into_chunks(data)
|
42
|
+
embeddings_with_config = generate_embeddings(documents)
|
43
|
+
save_vector_store(embeddings_with_config)
|
44
|
+
load_hnsw
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
|
50
|
+
|
51
|
+
def validate_params
|
52
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
53
|
+
raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
54
|
+
raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
55
|
+
|
56
|
+
index_dir = File.dirname(index_file_path)
|
57
|
+
raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
|
58
|
+
|
59
|
+
raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
60
|
+
end
|
61
|
+
|
62
|
+
def load_files
|
63
|
+
data = []
|
64
|
+
files = Dir.glob(training_data_path)
|
65
|
+
raise_error "No files found at #{training_data_path}" if files.empty?
|
66
|
+
|
67
|
+
files.each do |file|
|
68
|
+
data << File.read(file)
|
69
|
+
end
|
70
|
+
puts "Added #{files.length} files to data. Splitting text into chunks..."
|
71
|
+
data
|
72
|
+
end
|
73
|
+
|
74
|
+
def split_text_into_chunks(data)
|
75
|
+
return true unless rebuild_required?
|
76
|
+
|
77
|
+
docs = []
|
78
|
+
data.each do |chunk|
|
79
|
+
doc_output = Boxcars::Embeddings::SplitText.call(
|
80
|
+
separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
|
81
|
+
)
|
82
|
+
docs.concat(doc_output)
|
83
|
+
end
|
84
|
+
docs
|
85
|
+
end
|
86
|
+
|
87
|
+
def rebuild_required?
|
88
|
+
hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
89
|
+
return true unless File.exist?(index_file_path)
|
90
|
+
return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
|
91
|
+
return true if force_rebuild
|
92
|
+
|
93
|
+
false
|
94
|
+
end
|
95
|
+
|
96
|
+
def generate_embeddings(documents)
|
97
|
+
return true unless rebuild_required?
|
98
|
+
|
99
|
+
puts "Initializing Store..."
|
100
|
+
openai_client = OpenAI::Client.new(access_token: ENV.fetch('OPENAI_API_KEY', nil))
|
101
|
+
|
102
|
+
embeddings_with_dim = Boxcars::Embeddings::EmbedViaOpenAI.call(texts: documents, openai_connection: openai_client)
|
103
|
+
|
104
|
+
document_embeddings = embeddings_with_dim.map.with_index do |item, index|
|
105
|
+
{ doc_id: index, embedding: item[:embedding], document: documents[index] }
|
106
|
+
end
|
107
|
+
|
108
|
+
{ document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
|
109
|
+
end
|
110
|
+
|
111
|
+
def save_vector_store(embeddings_with_config)
|
112
|
+
return true unless rebuild_required?
|
113
|
+
|
114
|
+
puts "Saving Vectorstore"
|
115
|
+
Boxcars::Embeddings::Hnswlib::SaveToHnswlib.call(
|
116
|
+
document_embeddings: embeddings_with_config[:document_embeddings],
|
117
|
+
index_file_path: index_file_path,
|
118
|
+
json_doc_file_path: json_doc_file_path,
|
119
|
+
hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
|
120
|
+
)
|
121
|
+
puts "VectorStore saved"
|
122
|
+
end
|
123
|
+
|
124
|
+
def hnswlib_config(dim)
|
125
|
+
# dim: length of datum point vector that will be indexed.
|
126
|
+
Boxcars::Embeddings::Hnswlib::HnswlibConfig.new(
|
127
|
+
metric: "l2", max_item: 10000, dim: dim
|
128
|
+
)
|
129
|
+
end
|
130
|
+
|
131
|
+
def load_hnsw
|
132
|
+
puts "Loading Hnswlib"
|
133
|
+
|
134
|
+
config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
135
|
+
json_config = parse_json_file(config_file)
|
136
|
+
document_embeddings = parse_json_file(json_doc_file_path)
|
137
|
+
|
138
|
+
search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
|
139
|
+
search_index.load_index(index_file_path)
|
140
|
+
|
141
|
+
{ vector_store: search_index, document_embeddings: document_embeddings }
|
142
|
+
end
|
143
|
+
|
144
|
+
def parse_json_file(file_path)
|
145
|
+
return [] if file_path.nil?
|
146
|
+
|
147
|
+
file_content = File.read(file_path)
|
148
|
+
JSON.parse(file_content, symbolize_names: true)
|
149
|
+
rescue JSON::ParserError => e
|
150
|
+
raise_error("Error parsing hnswlib_config.json: #{e.message}")
|
151
|
+
end
|
152
|
+
|
153
|
+
def raise_error(message)
|
154
|
+
raise ::Boxcars::Error, message
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Boxcars
|
6
|
+
module Embeddings
|
7
|
+
module Hnswlib
|
8
|
+
class HnswlibConfig
|
9
|
+
attr_reader :metric, :max_item, :dim, :ef_construction, :m
|
10
|
+
|
11
|
+
# used for search index.
|
12
|
+
#
|
13
|
+
# @param max_item [Integer] The maximum number of items.
|
14
|
+
#
|
15
|
+
# @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
|
16
|
+
#
|
17
|
+
# @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
|
18
|
+
# It controls the index time/accuracy trade-off.
|
19
|
+
#
|
20
|
+
# @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
|
21
|
+
#
|
22
|
+
# reference: https://yoshoku.github.io/hnswlib.rb/doc/
|
23
|
+
def initialize(
|
24
|
+
metric: "l2",
|
25
|
+
max_item: 10000,
|
26
|
+
dim: 2,
|
27
|
+
ef_construction: 200,
|
28
|
+
max_outgoing_connection: 16
|
29
|
+
)
|
30
|
+
@metric = metric
|
31
|
+
@max_item = max_item
|
32
|
+
@dim = dim
|
33
|
+
@ef_construction = ef_construction
|
34
|
+
@max_outgoing_connection = max_outgoing_connection
|
35
|
+
end
|
36
|
+
|
37
|
+
def space
|
38
|
+
@metric == 'dot' ? 'ip' : 'l2'
|
39
|
+
end
|
40
|
+
|
41
|
+
def to_json(*args)
|
42
|
+
JSON.pretty_generate(
|
43
|
+
{
|
44
|
+
metric: @metric,
|
45
|
+
max_item: @max_item,
|
46
|
+
dim: @dim,
|
47
|
+
ef_construction: @ef_construction,
|
48
|
+
max_outgoing_connection: @max_outgoing_connection
|
49
|
+
},
|
50
|
+
*args
|
51
|
+
)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|