boxcars 0.2.11 → 0.2.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.env_sample +1 -0
- data/.rubocop.yml +16 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +12 -12
- data/Gemfile.lock +34 -28
- data/README.md +4 -1
- data/boxcars.gemspec +2 -2
- data/lib/boxcars/boxcar/active_record.rb +1 -1
- data/lib/boxcars/boxcar.rb +1 -0
- data/lib/boxcars/engine/openai.rb +8 -1
- data/lib/boxcars/vector_search.rb +66 -2
- data/lib/boxcars/vector_store/document.rb +3 -2
- data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
- data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
- data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
- data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
- data/lib/boxcars/vector_store/split_text.rb +2 -3
- data/lib/boxcars/vector_store.rb +73 -7
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -1
- metadata +14 -10
- data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
- data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
- data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
- data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
- data/lib/boxcars/vector_store/similarity_search.rb +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 69712266f9506d71ed3ad1fdcbfeef5a389bdbb7157d88e3f703f9a9b3ad8323
|
4
|
+
data.tar.gz: d7a5d0796d2963b737dc018c644042fe1e744ec7bd230f581367baf84df60f76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 85876f5e1053bb8100795020c33da778a06668f9e3be856a8689d90d13728cef73e437ee6d5c0888b4a5483f698ee8288c061573a93fdff93559080e525c4254
|
7
|
+
data.tar.gz: 99e15b3fe0c5d5277c5ed123e5569bca1f1ddfca3a1b3ec054504b855bc7a005d6eb9a8f7ba71989d16ded297959fa09cddf7b31879ff37df78df5dfb21b3240
|
data/.env_sample
CHANGED
data/.rubocop.yml
CHANGED
@@ -3,6 +3,7 @@ require:
|
|
3
3
|
- rubocop-rake
|
4
4
|
|
5
5
|
AllCops:
|
6
|
+
TargetRubyVersion: 3
|
6
7
|
Exclude:
|
7
8
|
- 'bin/{rails,rake}'
|
8
9
|
- 'node_modules/**/*'
|
@@ -152,3 +153,18 @@ Style/SlicingWithRange:
|
|
152
153
|
|
153
154
|
Bundler/OrderedGems:
|
154
155
|
Enabled: false
|
156
|
+
|
157
|
+
RSpec/MultipleMemoizedHelpers:
|
158
|
+
Enabled: false
|
159
|
+
|
160
|
+
RSpec/PendingWithoutReason:
|
161
|
+
Enabled: false
|
162
|
+
|
163
|
+
RSpec/NestedGroups:
|
164
|
+
Enabled: false
|
165
|
+
|
166
|
+
RSpec/ExampleLength:
|
167
|
+
Enabled: false
|
168
|
+
|
169
|
+
RSpec/MultipleExpectations:
|
170
|
+
Enabled: false
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [v0.2.11](https://github.com/BoxcarsAI/boxcars/tree/v0.2.11) (2023-05-05)
|
4
|
+
|
5
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.10...v0.2.11)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Chore: move vector store to top level [\#67](https://github.com/BoxcarsAI/boxcars/issues/67)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Move vector store [\#69](https://github.com/BoxcarsAI/boxcars/pull/69) ([francis](https://github.com/francis))
|
14
|
+
|
3
15
|
## [v0.2.10](https://github.com/BoxcarsAI/boxcars/tree/v0.2.10) (2023-05-05)
|
4
16
|
|
5
17
|
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.9...v0.2.10)
|
data/Gemfile
CHANGED
@@ -7,18 +7,6 @@ gemspec
|
|
7
7
|
|
8
8
|
gem "rake", "~> 13.0"
|
9
9
|
|
10
|
-
gem "rspec", "~> 3.2"
|
11
|
-
|
12
|
-
gem "rubocop", "~> 1.21"
|
13
|
-
|
14
|
-
gem "vcr", "~> 6.1.0"
|
15
|
-
|
16
|
-
gem "webmock", "~> 3.18.1"
|
17
|
-
|
18
|
-
gem "rubocop-rake", "~> 0.6.0"
|
19
|
-
|
20
|
-
gem "rubocop-rspec", "~> 2.17"
|
21
|
-
|
22
10
|
gem "sqlite3", "~> 1.6"
|
23
11
|
|
24
12
|
gem "activerecord", "~> 7.0"
|
@@ -32,3 +20,15 @@ gem "activesupport", "~> 7.0"
|
|
32
20
|
gem "rest-client", "~> 2.1"
|
33
21
|
|
34
22
|
gem "hnswlib", "~> 0.8.1"
|
23
|
+
|
24
|
+
gem "pg", "~> 1.5", ">= 1.5.3"
|
25
|
+
gem "pgvector", "~> 0.2.0"
|
26
|
+
|
27
|
+
group :development, :test do
|
28
|
+
gem "rspec", "~> 3.2"
|
29
|
+
gem "rubocop", "~> 1.21"
|
30
|
+
gem "vcr", "~> 6.1.0"
|
31
|
+
gem "webmock", "~> 3.18.1"
|
32
|
+
gem "rubocop-rake", "~> 0.6.0"
|
33
|
+
gem "rubocop-rspec", "~> 2.17"
|
34
|
+
end
|
data/Gemfile.lock
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
boxcars (0.2.
|
4
|
+
boxcars (0.2.12)
|
5
5
|
google_search_results (~> 2.2)
|
6
6
|
gpt4all (~> 0.0.4)
|
7
|
-
ruby-openai (~>
|
7
|
+
ruby-openai (~> 4.0)
|
8
8
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
@@ -19,7 +19,7 @@ GEM
|
|
19
19
|
i18n (>= 1.6, < 2)
|
20
20
|
minitest (>= 5.1)
|
21
21
|
tzinfo (~> 2.0)
|
22
|
-
addressable (2.8.
|
22
|
+
addressable (2.8.4)
|
23
23
|
public_suffix (>= 2.0.2, < 6.0)
|
24
24
|
ast (2.4.2)
|
25
25
|
async (1.31.0)
|
@@ -34,7 +34,7 @@ GEM
|
|
34
34
|
protocol-http1 (~> 0.15.0)
|
35
35
|
protocol-http2 (~> 0.15.0)
|
36
36
|
traces (>= 0.8.0)
|
37
|
-
async-http-faraday (0.
|
37
|
+
async-http-faraday (0.12.0)
|
38
38
|
async-http (~> 0.42)
|
39
39
|
faraday
|
40
40
|
async-io (1.34.3)
|
@@ -46,7 +46,7 @@ GEM
|
|
46
46
|
fiber-local
|
47
47
|
crack (0.4.5)
|
48
48
|
rexml
|
49
|
-
debug (1.
|
49
|
+
debug (1.8.0)
|
50
50
|
irb (>= 1.5.0)
|
51
51
|
reline (>= 0.3.1)
|
52
52
|
diff-lcs (1.5.0)
|
@@ -56,8 +56,10 @@ GEM
|
|
56
56
|
faraday (2.7.4)
|
57
57
|
faraday-net_http (>= 2.0, < 3.1)
|
58
58
|
ruby2_keywords (>= 0.0.4)
|
59
|
-
faraday-http-cache (2.
|
59
|
+
faraday-http-cache (2.5.0)
|
60
60
|
faraday (>= 0.8)
|
61
|
+
faraday-multipart (1.0.4)
|
62
|
+
multipart-post (~> 2)
|
61
63
|
faraday-net_http (3.0.2)
|
62
64
|
faraday-retry (2.1.0)
|
63
65
|
faraday (~> 2.0)
|
@@ -81,35 +83,33 @@ GEM
|
|
81
83
|
http-accept (1.7.0)
|
82
84
|
http-cookie (1.0.5)
|
83
85
|
domain_name (~> 0.5)
|
84
|
-
|
85
|
-
mini_mime (>= 1.0.0)
|
86
|
-
multi_xml (>= 0.5.2)
|
87
|
-
i18n (1.12.0)
|
86
|
+
i18n (1.13.0)
|
88
87
|
concurrent-ruby (~> 1.0)
|
89
88
|
io-console (0.6.0)
|
90
89
|
io-console (0.6.0-java)
|
91
|
-
irb (1.6.
|
90
|
+
irb (1.6.4)
|
92
91
|
reline (>= 0.3.0)
|
93
92
|
json (2.6.3)
|
94
93
|
json (2.6.3-java)
|
95
94
|
mime-types (3.4.1)
|
96
95
|
mime-types-data (~> 3.2015)
|
97
96
|
mime-types-data (3.2023.0218.1)
|
98
|
-
|
99
|
-
mini_portile2 (2.8.1)
|
97
|
+
mini_portile2 (2.8.2)
|
100
98
|
minitest (5.18.0)
|
101
99
|
multi_json (1.15.0)
|
102
|
-
|
100
|
+
multipart-post (2.3.0)
|
103
101
|
netrc (0.11.0)
|
104
|
-
nio4r (2.5.
|
105
|
-
nio4r (2.5.
|
102
|
+
nio4r (2.5.9)
|
103
|
+
nio4r (2.5.9-java)
|
106
104
|
octokit (4.25.1)
|
107
105
|
faraday (>= 1, < 3)
|
108
106
|
sawyer (~> 0.9)
|
109
107
|
os (1.1.4)
|
110
|
-
parallel (1.
|
111
|
-
parser (3.2.
|
108
|
+
parallel (1.23.0)
|
109
|
+
parser (3.2.2.1)
|
112
110
|
ast (~> 2.4.1)
|
111
|
+
pg (1.5.3)
|
112
|
+
pgvector (0.2.0)
|
113
113
|
protocol-hpack (1.4.2)
|
114
114
|
protocol-http (0.24.1)
|
115
115
|
protocol-http1 (0.15.0)
|
@@ -120,7 +120,7 @@ GEM
|
|
120
120
|
public_suffix (5.0.1)
|
121
121
|
rainbow (3.1.1)
|
122
122
|
rake (13.0.6)
|
123
|
-
regexp_parser (2.
|
123
|
+
regexp_parser (2.8.0)
|
124
124
|
reline (0.3.3)
|
125
125
|
io-console (~> 0.5)
|
126
126
|
rest-client (2.1.0)
|
@@ -133,36 +133,40 @@ GEM
|
|
133
133
|
rspec-core (~> 3.12.0)
|
134
134
|
rspec-expectations (~> 3.12.0)
|
135
135
|
rspec-mocks (~> 3.12.0)
|
136
|
-
rspec-core (3.12.
|
136
|
+
rspec-core (3.12.2)
|
137
137
|
rspec-support (~> 3.12.0)
|
138
|
-
rspec-expectations (3.12.
|
138
|
+
rspec-expectations (3.12.3)
|
139
139
|
diff-lcs (>= 1.2.0, < 2.0)
|
140
140
|
rspec-support (~> 3.12.0)
|
141
141
|
rspec-mocks (3.12.5)
|
142
142
|
diff-lcs (>= 1.2.0, < 2.0)
|
143
143
|
rspec-support (~> 3.12.0)
|
144
144
|
rspec-support (3.12.0)
|
145
|
-
rubocop (1.
|
145
|
+
rubocop (1.50.2)
|
146
146
|
json (~> 2.3)
|
147
147
|
parallel (~> 1.10)
|
148
148
|
parser (>= 3.2.0.0)
|
149
149
|
rainbow (>= 2.2.2, < 4.0)
|
150
150
|
regexp_parser (>= 1.8, < 3.0)
|
151
151
|
rexml (>= 3.2.5, < 4.0)
|
152
|
-
rubocop-ast (>= 1.
|
152
|
+
rubocop-ast (>= 1.28.0, < 2.0)
|
153
153
|
ruby-progressbar (~> 1.7)
|
154
154
|
unicode-display_width (>= 2.4.0, < 3.0)
|
155
|
-
rubocop-ast (1.28.
|
155
|
+
rubocop-ast (1.28.1)
|
156
156
|
parser (>= 3.2.1.0)
|
157
|
-
rubocop-capybara (2.
|
157
|
+
rubocop-capybara (2.18.0)
|
158
158
|
rubocop (~> 1.41)
|
159
|
+
rubocop-factory_bot (2.22.0)
|
160
|
+
rubocop (~> 1.33)
|
159
161
|
rubocop-rake (0.6.0)
|
160
162
|
rubocop (~> 1.0)
|
161
|
-
rubocop-rspec (2.
|
163
|
+
rubocop-rspec (2.22.0)
|
162
164
|
rubocop (~> 1.33)
|
163
165
|
rubocop-capybara (~> 2.17)
|
164
|
-
|
165
|
-
|
166
|
+
rubocop-factory_bot (~> 2.22)
|
167
|
+
ruby-openai (4.0.0)
|
168
|
+
faraday (>= 1)
|
169
|
+
faraday-multipart (>= 1)
|
166
170
|
ruby-progressbar (1.13.0)
|
167
171
|
ruby2_keywords (0.0.5)
|
168
172
|
sawyer (0.9.2)
|
@@ -212,6 +216,8 @@ DEPENDENCIES
|
|
212
216
|
faraday-retry (~> 2.0)
|
213
217
|
github_changelog_generator (~> 1.16)
|
214
218
|
hnswlib (~> 0.8.1)
|
219
|
+
pg (~> 1.5, >= 1.5.3)
|
220
|
+
pgvector (~> 0.2.0)
|
215
221
|
rake (~> 13.0)
|
216
222
|
rest-client (~> 2.1)
|
217
223
|
rspec (~> 3.2)
|
data/README.md
CHANGED
@@ -21,6 +21,7 @@ All of these concepts are in a module named Boxcars:
|
|
21
21
|
- Train - Given a list of Boxcars and optionally an Engine, a Train breaks down a problem into pieces for individual Boxcars to solve. The individual results are then combined until a final answer is found. ZeroShot is the only current implementation of Train (but we are adding more soon), and you can either construct it directly or use `Boxcars::train` when you want to build a Train.
|
22
22
|
- Prompt - used by an Engine to generate text results. Our Boxcars have built-in prompts, but you have the flexibility to change or augment them if you so desire.
|
23
23
|
- Engine - an entity that generates text from a Prompt. OpenAI's LLM text generator is the default Engine if no other is specified, and you can override the default engine if so desired (`Boxcar.configuration.default_engine`).
|
24
|
+
- VectorStore - a place to store and query vectors.
|
24
25
|
|
25
26
|
## Security
|
26
27
|
Currently, our system is designed for individuals who already possess administrative privileges for their project. It is likely possible to manipulate the system's prompts to carry out malicious actions, but if you already have administrative access, you can perform such actions without requiring boxcars in the first place.
|
@@ -132,7 +133,9 @@ Next Actions:
|
|
132
133
|
### More Examples
|
133
134
|
See [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/boxcars_examples.ipynb) Jupyter Notebook for more examples.
|
134
135
|
|
135
|
-
For the
|
136
|
+
For the Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
|
137
|
+
|
138
|
+
For simple vector storage and search, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/vector_store_examples.ipynb) Jupyter Notebook.
|
136
139
|
|
137
140
|
Note, some folks that we talked to didn't know that you could run Ruby Jupyter notebooks. [You can](https://github.com/SciRuby/iruby).
|
138
141
|
|
data/boxcars.gemspec
CHANGED
@@ -12,7 +12,7 @@ Gem::Specification.new do |spec|
|
|
12
12
|
spec.description = "You simply set an OpenAI key, give a number of Boxcars to a Train, and magic ensues when you run it."
|
13
13
|
spec.homepage = "https://github.com/BoxcarsAI/boxcars"
|
14
14
|
spec.license = "MIT"
|
15
|
-
spec.required_ruby_version = ">=
|
15
|
+
spec.required_ruby_version = ">= 3.0"
|
16
16
|
|
17
17
|
spec.metadata["homepage_uri"] = spec.homepage
|
18
18
|
spec.metadata["source_code_uri"] = spec.homepage
|
@@ -38,7 +38,7 @@ Gem::Specification.new do |spec|
|
|
38
38
|
# runtime dependencies
|
39
39
|
spec.add_dependency "google_search_results", "~> 2.2"
|
40
40
|
spec.add_dependency "gpt4all", "~> 0.0.4"
|
41
|
-
spec.add_dependency "ruby-openai", "~>
|
41
|
+
spec.add_dependency "ruby-openai", "~> 4.0"
|
42
42
|
|
43
43
|
# For more information and examples about making a new gem, checkout our
|
44
44
|
# guide at: https://bundler.io/guides/creating_gem.html
|
@@ -161,7 +161,7 @@ module Boxcars
|
|
161
161
|
begin
|
162
162
|
return true unless changes&.positive?
|
163
163
|
rescue StandardError => e
|
164
|
-
|
164
|
+
Boxcars.error "Error while computing change count: #{e.message}", :red
|
165
165
|
end
|
166
166
|
|
167
167
|
Boxcars.debug "#{name}(Pending Changes): #{changes}", :yellow
|
data/lib/boxcars/boxcar.rb
CHANGED
@@ -43,6 +43,10 @@ module Boxcars
|
|
43
43
|
::OpenAI::Client.new(access_token: access_token, organization_id: organization_id)
|
44
44
|
end
|
45
45
|
|
46
|
+
def conversation_model?(model)
|
47
|
+
["gpt-3.5-turbo", "gpt-4"].include?(model)
|
48
|
+
end
|
49
|
+
|
46
50
|
# Get an answer from the engine.
|
47
51
|
# @param prompt [String] The prompt to use when asking the engine.
|
48
52
|
# @param openai_access_token [String] The access token to use when asking the engine.
|
@@ -51,7 +55,7 @@ module Boxcars
|
|
51
55
|
def client(prompt:, inputs: {}, openai_access_token: nil, **kwargs)
|
52
56
|
clnt = Openai.open_ai_client(openai_access_token: openai_access_token)
|
53
57
|
params = open_ai_params.merge(kwargs)
|
54
|
-
if params[:model]
|
58
|
+
if conversation_model?(params[:model])
|
55
59
|
prompt = prompt.first if prompt.is_a?(Array)
|
56
60
|
params = prompt.as_messages(inputs).merge(params)
|
57
61
|
if Boxcars.configuration.log_prompts
|
@@ -71,6 +75,9 @@ module Boxcars
|
|
71
75
|
def run(question, **kwargs)
|
72
76
|
prompt = Prompt.new(template: question)
|
73
77
|
response = client(prompt: prompt, **kwargs)
|
78
|
+
raise Error, "OpenAI: No response from API" unless response
|
79
|
+
raise Error, "OpenAI: #{response['error']}" if response["error"]
|
80
|
+
|
74
81
|
answer = response["choices"].map { |c| c.dig("message", "content") || c["text"] }.join("\n").strip
|
75
82
|
puts answer
|
76
83
|
answer
|
@@ -3,8 +3,72 @@
|
|
3
3
|
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
4
4
|
module Boxcars
|
5
5
|
# For Boxcars that use an engine to do their work.
|
6
|
-
class VectorSearch
|
7
|
-
|
6
|
+
class VectorSearch
|
7
|
+
def initialize(params)
|
8
|
+
@vector_documents = params[:vector_documents]
|
9
|
+
@embedding_tool = params[:embedding_tool] || :openai
|
10
|
+
@vector_search_instance = vector_search_instance
|
11
|
+
@openai_connection = params[:openai_connection] || default_connection(openai_access_token: openai_access_token)
|
12
|
+
end
|
13
|
+
|
14
|
+
def call(query:, count: 1)
|
15
|
+
validate_query(query)
|
16
|
+
query_vector = convert_query_to_vector(query)
|
17
|
+
@vector_search_instance.call(query_vector: query_vector, count: count)
|
18
|
+
end
|
19
|
+
|
20
|
+
private
|
21
|
+
|
22
|
+
attr_reader :vector_documents, :embedding_tool, :openai_connection
|
23
|
+
|
24
|
+
def vector_search_instance
|
25
|
+
case vector_documents[:type]
|
26
|
+
when :hnswlib
|
27
|
+
Boxcars::VectorStore::Hnswlib::Search.new(
|
28
|
+
vector_documents: vector_documents
|
29
|
+
)
|
30
|
+
when :in_memory
|
31
|
+
Boxcars::VectorStore::InMemory::Search.new(
|
32
|
+
vector_documents: vector_documents
|
33
|
+
)
|
34
|
+
when :pgvector
|
35
|
+
Boxcars::VectorStore::Pgvector::Search.new(
|
36
|
+
vector_documents: vector_documents
|
37
|
+
)
|
38
|
+
else
|
39
|
+
raise_argument_error('Unsupported vector store provided')
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def default_connection(openai_access_token: nil)
|
44
|
+
Openai.open_ai_client(openai_access_token: openai_access_token)
|
45
|
+
end
|
46
|
+
|
47
|
+
def validate_query(query)
|
48
|
+
raise_argument_error('query must be a string') unless query.is_a?(String)
|
49
|
+
raise_argument_error('query must not be empty') if query.empty?
|
50
|
+
end
|
51
|
+
|
52
|
+
def convert_query_to_vector(query)
|
53
|
+
tool = embeddings_method(embedding_tool)
|
54
|
+
res = tool[:klass].call(
|
55
|
+
texts: [query], client: tool[:client]
|
56
|
+
).first
|
57
|
+
res[:embedding]
|
58
|
+
end
|
59
|
+
|
60
|
+
def embeddings_method(embedding_tool)
|
61
|
+
case embedding_tool
|
62
|
+
when :openai
|
63
|
+
{ klass: Boxcars::VectorStore::EmbedViaOpenAI, client: openai_connection }
|
64
|
+
when :tensorflow
|
65
|
+
{ klass: Boxcars::VectorStore::EmbedViaTensorflow, client: nil }
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def raise_argument_error(message)
|
70
|
+
raise ::Boxcars::ArgumentError, message
|
71
|
+
end
|
8
72
|
end
|
9
73
|
end
|
10
74
|
|
@@ -3,10 +3,11 @@
|
|
3
3
|
module Boxcars
|
4
4
|
module VectorStore
|
5
5
|
class Document
|
6
|
-
attr_accessor :
|
6
|
+
attr_accessor :content, :metadata, :embedding
|
7
7
|
|
8
8
|
def initialize(fields = {})
|
9
|
-
@
|
9
|
+
@content = fields[:content] || ""
|
10
|
+
@embedding = fields[:embedding] || []
|
10
11
|
@metadata = fields[:metadata] || {}
|
11
12
|
end
|
12
13
|
end
|
@@ -7,8 +7,6 @@ module Boxcars
|
|
7
7
|
class EmbedViaOpenAI
|
8
8
|
include VectorStore
|
9
9
|
|
10
|
-
attr_accessor :texts, :client, :model
|
11
|
-
|
12
10
|
def initialize(texts:, client:, model: 'text-embedding-ada-002')
|
13
11
|
validate_params(texts, client)
|
14
12
|
@texts = texts
|
@@ -28,6 +26,8 @@ module Boxcars
|
|
28
26
|
|
29
27
|
private
|
30
28
|
|
29
|
+
attr_accessor :texts, :client, :model
|
30
|
+
|
31
31
|
def validate_params(texts, client)
|
32
32
|
raise_error 'texts must be an array of strings' unless texts.is_a?(Array) && texts.all? { |text| text.is_a?(String) }
|
33
33
|
raise_error 'openai_connection must be an OpenAI::Client' unless client.is_a?(OpenAI::Client)
|
@@ -0,0 +1,100 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'hnswlib'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module VectorStore
|
9
|
+
module Hnswlib
|
10
|
+
# This class is responsible for building the vector store for the hnswlib similarity search.
|
11
|
+
# It will load the training data, generate the embeddings, and save the vector store.
|
12
|
+
# It will also load the vector store into memory.
|
13
|
+
# For later use, it will save the splitted document with index numbers to a json file.
|
14
|
+
class BuildFromFiles
|
15
|
+
include VectorStore
|
16
|
+
|
17
|
+
def initialize(params)
|
18
|
+
@split_chunk_size = params[:split_chunk_size] || 2000
|
19
|
+
@training_data_path = File.absolute_path(params[:training_data_path])
|
20
|
+
@index_file_path = File.absolute_path(params[:index_file_path])
|
21
|
+
|
22
|
+
validate_params(@training_data_path, @index_file_path, split_chunk_size)
|
23
|
+
|
24
|
+
@json_doc_file_path = absolute_json_doc_file_path(@index_file_path, params[:json_doc_file_path])
|
25
|
+
@force_rebuild = params[:force_rebuild] || true
|
26
|
+
@hnsw_vectors = []
|
27
|
+
end
|
28
|
+
|
29
|
+
def call
|
30
|
+
if !force_rebuild && File.exist?(index_file_path)
|
31
|
+
load_existing_vector_store
|
32
|
+
else
|
33
|
+
puts "Building Hnswlib vector store..."
|
34
|
+
data = load_data_files(training_data_path)
|
35
|
+
texts = split_text_into_chunks(data)
|
36
|
+
vectors = generate_vectors(texts)
|
37
|
+
add_vectors(vectors, texts)
|
38
|
+
save_vector_store
|
39
|
+
|
40
|
+
{
|
41
|
+
type: :hnswlib,
|
42
|
+
vector_store: hnsw_vectors
|
43
|
+
}
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
private
|
48
|
+
|
49
|
+
attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild, :hnsw_vectors
|
50
|
+
|
51
|
+
def validate_params(training_data_path, index_file_path, split_chunk_size)
|
52
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
53
|
+
|
54
|
+
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
55
|
+
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
56
|
+
|
57
|
+
index_dir = File.dirname(index_file_path)
|
58
|
+
|
59
|
+
raise_argument_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
|
60
|
+
raise_argument_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
61
|
+
end
|
62
|
+
|
63
|
+
def absolute_json_doc_file_path(index_file_path, json_doc_file_path)
|
64
|
+
return index_file_path.gsub(/\.bin$/, '.json') unless json_doc_file_path
|
65
|
+
|
66
|
+
File.absolute_path(json_doc_file_path)
|
67
|
+
end
|
68
|
+
|
69
|
+
def add_vectors(vectors, texts)
|
70
|
+
vectors.map.with_index do |vector, index|
|
71
|
+
hnsw_vector = Document.new(
|
72
|
+
content: texts[index],
|
73
|
+
embedding: vector[:embedding],
|
74
|
+
metadata: {
|
75
|
+
doc_id: index,
|
76
|
+
dim: vector[:dim],
|
77
|
+
metric: 'l2',
|
78
|
+
max_item: 10000,
|
79
|
+
index_file_path: index_file_path,
|
80
|
+
json_doc_file_path: json_doc_file_path
|
81
|
+
}
|
82
|
+
)
|
83
|
+
hnsw_vectors << hnsw_vector
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
def save_vector_store
|
88
|
+
Boxcars::VectorStore::Hnswlib::SaveToHnswlib.call(hnsw_vectors)
|
89
|
+
end
|
90
|
+
|
91
|
+
def load_existing_vector_store
|
92
|
+
Boxcars::VectorStore::Hnswlib::LoadFromDisk.call(
|
93
|
+
index_file_path: index_file_path,
|
94
|
+
json_doc_file_path: json_doc_file_path
|
95
|
+
)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'fileutils'
|
4
|
+
require 'hnswlib'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module VectorStore
|
9
|
+
module Hnswlib
|
10
|
+
class LoadFromDisk
|
11
|
+
include VectorStore
|
12
|
+
|
13
|
+
def initialize(params)
|
14
|
+
validate_params(params[:index_file_path], params[:json_doc_file_path])
|
15
|
+
|
16
|
+
@index_file_path = File.absolute_path(params[:index_file_path])
|
17
|
+
@json_doc_file_path = File.absolute_path(params[:json_doc_file_path])
|
18
|
+
end
|
19
|
+
|
20
|
+
def call
|
21
|
+
vectors = parse_json_file(json_doc_file_path)
|
22
|
+
hnsw_vectors = load_as_hnsw_vectors(vectors)
|
23
|
+
|
24
|
+
{
|
25
|
+
type: :hnswlib,
|
26
|
+
vector_store: hnsw_vectors
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
attr_reader :index_file_path, :json_doc_file_path
|
33
|
+
|
34
|
+
def validate_params(index_file_path, json_doc_file_path)
|
35
|
+
raise_argument_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
36
|
+
raise_argument_error("json_doc_file_path must be a string") unless json_doc_file_path.is_a?(String)
|
37
|
+
|
38
|
+
raise_argument_error("index_file_path must exist") unless File.exist?(index_file_path)
|
39
|
+
raise_argument_error("json_doc_file_path must exist") unless File.exist?(json_doc_file_path)
|
40
|
+
end
|
41
|
+
|
42
|
+
def load_as_hnsw_vectors(vectors)
|
43
|
+
hnsw_vectors = []
|
44
|
+
vectors.each do |vector|
|
45
|
+
hnsw_vector = Document.new(
|
46
|
+
content: vector[:document],
|
47
|
+
embedding: vector[:embedding],
|
48
|
+
metadata: vector[:metadata]
|
49
|
+
)
|
50
|
+
hnsw_vectors[vectors.first[:doc_id].to_i] = hnsw_vector
|
51
|
+
end
|
52
|
+
hnsw_vectors
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|