boxcars 0.2.12 → 0.2.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +8 -13
- data/README.md +7 -4
- data/boxcars.gemspec +3 -6
- data/lib/boxcars/boxcar/active_record.rb +1 -1
- data/lib/boxcars/boxcar/engine_boxcar.rb +2 -2
- data/lib/boxcars/boxcar/sql.rb +1 -1
- data/lib/boxcars/boxcar/swagger.rb +1 -1
- data/lib/boxcars/boxcar/vector_answer.rb +71 -0
- data/lib/boxcars/boxcar.rb +1 -0
- data/lib/boxcars/train/zero_shot.rb +1 -1
- data/lib/boxcars/train.rb +1 -1
- data/lib/boxcars/vector_search.rb +29 -1
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +24 -16
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +37 -11
- data/lib/boxcars/vector_store/hnswlib/search.rb +49 -13
- data/lib/boxcars/vector_store/in_memory/build_from_array.rb +75 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +7 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +4 -0
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +28 -15
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +11 -10
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +9 -11
- data/lib/boxcars/vector_store/pgvector/search.rb +24 -6
- data/lib/boxcars/vector_store.rb +7 -2
- data/lib/boxcars/version.rb +1 -1
- metadata +21 -34
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +0 -51
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1039f86c58712c10143cc26438da14571013081bcff08aed3d9fcac4b1e84060
|
4
|
+
data.tar.gz: fca9f08855cae8e4e8a4171c043e92884e245582b049b4cc75bfa4d2cd98e51a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f24a578f4004d99a0d71c05f4a0ed520dda2e562d1327ec68bdf8682a927b741e870ed348a1a273319003f515521ef8a12c8778b70994764a945e31f906f807
|
7
|
+
data.tar.gz: be067a2ba1ba2e032a58d32f46071f5510a5ce0f15fbc89a06fe84f4fd854ec825a0ea0c786a90ba87c20bb1893ff59512170ac2e338080e25f85b2474b6fb64
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,35 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [v0.2.13](https://github.com/BoxcarsAI/boxcars/tree/v0.2.13) (2023-05-24)
|
4
|
+
|
5
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.12...v0.2.13)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Typo "Boscar.error" should be "Boxcars.error" [\#82](https://github.com/BoxcarsAI/boxcars/issues/82)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Add vector answer boxcar [\#79](https://github.com/BoxcarsAI/boxcars/pull/79) ([francis](https://github.com/francis))
|
14
|
+
|
15
|
+
## [v0.2.12](https://github.com/BoxcarsAI/boxcars/tree/v0.2.12) (2023-05-22)
|
16
|
+
|
17
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.11...v0.2.12)
|
18
|
+
|
19
|
+
**Closed issues:**
|
20
|
+
|
21
|
+
- GPT-4 support? [\#71](https://github.com/BoxcarsAI/boxcars/issues/71)
|
22
|
+
- add PgVector Vector Store [\#68](https://github.com/BoxcarsAI/boxcars/issues/68)
|
23
|
+
|
24
|
+
**Merged pull requests:**
|
25
|
+
|
26
|
+
- issue\_82 typo "Boscar" instead of "Boxcars" [\#83](https://github.com/BoxcarsAI/boxcars/pull/83) ([MadBomber](https://github.com/MadBomber))
|
27
|
+
- Update boxcars.rb config example [\#81](https://github.com/BoxcarsAI/boxcars/pull/81) ([nhorton](https://github.com/nhorton))
|
28
|
+
- Feature- added pgvector vector store [\#80](https://github.com/BoxcarsAI/boxcars/pull/80) ([jaigouk](https://github.com/jaigouk))
|
29
|
+
- drop support for pre ruby 3 version [\#75](https://github.com/BoxcarsAI/boxcars/pull/75) ([francis](https://github.com/francis))
|
30
|
+
- Chore - refine VectorSearch [\#74](https://github.com/BoxcarsAI/boxcars/pull/74) ([jaigouk](https://github.com/jaigouk))
|
31
|
+
- raise error if OpenAI API returns error or nil. closes \#71 [\#72](https://github.com/BoxcarsAI/boxcars/pull/72) ([francis](https://github.com/francis))
|
32
|
+
|
3
33
|
## [v0.2.11](https://github.com/BoxcarsAI/boxcars/tree/v0.2.11) (2023-05-05)
|
4
34
|
|
5
35
|
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.10...v0.2.11)
|
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
boxcars (0.2.
|
4
|
+
boxcars (0.2.14)
|
5
5
|
google_search_results (~> 2.2)
|
6
6
|
gpt4all (~> 0.0.4)
|
7
|
-
|
7
|
+
hnswlib (~> 0.8)
|
8
|
+
pgvector (~> 0.2)
|
9
|
+
ruby-openai (~> 4.1)
|
8
10
|
|
9
11
|
GEM
|
10
12
|
remote: https://rubygems.org/
|
@@ -86,11 +88,9 @@ GEM
|
|
86
88
|
i18n (1.13.0)
|
87
89
|
concurrent-ruby (~> 1.0)
|
88
90
|
io-console (0.6.0)
|
89
|
-
io-console (0.6.0-java)
|
90
91
|
irb (1.6.4)
|
91
92
|
reline (>= 0.3.0)
|
92
93
|
json (2.6.3)
|
93
|
-
json (2.6.3-java)
|
94
94
|
mime-types (3.4.1)
|
95
95
|
mime-types-data (~> 3.2015)
|
96
96
|
mime-types-data (3.2023.0218.1)
|
@@ -100,7 +100,6 @@ GEM
|
|
100
100
|
multipart-post (2.3.0)
|
101
101
|
netrc (0.11.0)
|
102
102
|
nio4r (2.5.9)
|
103
|
-
nio4r (2.5.9-java)
|
104
103
|
octokit (4.25.1)
|
105
104
|
faraday (>= 1, < 3)
|
106
105
|
sawyer (~> 0.9)
|
@@ -142,7 +141,7 @@ GEM
|
|
142
141
|
diff-lcs (>= 1.2.0, < 2.0)
|
143
142
|
rspec-support (~> 3.12.0)
|
144
143
|
rspec-support (3.12.0)
|
145
|
-
rubocop (1.
|
144
|
+
rubocop (1.51.0)
|
146
145
|
json (~> 2.3)
|
147
146
|
parallel (~> 1.10)
|
148
147
|
parser (>= 3.2.0.0)
|
@@ -156,7 +155,7 @@ GEM
|
|
156
155
|
parser (>= 3.2.1.0)
|
157
156
|
rubocop-capybara (2.18.0)
|
158
157
|
rubocop (~> 1.41)
|
159
|
-
rubocop-factory_bot (2.
|
158
|
+
rubocop-factory_bot (2.23.1)
|
160
159
|
rubocop (~> 1.33)
|
161
160
|
rubocop-rake (0.6.0)
|
162
161
|
rubocop (~> 1.0)
|
@@ -164,7 +163,7 @@ GEM
|
|
164
163
|
rubocop (~> 1.33)
|
165
164
|
rubocop-capybara (~> 2.17)
|
166
165
|
rubocop-factory_bot (~> 2.22)
|
167
|
-
ruby-openai (4.
|
166
|
+
ruby-openai (4.1.0)
|
168
167
|
faraday (>= 1)
|
169
168
|
faraday-multipart (>= 1)
|
170
169
|
ruby-progressbar (1.13.0)
|
@@ -172,11 +171,8 @@ GEM
|
|
172
171
|
sawyer (0.9.2)
|
173
172
|
addressable (>= 2.3.5)
|
174
173
|
faraday (>= 0.17.3, < 3)
|
175
|
-
sqlite3 (1.6.
|
174
|
+
sqlite3 (1.6.3)
|
176
175
|
mini_portile2 (~> 2.8.0)
|
177
|
-
sqlite3 (1.6.2-arm64-darwin)
|
178
|
-
sqlite3 (1.6.2-x86_64-darwin)
|
179
|
-
sqlite3 (1.6.2-x86_64-linux)
|
180
176
|
strings-ansi (0.2.0)
|
181
177
|
timers (4.3.5)
|
182
178
|
traces (0.9.1)
|
@@ -191,7 +187,6 @@ GEM
|
|
191
187
|
concurrent-ruby (~> 1.0)
|
192
188
|
unf (0.1.4)
|
193
189
|
unf_ext
|
194
|
-
unf (0.1.4-java)
|
195
190
|
unf_ext (0.0.8.2)
|
196
191
|
unicode-display_width (2.4.2)
|
197
192
|
vcr (6.1.0)
|
data/README.md
CHANGED
@@ -3,14 +3,14 @@
|
|
3
3
|
<h4 align="center">
|
4
4
|
<a href="https://www.boxcars.ai">Website</a> |
|
5
5
|
<a href="https://www.boxcars.ai/blog">Blog</a> |
|
6
|
-
<a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
|
6
|
+
<a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
|
7
7
|
</h4>
|
8
8
|
|
9
9
|
<p align="center">
|
10
10
|
<a href="https://github.com/BoxcarsAI/boxcars/blob/main/LICENSE.txt"><img src="https://img.shields.io/badge/license-MIT-informational" alt="License"></a>
|
11
11
|
</p>
|
12
12
|
|
13
|
-
Boxcars is a gem that enables you to create new systems with AI composability, using various concepts such as OpenAI, Search, SQL, Rails Active Record and more. This can even be extended with your concepts as well (including your concepts).
|
13
|
+
Boxcars is a gem that enables you to create new systems with AI composability, using various concepts such as OpenAI, Search, SQL, Rails Active Record, Vector Search and more. This can even be extended with your concepts as well (including your concepts).
|
14
14
|
|
15
15
|
This gem was inspired by the popular Python library Langchain. However, we wanted to give it a Ruby spin and make it more user-friendly for beginners to get started.
|
16
16
|
|
@@ -57,6 +57,9 @@ require "boxcars"
|
|
57
57
|
Note: if you want to try out the examples below, run this command and then paste in the code segments of interest:
|
58
58
|
```bash
|
59
59
|
irb -r dotenv/load -r boxcars
|
60
|
+
|
61
|
+
# or if you prefer local repository
|
62
|
+
irb -r dotenv/load -r ./lib/boxcars
|
60
63
|
```
|
61
64
|
|
62
65
|
### Direct Boxcar Use
|
@@ -107,7 +110,7 @@ Produces:
|
|
107
110
|
```text
|
108
111
|
> Entering Zero Shot#run
|
109
112
|
What is pi times the square root of the average temperature in Austin TX in January?
|
110
|
-
Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of the average temperature. We can use a search engine to find the average temperature in Austin TX in January and a calculator to perform the multiplication.
|
113
|
+
Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of the average temperature. We can use a search engine to find the average temperature in Austin TX in January and a calculator to perform the multiplication.
|
111
114
|
Question: Average temperature in Austin TX in January
|
112
115
|
Answer: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
|
113
116
|
Observation: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
|
@@ -135,7 +138,7 @@ See [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/boxcars_exam
|
|
135
138
|
|
136
139
|
For the Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
|
137
140
|
|
138
|
-
For simple vector storage and search, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/
|
141
|
+
For simple vector storage and search, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/vector_search_examples.ipynb) Jupyter Notebook.
|
139
142
|
|
140
143
|
Note, some folks that we talked to didn't know that you could run Ruby Jupyter notebooks. [You can](https://github.com/SciRuby/iruby).
|
141
144
|
|
data/boxcars.gemspec
CHANGED
@@ -30,15 +30,12 @@ Gem::Specification.new do |spec|
|
|
30
30
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
31
31
|
spec.require_paths = ["lib"]
|
32
32
|
|
33
|
-
# dev / test dependencies
|
34
|
-
spec.add_development_dependency "debug", "~> 1.1"
|
35
|
-
spec.add_development_dependency "dotenv", "~> 2.8"
|
36
|
-
spec.add_development_dependency "rspec", "~> 3.2"
|
37
|
-
|
38
33
|
# runtime dependencies
|
39
34
|
spec.add_dependency "google_search_results", "~> 2.2"
|
40
35
|
spec.add_dependency "gpt4all", "~> 0.0.4"
|
41
|
-
spec.add_dependency "
|
36
|
+
spec.add_dependency "hnswlib", "~> 0.8"
|
37
|
+
spec.add_dependency "pgvector", "~> 0.2"
|
38
|
+
spec.add_dependency "ruby-openai", "~> 4.1"
|
42
39
|
|
43
40
|
# For more information and examples about making a new gem, checkout our
|
44
41
|
# guide at: https://bundler.io/guides/creating_gem.html
|
@@ -114,14 +114,14 @@ module Boxcars
|
|
114
114
|
end
|
115
115
|
|
116
116
|
# @return Hash The additional variables for this boxcar.
|
117
|
-
def prediction_additional
|
117
|
+
def prediction_additional(_inputs)
|
118
118
|
{ stop: stop, top_k: top_k }
|
119
119
|
end
|
120
120
|
|
121
121
|
# @param inputs [Hash] The inputs to the boxcar.
|
122
122
|
# @return Hash The variables for this boxcar.
|
123
123
|
def prediction_variables(inputs)
|
124
|
-
prediction_additional.merge(inputs)
|
124
|
+
prediction_additional(inputs).merge(inputs)
|
125
125
|
end
|
126
126
|
|
127
127
|
# remove backticks or triple backticks from the code
|
data/lib/boxcars/boxcar/sql.rb
CHANGED
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Boxcars is a framework for running a series of tools to get an answer to a question.
|
4
|
+
module Boxcars
|
5
|
+
# A Boxcar that interprets a prompt and executes ruby code to do math
|
6
|
+
class VectorAnswer < EngineBoxcar
|
7
|
+
# the description of this engine boxcar
|
8
|
+
DESC = "useful for when you need to answer questions from vector search results."
|
9
|
+
|
10
|
+
attr_reader :embeddings, :vector_documents, :search_content
|
11
|
+
|
12
|
+
# @param embeddings [Hash] The vector embeddings to use for this boxcar.
|
13
|
+
# @param vector_documents [Hash] The vector documents to use for this boxcar.
|
14
|
+
# @param engine [Boxcars::Engine] The engine to user for this boxcar. Can be inherited from a train if nil.
|
15
|
+
# @param prompt [Boxcars::Prompt] The prompt to use for this boxcar. Defaults to built-in prompt.
|
16
|
+
# @param kwargs [Hash] Any other keyword arguments to pass to the parent class.
|
17
|
+
def initialize(embeddings:, vector_documents:, engine: nil, prompt: nil, **kwargs)
|
18
|
+
the_prompt = prompt || my_prompt
|
19
|
+
@embeddings = embeddings
|
20
|
+
@vector_documents = vector_documents
|
21
|
+
kwargs[:stop] ||= ["```output"]
|
22
|
+
kwargs[:name] ||= "VectorAnswer"
|
23
|
+
kwargs[:description] ||= DESC
|
24
|
+
super(engine: engine, prompt: the_prompt, **kwargs)
|
25
|
+
end
|
26
|
+
|
27
|
+
# @param inputs [Hash] The inputs to use for the prediction.
|
28
|
+
# @return Hash The additional variables for this boxcar.
|
29
|
+
def prediction_additional(inputs)
|
30
|
+
{ search_content: get_search_content(inputs[:question]) }.merge super
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
# @param results [Array] The results from the vector search.
|
36
|
+
# @return [String] The content of the search results.
|
37
|
+
def get_results_content(results)
|
38
|
+
results&.map do |result|
|
39
|
+
result[:document].content
|
40
|
+
end.to_a.join("\n\n")
|
41
|
+
end
|
42
|
+
|
43
|
+
# return the content of the search results for count results
|
44
|
+
# @params question [String] The question to search for.
|
45
|
+
# @params count [Integer] The number of results to return.
|
46
|
+
# @return [String] The content of the search results.
|
47
|
+
def get_search_content(question, count: 1)
|
48
|
+
search = Boxcars::VectorSearch.new(embeddings: embeddings, vector_documents: vector_documents)
|
49
|
+
results = search.call query: question, count: count
|
50
|
+
@search_content = get_results_content(results)
|
51
|
+
end
|
52
|
+
|
53
|
+
# our template
|
54
|
+
CTEMPLATE = [
|
55
|
+
syst("You are tasked with answering a question using these possibly relevant excerpts from a large volume of text:\n" \
|
56
|
+
"```text\n%<search_content>s\n```\n\n",
|
57
|
+
"Using the above, just answer the question as if you were answering directly."),
|
58
|
+
user("%<question>s")
|
59
|
+
].freeze
|
60
|
+
|
61
|
+
# The prompt to use for the engine.
|
62
|
+
def my_prompt
|
63
|
+
@conversation ||= Conversation.new(lines: CTEMPLATE)
|
64
|
+
@my_prompt ||= ConversationPrompt.new(
|
65
|
+
conversation: @conversation,
|
66
|
+
input_variables: [:question],
|
67
|
+
other_inputs: [:search_content],
|
68
|
+
output_variables: [:answer])
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
data/lib/boxcars/boxcar.rb
CHANGED
data/lib/boxcars/train.rb
CHANGED
@@ -69,7 +69,7 @@ module Boxcars
|
|
69
69
|
# @return [Boxcars::Action] Action specifying what boxcar to use.
|
70
70
|
def plan(intermediate_steps, **kwargs)
|
71
71
|
thoughts = construct_scratchpad(intermediate_steps)
|
72
|
-
full_inputs = prediction_additional.merge(kwargs).merge(agent_scratchpad: thoughts)
|
72
|
+
full_inputs = prediction_additional(kwargs).merge(kwargs).merge(agent_scratchpad: thoughts)
|
73
73
|
action = get_next_action(full_inputs)
|
74
74
|
return TrainFinish.new({ output: action.boxcar_input }, log: action.log) if action.boxcar == finish_boxcar_name
|
75
75
|
|
@@ -4,13 +4,41 @@
|
|
4
4
|
module Boxcars
|
5
5
|
# For Boxcars that use an engine to do their work.
|
6
6
|
class VectorSearch
|
7
|
+
# initialize the vector search with the following parameters:
|
8
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
9
|
+
# @option params [Hash] :vector_documents The vector documents to search.
|
10
|
+
# example:
|
11
|
+
# {
|
12
|
+
# type: :in_memory,
|
13
|
+
# vector_store: [
|
14
|
+
# Boxcars::VectorStore::Document.new(
|
15
|
+
# content: "hello",
|
16
|
+
# embedding: [0.1, 0.2, 0.3],
|
17
|
+
# metadata: { a: 1 }
|
18
|
+
# )
|
19
|
+
# ]
|
20
|
+
# }
|
7
21
|
def initialize(params)
|
8
22
|
@vector_documents = params[:vector_documents]
|
9
23
|
@embedding_tool = params[:embedding_tool] || :openai
|
10
24
|
@vector_search_instance = vector_search_instance
|
11
|
-
@openai_connection = params[:openai_connection] || default_connection(openai_access_token: openai_access_token)
|
25
|
+
@openai_connection = params[:openai_connection] || default_connection(openai_access_token: params[:openai_access_token])
|
12
26
|
end
|
13
27
|
|
28
|
+
# @param query [String] The query to search for.
|
29
|
+
# @param count [Integer] The number of results to return.
|
30
|
+
# @return [Array] array of hashes with :document and :distance keys
|
31
|
+
# @example
|
32
|
+
# [
|
33
|
+
# {
|
34
|
+
# document: Boxcars::VectorStore::Document.new(
|
35
|
+
# content: "hello",
|
36
|
+
# embedding: [0.1, 0.2, 0.3],
|
37
|
+
# metadata: { a: 1 }
|
38
|
+
# ),
|
39
|
+
# distance: 0.1
|
40
|
+
# }
|
41
|
+
# ]
|
14
42
|
def call(query:, count: 1)
|
15
43
|
validate_query(query)
|
16
44
|
query_vector = convert_query_to_vector(query)
|
@@ -16,13 +16,10 @@ module Boxcars
|
|
16
16
|
|
17
17
|
def initialize(params)
|
18
18
|
@split_chunk_size = params[:split_chunk_size] || 2000
|
19
|
-
@
|
20
|
-
|
19
|
+
@base_dir_path, @index_file_path, @json_doc_file_path =
|
20
|
+
validate_params(params[:training_data_path], params[:index_file_path], split_chunk_size)
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
@json_doc_file_path = absolute_json_doc_file_path(@index_file_path, params[:json_doc_file_path])
|
25
|
-
@force_rebuild = params[:force_rebuild] || true
|
22
|
+
@force_rebuild = params[:force_rebuild] || false
|
26
23
|
@hnsw_vectors = []
|
27
24
|
end
|
28
25
|
|
@@ -32,9 +29,13 @@ module Boxcars
|
|
32
29
|
else
|
33
30
|
puts "Building Hnswlib vector store..."
|
34
31
|
data = load_data_files(training_data_path)
|
32
|
+
Boxcars.debug("Loaded #{data.length} files from #{training_data_path}")
|
35
33
|
texts = split_text_into_chunks(data)
|
34
|
+
Boxcars.debug("Split #{data.length} files into #{texts.length} chunks")
|
36
35
|
vectors = generate_vectors(texts)
|
36
|
+
Boxcars.debug("Generated #{vectors.length} vectors")
|
37
37
|
add_vectors(vectors, texts)
|
38
|
+
Boxcars.debug("Added #{vectors.length} vectors to vector store")
|
38
39
|
save_vector_store
|
39
40
|
|
40
41
|
{
|
@@ -46,24 +47,29 @@ module Boxcars
|
|
46
47
|
|
47
48
|
private
|
48
49
|
|
49
|
-
attr_reader :training_data_path, :index_file_path, :
|
50
|
+
attr_reader :training_data_path, :index_file_path, :base_dir_path,
|
51
|
+
:split_chunk_size, :json_doc_file_path, :force_rebuild, :hnsw_vectors
|
50
52
|
|
51
53
|
def validate_params(training_data_path, index_file_path, split_chunk_size)
|
52
|
-
|
54
|
+
validate_string(training_data_path, 'training_data_path')
|
55
|
+
validate_string(index_file_path, 'index_file_path')
|
56
|
+
|
57
|
+
absolute_data_path = File.absolute_path(training_data_path)
|
58
|
+
base_data_dir_path = File.dirname(absolute_data_path.gsub(/\*{1,2}/, ''))
|
59
|
+
@training_data_path = training_data_path
|
53
60
|
|
54
|
-
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(
|
55
|
-
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(
|
61
|
+
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(base_data_dir_path)
|
62
|
+
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(absolute_data_path).empty?
|
56
63
|
|
57
|
-
|
64
|
+
absolute_index_path = File.absolute_path(index_file_path)
|
65
|
+
index_parent_dir = File.dirname(absolute_index_path)
|
58
66
|
|
59
|
-
raise_argument_error('index_file_path parent directory must exist') unless File.directory?(
|
67
|
+
raise_argument_error('index_file_path parent directory must exist') unless File.directory?(index_parent_dir)
|
60
68
|
raise_argument_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
61
|
-
end
|
62
69
|
|
63
|
-
|
64
|
-
return index_file_path.gsub(/\.bin$/, '.json') unless json_doc_file_path
|
70
|
+
json_doc_file_path = index_file_path.gsub(/\.bin$/, '.json')
|
65
71
|
|
66
|
-
|
72
|
+
[index_parent_dir, index_file_path, json_doc_file_path]
|
67
73
|
end
|
68
74
|
|
69
75
|
def add_vectors(vectors, texts)
|
@@ -76,6 +82,7 @@ module Boxcars
|
|
76
82
|
dim: vector[:dim],
|
77
83
|
metric: 'l2',
|
78
84
|
max_item: 10000,
|
85
|
+
base_dir_path: base_dir_path,
|
79
86
|
index_file_path: index_file_path,
|
80
87
|
json_doc_file_path: json_doc_file_path
|
81
88
|
}
|
@@ -90,6 +97,7 @@ module Boxcars
|
|
90
97
|
|
91
98
|
def load_existing_vector_store
|
92
99
|
Boxcars::VectorStore::Hnswlib::LoadFromDisk.call(
|
100
|
+
base_dir_path: base_dir_path,
|
93
101
|
index_file_path: index_file_path,
|
94
102
|
json_doc_file_path: json_doc_file_path
|
95
103
|
)
|
@@ -10,11 +10,13 @@ module Boxcars
|
|
10
10
|
class LoadFromDisk
|
11
11
|
include VectorStore
|
12
12
|
|
13
|
+
# params:
|
14
|
+
# base_dir_path: string (absolute path to the directory containing the index_file_path and json_doc_file_path),
|
15
|
+
# index_file_path: string (relative path to the index file from the base_dir_path),
|
16
|
+
# json_doc_file_path: string (relative path to the json file from the base_dir_path)
|
13
17
|
def initialize(params)
|
14
|
-
|
15
|
-
|
16
|
-
@index_file_path = File.absolute_path(params[:index_file_path])
|
17
|
-
@json_doc_file_path = File.absolute_path(params[:json_doc_file_path])
|
18
|
+
@base_dir_path, @index_file_path, @json_doc_file_path =
|
19
|
+
validate_params(params)
|
18
20
|
end
|
19
21
|
|
20
22
|
def call
|
@@ -29,14 +31,34 @@ module Boxcars
|
|
29
31
|
|
30
32
|
private
|
31
33
|
|
32
|
-
attr_reader :index_file_path, :json_doc_file_path
|
34
|
+
attr_reader :base_dir_path, :index_file_path, :json_doc_file_path
|
35
|
+
|
36
|
+
def validate_params(params)
|
37
|
+
base_dir_path = params[:base_dir_path]
|
38
|
+
index_file_path = remove_relative_path(params[:index_file_path])
|
39
|
+
json_doc_file_path = remove_relative_path(params[:json_doc_file_path])
|
40
|
+
# we omit base_dir validation in case of loading the data from other environments
|
41
|
+
validate_string(index_file_path, "index_file_path")
|
42
|
+
validate_string(json_doc_file_path, "json_doc_file_path")
|
43
|
+
|
44
|
+
absolute_index_path = validate_file_existence(base_dir_path, index_file_path, "index_file_path")
|
45
|
+
abosolute_json_path = validate_file_existence(base_dir_path, json_doc_file_path, "json_doc_file_path")
|
46
|
+
|
47
|
+
[base_dir_path, absolute_index_path, abosolute_json_path]
|
48
|
+
end
|
49
|
+
|
50
|
+
def remove_relative_path(path)
|
51
|
+
path.start_with?('./') ? path[2..] : path
|
52
|
+
end
|
53
|
+
|
54
|
+
def validate_file_existence(base_dir, file_path, name)
|
55
|
+
file =
|
56
|
+
base_dir.to_s.empty? ? file_path : File.join(base_dir, file_path)
|
57
|
+
complete_path = File.absolute_path(file)
|
33
58
|
|
34
|
-
|
35
|
-
raise_argument_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
36
|
-
raise_argument_error("json_doc_file_path must be a string") unless json_doc_file_path.is_a?(String)
|
59
|
+
raise raise_argument_error("#{name} does not exist at #{complete_path}") unless File.exist?(complete_path)
|
37
60
|
|
38
|
-
|
39
|
-
raise_argument_error("json_doc_file_path must exist") unless File.exist?(json_doc_file_path)
|
61
|
+
complete_path
|
40
62
|
end
|
41
63
|
|
42
64
|
def load_as_hnsw_vectors(vectors)
|
@@ -47,7 +69,11 @@ module Boxcars
|
|
47
69
|
embedding: vector[:embedding],
|
48
70
|
metadata: vector[:metadata]
|
49
71
|
)
|
50
|
-
|
72
|
+
if vector[:metadata][:doc_id]
|
73
|
+
hnsw_vectors[vector[:metadata][:doc_id]] = hnsw_vector
|
74
|
+
else
|
75
|
+
hnsw_vectors << hnsw_vector
|
76
|
+
end
|
51
77
|
end
|
52
78
|
hnsw_vectors
|
53
79
|
end
|
@@ -9,19 +9,35 @@ module Boxcars
|
|
9
9
|
class Search
|
10
10
|
include VectorStore
|
11
11
|
|
12
|
+
# initialize the vector store search with the following parameters:
|
13
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
14
|
+
# example:
|
15
|
+
# {
|
16
|
+
# type: :hnswlib,
|
17
|
+
# vector_store: [
|
18
|
+
# Boxcars::VectorStore::Document.new(
|
19
|
+
# content: "hello",
|
20
|
+
# embedding: [0.1, 0.2, 0.3],
|
21
|
+
# metadata: { a: 1 }
|
22
|
+
# )
|
23
|
+
# ]
|
24
|
+
# }
|
12
25
|
def initialize(params)
|
13
|
-
validate_params(params[:vector_documents])
|
14
|
-
@
|
15
|
-
@search_index = load_index(
|
26
|
+
@vector_store = validate_params(params[:vector_documents])
|
27
|
+
@metadata, @index_file = validate_files(vector_store)
|
28
|
+
@search_index = load_index(metadata, index_file)
|
16
29
|
end
|
17
30
|
|
31
|
+
# @param query_vector [Array] The query vector to search for.
|
32
|
+
# @param count [Integer] The number of results to return.
|
33
|
+
# @return [Array] array of hashes with :document and :distance keys
|
18
34
|
def call(query_vector:, count: 1)
|
19
35
|
search(query_vector, count)
|
20
36
|
end
|
21
37
|
|
22
38
|
private
|
23
39
|
|
24
|
-
attr_reader :
|
40
|
+
attr_reader :vector_store, :index_file, :search_index, :metadata
|
25
41
|
|
26
42
|
def validate_params(vector_documents)
|
27
43
|
raise_argument_error('vector_documents is nil') unless vector_documents
|
@@ -34,27 +50,47 @@ module Boxcars
|
|
34
50
|
raise_arugment_error('vector_store must be an array of Document objects')
|
35
51
|
end
|
36
52
|
|
37
|
-
|
53
|
+
vector_documents[:vector_store]
|
38
54
|
end
|
39
55
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
56
|
+
def validate_files(vector_store)
|
57
|
+
metadata = vector_store.first.metadata
|
58
|
+
raise_arugment_error('metadata must be a hash') unless metadata.is_a?(Hash)
|
59
|
+
raise_arugment_error('metadata is empty') if metadata.empty?
|
43
60
|
|
61
|
+
validate_string(metadata[:index_file_path], "index_file_path")
|
62
|
+
validate_string(metadata[:json_doc_file_path], "json_doc_file_path")
|
63
|
+
|
64
|
+
base_dir = metadata[:base_dir_path]
|
65
|
+
index_file_file_path = metadata[:index_file_path]
|
66
|
+
index_file =
|
67
|
+
if !index_file_file_path.to_s.empty? && File.exist?(index_file_file_path)
|
68
|
+
index_file_file_path
|
69
|
+
else
|
70
|
+
File.join(base_dir.to_s, index_file_file_path.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
raise_argument_error('index_file does not exist') unless File.exist?(index_file)
|
74
|
+
|
75
|
+
[metadata, index_file]
|
76
|
+
end
|
77
|
+
|
78
|
+
def load_index(metadata, index_file)
|
44
79
|
search_index = ::Hnswlib::HierarchicalNSW.new(
|
45
80
|
space: metadata[:metric],
|
46
81
|
dim: metadata[:dim]
|
47
82
|
)
|
48
|
-
search_index.load_index(
|
49
|
-
@search_index = search_index
|
50
|
-
@vector_store = vector_documents[:vector_store]
|
51
|
-
|
83
|
+
search_index.load_index(index_file)
|
52
84
|
search_index
|
53
85
|
end
|
54
86
|
|
55
87
|
def search(query_vector, num_neighbors)
|
56
88
|
raw_results = search_index.search_knn(query_vector, num_neighbors)
|
57
|
-
|
89
|
+
|
90
|
+
raw_results.map { |doc_id, distance| lookup_embedding(doc_id, distance) }
|
91
|
+
.compact
|
92
|
+
.first(num_neighbors)
|
93
|
+
.sort_by { |result| result[:distance] }
|
58
94
|
rescue StandardError => e
|
59
95
|
raise_argument_error("Error searching for #{query_vector}: #{e.message}")
|
60
96
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module VectorStore
|
5
|
+
module InMemory
|
6
|
+
class BuildFromArray
|
7
|
+
include VectorStore
|
8
|
+
|
9
|
+
# @param embedding_tool [Symbol] :openai or other embedding tools
|
10
|
+
# @param input_array [Array] array of hashes with :content and :metadata keys
|
11
|
+
# each hash item should have content and metadata
|
12
|
+
# [
|
13
|
+
# { content: "hello", metadata: { a: 1 } },
|
14
|
+
# { content: "hi", metadata: { a: 1 } },
|
15
|
+
# { content: "bye", metadata: { a: 1 } },
|
16
|
+
# { content: "what's this", metadata: { a: 1 } }
|
17
|
+
# ]
|
18
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
19
|
+
def initialize(embedding_tool: :openai, input_array: nil)
|
20
|
+
validate_params(embedding_tool, input_array)
|
21
|
+
@embedding_tool = embedding_tool
|
22
|
+
@input_array = input_array
|
23
|
+
@memory_vectors = []
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Hash] vector_store: array of Inventor::VectorStore::Document
|
27
|
+
def call
|
28
|
+
texts = input_array.map { |doc| doc[:content] }
|
29
|
+
vectors = generate_vectors(texts)
|
30
|
+
add_vectors(vectors, input_array)
|
31
|
+
|
32
|
+
{
|
33
|
+
type: :in_memory,
|
34
|
+
vector_store: memory_vectors
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
attr_reader :input_array, :memory_vectors
|
41
|
+
|
42
|
+
def validate_params(embedding_tool, input_array)
|
43
|
+
raise_argument_error('input_array is nil') unless input_array
|
44
|
+
raise_argument_error('input_array must be an array') unless input_array.is_a?(Array)
|
45
|
+
unless proper_document_array?(input_array)
|
46
|
+
raise_argument_error('items in input_array needs to have content and metadata')
|
47
|
+
end
|
48
|
+
|
49
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
50
|
+
|
51
|
+
raise_argument_error('embedding_tool is invalid')
|
52
|
+
end
|
53
|
+
|
54
|
+
def proper_document_array?(input_array)
|
55
|
+
return false unless
|
56
|
+
input_array.all? { |hash| hash.key?(:content) && hash.key?(:metadata) }
|
57
|
+
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
# returns array of documents with vectors
|
62
|
+
def add_vectors(vectors, input_array)
|
63
|
+
vectors.zip(input_array).each do |vector, doc|
|
64
|
+
memory_vector = Document.new(
|
65
|
+
content: doc[:content],
|
66
|
+
embedding: vector[:embedding],
|
67
|
+
metadata: doc[:metadata].merge(dim: vector[:dim])
|
68
|
+
)
|
69
|
+
@memory_vectors << memory_vector
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -6,6 +6,12 @@ module Boxcars
|
|
6
6
|
class BuildFromFiles
|
7
7
|
include VectorStore
|
8
8
|
|
9
|
+
# initialize the vector store with the following parameters:
|
10
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
11
|
+
# @option params [Symbol] :embedding_tool The embedding tool to use.
|
12
|
+
# @option params [String] :training_data_path The path to the training data files.
|
13
|
+
# @option params [Integer] :split_chunk_size The number of characters to split the text into.
|
14
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
9
15
|
def initialize(params)
|
10
16
|
@split_chunk_size = params[:split_chunk_size] || 2000
|
11
17
|
@training_data_path = File.absolute_path(params[:training_data_path])
|
@@ -15,6 +21,7 @@ module Boxcars
|
|
15
21
|
@memory_vectors = []
|
16
22
|
end
|
17
23
|
|
24
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
18
25
|
def call
|
19
26
|
data = load_data_files(training_data_path)
|
20
27
|
texts = split_text_into_chunks(data)
|
@@ -6,6 +6,10 @@ module Boxcars
|
|
6
6
|
class Search
|
7
7
|
include VectorStore
|
8
8
|
|
9
|
+
# initialize the vector store InMemory::Search with the following parameters:
|
10
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
11
|
+
# @option params [Hash] :vector_documents The vector documents to search.
|
12
|
+
# @option params [Hash] :vector_store The vector store to search.
|
9
13
|
def initialize(params)
|
10
14
|
validate_params(params[:vector_documents])
|
11
15
|
@vector_documents = params[:vector_documents]
|
@@ -7,15 +7,24 @@ module Boxcars
|
|
7
7
|
class BuildFromArray
|
8
8
|
include VectorStore
|
9
9
|
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
# }
|
10
|
+
# initialize the vector store with the following parameters:
|
11
|
+
#
|
12
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
13
|
+
#
|
14
|
+
# @option params [Symbol] :embedding_tool The embedding tool to use. Must be provided.
|
15
|
+
# @option params [Array] :input_array The array of inputs to use for the embedding tool. Must be provided.
|
16
|
+
# each hash item should have content and metadata
|
17
|
+
# [
|
18
|
+
# { content: "hello", metadata: { a: 1 } },
|
19
|
+
# { content: "hi", metadata: { a: 1 } },
|
20
|
+
# { content: "bye", metadata: { a: 1 } },
|
21
|
+
# { content: "what's this", metadata: { a: 1 } }
|
22
|
+
# ]
|
23
|
+
# @option params [String] :database_url The URL of the database where embeddings are stored. Must be provided.
|
24
|
+
# @option params [String] :table_name The name of the database table where embeddings are stored. Must be provided.
|
25
|
+
# @option params [String] :embedding_column_name The name of the database column where embeddings are stored. required.
|
26
|
+
# @option params [String] :content_column_name The name of the database column where content is stored. Must be provided.
|
27
|
+
# @option params [String] :metadata_column_name The name of the database column where metadata is stored. required.
|
19
28
|
def initialize(params)
|
20
29
|
@embedding_tool = params[:embedding_tool] || :openai
|
21
30
|
|
@@ -31,10 +40,11 @@ module Boxcars
|
|
31
40
|
@pg_vectors = []
|
32
41
|
end
|
33
42
|
|
43
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
34
44
|
def call
|
35
|
-
texts = input_array
|
45
|
+
texts = input_array.map { |doc| doc[:content] }
|
36
46
|
vectors = generate_vectors(texts)
|
37
|
-
add_vectors(vectors,
|
47
|
+
add_vectors(vectors, input_array)
|
38
48
|
documents = save_vector_store
|
39
49
|
|
40
50
|
{
|
@@ -51,15 +61,18 @@ module Boxcars
|
|
51
61
|
|
52
62
|
def validate_params(embedding_tool, input_array)
|
53
63
|
raise_argument_error('input_array is nil') unless input_array
|
64
|
+
raise_argument_error('input_array must be an array') unless input_array.is_a?(Array)
|
65
|
+
raise_argument_error('items in input_array needs to have content and metadata') unless proper_input_array?(input_array)
|
54
66
|
return if %i[openai tensorflow].include?(embedding_tool)
|
55
67
|
|
56
68
|
raise_argument_error('embedding_tool is invalid') unless %i[openai tensorflow].include?(embedding_tool)
|
69
|
+
end
|
57
70
|
|
58
|
-
|
59
|
-
|
71
|
+
def proper_input_array?(input_array)
|
72
|
+
return false unless
|
73
|
+
input_array.all? { |hash| hash.key?(:content) && hash.key?(:metadata) }
|
60
74
|
|
61
|
-
|
62
|
-
end
|
75
|
+
true
|
63
76
|
end
|
64
77
|
|
65
78
|
def add_vectors(vectors, texts)
|
@@ -10,15 +10,15 @@ module Boxcars
|
|
10
10
|
class BuildFromFiles
|
11
11
|
include VectorStore
|
12
12
|
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
13
|
+
# @param training_data_path [String] path to training data files
|
14
|
+
# @param split_chunk_size [Integer] number of characters to split the text into
|
15
|
+
# @param embedding_tool [Symbol] embedding tool to use
|
16
|
+
# @param database_url [String] database url
|
17
|
+
# @param table_name [String] table name
|
18
|
+
# @param embedding_column_name [String] embedding column name
|
19
|
+
# @param content_column_name [String] content column name
|
20
|
+
# @param metadata_column_name [String] metadata column name
|
21
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
22
22
|
def initialize(params)
|
23
23
|
@split_chunk_size = params[:split_chunk_size] || 2000
|
24
24
|
@training_data_path = File.absolute_path(params[:training_data_path])
|
@@ -35,6 +35,7 @@ module Boxcars
|
|
35
35
|
@pg_vectors = []
|
36
36
|
end
|
37
37
|
|
38
|
+
# @return [Hash] vector_store: array of Inventor::VectorStore::Document
|
38
39
|
def call
|
39
40
|
data = load_data_files(training_data_path)
|
40
41
|
texts = split_text_into_chunks(data)
|
@@ -57,7 +58,7 @@ module Boxcars
|
|
57
58
|
def validate_params(embedding_tool, training_data_path)
|
58
59
|
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
59
60
|
|
60
|
-
raise_argument_error('training_data_path parent directory must exist') unless
|
61
|
+
raise_argument_error('training_data_path parent directory must exist') unless Dir.exist?(training_data_dir)
|
61
62
|
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
62
63
|
return if %i[openai tensorflow].include?(embedding_tool)
|
63
64
|
|
@@ -9,15 +9,14 @@ module Boxcars
|
|
9
9
|
class SaveToDatabase
|
10
10
|
include VectorStore
|
11
11
|
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
12
|
+
# @param pg_vectors [Array] array of Boxcars::VectorStore::Document
|
13
|
+
# @param database_url [String] database url
|
14
|
+
# @param table_name [String] table name
|
15
|
+
# @param embedding_column_name [String] embedding column name
|
16
|
+
# @param content_column_name [String] content column name
|
17
|
+
# @param metadata_column_name [String] metadata column name
|
18
|
+
# @return [Array] array of Boxcars::VectorStore::Document
|
19
19
|
def initialize(params)
|
20
|
-
@errors = []
|
21
20
|
validate_param_types(params)
|
22
21
|
@db_connection = test_db_params(params)
|
23
22
|
|
@@ -29,9 +28,8 @@ module Boxcars
|
|
29
28
|
@pg_vectors = params[:pg_vectors]
|
30
29
|
end
|
31
30
|
|
31
|
+
# @return [Array] array of Boxcars::VectorStore::Document
|
32
32
|
def call
|
33
|
-
return { success: false, error: errors } unless errors.empty?
|
34
|
-
|
35
33
|
add_vectors_to_database
|
36
34
|
end
|
37
35
|
|
@@ -39,7 +37,7 @@ module Boxcars
|
|
39
37
|
|
40
38
|
attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
|
41
39
|
:embedding_column_name, :content_column_name,
|
42
|
-
:metadata_column_name
|
40
|
+
:metadata_column_name
|
43
41
|
|
44
42
|
def validate_param_types(params)
|
45
43
|
pg_vectors = params[:pg_vectors]
|
@@ -9,17 +9,21 @@ module Boxcars
|
|
9
9
|
class Search
|
10
10
|
include VectorStore
|
11
11
|
|
12
|
-
#
|
12
|
+
# initialize the vector store with the following parameters:
|
13
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
14
|
+
# @option params [Hash] :vector_documents The vector documents to search.
|
15
|
+
# example:
|
13
16
|
# {
|
14
17
|
# type: :pgvector,
|
15
18
|
# vector_store: {
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
# metadata_column_name: metadata_column_name
|
19
|
+
# table_name: "vector_store",
|
20
|
+
# embedding_column_name: "embedding",
|
21
|
+
# content_column_name: "content",
|
22
|
+
# database_url: ENV['DATABASE_URL']
|
21
23
|
# }
|
22
24
|
# }
|
25
|
+
#
|
26
|
+
# @option params [Hash] :vector_store The vector store to search.
|
23
27
|
def initialize(params)
|
24
28
|
vector_store = validate_params(params)
|
25
29
|
db_url = validate_vector_store(vector_store)
|
@@ -28,6 +32,20 @@ module Boxcars
|
|
28
32
|
@vector_documents = params[:vector_documents]
|
29
33
|
end
|
30
34
|
|
35
|
+
# @param query_vector [Array] The query vector to search for.
|
36
|
+
# @param count [Integer] The number of results to return.
|
37
|
+
# @return [Array] array of hashes with :document and :distance keys
|
38
|
+
# @example
|
39
|
+
# [
|
40
|
+
# {
|
41
|
+
# document: Boxcars::VectorStore::Document.new(
|
42
|
+
# content: "hello",
|
43
|
+
# embedding: [0.1, 0.2, 0.3],
|
44
|
+
# metadata: { a: 1 }
|
45
|
+
# ),
|
46
|
+
# distance: 0.1
|
47
|
+
# }
|
48
|
+
# ]
|
31
49
|
def call(query_vector:, count: 1)
|
32
50
|
raise ::Boxcars::ArgumentError, 'query_vector is empty' if query_vector.empty?
|
33
51
|
|
data/lib/boxcars/vector_store.rb
CHANGED
@@ -54,7 +54,7 @@ module Boxcars
|
|
54
54
|
|
55
55
|
file_content = File.read(file_path)
|
56
56
|
JSON.parse(file_content, symbolize_names: true)
|
57
|
-
rescue JSON::ParserError => e
|
57
|
+
rescue JSON::ParserError, Errno::ENOENT => e
|
58
58
|
raise_argument_error("Error parsing #{file_path}: #{e.message}")
|
59
59
|
end
|
60
60
|
|
@@ -80,6 +80,11 @@ module Boxcars
|
|
80
80
|
end
|
81
81
|
docs
|
82
82
|
end
|
83
|
+
|
84
|
+
def validate_string(value, name)
|
85
|
+
raise raise_argument_error("#{name} must be a string") unless value.is_a?(String)
|
86
|
+
raise raise_argument_error("#{name} is empty") if value.empty?
|
87
|
+
end
|
83
88
|
end
|
84
89
|
end
|
85
90
|
|
@@ -92,7 +97,7 @@ require_relative "vector_store/hnswlib/save_to_hnswlib"
|
|
92
97
|
require_relative "vector_store/hnswlib/build_from_files"
|
93
98
|
require_relative "vector_store/hnswlib/search"
|
94
99
|
require_relative "vector_store/in_memory/build_from_files"
|
95
|
-
require_relative "vector_store/in_memory/
|
100
|
+
require_relative "vector_store/in_memory/build_from_array"
|
96
101
|
require_relative "vector_store/in_memory/search"
|
97
102
|
require_relative "vector_store/pgvector/build_from_files"
|
98
103
|
require_relative "vector_store/pgvector/build_from_array"
|
data/lib/boxcars/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boxcars
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Sullivan
|
@@ -9,92 +9,78 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-06-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: !ruby/object:Gem::Requirement
|
17
|
-
requirements:
|
18
|
-
- - "~>"
|
19
|
-
- !ruby/object:Gem::Version
|
20
|
-
version: '1.1'
|
21
|
-
type: :development
|
22
|
-
prerelease: false
|
23
|
-
version_requirements: !ruby/object:Gem::Requirement
|
24
|
-
requirements:
|
25
|
-
- - "~>"
|
26
|
-
- !ruby/object:Gem::Version
|
27
|
-
version: '1.1'
|
28
|
-
- !ruby/object:Gem::Dependency
|
29
|
-
name: dotenv
|
15
|
+
name: google_search_results
|
30
16
|
requirement: !ruby/object:Gem::Requirement
|
31
17
|
requirements:
|
32
18
|
- - "~>"
|
33
19
|
- !ruby/object:Gem::Version
|
34
|
-
version: '2.
|
35
|
-
type: :
|
20
|
+
version: '2.2'
|
21
|
+
type: :runtime
|
36
22
|
prerelease: false
|
37
23
|
version_requirements: !ruby/object:Gem::Requirement
|
38
24
|
requirements:
|
39
25
|
- - "~>"
|
40
26
|
- !ruby/object:Gem::Version
|
41
|
-
version: '2.
|
27
|
+
version: '2.2'
|
42
28
|
- !ruby/object:Gem::Dependency
|
43
|
-
name:
|
29
|
+
name: gpt4all
|
44
30
|
requirement: !ruby/object:Gem::Requirement
|
45
31
|
requirements:
|
46
32
|
- - "~>"
|
47
33
|
- !ruby/object:Gem::Version
|
48
|
-
version:
|
49
|
-
type: :
|
34
|
+
version: 0.0.4
|
35
|
+
type: :runtime
|
50
36
|
prerelease: false
|
51
37
|
version_requirements: !ruby/object:Gem::Requirement
|
52
38
|
requirements:
|
53
39
|
- - "~>"
|
54
40
|
- !ruby/object:Gem::Version
|
55
|
-
version:
|
41
|
+
version: 0.0.4
|
56
42
|
- !ruby/object:Gem::Dependency
|
57
|
-
name:
|
43
|
+
name: hnswlib
|
58
44
|
requirement: !ruby/object:Gem::Requirement
|
59
45
|
requirements:
|
60
46
|
- - "~>"
|
61
47
|
- !ruby/object:Gem::Version
|
62
|
-
version: '
|
48
|
+
version: '0.8'
|
63
49
|
type: :runtime
|
64
50
|
prerelease: false
|
65
51
|
version_requirements: !ruby/object:Gem::Requirement
|
66
52
|
requirements:
|
67
53
|
- - "~>"
|
68
54
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
55
|
+
version: '0.8'
|
70
56
|
- !ruby/object:Gem::Dependency
|
71
|
-
name:
|
57
|
+
name: pgvector
|
72
58
|
requirement: !ruby/object:Gem::Requirement
|
73
59
|
requirements:
|
74
60
|
- - "~>"
|
75
61
|
- !ruby/object:Gem::Version
|
76
|
-
version: 0.
|
62
|
+
version: '0.2'
|
77
63
|
type: :runtime
|
78
64
|
prerelease: false
|
79
65
|
version_requirements: !ruby/object:Gem::Requirement
|
80
66
|
requirements:
|
81
67
|
- - "~>"
|
82
68
|
- !ruby/object:Gem::Version
|
83
|
-
version: 0.
|
69
|
+
version: '0.2'
|
84
70
|
- !ruby/object:Gem::Dependency
|
85
71
|
name: ruby-openai
|
86
72
|
requirement: !ruby/object:Gem::Requirement
|
87
73
|
requirements:
|
88
74
|
- - "~>"
|
89
75
|
- !ruby/object:Gem::Version
|
90
|
-
version: '4.
|
76
|
+
version: '4.1'
|
91
77
|
type: :runtime
|
92
78
|
prerelease: false
|
93
79
|
version_requirements: !ruby/object:Gem::Requirement
|
94
80
|
requirements:
|
95
81
|
- - "~>"
|
96
82
|
- !ruby/object:Gem::Version
|
97
|
-
version: '4.
|
83
|
+
version: '4.1'
|
98
84
|
description: You simply set an OpenAI key, give a number of Boxcars to a Train, and
|
99
85
|
magic ensues when you run it.
|
100
86
|
email:
|
@@ -124,6 +110,7 @@ files:
|
|
124
110
|
- lib/boxcars/boxcar/google_search.rb
|
125
111
|
- lib/boxcars/boxcar/sql.rb
|
126
112
|
- lib/boxcars/boxcar/swagger.rb
|
113
|
+
- lib/boxcars/boxcar/vector_answer.rb
|
127
114
|
- lib/boxcars/boxcar/wikipedia_search.rb
|
128
115
|
- lib/boxcars/conversation.rb
|
129
116
|
- lib/boxcars/conversation_prompt.rb
|
@@ -148,7 +135,7 @@ files:
|
|
148
135
|
- lib/boxcars/vector_store/hnswlib/load_from_disk.rb
|
149
136
|
- lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb
|
150
137
|
- lib/boxcars/vector_store/hnswlib/search.rb
|
151
|
-
- lib/boxcars/vector_store/in_memory/
|
138
|
+
- lib/boxcars/vector_store/in_memory/build_from_array.rb
|
152
139
|
- lib/boxcars/vector_store/in_memory/build_from_files.rb
|
153
140
|
- lib/boxcars/vector_store/in_memory/search.rb
|
154
141
|
- lib/boxcars/vector_store/pgvector/build_from_array.rb
|
@@ -180,7 +167,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
180
167
|
- !ruby/object:Gem::Version
|
181
168
|
version: '0'
|
182
169
|
requirements: []
|
183
|
-
rubygems_version: 3.
|
170
|
+
rubygems_version: 3.2.32
|
184
171
|
signing_key:
|
185
172
|
specification_version: 4
|
186
173
|
summary: Boxcars is a gem that enables you to create new systems with AI composability.
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Boxcars
|
4
|
-
module VectorStore
|
5
|
-
module InMemory
|
6
|
-
class BuildFromDocumentArray
|
7
|
-
include VectorStore
|
8
|
-
|
9
|
-
def initialize(embedding_tool: :openai, documents: nil)
|
10
|
-
validate_params(embedding_tool, documents)
|
11
|
-
@embedding_tool = embedding_tool
|
12
|
-
@documents = documents
|
13
|
-
@memory_vectors = []
|
14
|
-
end
|
15
|
-
|
16
|
-
def call
|
17
|
-
texts = documents
|
18
|
-
vectors = generate_vectors(texts)
|
19
|
-
add_vectors(vectors, documents)
|
20
|
-
{
|
21
|
-
type: :in_memory,
|
22
|
-
vector_store: memory_vectors
|
23
|
-
}
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
attr_reader :documents, :memory_vectors
|
29
|
-
|
30
|
-
def validate_params(embedding_tool, documents)
|
31
|
-
raise_argument_error('documents is nil') unless documents
|
32
|
-
return if %i[openai tensorflow].include?(embedding_tool)
|
33
|
-
|
34
|
-
raise_argument_error('embedding_tool is invalid')
|
35
|
-
end
|
36
|
-
|
37
|
-
# returns array of documents with vectors
|
38
|
-
def add_vectors(vectors, documents)
|
39
|
-
vectors.zip(documents).each do |vector, doc|
|
40
|
-
memory_vector = Document.new(
|
41
|
-
content: doc[:content],
|
42
|
-
embedding: vector[:embedding],
|
43
|
-
metadata: doc[:metadata].merge(dim: vector[:dim])
|
44
|
-
)
|
45
|
-
@memory_vectors << memory_vector
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|