boxcars 0.2.13 → 0.2.14
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +30 -0
- data/Gemfile.lock +1 -8
- data/README.md +7 -4
- data/boxcars.gemspec +1 -1
- data/lib/boxcars/boxcar/vector_answer.rb +1 -1
- data/lib/boxcars/vector_search.rb +28 -0
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +20 -16
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +37 -11
- data/lib/boxcars/vector_store/hnswlib/search.rb +49 -13
- data/lib/boxcars/vector_store/in_memory/build_from_array.rb +75 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +7 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +4 -0
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +28 -15
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +11 -10
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +9 -11
- data/lib/boxcars/vector_store/pgvector/search.rb +24 -6
- data/lib/boxcars/vector_store.rb +7 -2
- data/lib/boxcars/version.rb +1 -1
- metadata +9 -9
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +0 -51
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1039f86c58712c10143cc26438da14571013081bcff08aed3d9fcac4b1e84060
|
4
|
+
data.tar.gz: fca9f08855cae8e4e8a4171c043e92884e245582b049b4cc75bfa4d2cd98e51a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5f24a578f4004d99a0d71c05f4a0ed520dda2e562d1327ec68bdf8682a927b741e870ed348a1a273319003f515521ef8a12c8778b70994764a945e31f906f807
|
7
|
+
data.tar.gz: be067a2ba1ba2e032a58d32f46071f5510a5ce0f15fbc89a06fe84f4fd854ec825a0ea0c786a90ba87c20bb1893ff59512170ac2e338080e25f85b2474b6fb64
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,35 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## [v0.2.13](https://github.com/BoxcarsAI/boxcars/tree/v0.2.13) (2023-05-24)
|
4
|
+
|
5
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.12...v0.2.13)
|
6
|
+
|
7
|
+
**Closed issues:**
|
8
|
+
|
9
|
+
- Typo "Boscar.error" should be "Boxcars.error" [\#82](https://github.com/BoxcarsAI/boxcars/issues/82)
|
10
|
+
|
11
|
+
**Merged pull requests:**
|
12
|
+
|
13
|
+
- Add vector answer boxcar [\#79](https://github.com/BoxcarsAI/boxcars/pull/79) ([francis](https://github.com/francis))
|
14
|
+
|
15
|
+
## [v0.2.12](https://github.com/BoxcarsAI/boxcars/tree/v0.2.12) (2023-05-22)
|
16
|
+
|
17
|
+
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.11...v0.2.12)
|
18
|
+
|
19
|
+
**Closed issues:**
|
20
|
+
|
21
|
+
- GPT-4 support? [\#71](https://github.com/BoxcarsAI/boxcars/issues/71)
|
22
|
+
- add PgVector Vector Store [\#68](https://github.com/BoxcarsAI/boxcars/issues/68)
|
23
|
+
|
24
|
+
**Merged pull requests:**
|
25
|
+
|
26
|
+
- issue\_82 typo "Boscar" instead of "Boxcars" [\#83](https://github.com/BoxcarsAI/boxcars/pull/83) ([MadBomber](https://github.com/MadBomber))
|
27
|
+
- Update boxcars.rb config example [\#81](https://github.com/BoxcarsAI/boxcars/pull/81) ([nhorton](https://github.com/nhorton))
|
28
|
+
- Feature- added pgvector vector store [\#80](https://github.com/BoxcarsAI/boxcars/pull/80) ([jaigouk](https://github.com/jaigouk))
|
29
|
+
- drop support for pre ruby 3 version [\#75](https://github.com/BoxcarsAI/boxcars/pull/75) ([francis](https://github.com/francis))
|
30
|
+
- Chore - refine VectorSearch [\#74](https://github.com/BoxcarsAI/boxcars/pull/74) ([jaigouk](https://github.com/jaigouk))
|
31
|
+
- raise error if OpenAI API returns error or nil. closes \#71 [\#72](https://github.com/BoxcarsAI/boxcars/pull/72) ([francis](https://github.com/francis))
|
32
|
+
|
3
33
|
## [v0.2.11](https://github.com/BoxcarsAI/boxcars/tree/v0.2.11) (2023-05-05)
|
4
34
|
|
5
35
|
[Full Changelog](https://github.com/BoxcarsAI/boxcars/compare/v0.2.10...v0.2.11)
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
boxcars (0.2.
|
4
|
+
boxcars (0.2.14)
|
5
5
|
google_search_results (~> 2.2)
|
6
6
|
gpt4all (~> 0.0.4)
|
7
7
|
hnswlib (~> 0.8)
|
@@ -88,11 +88,9 @@ GEM
|
|
88
88
|
i18n (1.13.0)
|
89
89
|
concurrent-ruby (~> 1.0)
|
90
90
|
io-console (0.6.0)
|
91
|
-
io-console (0.6.0-java)
|
92
91
|
irb (1.6.4)
|
93
92
|
reline (>= 0.3.0)
|
94
93
|
json (2.6.3)
|
95
|
-
json (2.6.3-java)
|
96
94
|
mime-types (3.4.1)
|
97
95
|
mime-types-data (~> 3.2015)
|
98
96
|
mime-types-data (3.2023.0218.1)
|
@@ -102,7 +100,6 @@ GEM
|
|
102
100
|
multipart-post (2.3.0)
|
103
101
|
netrc (0.11.0)
|
104
102
|
nio4r (2.5.9)
|
105
|
-
nio4r (2.5.9-java)
|
106
103
|
octokit (4.25.1)
|
107
104
|
faraday (>= 1, < 3)
|
108
105
|
sawyer (~> 0.9)
|
@@ -176,9 +173,6 @@ GEM
|
|
176
173
|
faraday (>= 0.17.3, < 3)
|
177
174
|
sqlite3 (1.6.3)
|
178
175
|
mini_portile2 (~> 2.8.0)
|
179
|
-
sqlite3 (1.6.3-arm64-darwin)
|
180
|
-
sqlite3 (1.6.3-x86_64-darwin)
|
181
|
-
sqlite3 (1.6.3-x86_64-linux)
|
182
176
|
strings-ansi (0.2.0)
|
183
177
|
timers (4.3.5)
|
184
178
|
traces (0.9.1)
|
@@ -193,7 +187,6 @@ GEM
|
|
193
187
|
concurrent-ruby (~> 1.0)
|
194
188
|
unf (0.1.4)
|
195
189
|
unf_ext
|
196
|
-
unf (0.1.4-java)
|
197
190
|
unf_ext (0.0.8.2)
|
198
191
|
unicode-display_width (2.4.2)
|
199
192
|
vcr (6.1.0)
|
data/README.md
CHANGED
@@ -3,14 +3,14 @@
|
|
3
3
|
<h4 align="center">
|
4
4
|
<a href="https://www.boxcars.ai">Website</a> |
|
5
5
|
<a href="https://www.boxcars.ai/blog">Blog</a> |
|
6
|
-
<a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
|
6
|
+
<a href="https://github.com/BoxcarsAI/boxcars/wiki">Documentation</a>
|
7
7
|
</h4>
|
8
8
|
|
9
9
|
<p align="center">
|
10
10
|
<a href="https://github.com/BoxcarsAI/boxcars/blob/main/LICENSE.txt"><img src="https://img.shields.io/badge/license-MIT-informational" alt="License"></a>
|
11
11
|
</p>
|
12
12
|
|
13
|
-
Boxcars is a gem that enables you to create new systems with AI composability, using various concepts such as OpenAI, Search, SQL, Rails Active Record and more. This can even be extended with your concepts as well (including your concepts).
|
13
|
+
Boxcars is a gem that enables you to create new systems with AI composability, using various concepts such as OpenAI, Search, SQL, Rails Active Record, Vector Search and more. This can even be extended with your concepts as well (including your concepts).
|
14
14
|
|
15
15
|
This gem was inspired by the popular Python library Langchain. However, we wanted to give it a Ruby spin and make it more user-friendly for beginners to get started.
|
16
16
|
|
@@ -57,6 +57,9 @@ require "boxcars"
|
|
57
57
|
Note: if you want to try out the examples below, run this command and then paste in the code segments of interest:
|
58
58
|
```bash
|
59
59
|
irb -r dotenv/load -r boxcars
|
60
|
+
|
61
|
+
# or if you prefer local repository
|
62
|
+
irb -r dotenv/load -r ./lib/boxcars
|
60
63
|
```
|
61
64
|
|
62
65
|
### Direct Boxcar Use
|
@@ -107,7 +110,7 @@ Produces:
|
|
107
110
|
```text
|
108
111
|
> Entering Zero Shot#run
|
109
112
|
What is pi times the square root of the average temperature in Austin TX in January?
|
110
|
-
Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of the average temperature. We can use a search engine to find the average temperature in Austin TX in January and a calculator to perform the multiplication.
|
113
|
+
Thought: We need to find the average temperature in Austin TX in January and then multiply it by pi and the square root of the average temperature. We can use a search engine to find the average temperature in Austin TX in January and a calculator to perform the multiplication.
|
111
114
|
Question: Average temperature in Austin TX in January
|
112
115
|
Answer: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
|
113
116
|
Observation: January Weather in Austin Texas, United States. Daily high temperatures increase by 2°F, from 62°F to 64°F, rarely falling below 45°F or exceeding 76° ...
|
@@ -135,7 +138,7 @@ See [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/boxcars_exam
|
|
135
138
|
|
136
139
|
For the Swagger boxcar, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/swagger_examples.ipynb) Jupyter Notebook.
|
137
140
|
|
138
|
-
For simple vector storage and search, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/
|
141
|
+
For simple vector storage and search, see [this](https://github.com/BoxcarsAI/boxcars/blob/main/notebooks/vector_search_examples.ipynb) Jupyter Notebook.
|
139
142
|
|
140
143
|
Note, some folks that we talked to didn't know that you could run Ruby Jupyter notebooks. [You can](https://github.com/SciRuby/iruby).
|
141
144
|
|
data/boxcars.gemspec
CHANGED
@@ -34,8 +34,8 @@ Gem::Specification.new do |spec|
|
|
34
34
|
spec.add_dependency "google_search_results", "~> 2.2"
|
35
35
|
spec.add_dependency "gpt4all", "~> 0.0.4"
|
36
36
|
spec.add_dependency "hnswlib", "~> 0.8"
|
37
|
-
spec.add_dependency "ruby-openai", "~> 4.1"
|
38
37
|
spec.add_dependency "pgvector", "~> 0.2"
|
38
|
+
spec.add_dependency "ruby-openai", "~> 4.1"
|
39
39
|
|
40
40
|
# For more information and examples about making a new gem, checkout our
|
41
41
|
# guide at: https://bundler.io/guides/creating_gem.html
|
@@ -47,7 +47,7 @@ module Boxcars
|
|
47
47
|
def get_search_content(question, count: 1)
|
48
48
|
search = Boxcars::VectorSearch.new(embeddings: embeddings, vector_documents: vector_documents)
|
49
49
|
results = search.call query: question, count: count
|
50
|
-
@search_content =
|
50
|
+
@search_content = get_results_content(results)
|
51
51
|
end
|
52
52
|
|
53
53
|
# our template
|
@@ -4,6 +4,20 @@
|
|
4
4
|
module Boxcars
|
5
5
|
# For Boxcars that use an engine to do their work.
|
6
6
|
class VectorSearch
|
7
|
+
# initialize the vector search with the following parameters:
|
8
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
9
|
+
# @option params [Hash] :vector_documents The vector documents to search.
|
10
|
+
# example:
|
11
|
+
# {
|
12
|
+
# type: :in_memory,
|
13
|
+
# vector_store: [
|
14
|
+
# Boxcars::VectorStore::Document.new(
|
15
|
+
# content: "hello",
|
16
|
+
# embedding: [0.1, 0.2, 0.3],
|
17
|
+
# metadata: { a: 1 }
|
18
|
+
# )
|
19
|
+
# ]
|
20
|
+
# }
|
7
21
|
def initialize(params)
|
8
22
|
@vector_documents = params[:vector_documents]
|
9
23
|
@embedding_tool = params[:embedding_tool] || :openai
|
@@ -11,6 +25,20 @@ module Boxcars
|
|
11
25
|
@openai_connection = params[:openai_connection] || default_connection(openai_access_token: params[:openai_access_token])
|
12
26
|
end
|
13
27
|
|
28
|
+
# @param query [String] The query to search for.
|
29
|
+
# @param count [Integer] The number of results to return.
|
30
|
+
# @return [Array] array of hashes with :document and :distance keys
|
31
|
+
# @example
|
32
|
+
# [
|
33
|
+
# {
|
34
|
+
# document: Boxcars::VectorStore::Document.new(
|
35
|
+
# content: "hello",
|
36
|
+
# embedding: [0.1, 0.2, 0.3],
|
37
|
+
# metadata: { a: 1 }
|
38
|
+
# ),
|
39
|
+
# distance: 0.1
|
40
|
+
# }
|
41
|
+
# ]
|
14
42
|
def call(query:, count: 1)
|
15
43
|
validate_query(query)
|
16
44
|
query_vector = convert_query_to_vector(query)
|
@@ -16,13 +16,10 @@ module Boxcars
|
|
16
16
|
|
17
17
|
def initialize(params)
|
18
18
|
@split_chunk_size = params[:split_chunk_size] || 2000
|
19
|
-
@
|
20
|
-
|
19
|
+
@base_dir_path, @index_file_path, @json_doc_file_path =
|
20
|
+
validate_params(params[:training_data_path], params[:index_file_path], split_chunk_size)
|
21
21
|
|
22
|
-
|
23
|
-
|
24
|
-
@json_doc_file_path = absolute_json_doc_file_path(@index_file_path, params[:json_doc_file_path])
|
25
|
-
@force_rebuild = params.key?(:force_rebuild) ? params[:force_rebuild] : true
|
22
|
+
@force_rebuild = params[:force_rebuild] || false
|
26
23
|
@hnsw_vectors = []
|
27
24
|
end
|
28
25
|
|
@@ -50,24 +47,29 @@ module Boxcars
|
|
50
47
|
|
51
48
|
private
|
52
49
|
|
53
|
-
attr_reader :training_data_path, :index_file_path, :
|
50
|
+
attr_reader :training_data_path, :index_file_path, :base_dir_path,
|
51
|
+
:split_chunk_size, :json_doc_file_path, :force_rebuild, :hnsw_vectors
|
54
52
|
|
55
53
|
def validate_params(training_data_path, index_file_path, split_chunk_size)
|
56
|
-
|
54
|
+
validate_string(training_data_path, 'training_data_path')
|
55
|
+
validate_string(index_file_path, 'index_file_path')
|
56
|
+
|
57
|
+
absolute_data_path = File.absolute_path(training_data_path)
|
58
|
+
base_data_dir_path = File.dirname(absolute_data_path.gsub(/\*{1,2}/, ''))
|
59
|
+
@training_data_path = training_data_path
|
57
60
|
|
58
|
-
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(
|
59
|
-
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(
|
61
|
+
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(base_data_dir_path)
|
62
|
+
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(absolute_data_path).empty?
|
60
63
|
|
61
|
-
|
64
|
+
absolute_index_path = File.absolute_path(index_file_path)
|
65
|
+
index_parent_dir = File.dirname(absolute_index_path)
|
62
66
|
|
63
|
-
raise_argument_error('index_file_path parent directory must exist') unless File.directory?(
|
67
|
+
raise_argument_error('index_file_path parent directory must exist') unless File.directory?(index_parent_dir)
|
64
68
|
raise_argument_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
65
|
-
end
|
66
69
|
|
67
|
-
|
68
|
-
return index_file_path.gsub(/\.bin$/, '.json') unless json_doc_file_path
|
70
|
+
json_doc_file_path = index_file_path.gsub(/\.bin$/, '.json')
|
69
71
|
|
70
|
-
|
72
|
+
[index_parent_dir, index_file_path, json_doc_file_path]
|
71
73
|
end
|
72
74
|
|
73
75
|
def add_vectors(vectors, texts)
|
@@ -80,6 +82,7 @@ module Boxcars
|
|
80
82
|
dim: vector[:dim],
|
81
83
|
metric: 'l2',
|
82
84
|
max_item: 10000,
|
85
|
+
base_dir_path: base_dir_path,
|
83
86
|
index_file_path: index_file_path,
|
84
87
|
json_doc_file_path: json_doc_file_path
|
85
88
|
}
|
@@ -94,6 +97,7 @@ module Boxcars
|
|
94
97
|
|
95
98
|
def load_existing_vector_store
|
96
99
|
Boxcars::VectorStore::Hnswlib::LoadFromDisk.call(
|
100
|
+
base_dir_path: base_dir_path,
|
97
101
|
index_file_path: index_file_path,
|
98
102
|
json_doc_file_path: json_doc_file_path
|
99
103
|
)
|
@@ -10,11 +10,13 @@ module Boxcars
|
|
10
10
|
class LoadFromDisk
|
11
11
|
include VectorStore
|
12
12
|
|
13
|
+
# params:
|
14
|
+
# base_dir_path: string (absolute path to the directory containing the index_file_path and json_doc_file_path),
|
15
|
+
# index_file_path: string (relative path to the index file from the base_dir_path),
|
16
|
+
# json_doc_file_path: string (relative path to the json file from the base_dir_path)
|
13
17
|
def initialize(params)
|
14
|
-
|
15
|
-
|
16
|
-
@index_file_path = File.absolute_path(params[:index_file_path])
|
17
|
-
@json_doc_file_path = File.absolute_path(params[:json_doc_file_path])
|
18
|
+
@base_dir_path, @index_file_path, @json_doc_file_path =
|
19
|
+
validate_params(params)
|
18
20
|
end
|
19
21
|
|
20
22
|
def call
|
@@ -29,14 +31,34 @@ module Boxcars
|
|
29
31
|
|
30
32
|
private
|
31
33
|
|
32
|
-
attr_reader :index_file_path, :json_doc_file_path
|
34
|
+
attr_reader :base_dir_path, :index_file_path, :json_doc_file_path
|
35
|
+
|
36
|
+
def validate_params(params)
|
37
|
+
base_dir_path = params[:base_dir_path]
|
38
|
+
index_file_path = remove_relative_path(params[:index_file_path])
|
39
|
+
json_doc_file_path = remove_relative_path(params[:json_doc_file_path])
|
40
|
+
# we omit base_dir validation in case of loading the data from other environments
|
41
|
+
validate_string(index_file_path, "index_file_path")
|
42
|
+
validate_string(json_doc_file_path, "json_doc_file_path")
|
43
|
+
|
44
|
+
absolute_index_path = validate_file_existence(base_dir_path, index_file_path, "index_file_path")
|
45
|
+
abosolute_json_path = validate_file_existence(base_dir_path, json_doc_file_path, "json_doc_file_path")
|
46
|
+
|
47
|
+
[base_dir_path, absolute_index_path, abosolute_json_path]
|
48
|
+
end
|
49
|
+
|
50
|
+
def remove_relative_path(path)
|
51
|
+
path.start_with?('./') ? path[2..] : path
|
52
|
+
end
|
53
|
+
|
54
|
+
def validate_file_existence(base_dir, file_path, name)
|
55
|
+
file =
|
56
|
+
base_dir.to_s.empty? ? file_path : File.join(base_dir, file_path)
|
57
|
+
complete_path = File.absolute_path(file)
|
33
58
|
|
34
|
-
|
35
|
-
raise_argument_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
36
|
-
raise_argument_error("json_doc_file_path must be a string") unless json_doc_file_path.is_a?(String)
|
59
|
+
raise raise_argument_error("#{name} does not exist at #{complete_path}") unless File.exist?(complete_path)
|
37
60
|
|
38
|
-
|
39
|
-
raise_argument_error("json_doc_file_path must exist") unless File.exist?(json_doc_file_path)
|
61
|
+
complete_path
|
40
62
|
end
|
41
63
|
|
42
64
|
def load_as_hnsw_vectors(vectors)
|
@@ -47,7 +69,11 @@ module Boxcars
|
|
47
69
|
embedding: vector[:embedding],
|
48
70
|
metadata: vector[:metadata]
|
49
71
|
)
|
50
|
-
|
72
|
+
if vector[:metadata][:doc_id]
|
73
|
+
hnsw_vectors[vector[:metadata][:doc_id]] = hnsw_vector
|
74
|
+
else
|
75
|
+
hnsw_vectors << hnsw_vector
|
76
|
+
end
|
51
77
|
end
|
52
78
|
hnsw_vectors
|
53
79
|
end
|
@@ -9,19 +9,35 @@ module Boxcars
|
|
9
9
|
class Search
|
10
10
|
include VectorStore
|
11
11
|
|
12
|
+
# initialize the vector store search with the following parameters:
|
13
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
14
|
+
# example:
|
15
|
+
# {
|
16
|
+
# type: :hnswlib,
|
17
|
+
# vector_store: [
|
18
|
+
# Boxcars::VectorStore::Document.new(
|
19
|
+
# content: "hello",
|
20
|
+
# embedding: [0.1, 0.2, 0.3],
|
21
|
+
# metadata: { a: 1 }
|
22
|
+
# )
|
23
|
+
# ]
|
24
|
+
# }
|
12
25
|
def initialize(params)
|
13
|
-
validate_params(params[:vector_documents])
|
14
|
-
@
|
15
|
-
@search_index = load_index(
|
26
|
+
@vector_store = validate_params(params[:vector_documents])
|
27
|
+
@metadata, @index_file = validate_files(vector_store)
|
28
|
+
@search_index = load_index(metadata, index_file)
|
16
29
|
end
|
17
30
|
|
31
|
+
# @param query_vector [Array] The query vector to search for.
|
32
|
+
# @param count [Integer] The number of results to return.
|
33
|
+
# @return [Array] array of hashes with :document and :distance keys
|
18
34
|
def call(query_vector:, count: 1)
|
19
35
|
search(query_vector, count)
|
20
36
|
end
|
21
37
|
|
22
38
|
private
|
23
39
|
|
24
|
-
attr_reader :
|
40
|
+
attr_reader :vector_store, :index_file, :search_index, :metadata
|
25
41
|
|
26
42
|
def validate_params(vector_documents)
|
27
43
|
raise_argument_error('vector_documents is nil') unless vector_documents
|
@@ -34,27 +50,47 @@ module Boxcars
|
|
34
50
|
raise_arugment_error('vector_store must be an array of Document objects')
|
35
51
|
end
|
36
52
|
|
37
|
-
|
53
|
+
vector_documents[:vector_store]
|
38
54
|
end
|
39
55
|
|
40
|
-
def
|
41
|
-
|
42
|
-
|
56
|
+
def validate_files(vector_store)
|
57
|
+
metadata = vector_store.first.metadata
|
58
|
+
raise_arugment_error('metadata must be a hash') unless metadata.is_a?(Hash)
|
59
|
+
raise_arugment_error('metadata is empty') if metadata.empty?
|
43
60
|
|
61
|
+
validate_string(metadata[:index_file_path], "index_file_path")
|
62
|
+
validate_string(metadata[:json_doc_file_path], "json_doc_file_path")
|
63
|
+
|
64
|
+
base_dir = metadata[:base_dir_path]
|
65
|
+
index_file_file_path = metadata[:index_file_path]
|
66
|
+
index_file =
|
67
|
+
if !index_file_file_path.to_s.empty? && File.exist?(index_file_file_path)
|
68
|
+
index_file_file_path
|
69
|
+
else
|
70
|
+
File.join(base_dir.to_s, index_file_file_path.to_s)
|
71
|
+
end
|
72
|
+
|
73
|
+
raise_argument_error('index_file does not exist') unless File.exist?(index_file)
|
74
|
+
|
75
|
+
[metadata, index_file]
|
76
|
+
end
|
77
|
+
|
78
|
+
def load_index(metadata, index_file)
|
44
79
|
search_index = ::Hnswlib::HierarchicalNSW.new(
|
45
80
|
space: metadata[:metric],
|
46
81
|
dim: metadata[:dim]
|
47
82
|
)
|
48
|
-
search_index.load_index(
|
49
|
-
@search_index = search_index
|
50
|
-
@vector_store = vector_documents[:vector_store]
|
51
|
-
|
83
|
+
search_index.load_index(index_file)
|
52
84
|
search_index
|
53
85
|
end
|
54
86
|
|
55
87
|
def search(query_vector, num_neighbors)
|
56
88
|
raw_results = search_index.search_knn(query_vector, num_neighbors)
|
57
|
-
|
89
|
+
|
90
|
+
raw_results.map { |doc_id, distance| lookup_embedding(doc_id, distance) }
|
91
|
+
.compact
|
92
|
+
.first(num_neighbors)
|
93
|
+
.sort_by { |result| result[:distance] }
|
58
94
|
rescue StandardError => e
|
59
95
|
raise_argument_error("Error searching for #{query_vector}: #{e.message}")
|
60
96
|
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module VectorStore
|
5
|
+
module InMemory
|
6
|
+
class BuildFromArray
|
7
|
+
include VectorStore
|
8
|
+
|
9
|
+
# @param embedding_tool [Symbol] :openai or other embedding tools
|
10
|
+
# @param input_array [Array] array of hashes with :content and :metadata keys
|
11
|
+
# each hash item should have content and metadata
|
12
|
+
# [
|
13
|
+
# { content: "hello", metadata: { a: 1 } },
|
14
|
+
# { content: "hi", metadata: { a: 1 } },
|
15
|
+
# { content: "bye", metadata: { a: 1 } },
|
16
|
+
# { content: "what's this", metadata: { a: 1 } }
|
17
|
+
# ]
|
18
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
19
|
+
def initialize(embedding_tool: :openai, input_array: nil)
|
20
|
+
validate_params(embedding_tool, input_array)
|
21
|
+
@embedding_tool = embedding_tool
|
22
|
+
@input_array = input_array
|
23
|
+
@memory_vectors = []
|
24
|
+
end
|
25
|
+
|
26
|
+
# @return [Hash] vector_store: array of Inventor::VectorStore::Document
|
27
|
+
def call
|
28
|
+
texts = input_array.map { |doc| doc[:content] }
|
29
|
+
vectors = generate_vectors(texts)
|
30
|
+
add_vectors(vectors, input_array)
|
31
|
+
|
32
|
+
{
|
33
|
+
type: :in_memory,
|
34
|
+
vector_store: memory_vectors
|
35
|
+
}
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
attr_reader :input_array, :memory_vectors
|
41
|
+
|
42
|
+
def validate_params(embedding_tool, input_array)
|
43
|
+
raise_argument_error('input_array is nil') unless input_array
|
44
|
+
raise_argument_error('input_array must be an array') unless input_array.is_a?(Array)
|
45
|
+
unless proper_document_array?(input_array)
|
46
|
+
raise_argument_error('items in input_array needs to have content and metadata')
|
47
|
+
end
|
48
|
+
|
49
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
50
|
+
|
51
|
+
raise_argument_error('embedding_tool is invalid')
|
52
|
+
end
|
53
|
+
|
54
|
+
def proper_document_array?(input_array)
|
55
|
+
return false unless
|
56
|
+
input_array.all? { |hash| hash.key?(:content) && hash.key?(:metadata) }
|
57
|
+
|
58
|
+
true
|
59
|
+
end
|
60
|
+
|
61
|
+
# returns array of documents with vectors
|
62
|
+
def add_vectors(vectors, input_array)
|
63
|
+
vectors.zip(input_array).each do |vector, doc|
|
64
|
+
memory_vector = Document.new(
|
65
|
+
content: doc[:content],
|
66
|
+
embedding: vector[:embedding],
|
67
|
+
metadata: doc[:metadata].merge(dim: vector[:dim])
|
68
|
+
)
|
69
|
+
@memory_vectors << memory_vector
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -6,6 +6,12 @@ module Boxcars
|
|
6
6
|
class BuildFromFiles
|
7
7
|
include VectorStore
|
8
8
|
|
9
|
+
# initialize the vector store with the following parameters:
|
10
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
11
|
+
# @option params [Symbol] :embedding_tool The embedding tool to use.
|
12
|
+
# @option params [String] :training_data_path The path to the training data files.
|
13
|
+
# @option params [Integer] :split_chunk_size The number of characters to split the text into.
|
14
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
9
15
|
def initialize(params)
|
10
16
|
@split_chunk_size = params[:split_chunk_size] || 2000
|
11
17
|
@training_data_path = File.absolute_path(params[:training_data_path])
|
@@ -15,6 +21,7 @@ module Boxcars
|
|
15
21
|
@memory_vectors = []
|
16
22
|
end
|
17
23
|
|
24
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
18
25
|
def call
|
19
26
|
data = load_data_files(training_data_path)
|
20
27
|
texts = split_text_into_chunks(data)
|
@@ -6,6 +6,10 @@ module Boxcars
|
|
6
6
|
class Search
|
7
7
|
include VectorStore
|
8
8
|
|
9
|
+
# initialize the vector store InMemory::Search with the following parameters:
|
10
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
11
|
+
# @option params [Hash] :vector_documents The vector documents to search.
|
12
|
+
# @option params [Hash] :vector_store The vector store to search.
|
9
13
|
def initialize(params)
|
10
14
|
validate_params(params[:vector_documents])
|
11
15
|
@vector_documents = params[:vector_documents]
|
@@ -7,15 +7,24 @@ module Boxcars
|
|
7
7
|
class BuildFromArray
|
8
8
|
include VectorStore
|
9
9
|
|
10
|
-
#
|
11
|
-
#
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
# }
|
10
|
+
# initialize the vector store with the following parameters:
|
11
|
+
#
|
12
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
13
|
+
#
|
14
|
+
# @option params [Symbol] :embedding_tool The embedding tool to use. Must be provided.
|
15
|
+
# @option params [Array] :input_array The array of inputs to use for the embedding tool. Must be provided.
|
16
|
+
# each hash item should have content and metadata
|
17
|
+
# [
|
18
|
+
# { content: "hello", metadata: { a: 1 } },
|
19
|
+
# { content: "hi", metadata: { a: 1 } },
|
20
|
+
# { content: "bye", metadata: { a: 1 } },
|
21
|
+
# { content: "what's this", metadata: { a: 1 } }
|
22
|
+
# ]
|
23
|
+
# @option params [String] :database_url The URL of the database where embeddings are stored. Must be provided.
|
24
|
+
# @option params [String] :table_name The name of the database table where embeddings are stored. Must be provided.
|
25
|
+
# @option params [String] :embedding_column_name The name of the database column where embeddings are stored. required.
|
26
|
+
# @option params [String] :content_column_name The name of the database column where content is stored. Must be provided.
|
27
|
+
# @option params [String] :metadata_column_name The name of the database column where metadata is stored. required.
|
19
28
|
def initialize(params)
|
20
29
|
@embedding_tool = params[:embedding_tool] || :openai
|
21
30
|
|
@@ -31,10 +40,11 @@ module Boxcars
|
|
31
40
|
@pg_vectors = []
|
32
41
|
end
|
33
42
|
|
43
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
34
44
|
def call
|
35
|
-
texts = input_array
|
45
|
+
texts = input_array.map { |doc| doc[:content] }
|
36
46
|
vectors = generate_vectors(texts)
|
37
|
-
add_vectors(vectors,
|
47
|
+
add_vectors(vectors, input_array)
|
38
48
|
documents = save_vector_store
|
39
49
|
|
40
50
|
{
|
@@ -51,15 +61,18 @@ module Boxcars
|
|
51
61
|
|
52
62
|
def validate_params(embedding_tool, input_array)
|
53
63
|
raise_argument_error('input_array is nil') unless input_array
|
64
|
+
raise_argument_error('input_array must be an array') unless input_array.is_a?(Array)
|
65
|
+
raise_argument_error('items in input_array needs to have content and metadata') unless proper_input_array?(input_array)
|
54
66
|
return if %i[openai tensorflow].include?(embedding_tool)
|
55
67
|
|
56
68
|
raise_argument_error('embedding_tool is invalid') unless %i[openai tensorflow].include?(embedding_tool)
|
69
|
+
end
|
57
70
|
|
58
|
-
|
59
|
-
|
71
|
+
def proper_input_array?(input_array)
|
72
|
+
return false unless
|
73
|
+
input_array.all? { |hash| hash.key?(:content) && hash.key?(:metadata) }
|
60
74
|
|
61
|
-
|
62
|
-
end
|
75
|
+
true
|
63
76
|
end
|
64
77
|
|
65
78
|
def add_vectors(vectors, texts)
|
@@ -10,15 +10,15 @@ module Boxcars
|
|
10
10
|
class BuildFromFiles
|
11
11
|
include VectorStore
|
12
12
|
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
13
|
+
# @param training_data_path [String] path to training data files
|
14
|
+
# @param split_chunk_size [Integer] number of characters to split the text into
|
15
|
+
# @param embedding_tool [Symbol] embedding tool to use
|
16
|
+
# @param database_url [String] database url
|
17
|
+
# @param table_name [String] table name
|
18
|
+
# @param embedding_column_name [String] embedding column name
|
19
|
+
# @param content_column_name [String] content column name
|
20
|
+
# @param metadata_column_name [String] metadata column name
|
21
|
+
# @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
|
22
22
|
def initialize(params)
|
23
23
|
@split_chunk_size = params[:split_chunk_size] || 2000
|
24
24
|
@training_data_path = File.absolute_path(params[:training_data_path])
|
@@ -35,6 +35,7 @@ module Boxcars
|
|
35
35
|
@pg_vectors = []
|
36
36
|
end
|
37
37
|
|
38
|
+
# @return [Hash] vector_store: array of Inventor::VectorStore::Document
|
38
39
|
def call
|
39
40
|
data = load_data_files(training_data_path)
|
40
41
|
texts = split_text_into_chunks(data)
|
@@ -57,7 +58,7 @@ module Boxcars
|
|
57
58
|
def validate_params(embedding_tool, training_data_path)
|
58
59
|
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
59
60
|
|
60
|
-
raise_argument_error('training_data_path parent directory must exist') unless
|
61
|
+
raise_argument_error('training_data_path parent directory must exist') unless Dir.exist?(training_data_dir)
|
61
62
|
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
62
63
|
return if %i[openai tensorflow].include?(embedding_tool)
|
63
64
|
|
@@ -9,15 +9,14 @@ module Boxcars
|
|
9
9
|
class SaveToDatabase
|
10
10
|
include VectorStore
|
11
11
|
|
12
|
-
#
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
12
|
+
# @param pg_vectors [Array] array of Boxcars::VectorStore::Document
|
13
|
+
# @param database_url [String] database url
|
14
|
+
# @param table_name [String] table name
|
15
|
+
# @param embedding_column_name [String] embedding column name
|
16
|
+
# @param content_column_name [String] content column name
|
17
|
+
# @param metadata_column_name [String] metadata column name
|
18
|
+
# @return [Array] array of Boxcars::VectorStore::Document
|
19
19
|
def initialize(params)
|
20
|
-
@errors = []
|
21
20
|
validate_param_types(params)
|
22
21
|
@db_connection = test_db_params(params)
|
23
22
|
|
@@ -29,9 +28,8 @@ module Boxcars
|
|
29
28
|
@pg_vectors = params[:pg_vectors]
|
30
29
|
end
|
31
30
|
|
31
|
+
# @return [Array] array of Boxcars::VectorStore::Document
|
32
32
|
def call
|
33
|
-
return { success: false, error: errors } unless errors.empty?
|
34
|
-
|
35
33
|
add_vectors_to_database
|
36
34
|
end
|
37
35
|
|
@@ -39,7 +37,7 @@ module Boxcars
|
|
39
37
|
|
40
38
|
attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
|
41
39
|
:embedding_column_name, :content_column_name,
|
42
|
-
:metadata_column_name
|
40
|
+
:metadata_column_name
|
43
41
|
|
44
42
|
def validate_param_types(params)
|
45
43
|
pg_vectors = params[:pg_vectors]
|
@@ -9,17 +9,21 @@ module Boxcars
|
|
9
9
|
class Search
|
10
10
|
include VectorStore
|
11
11
|
|
12
|
-
#
|
12
|
+
# initialize the vector store with the following parameters:
|
13
|
+
# @param params [Hash] A Hash containing the initial configuration.
|
14
|
+
# @option params [Hash] :vector_documents The vector documents to search.
|
15
|
+
# example:
|
13
16
|
# {
|
14
17
|
# type: :pgvector,
|
15
18
|
# vector_store: {
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
# metadata_column_name: metadata_column_name
|
19
|
+
# table_name: "vector_store",
|
20
|
+
# embedding_column_name: "embedding",
|
21
|
+
# content_column_name: "content",
|
22
|
+
# database_url: ENV['DATABASE_URL']
|
21
23
|
# }
|
22
24
|
# }
|
25
|
+
#
|
26
|
+
# @option params [Hash] :vector_store The vector store to search.
|
23
27
|
def initialize(params)
|
24
28
|
vector_store = validate_params(params)
|
25
29
|
db_url = validate_vector_store(vector_store)
|
@@ -28,6 +32,20 @@ module Boxcars
|
|
28
32
|
@vector_documents = params[:vector_documents]
|
29
33
|
end
|
30
34
|
|
35
|
+
# @param query_vector [Array] The query vector to search for.
|
36
|
+
# @param count [Integer] The number of results to return.
|
37
|
+
# @return [Array] array of hashes with :document and :distance keys
|
38
|
+
# @example
|
39
|
+
# [
|
40
|
+
# {
|
41
|
+
# document: Boxcars::VectorStore::Document.new(
|
42
|
+
# content: "hello",
|
43
|
+
# embedding: [0.1, 0.2, 0.3],
|
44
|
+
# metadata: { a: 1 }
|
45
|
+
# ),
|
46
|
+
# distance: 0.1
|
47
|
+
# }
|
48
|
+
# ]
|
31
49
|
def call(query_vector:, count: 1)
|
32
50
|
raise ::Boxcars::ArgumentError, 'query_vector is empty' if query_vector.empty?
|
33
51
|
|
data/lib/boxcars/vector_store.rb
CHANGED
@@ -54,7 +54,7 @@ module Boxcars
|
|
54
54
|
|
55
55
|
file_content = File.read(file_path)
|
56
56
|
JSON.parse(file_content, symbolize_names: true)
|
57
|
-
rescue JSON::ParserError => e
|
57
|
+
rescue JSON::ParserError, Errno::ENOENT => e
|
58
58
|
raise_argument_error("Error parsing #{file_path}: #{e.message}")
|
59
59
|
end
|
60
60
|
|
@@ -80,6 +80,11 @@ module Boxcars
|
|
80
80
|
end
|
81
81
|
docs
|
82
82
|
end
|
83
|
+
|
84
|
+
def validate_string(value, name)
|
85
|
+
raise raise_argument_error("#{name} must be a string") unless value.is_a?(String)
|
86
|
+
raise raise_argument_error("#{name} is empty") if value.empty?
|
87
|
+
end
|
83
88
|
end
|
84
89
|
end
|
85
90
|
|
@@ -92,7 +97,7 @@ require_relative "vector_store/hnswlib/save_to_hnswlib"
|
|
92
97
|
require_relative "vector_store/hnswlib/build_from_files"
|
93
98
|
require_relative "vector_store/hnswlib/search"
|
94
99
|
require_relative "vector_store/in_memory/build_from_files"
|
95
|
-
require_relative "vector_store/in_memory/
|
100
|
+
require_relative "vector_store/in_memory/build_from_array"
|
96
101
|
require_relative "vector_store/in_memory/search"
|
97
102
|
require_relative "vector_store/pgvector/build_from_files"
|
98
103
|
require_relative "vector_store/pgvector/build_from_array"
|
data/lib/boxcars/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boxcars
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.14
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Sullivan
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-
|
12
|
+
date: 2023-06-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: google_search_results
|
@@ -54,33 +54,33 @@ dependencies:
|
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '0.8'
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
|
-
name:
|
57
|
+
name: pgvector
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '
|
62
|
+
version: '0.2'
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: '
|
69
|
+
version: '0.2'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
|
-
name:
|
71
|
+
name: ruby-openai
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '
|
76
|
+
version: '4.1'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
79
|
version_requirements: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
81
|
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
|
-
version: '
|
83
|
+
version: '4.1'
|
84
84
|
description: You simply set an OpenAI key, give a number of Boxcars to a Train, and
|
85
85
|
magic ensues when you run it.
|
86
86
|
email:
|
@@ -135,7 +135,7 @@ files:
|
|
135
135
|
- lib/boxcars/vector_store/hnswlib/load_from_disk.rb
|
136
136
|
- lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb
|
137
137
|
- lib/boxcars/vector_store/hnswlib/search.rb
|
138
|
-
- lib/boxcars/vector_store/in_memory/
|
138
|
+
- lib/boxcars/vector_store/in_memory/build_from_array.rb
|
139
139
|
- lib/boxcars/vector_store/in_memory/build_from_files.rb
|
140
140
|
- lib/boxcars/vector_store/in_memory/search.rb
|
141
141
|
- lib/boxcars/vector_store/pgvector/build_from_array.rb
|
@@ -1,51 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module Boxcars
|
4
|
-
module VectorStore
|
5
|
-
module InMemory
|
6
|
-
class BuildFromDocumentArray
|
7
|
-
include VectorStore
|
8
|
-
|
9
|
-
def initialize(embedding_tool: :openai, documents: nil)
|
10
|
-
validate_params(embedding_tool, documents)
|
11
|
-
@embedding_tool = embedding_tool
|
12
|
-
@documents = documents
|
13
|
-
@memory_vectors = []
|
14
|
-
end
|
15
|
-
|
16
|
-
def call
|
17
|
-
texts = documents
|
18
|
-
vectors = generate_vectors(texts)
|
19
|
-
add_vectors(vectors, documents)
|
20
|
-
{
|
21
|
-
type: :in_memory,
|
22
|
-
vector_store: memory_vectors
|
23
|
-
}
|
24
|
-
end
|
25
|
-
|
26
|
-
private
|
27
|
-
|
28
|
-
attr_reader :documents, :memory_vectors
|
29
|
-
|
30
|
-
def validate_params(embedding_tool, documents)
|
31
|
-
raise_argument_error('documents is nil') unless documents
|
32
|
-
return if %i[openai tensorflow].include?(embedding_tool)
|
33
|
-
|
34
|
-
raise_argument_error('embedding_tool is invalid')
|
35
|
-
end
|
36
|
-
|
37
|
-
# returns array of documents with vectors
|
38
|
-
def add_vectors(vectors, documents)
|
39
|
-
vectors.zip(documents).each do |vector, doc|
|
40
|
-
memory_vector = Document.new(
|
41
|
-
content: doc[:content],
|
42
|
-
embedding: vector[:embedding],
|
43
|
-
metadata: doc[:metadata].merge(dim: vector[:dim])
|
44
|
-
)
|
45
|
-
@memory_vectors << memory_vector
|
46
|
-
end
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|