boxcars 0.2.11 → 0.2.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.env_sample +1 -0
- data/.rubocop.yml +16 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +12 -12
- data/Gemfile.lock +34 -28
- data/README.md +4 -1
- data/boxcars.gemspec +2 -2
- data/lib/boxcars/boxcar/active_record.rb +1 -1
- data/lib/boxcars/boxcar.rb +1 -0
- data/lib/boxcars/engine/openai.rb +8 -1
- data/lib/boxcars/vector_search.rb +66 -2
- data/lib/boxcars/vector_store/document.rb +3 -2
- data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
- data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
- data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
- data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
- data/lib/boxcars/vector_store/split_text.rb +2 -3
- data/lib/boxcars/vector_store.rb +73 -7
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -1
- metadata +14 -10
- data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
- data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
- data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
- data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
- data/lib/boxcars/vector_store/similarity_search.rb +0 -55
@@ -0,0 +1,152 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pg'
|
4
|
+
require 'pgvector'
|
5
|
+
|
6
|
+
module Boxcars
|
7
|
+
module VectorStore
|
8
|
+
module Pgvector
|
9
|
+
class SaveToDatabase
|
10
|
+
include VectorStore
|
11
|
+
|
12
|
+
# params = {
|
13
|
+
# pg_vectors: pg_vectors,
|
14
|
+
# database_url: db_url,
|
15
|
+
# table_name: table_name,
|
16
|
+
# embedding_column_name: embedding_column_name,
|
17
|
+
# content_column_name: content_column_name
|
18
|
+
# }
|
19
|
+
def initialize(params)
|
20
|
+
@errors = []
|
21
|
+
validate_param_types(params)
|
22
|
+
@db_connection = test_db_params(params)
|
23
|
+
|
24
|
+
@table_name = params[:table_name]
|
25
|
+
@content_column_name = params[:content_column_name]
|
26
|
+
@embedding_column_name = params[:embedding_column_name]
|
27
|
+
@metadata_column_name = params[:metadata_column_name]
|
28
|
+
|
29
|
+
@pg_vectors = params[:pg_vectors]
|
30
|
+
end
|
31
|
+
|
32
|
+
def call
|
33
|
+
return { success: false, error: errors } unless errors.empty?
|
34
|
+
|
35
|
+
add_vectors_to_database
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
|
40
|
+
attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
|
41
|
+
:embedding_column_name, :content_column_name,
|
42
|
+
:metadata_column_name, :errors
|
43
|
+
|
44
|
+
def validate_param_types(params)
|
45
|
+
pg_vectors = params[:pg_vectors]
|
46
|
+
|
47
|
+
raise_argument_error('pg_vectors must be an array') unless pg_vectors.is_a?(Array)
|
48
|
+
raise_argument_error('missing data') if pg_vectors.empty?
|
49
|
+
raise_argument_error('invalid vector_store') unless valid_vector_store?(pg_vectors)
|
50
|
+
@database_url = params[:database_url]
|
51
|
+
raise_argument_error('missing database_url argument') if @database_url.to_s.empty?
|
52
|
+
end
|
53
|
+
|
54
|
+
def valid_vector_store?(pg_vectors)
|
55
|
+
pg_vectors.all? do |doc|
|
56
|
+
doc.is_a?(Boxcars::VectorStore::Document)
|
57
|
+
end
|
58
|
+
rescue TypeError => e
|
59
|
+
raise_argument_error(e.message)
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_db_params(params)
|
63
|
+
conn = ::PG::Connection.new(@database_url)
|
64
|
+
|
65
|
+
check_db_connection(conn)
|
66
|
+
check_vector_extension(conn)
|
67
|
+
check_table_exists(conn, params[:table_name])
|
68
|
+
check_column_exists(conn, params)
|
69
|
+
|
70
|
+
registry = PG::BasicTypeRegistry.new.define_default_types
|
71
|
+
::Pgvector::PG.register_vector(registry)
|
72
|
+
conn.type_map_for_queries = PG::BasicTypeMapForQueries.new(conn, registry: registry)
|
73
|
+
conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
|
74
|
+
conn
|
75
|
+
rescue PG::Error, NameError => e
|
76
|
+
raise_argument_error(e.message)
|
77
|
+
end
|
78
|
+
|
79
|
+
def check_db_connection(conn)
|
80
|
+
return if conn.status == PG::CONNECTION_OK
|
81
|
+
|
82
|
+
raise_argument_error("PostgreSQL connection is not ok")
|
83
|
+
end
|
84
|
+
|
85
|
+
def check_vector_extension(conn)
|
86
|
+
return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
|
87
|
+
|
88
|
+
raise_argument_error("PostgreSQL 'vector' extension is not installed")
|
89
|
+
end
|
90
|
+
|
91
|
+
def check_table_exists(conn, table_name)
|
92
|
+
table_exists = conn.exec_params(
|
93
|
+
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
|
94
|
+
).getvalue(0, 0) == "t"
|
95
|
+
return if table_exists
|
96
|
+
|
97
|
+
raise_argument_error("Table '#{table_name}' does not exist")
|
98
|
+
end
|
99
|
+
|
100
|
+
def check_column_exists(conn, params)
|
101
|
+
column_names = %i[embedding_column_name content_column_name metadata_column_name]
|
102
|
+
table_name = params[:table_name]
|
103
|
+
|
104
|
+
column_names.each do |target|
|
105
|
+
column_name = params[target]
|
106
|
+
column_exists = conn.exec_params(
|
107
|
+
"SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
|
108
|
+
[table_name, column_name]
|
109
|
+
).getvalue(0, 0) == "t"
|
110
|
+
next if column_exists
|
111
|
+
|
112
|
+
raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
def add_vectors_to_database
|
117
|
+
pg_vectors.each do |document|
|
118
|
+
embedding = document.embedding.map(&:to_f)
|
119
|
+
content = document.content
|
120
|
+
metadata = document.metadata.to_json
|
121
|
+
|
122
|
+
if document.metadata[:id]
|
123
|
+
id = document.metadata[:id]
|
124
|
+
# directly inserting table_name, embedding_column_name, and content_column_name
|
125
|
+
# into the SQL command. If these values are coming from an untrusted source,
|
126
|
+
# there is a risk of SQL injection
|
127
|
+
sql = <<-SQL
|
128
|
+
INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
|
129
|
+
VALUES ($1, $2, $3, $4)
|
130
|
+
ON CONFLICT (id) DO UPDATE
|
131
|
+
SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
|
132
|
+
#{content_column_name} = EXCLUDED.#{content_column_name},
|
133
|
+
#{metadata_column_name} = EXCLUDED.#{metadata_column_name}
|
134
|
+
SQL
|
135
|
+
# parameters are given separately from the SQL command,
|
136
|
+
# there's no risk of them being interpreted as part of the command.
|
137
|
+
db_connection.exec_params(sql, [id, embedding, content, metadata])
|
138
|
+
else
|
139
|
+
sql = <<-SQL
|
140
|
+
INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
|
141
|
+
VALUES ($1, $2, $3)
|
142
|
+
SQL
|
143
|
+
db_connection.exec_params(sql, [embedding, content, metadata])
|
144
|
+
end
|
145
|
+
end
|
146
|
+
rescue PG::Error => e
|
147
|
+
raise_argument_error(e.message)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pg'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Boxcars
|
7
|
+
module VectorStore
|
8
|
+
module Pgvector
|
9
|
+
class Search
|
10
|
+
include VectorStore
|
11
|
+
|
12
|
+
# required params:
|
13
|
+
# {
|
14
|
+
# type: :pgvector,
|
15
|
+
# vector_store: {
|
16
|
+
# database_url: database_url,
|
17
|
+
# table_name: table_name,
|
18
|
+
# embedding_column_name: embedding_column_name,
|
19
|
+
# content_column_name: content_column_name,
|
20
|
+
# metadata_column_name: metadata_column_name
|
21
|
+
# }
|
22
|
+
# }
|
23
|
+
def initialize(params)
|
24
|
+
vector_store = validate_params(params)
|
25
|
+
db_url = validate_vector_store(vector_store)
|
26
|
+
@db_connection = test_db(db_url)
|
27
|
+
|
28
|
+
@vector_documents = params[:vector_documents]
|
29
|
+
end
|
30
|
+
|
31
|
+
def call(query_vector:, count: 1)
|
32
|
+
raise ::Boxcars::ArgumentError, 'query_vector is empty' if query_vector.empty?
|
33
|
+
|
34
|
+
search(query_vector, count)
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
attr_reader :vector_documents, :vector_store, :db_connection,
|
40
|
+
:table_name, :embedding_column_name, :content_column_name
|
41
|
+
|
42
|
+
def validate_params(params)
|
43
|
+
@vector_documents = params[:vector_documents]
|
44
|
+
|
45
|
+
raise_argument_error('vector_documents is nil') unless vector_documents
|
46
|
+
raise_arugment_error('vector_documents must be a hash') unless vector_documents.is_a?(Hash)
|
47
|
+
raise_arugment_error('type must be pgvector') unless vector_documents[:type] == :pgvector
|
48
|
+
|
49
|
+
@vector_store = vector_documents[:vector_store]
|
50
|
+
@vector_store
|
51
|
+
end
|
52
|
+
|
53
|
+
def validate_vector_store(vector_store)
|
54
|
+
raise_arugment_error('vector_store is nil') unless vector_store
|
55
|
+
raise_arugment_error('vector_store must be a hash') unless vector_store.is_a?(Hash)
|
56
|
+
raise_arugment_error('vector_store must have a table_name') unless vector_store[:table_name]
|
57
|
+
raise_arugment_error('vector_store must have a embedding_column_name') unless vector_store[:embedding_column_name]
|
58
|
+
raise_arugment_error('vector_store must have a content_column_name') unless vector_store[:content_column_name]
|
59
|
+
raise_argument_error('missing DATABASE_URL') unless vector_store[:database_url]
|
60
|
+
|
61
|
+
vector_store[:database_url]
|
62
|
+
end
|
63
|
+
|
64
|
+
def test_db(db_url)
|
65
|
+
conn = ::PG::Connection.new(db_url)
|
66
|
+
|
67
|
+
check_db_connection(conn)
|
68
|
+
check_vector_extension(conn)
|
69
|
+
check_table_exists(conn, vector_store[:table_name])
|
70
|
+
check_column_exists(conn)
|
71
|
+
|
72
|
+
@table_name = vector_store[:table_name]
|
73
|
+
@embedding_column_name = vector_store[:embedding_column_name]
|
74
|
+
@content_column_name = vector_store[:content_column_name]
|
75
|
+
|
76
|
+
conn
|
77
|
+
rescue PG::Error, PG::UndefinedTable, NameError => e
|
78
|
+
raise_argument_error(e.message)
|
79
|
+
end
|
80
|
+
|
81
|
+
def check_db_connection(conn)
|
82
|
+
return if conn.status == PG::CONNECTION_OK
|
83
|
+
|
84
|
+
raise_argument_error("PostgreSQL connection is not ok")
|
85
|
+
end
|
86
|
+
|
87
|
+
def check_vector_extension(conn)
|
88
|
+
return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
|
89
|
+
|
90
|
+
raise_argument_error("PostgreSQL 'vector' extension is not installed")
|
91
|
+
end
|
92
|
+
|
93
|
+
def check_table_exists(conn, table_name)
|
94
|
+
table_exists = conn.exec_params(
|
95
|
+
"SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
|
96
|
+
).getvalue(0, 0) == "t"
|
97
|
+
return if table_exists
|
98
|
+
|
99
|
+
raise_argument_error("Table '#{table_name}' does not exist")
|
100
|
+
end
|
101
|
+
|
102
|
+
def check_column_exists(conn)
|
103
|
+
column_names = %i[embedding_column_name content_column_name]
|
104
|
+
table_name = vector_store[:table_name]
|
105
|
+
|
106
|
+
column_names.each do |target|
|
107
|
+
column_name = vector_store[target]
|
108
|
+
column_exists = conn.exec_params(
|
109
|
+
"SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
|
110
|
+
[table_name, column_name]
|
111
|
+
).getvalue(0, 0) == "t"
|
112
|
+
next if column_exists
|
113
|
+
|
114
|
+
raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def search(query_vector, num_neighbors)
|
119
|
+
sql = <<-SQL
|
120
|
+
SELECT *, #{embedding_column_name} <-> $1 AS distance FROM #{table_name}
|
121
|
+
ORDER BY #{embedding_column_name} <-> $1
|
122
|
+
LIMIT #{num_neighbors}
|
123
|
+
SQL
|
124
|
+
result = db_connection.exec_params(sql, [query_vector.to_s]).to_a
|
125
|
+
return [] if result.empty?
|
126
|
+
|
127
|
+
result.map { |hash| hash.transform_keys(&:to_sym) }
|
128
|
+
.map do |item|
|
129
|
+
{
|
130
|
+
document: Boxcars::VectorStore::Document.new(
|
131
|
+
content: item[:content],
|
132
|
+
embedding: JSON.parse(item[:embedding]),
|
133
|
+
metadata: JSON.parse(item[:metadata], symbolize_names: true)
|
134
|
+
),
|
135
|
+
distance: item[:distance].to_f
|
136
|
+
}
|
137
|
+
end
|
138
|
+
rescue StandardError => e
|
139
|
+
raise_argument_error("Error searching for #{query_vector}: #{e.message}")
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -6,14 +6,11 @@ module Boxcars
|
|
6
6
|
class SplitText
|
7
7
|
include VectorStore
|
8
8
|
|
9
|
-
attr_reader :separator, :chunk_size, :chunk_overlap, :text
|
10
|
-
|
11
9
|
# @param separator [String] The string to use to split the text.
|
12
10
|
# @param chunk_size [Integer] The size of each chunk.
|
13
11
|
# @param chunk_overlap [Integer] The amount of overlap between chunks.
|
14
12
|
# @param text [String] The text to split.
|
15
13
|
def initialize(separator: "Search", chunk_size: 7, chunk_overlap: 3, text: "")
|
16
|
-
# require 'debugger'; debugger
|
17
14
|
validate_params(separator, chunk_size, chunk_overlap, text)
|
18
15
|
|
19
16
|
@separator = separator
|
@@ -31,6 +28,8 @@ module Boxcars
|
|
31
28
|
|
32
29
|
private
|
33
30
|
|
31
|
+
attr_reader :separator, :chunk_size, :chunk_overlap, :text
|
32
|
+
|
34
33
|
def validate_params(separator, chunk_size, chunk_overlap, text)
|
35
34
|
raise_error("separator must be a string") unless separator.is_a?(String)
|
36
35
|
raise_error("chunk_size must be an integer") unless chunk_size.is_a?(Integer)
|
data/lib/boxcars/vector_store.rb
CHANGED
@@ -13,10 +13,72 @@ module Boxcars
|
|
13
13
|
|
14
14
|
def self.included(base)
|
15
15
|
base.extend(ClassMethods)
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
attr_reader :embedding_tool
|
21
|
+
|
22
|
+
def generate_vectors(texts)
|
23
|
+
@embedding_tool = embedding_tool || :openai
|
24
|
+
|
25
|
+
embeddings_method[:klass]
|
26
|
+
.call(
|
27
|
+
texts: texts, client: embeddings_method[:client]
|
28
|
+
)
|
29
|
+
.map { |item| item.transform_keys(&:to_sym) }
|
30
|
+
end
|
31
|
+
|
32
|
+
def embeddings_method
|
33
|
+
case @embedding_tool
|
34
|
+
when :openai
|
35
|
+
{ klass: Boxcars::VectorStore::EmbedViaOpenAI, client: openai_client }
|
36
|
+
when :tensorflow
|
37
|
+
{ klass: Boxcars::VectorStore::EmbedViaTensorflow, client: nil }
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# Get the OpenAI client
|
42
|
+
# @param openai_access_token [String] the OpenAI access token
|
43
|
+
# @return [OpenAI::Client]
|
44
|
+
def openai_client(openai_access_token: nil)
|
45
|
+
@openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
|
46
|
+
end
|
47
|
+
|
48
|
+
def raise_argument_error(message)
|
49
|
+
raise ::Boxcars::ArgumentError, message
|
50
|
+
end
|
51
|
+
|
52
|
+
def parse_json_file(file_path)
|
53
|
+
return [] if file_path.nil?
|
54
|
+
|
55
|
+
file_content = File.read(file_path)
|
56
|
+
JSON.parse(file_content, symbolize_names: true)
|
57
|
+
rescue JSON::ParserError => e
|
58
|
+
raise_argument_error("Error parsing #{file_path}: #{e.message}")
|
59
|
+
end
|
60
|
+
|
61
|
+
def load_data_files(training_data_path)
|
62
|
+
data = []
|
63
|
+
files = Dir.glob(training_data_path)
|
64
|
+
raise_error "No files found at #{training_data_path}" if files.empty?
|
65
|
+
|
66
|
+
files.each do |file|
|
67
|
+
data << File.read(file)
|
68
|
+
end
|
69
|
+
puts "Added #{files.length} files to data. Splitting text into chunks..."
|
70
|
+
data
|
71
|
+
end
|
16
72
|
|
17
|
-
|
18
|
-
|
73
|
+
def split_text_into_chunks(data)
|
74
|
+
docs = []
|
75
|
+
data.each do |chunk|
|
76
|
+
doc_output = Boxcars::VectorStore::SplitText.call(
|
77
|
+
separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
|
78
|
+
)
|
79
|
+
docs.concat(doc_output)
|
19
80
|
end
|
81
|
+
docs
|
20
82
|
end
|
21
83
|
end
|
22
84
|
end
|
@@ -25,10 +87,14 @@ require_relative "vector_store/document"
|
|
25
87
|
require_relative "vector_store/embed_via_open_ai"
|
26
88
|
require_relative "vector_store/embed_via_tensorflow"
|
27
89
|
require_relative "vector_store/split_text"
|
28
|
-
require_relative "vector_store/
|
29
|
-
require_relative "vector_store/hnswlib/hnswlib_config"
|
90
|
+
require_relative "vector_store/hnswlib/load_from_disk"
|
30
91
|
require_relative "vector_store/hnswlib/save_to_hnswlib"
|
31
|
-
require_relative "vector_store/hnswlib/
|
32
|
-
require_relative "vector_store/hnswlib/
|
33
|
-
require_relative "vector_store/in_memory/
|
92
|
+
require_relative "vector_store/hnswlib/build_from_files"
|
93
|
+
require_relative "vector_store/hnswlib/search"
|
94
|
+
require_relative "vector_store/in_memory/build_from_files"
|
95
|
+
require_relative "vector_store/in_memory/build_from_document_array"
|
34
96
|
require_relative "vector_store/in_memory/search"
|
97
|
+
require_relative "vector_store/pgvector/build_from_files"
|
98
|
+
require_relative "vector_store/pgvector/build_from_array"
|
99
|
+
require_relative "vector_store/pgvector/save_to_database"
|
100
|
+
require_relative "vector_store/pgvector/search"
|
data/lib/boxcars/version.rb
CHANGED
data/lib/boxcars.rb
CHANGED
@@ -58,7 +58,7 @@ module Boxcars
|
|
58
58
|
# override with kwargs if present
|
59
59
|
kwargs[key]
|
60
60
|
elsif (provided_val = instance_variable_get("@#{key}"))
|
61
|
-
# use saved value if present. Set using Boxcars
|
61
|
+
# use saved value if present. Set using Boxcars.configuration.the_key = "abcde"
|
62
62
|
provided_val
|
63
63
|
else
|
64
64
|
# otherwise, dig out of the environment
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: boxcars
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francis Sullivan
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: exe
|
11
11
|
cert_chain: []
|
12
|
-
date: 2023-05-
|
12
|
+
date: 2023-05-22 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: debug
|
@@ -87,14 +87,14 @@ dependencies:
|
|
87
87
|
requirements:
|
88
88
|
- - "~>"
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: '
|
90
|
+
version: '4.0'
|
91
91
|
type: :runtime
|
92
92
|
prerelease: false
|
93
93
|
version_requirements: !ruby/object:Gem::Requirement
|
94
94
|
requirements:
|
95
95
|
- - "~>"
|
96
96
|
- !ruby/object:Gem::Version
|
97
|
-
version: '
|
97
|
+
version: '4.0'
|
98
98
|
description: You simply set an OpenAI key, give a number of Boxcars to a Train, and
|
99
99
|
magic ensues when you run it.
|
100
100
|
email:
|
@@ -144,13 +144,17 @@ files:
|
|
144
144
|
- lib/boxcars/vector_store/document.rb
|
145
145
|
- lib/boxcars/vector_store/embed_via_open_ai.rb
|
146
146
|
- lib/boxcars/vector_store/embed_via_tensorflow.rb
|
147
|
-
- lib/boxcars/vector_store/hnswlib/
|
148
|
-
- lib/boxcars/vector_store/hnswlib/
|
149
|
-
- lib/boxcars/vector_store/hnswlib/hnswlib_search.rb
|
147
|
+
- lib/boxcars/vector_store/hnswlib/build_from_files.rb
|
148
|
+
- lib/boxcars/vector_store/hnswlib/load_from_disk.rb
|
150
149
|
- lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb
|
151
|
-
- lib/boxcars/vector_store/
|
150
|
+
- lib/boxcars/vector_store/hnswlib/search.rb
|
151
|
+
- lib/boxcars/vector_store/in_memory/build_from_document_array.rb
|
152
|
+
- lib/boxcars/vector_store/in_memory/build_from_files.rb
|
152
153
|
- lib/boxcars/vector_store/in_memory/search.rb
|
153
|
-
- lib/boxcars/vector_store/
|
154
|
+
- lib/boxcars/vector_store/pgvector/build_from_array.rb
|
155
|
+
- lib/boxcars/vector_store/pgvector/build_from_files.rb
|
156
|
+
- lib/boxcars/vector_store/pgvector/save_to_database.rb
|
157
|
+
- lib/boxcars/vector_store/pgvector/search.rb
|
154
158
|
- lib/boxcars/vector_store/split_text.rb
|
155
159
|
- lib/boxcars/version.rb
|
156
160
|
homepage: https://github.com/BoxcarsAI/boxcars
|
@@ -169,7 +173,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
169
173
|
requirements:
|
170
174
|
- - ">="
|
171
175
|
- !ruby/object:Gem::Version
|
172
|
-
version:
|
176
|
+
version: '3.0'
|
173
177
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
174
178
|
requirements:
|
175
179
|
- - ">="
|
@@ -1,157 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require 'fileutils'
|
4
|
-
require 'hnswlib'
|
5
|
-
require 'json'
|
6
|
-
|
7
|
-
module Boxcars
|
8
|
-
module VectorStore
|
9
|
-
module Hnswlib
|
10
|
-
class BuildVectorStore
|
11
|
-
include VectorStore
|
12
|
-
|
13
|
-
# This class is responsible for building the vector store for the hnswlib similarity search.
|
14
|
-
# It will load the training data, generate the embeddings, and save the vector store.
|
15
|
-
# It will also load the vector store into memory.
|
16
|
-
# For later use, it will save the splitted document with index numbers to a json file.
|
17
|
-
#
|
18
|
-
# @param training_data_path [String] The path to the training data. Can be a glob pattern.
|
19
|
-
# @param index_file_path [String] The path to the index file.
|
20
|
-
# @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
|
21
|
-
# @option json_doc_file_path [String]. The json file containing the document text.
|
22
|
-
# if nil, it will reuse index file name.
|
23
|
-
# @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
|
24
|
-
def initialize(
|
25
|
-
training_data_path:,
|
26
|
-
index_file_path:,
|
27
|
-
split_chunk_size: 2000,
|
28
|
-
json_doc_file_path: nil,
|
29
|
-
force_rebuild: true
|
30
|
-
)
|
31
|
-
@training_data_path = training_data_path
|
32
|
-
@index_file_path = index_file_path
|
33
|
-
@split_chunk_size = split_chunk_size
|
34
|
-
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
35
|
-
@force_rebuild = force_rebuild
|
36
|
-
end
|
37
|
-
|
38
|
-
def call
|
39
|
-
validate_params
|
40
|
-
data = load_files
|
41
|
-
documents = split_text_into_chunks(data)
|
42
|
-
embeddings_with_config = generate_embeddings(documents)
|
43
|
-
save_vector_store(embeddings_with_config)
|
44
|
-
load_hnsw
|
45
|
-
end
|
46
|
-
|
47
|
-
private
|
48
|
-
|
49
|
-
attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
|
50
|
-
|
51
|
-
def validate_params
|
52
|
-
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
53
|
-
raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
54
|
-
raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
55
|
-
|
56
|
-
index_dir = File.dirname(index_file_path)
|
57
|
-
raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
|
58
|
-
|
59
|
-
raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
|
60
|
-
end
|
61
|
-
|
62
|
-
def load_files
|
63
|
-
data = []
|
64
|
-
files = Dir.glob(training_data_path)
|
65
|
-
raise_error "No files found at #{training_data_path}" if files.empty?
|
66
|
-
|
67
|
-
files.each do |file|
|
68
|
-
data << File.read(file)
|
69
|
-
end
|
70
|
-
puts "Added #{files.length} files to data. Splitting text into chunks..."
|
71
|
-
data
|
72
|
-
end
|
73
|
-
|
74
|
-
def split_text_into_chunks(data)
|
75
|
-
return true unless rebuild_required?
|
76
|
-
|
77
|
-
docs = []
|
78
|
-
data.each do |chunk|
|
79
|
-
doc_output = Boxcars::VectorStore::SplitText.call(
|
80
|
-
separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
|
81
|
-
)
|
82
|
-
docs.concat(doc_output)
|
83
|
-
end
|
84
|
-
docs
|
85
|
-
end
|
86
|
-
|
87
|
-
def rebuild_required?
|
88
|
-
hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
89
|
-
return true unless File.exist?(index_file_path)
|
90
|
-
return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
|
91
|
-
return true if force_rebuild
|
92
|
-
|
93
|
-
false
|
94
|
-
end
|
95
|
-
|
96
|
-
def generate_embeddings(documents)
|
97
|
-
return true unless rebuild_required?
|
98
|
-
|
99
|
-
puts "Initializing Store..."
|
100
|
-
openai_client = Openai.open_ai_client
|
101
|
-
embeddings_with_dim = Boxcars::VectorStore::EmbedViaOpenAI.call(texts: documents, client: openai_client)
|
102
|
-
document_embeddings = embeddings_with_dim.map.with_index do |item, index|
|
103
|
-
{ doc_id: index, embedding: item[:embedding], document: documents[index] }
|
104
|
-
end
|
105
|
-
|
106
|
-
{ document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
|
107
|
-
end
|
108
|
-
|
109
|
-
def save_vector_store(embeddings_with_config)
|
110
|
-
return true unless rebuild_required?
|
111
|
-
|
112
|
-
puts "Saving Vectorstore"
|
113
|
-
Boxcars::VectorStore::Hnswlib::SaveToHnswlib.call(
|
114
|
-
document_embeddings: embeddings_with_config[:document_embeddings],
|
115
|
-
index_file_path: index_file_path,
|
116
|
-
json_doc_file_path: json_doc_file_path,
|
117
|
-
hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
|
118
|
-
)
|
119
|
-
puts "VectorStore saved"
|
120
|
-
end
|
121
|
-
|
122
|
-
def hnswlib_config(dim)
|
123
|
-
# dim: length of datum point vector that will be indexed.
|
124
|
-
Boxcars::VectorStore::Hnswlib::HnswlibConfig.new(
|
125
|
-
metric: "l2", max_item: 10000, dim: dim
|
126
|
-
)
|
127
|
-
end
|
128
|
-
|
129
|
-
def load_hnsw
|
130
|
-
puts "Loading Hnswlib"
|
131
|
-
|
132
|
-
config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
|
133
|
-
json_config = parse_json_file(config_file)
|
134
|
-
document_embeddings = parse_json_file(json_doc_file_path)
|
135
|
-
|
136
|
-
search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
|
137
|
-
search_index.load_index(index_file_path)
|
138
|
-
|
139
|
-
{ vector_store: search_index, document_embeddings: document_embeddings }
|
140
|
-
end
|
141
|
-
|
142
|
-
def parse_json_file(file_path)
|
143
|
-
return [] if file_path.nil?
|
144
|
-
|
145
|
-
file_content = File.read(file_path)
|
146
|
-
JSON.parse(file_content, symbolize_names: true)
|
147
|
-
rescue JSON::ParserError => e
|
148
|
-
raise_error("Error parsing hnswlib_config.json: #{e.message}")
|
149
|
-
end
|
150
|
-
|
151
|
-
def raise_error(message)
|
152
|
-
raise ::Boxcars::Error, message
|
153
|
-
end
|
154
|
-
end
|
155
|
-
end
|
156
|
-
end
|
157
|
-
end
|