boxcars 0.2.11 → 0.2.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.env_sample +1 -0
- data/.rubocop.yml +16 -0
- data/CHANGELOG.md +12 -0
- data/Gemfile +12 -12
- data/Gemfile.lock +34 -28
- data/README.md +4 -1
- data/boxcars.gemspec +2 -2
- data/lib/boxcars/boxcar/active_record.rb +1 -1
- data/lib/boxcars/boxcar.rb +1 -0
- data/lib/boxcars/engine/openai.rb +8 -1
- data/lib/boxcars/vector_search.rb +66 -2
- data/lib/boxcars/vector_store/document.rb +3 -2
- data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
- data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +100 -0
- data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
- data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
- data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
- data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
- data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
- data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
- data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
- data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
- data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
- data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
- data/lib/boxcars/vector_store/split_text.rb +2 -3
- data/lib/boxcars/vector_store.rb +73 -7
- data/lib/boxcars/version.rb +1 -1
- data/lib/boxcars.rb +1 -1
- metadata +14 -10
- data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
- data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
- data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
- data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
- data/lib/boxcars/vector_store/similarity_search.rb +0 -55
@@ -12,67 +12,77 @@ module Boxcars
|
|
12
12
|
|
13
13
|
# @param document_embeddings [Array] An array of hashes containing the document id, document text, and embedding.
|
14
14
|
# @param index_file_path [String] The path to the index file.
|
15
|
-
# @param hnswlib_config [Boxcars::VectorStore::Hnswlib::Config] The config object for the hnswlib index.
|
16
15
|
# @option json_doc_file_path [String] Optional. The path to the json file containing the document text.
|
17
|
-
def initialize(
|
18
|
-
@
|
19
|
-
|
20
|
-
@json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
|
21
|
-
|
22
|
-
@hnswlib_config = hnswlib_config
|
23
|
-
@index = ::Hnswlib::HnswIndex.new(
|
24
|
-
n_features: hnswlib_config.dim,
|
25
|
-
max_item: hnswlib_config.max_item,
|
26
|
-
metric: hnswlib_config.metric
|
27
|
-
)
|
28
|
-
end
|
16
|
+
def initialize(hnsw_vectors_array)
|
17
|
+
@metadata = hnsw_vectors_array&.first&.metadata
|
18
|
+
validate_params(hnsw_vectors_array, metadata)
|
29
19
|
|
30
|
-
|
31
|
-
|
32
|
-
|
20
|
+
@vectors = hnsw_vectors_array
|
21
|
+
@index_file_path = metadata[:index_file_path]
|
22
|
+
@json_doc_file_path = metadata[:json_doc_file_path] || @index_file_path.gsub(/\.bin$/, '.json')
|
33
23
|
|
34
|
-
|
35
|
-
|
24
|
+
@metric = metadata[:metric] || "l2"
|
25
|
+
@dim = metadata[:dim]
|
26
|
+
@max_item = metadata[:max_item] || 10000
|
36
27
|
|
37
|
-
|
38
|
-
|
28
|
+
@index = ::Hnswlib::HierarchicalNSW.new(
|
29
|
+
space: @metric,
|
30
|
+
dim: @dim
|
31
|
+
)
|
32
|
+
@index.init_index(max_elements: @max_item)
|
33
|
+
end
|
39
34
|
|
35
|
+
def call
|
36
|
+
document_texts = add_vectors_to_index
|
40
37
|
write_files(index, document_texts)
|
41
38
|
end
|
42
39
|
|
43
40
|
private
|
44
41
|
|
45
|
-
|
46
|
-
FileUtils.mkdir_p(File.dirname(json_doc_file_path))
|
47
|
-
File.write(json_doc_file_path, document_texts.to_json)
|
42
|
+
attr_reader :metadata, :index, :vectors, :index_file_path, :json_doc_file_path, :metric, :dim, :max_item
|
48
43
|
|
49
|
-
|
50
|
-
|
44
|
+
def validate_params(hnsw_vectors_array, metadata)
|
45
|
+
raise_argument_error('argument must be an array') unless hnsw_vectors_array.is_a?(Array)
|
46
|
+
raise_argument_error('missing data') if hnsw_vectors_array.empty?
|
47
|
+
raise_error('missing metadata') unless metadata || metadata.empty?
|
51
48
|
|
52
|
-
|
49
|
+
raise_argument_error("dim must be an integer") unless metadata[:dim].is_a?(Integer)
|
50
|
+
raise_argument_error('missing dim') unless metadata[:dim]
|
51
|
+
raise_argument_error('missing index_file_path') unless metadata[:index_file_path]
|
52
|
+
|
53
|
+
check_parent_directory(metadata[:index_file_path])
|
54
|
+
check_parent_directory(metadata[:json_doc_file_path])
|
53
55
|
end
|
54
56
|
|
55
|
-
|
57
|
+
def add_vectors_to_index
|
58
|
+
document_texts = []
|
56
59
|
|
57
|
-
|
58
|
-
|
59
|
-
raise_error("dim must be an integer") unless hnswlib_config.dim.is_a?(Integer)
|
60
|
-
raise_error("index_file_path must be a string") unless index_file_path.is_a?(String)
|
60
|
+
vectors.each do |item|
|
61
|
+
index.add_point(item.embedding, item.metadata[:doc_id])
|
61
62
|
|
62
|
-
|
63
|
-
|
63
|
+
document_texts << {
|
64
|
+
doc_id: item.metadata[:doc_id],
|
65
|
+
embedding: item.embedding,
|
66
|
+
document: item.content,
|
67
|
+
metadata: item.metadata
|
68
|
+
}
|
64
69
|
end
|
70
|
+
document_texts
|
71
|
+
end
|
72
|
+
|
73
|
+
def write_files(index, document_texts)
|
74
|
+
FileUtils.mkdir_p(File.dirname(json_doc_file_path))
|
75
|
+
File.write(json_doc_file_path, document_texts.to_json)
|
76
|
+
|
77
|
+
FileUtils.mkdir_p(File.dirname(index_file_path))
|
78
|
+
index.save_index(index_file_path)
|
65
79
|
end
|
66
80
|
|
67
81
|
def check_parent_directory(path)
|
68
82
|
return unless path
|
69
83
|
|
70
84
|
parent_dir = File.dirname(path)
|
71
|
-
|
72
|
-
end
|
73
|
-
|
74
|
-
def raise_error(message)
|
75
|
-
raise ::Boxcars::ValueError, message
|
85
|
+
raise_argument_error('parent directory must exist') unless File.directory?(parent_dir)
|
76
86
|
end
|
77
87
|
end
|
78
88
|
end
|
@@ -0,0 +1,70 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'hnswlib'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
module Boxcars
|
7
|
+
module VectorStore
|
8
|
+
module Hnswlib
|
9
|
+
class Search
|
10
|
+
include VectorStore
|
11
|
+
|
12
|
+
def initialize(params)
|
13
|
+
validate_params(params[:vector_documents])
|
14
|
+
@vector_documents = params[:vector_documents]
|
15
|
+
@search_index = load_index(params[:vector_documents])
|
16
|
+
end
|
17
|
+
|
18
|
+
def call(query_vector:, count: 1)
|
19
|
+
search(query_vector, count)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
attr_reader :vector_documents, :vector_store, :json_doc, :search_index, :metadata
|
25
|
+
|
26
|
+
def validate_params(vector_documents)
|
27
|
+
raise_argument_error('vector_documents is nil') unless vector_documents
|
28
|
+
raise_arugment_error('vector_documents must be a hash') unless vector_documents.is_a?(Hash)
|
29
|
+
raise_arugment_error('type must be hnswlib') unless vector_documents[:type] == :hnswlib
|
30
|
+
raise_arugment_error('vector_store is nil') unless vector_documents[:vector_store]
|
31
|
+
raise_arugment_error('vector_store must be an array') unless vector_documents[:vector_store].is_a?(Array)
|
32
|
+
|
33
|
+
unless vector_documents[:vector_store].all? { |doc| doc.is_a?(Document) }
|
34
|
+
raise_arugment_error('vector_store must be an array of Document objects')
|
35
|
+
end
|
36
|
+
|
37
|
+
true
|
38
|
+
end
|
39
|
+
|
40
|
+
def load_index(vector_documents)
|
41
|
+
@metadata = vector_documents[:vector_store].first.metadata
|
42
|
+
@json_doc = @metadata[:json_doc_file_path]
|
43
|
+
|
44
|
+
search_index = ::Hnswlib::HierarchicalNSW.new(
|
45
|
+
space: metadata[:metric],
|
46
|
+
dim: metadata[:dim]
|
47
|
+
)
|
48
|
+
search_index.load_index(metadata[:index_file_path])
|
49
|
+
@search_index = search_index
|
50
|
+
@vector_store = vector_documents[:vector_store]
|
51
|
+
|
52
|
+
search_index
|
53
|
+
end
|
54
|
+
|
55
|
+
def search(query_vector, num_neighbors)
|
56
|
+
raw_results = search_index.search_knn(query_vector, num_neighbors)
|
57
|
+
raw_results.map { |doc_id, distance| lookup_embedding(doc_id, distance) }.compact
|
58
|
+
rescue StandardError => e
|
59
|
+
raise_argument_error("Error searching for #{query_vector}: #{e.message}")
|
60
|
+
end
|
61
|
+
|
62
|
+
def lookup_embedding(doc_id, distance)
|
63
|
+
return unless vector_store[doc_id]
|
64
|
+
|
65
|
+
{ document: vector_store[doc_id], distance: distance }
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module VectorStore
|
5
|
+
module InMemory
|
6
|
+
class BuildFromDocumentArray
|
7
|
+
include VectorStore
|
8
|
+
|
9
|
+
def initialize(embedding_tool: :openai, documents: nil)
|
10
|
+
validate_params(embedding_tool, documents)
|
11
|
+
@embedding_tool = embedding_tool
|
12
|
+
@documents = documents
|
13
|
+
@memory_vectors = []
|
14
|
+
end
|
15
|
+
|
16
|
+
def call
|
17
|
+
texts = documents
|
18
|
+
vectors = generate_vectors(texts)
|
19
|
+
add_vectors(vectors, documents)
|
20
|
+
{
|
21
|
+
type: :in_memory,
|
22
|
+
vector_store: memory_vectors
|
23
|
+
}
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
|
28
|
+
attr_reader :documents, :memory_vectors
|
29
|
+
|
30
|
+
def validate_params(embedding_tool, documents)
|
31
|
+
raise_argument_error('documents is nil') unless documents
|
32
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
33
|
+
|
34
|
+
raise_argument_error('embedding_tool is invalid')
|
35
|
+
end
|
36
|
+
|
37
|
+
# returns array of documents with vectors
|
38
|
+
def add_vectors(vectors, documents)
|
39
|
+
vectors.zip(documents).each do |vector, doc|
|
40
|
+
memory_vector = Document.new(
|
41
|
+
content: doc[:content],
|
42
|
+
embedding: vector[:embedding],
|
43
|
+
metadata: doc[:metadata].merge(dim: vector[:dim])
|
44
|
+
)
|
45
|
+
@memory_vectors << memory_vector
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module VectorStore
|
5
|
+
module InMemory
|
6
|
+
class BuildFromFiles
|
7
|
+
include VectorStore
|
8
|
+
|
9
|
+
def initialize(params)
|
10
|
+
@split_chunk_size = params[:split_chunk_size] || 2000
|
11
|
+
@training_data_path = File.absolute_path(params[:training_data_path])
|
12
|
+
@embedding_tool = params[:embedding_tool] || :openai
|
13
|
+
|
14
|
+
validate_params(embedding_tool, training_data_path)
|
15
|
+
@memory_vectors = []
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
data = load_data_files(training_data_path)
|
20
|
+
texts = split_text_into_chunks(data)
|
21
|
+
vectors = generate_vectors(texts)
|
22
|
+
add_vectors(vectors, texts)
|
23
|
+
|
24
|
+
{
|
25
|
+
type: :in_memory,
|
26
|
+
vector_store: memory_vectors
|
27
|
+
}
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :memory_vectors
|
33
|
+
|
34
|
+
def validate_params(embedding_tool, training_data_path)
|
35
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
36
|
+
|
37
|
+
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
38
|
+
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
39
|
+
|
40
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
41
|
+
|
42
|
+
raise_argument_error('embedding_tool is invalid')
|
43
|
+
end
|
44
|
+
|
45
|
+
def add_vectors(vectors, texts)
|
46
|
+
vectors.map.with_index do |vector, index|
|
47
|
+
memory_vector = Document.new(
|
48
|
+
content: texts[index],
|
49
|
+
embedding: vector[:embedding],
|
50
|
+
metadata: {
|
51
|
+
doc_id: index,
|
52
|
+
training_data_path: training_data_path
|
53
|
+
}
|
54
|
+
)
|
55
|
+
memory_vectors << memory_vector
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -1,72 +1,52 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
# require 'openai'
|
4
|
-
#
|
5
|
-
# documents = [
|
6
|
-
# { page_content: "hello", metadata: { a: 1 } },
|
7
|
-
# { page_content: "hi", metadata: { a: 1 } },
|
8
|
-
# { page_content: "bye", metadata: { a: 1 } },
|
9
|
-
# { page_content: "what's this", metadata: { a: 1 } },
|
10
|
-
# ]
|
11
|
-
#
|
12
|
-
# vector_documents = Boxcars::VectorStore::InMemory::AddDocuments.call(embedding_tool: :openai, documents: documents)
|
13
|
-
#
|
14
|
-
# result = Boxcars::VectorStore::InMemory::Search.call(vecotr_documents: vector_documents, query: "hello")
|
15
|
-
#
|
16
|
-
# expect(result).to eq(Boxcars::VectorStore::Document.new({ page_content: "hello", metadata: { a: 1 } }))
|
17
|
-
|
18
3
|
module Boxcars
|
19
4
|
module VectorStore
|
20
5
|
module InMemory
|
21
6
|
class Search
|
22
7
|
include VectorStore
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
@
|
27
|
-
@embedding_tool = embedding_tool
|
8
|
+
|
9
|
+
def initialize(params)
|
10
|
+
validate_params(params[:vector_documents])
|
11
|
+
@vector_documents = params[:vector_documents]
|
28
12
|
end
|
29
13
|
|
30
|
-
def call
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
similarity: cosine_similarity(query_vector, doc[:vector])
|
35
|
-
}
|
36
|
-
end
|
37
|
-
results.min_by { |result| -result[:similarity] }[:document]
|
14
|
+
def call(query_vector:, count: 1)
|
15
|
+
raise ::Boxcars::ArgumentError, 'query_vector is empty' if query_vector.empty?
|
16
|
+
|
17
|
+
search(query_vector, count)
|
38
18
|
end
|
39
19
|
|
40
20
|
private
|
41
21
|
|
42
|
-
|
43
|
-
raise ::Boxcars::ArgumentError, 'query is empty' if query.to_s.empty?
|
44
|
-
raise ::Boxcars::ArgumentError, 'embedding_tool is invalid' unless %i[openai tensorflow].include?(embedding_tool)
|
22
|
+
attr_reader :vector_documents
|
45
23
|
|
46
|
-
|
47
|
-
|
48
|
-
end
|
49
|
-
raise ::Boxcars::ArgumentError, "vector_documents is not valid"
|
50
|
-
end
|
51
|
-
end
|
24
|
+
def validate_params(vector_documents)
|
25
|
+
return if valid_vector_store?(vector_documents)
|
52
26
|
|
53
|
-
|
54
|
-
embeddings_method(@embedding_tool)[:klass].call(
|
55
|
-
texts: [@query], client: embeddings_method(@embedding_tool)[:client]
|
56
|
-
).first
|
27
|
+
raise ::Boxcars::ArgumentError, "vector_documents is not valid"
|
57
28
|
end
|
58
29
|
|
59
|
-
def
|
60
|
-
|
30
|
+
def valid_vector_store?(vector_documents)
|
31
|
+
vector_documents && vector_documents[:type] == :in_memory &&
|
32
|
+
vector_documents[:vector_store].is_a?(Array) &&
|
33
|
+
vector_documents[:vector_store].all? do |doc|
|
34
|
+
doc.is_a?(Boxcars::VectorStore::Document)
|
35
|
+
end
|
61
36
|
end
|
62
37
|
|
63
|
-
def
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
38
|
+
def search(query_vector, num_neighbors)
|
39
|
+
results = vector_documents[:vector_store].map do |doc|
|
40
|
+
{
|
41
|
+
document: doc,
|
42
|
+
similarity: cosine_similarity(query_vector, doc.embedding)
|
43
|
+
}
|
69
44
|
end
|
45
|
+
results.sort_by { |result| -result[:similarity] }
|
46
|
+
.first(num_neighbors)
|
47
|
+
rescue StandardError => e
|
48
|
+
raise_argument_error("Error searching for #{query_vector}: #{e.message}")
|
49
|
+
raise_error
|
70
50
|
end
|
71
51
|
|
72
52
|
def cosine_similarity(vector1, vector2)
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Boxcars
|
4
|
+
module VectorStore
|
5
|
+
# install pgvector: https://github.com/pgvector/pgvector#installation-notes
|
6
|
+
module Pgvector
|
7
|
+
class BuildFromArray
|
8
|
+
include VectorStore
|
9
|
+
|
10
|
+
# params = {
|
11
|
+
# embedding_tool: embedding_tool,
|
12
|
+
# input_array: input_array,
|
13
|
+
# database_url: db_url,
|
14
|
+
# table_name: table_name,
|
15
|
+
# embedding_column_name: embedding_column_name,
|
16
|
+
# content_column_name: content_column_name,
|
17
|
+
# metadata_column_name: metadata_column_name
|
18
|
+
# }
|
19
|
+
def initialize(params)
|
20
|
+
@embedding_tool = params[:embedding_tool] || :openai
|
21
|
+
|
22
|
+
validate_params(embedding_tool, params[:input_array])
|
23
|
+
|
24
|
+
@database_url = params[:database_url]
|
25
|
+
@table_name = params[:table_name]
|
26
|
+
@embedding_column_name = params[:embedding_column_name]
|
27
|
+
@content_column_name = params[:content_column_name]
|
28
|
+
@metadata_column_name = params[:metadata_column_name]
|
29
|
+
|
30
|
+
@input_array = params[:input_array]
|
31
|
+
@pg_vectors = []
|
32
|
+
end
|
33
|
+
|
34
|
+
def call
|
35
|
+
texts = input_array
|
36
|
+
vectors = generate_vectors(texts)
|
37
|
+
add_vectors(vectors, texts)
|
38
|
+
documents = save_vector_store
|
39
|
+
|
40
|
+
{
|
41
|
+
type: :pgvector,
|
42
|
+
vector_store: documents
|
43
|
+
}
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
attr_reader :input_array, :embedding_tool, :pg_vectors, :database_url,
|
49
|
+
:table_name, :embedding_column_name, :content_column_name,
|
50
|
+
:metadata_column_name
|
51
|
+
|
52
|
+
def validate_params(embedding_tool, input_array)
|
53
|
+
raise_argument_error('input_array is nil') unless input_array
|
54
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
55
|
+
|
56
|
+
raise_argument_error('embedding_tool is invalid') unless %i[openai tensorflow].include?(embedding_tool)
|
57
|
+
|
58
|
+
input_array.each do |item|
|
59
|
+
next if item.key?(:content) && item.key?(:metadata)
|
60
|
+
|
61
|
+
return raise_argument_error('embedding_tool is invalid')
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def add_vectors(vectors, texts)
|
66
|
+
raise_argument_error("vectors are nil") unless vectors
|
67
|
+
raise_argument_error("vectors and texts are not the same size") unless vectors.size == texts.size
|
68
|
+
|
69
|
+
vectors.zip(texts) do |vector, doc|
|
70
|
+
pg_vector = Document.new(
|
71
|
+
content: doc[:content],
|
72
|
+
embedding: vector[:embedding],
|
73
|
+
metadata: doc[:metadata]
|
74
|
+
)
|
75
|
+
@pg_vectors << pg_vector
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def save_vector_store
|
80
|
+
result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
|
81
|
+
pg_vectors: pg_vectors,
|
82
|
+
database_url: database_url,
|
83
|
+
table_name: table_name,
|
84
|
+
embedding_column_name: embedding_column_name,
|
85
|
+
content_column_name: content_column_name,
|
86
|
+
metadata_column_name: metadata_column_name
|
87
|
+
)
|
88
|
+
raise_argument_error('Error saving vector store to database.') unless result
|
89
|
+
|
90
|
+
result
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
@@ -0,0 +1,97 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pgvector'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'json'
|
6
|
+
|
7
|
+
module Boxcars
|
8
|
+
module VectorStore
|
9
|
+
module Pgvector
|
10
|
+
class BuildFromFiles
|
11
|
+
include VectorStore
|
12
|
+
|
13
|
+
# params = {
|
14
|
+
# training_data_path: training_data_path,
|
15
|
+
# split_chunk_size: 200,
|
16
|
+
# embedding_tool: embedding_tool,
|
17
|
+
# database_url: db_url,
|
18
|
+
# table_name: table_name,
|
19
|
+
# embedding_column_name: embedding_column_name,
|
20
|
+
# content_column_name: content_column_name
|
21
|
+
# }
|
22
|
+
def initialize(params)
|
23
|
+
@split_chunk_size = params[:split_chunk_size] || 2000
|
24
|
+
@training_data_path = File.absolute_path(params[:training_data_path])
|
25
|
+
@embedding_tool = params[:embedding_tool] || :openai
|
26
|
+
|
27
|
+
validate_params(embedding_tool, training_data_path)
|
28
|
+
|
29
|
+
@database_url = params[:database_url]
|
30
|
+
@table_name = params[:table_name]
|
31
|
+
@embedding_column_name = params[:embedding_column_name]
|
32
|
+
@content_column_name = params[:content_column_name]
|
33
|
+
@metadata_column_name = params[:metadata_column_name]
|
34
|
+
|
35
|
+
@pg_vectors = []
|
36
|
+
end
|
37
|
+
|
38
|
+
def call
|
39
|
+
data = load_data_files(training_data_path)
|
40
|
+
texts = split_text_into_chunks(data)
|
41
|
+
embeddings = generate_vectors(texts)
|
42
|
+
add_vectors(embeddings, texts)
|
43
|
+
documents = save_vector_store
|
44
|
+
|
45
|
+
{
|
46
|
+
type: :pgvector,
|
47
|
+
vector_store: documents
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :database_url,
|
54
|
+
:table_name, :embedding_column_name, :content_column_name,
|
55
|
+
:metadata_column_name, :pg_vectors
|
56
|
+
|
57
|
+
def validate_params(embedding_tool, training_data_path)
|
58
|
+
training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
|
59
|
+
|
60
|
+
raise_argument_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
|
61
|
+
raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
|
62
|
+
return if %i[openai tensorflow].include?(embedding_tool)
|
63
|
+
|
64
|
+
raise_argument_error('embedding_tool is invalid')
|
65
|
+
end
|
66
|
+
|
67
|
+
def add_vectors(vectors, texts)
|
68
|
+
vectors.map.with_index do |vector, index|
|
69
|
+
pg_vector = Document.new(
|
70
|
+
content: texts[index],
|
71
|
+
embedding: vector[:embedding],
|
72
|
+
metadata: {
|
73
|
+
doc_id: index,
|
74
|
+
training_data_path: training_data_path
|
75
|
+
}
|
76
|
+
)
|
77
|
+
pg_vectors << pg_vector
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def save_vector_store
|
82
|
+
result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
|
83
|
+
pg_vectors: pg_vectors,
|
84
|
+
database_url: database_url,
|
85
|
+
table_name: table_name,
|
86
|
+
embedding_column_name: embedding_column_name,
|
87
|
+
content_column_name: content_column_name,
|
88
|
+
metadata_column_name: metadata_column_name
|
89
|
+
)
|
90
|
+
raise_argument_error('Error saving vector store to database.') unless result
|
91
|
+
|
92
|
+
result
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|