boxcars 0.2.11 → 0.2.13

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/.env_sample +1 -0
  3. data/.rubocop.yml +16 -0
  4. data/CHANGELOG.md +12 -0
  5. data/Gemfile +15 -11
  6. data/Gemfile.lock +40 -32
  7. data/README.md +4 -1
  8. data/boxcars.gemspec +4 -7
  9. data/lib/boxcars/boxcar/active_record.rb +2 -2
  10. data/lib/boxcars/boxcar/engine_boxcar.rb +2 -2
  11. data/lib/boxcars/boxcar/sql.rb +1 -1
  12. data/lib/boxcars/boxcar/swagger.rb +1 -1
  13. data/lib/boxcars/boxcar/vector_answer.rb +71 -0
  14. data/lib/boxcars/boxcar.rb +2 -0
  15. data/lib/boxcars/engine/openai.rb +8 -1
  16. data/lib/boxcars/train/zero_shot.rb +1 -1
  17. data/lib/boxcars/train.rb +1 -1
  18. data/lib/boxcars/vector_search.rb +66 -2
  19. data/lib/boxcars/vector_store/document.rb +3 -2
  20. data/lib/boxcars/vector_store/embed_via_open_ai.rb +2 -2
  21. data/lib/boxcars/vector_store/hnswlib/build_from_files.rb +104 -0
  22. data/lib/boxcars/vector_store/hnswlib/load_from_disk.rb +57 -0
  23. data/lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb +48 -38
  24. data/lib/boxcars/vector_store/hnswlib/search.rb +70 -0
  25. data/lib/boxcars/vector_store/in_memory/build_from_document_array.rb +51 -0
  26. data/lib/boxcars/vector_store/in_memory/build_from_files.rb +61 -0
  27. data/lib/boxcars/vector_store/in_memory/search.rb +29 -49
  28. data/lib/boxcars/vector_store/pgvector/build_from_array.rb +95 -0
  29. data/lib/boxcars/vector_store/pgvector/build_from_files.rb +97 -0
  30. data/lib/boxcars/vector_store/pgvector/save_to_database.rb +152 -0
  31. data/lib/boxcars/vector_store/pgvector/search.rb +144 -0
  32. data/lib/boxcars/vector_store/split_text.rb +2 -3
  33. data/lib/boxcars/vector_store.rb +73 -7
  34. data/lib/boxcars/version.rb +1 -1
  35. data/lib/boxcars.rb +1 -1
  36. metadata +31 -40
  37. data/lib/boxcars/vector_store/hnswlib/build_vector_store.rb +0 -157
  38. data/lib/boxcars/vector_store/hnswlib/hnswlib_config.rb +0 -56
  39. data/lib/boxcars/vector_store/hnswlib/hnswlib_search.rb +0 -54
  40. data/lib/boxcars/vector_store/in_memory/add_documents.rb +0 -67
  41. data/lib/boxcars/vector_store/similarity_search.rb +0 -55
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boxcars
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.11
4
+ version: 0.2.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francis Sullivan
@@ -9,92 +9,78 @@ authors:
9
9
  autorequire:
10
10
  bindir: exe
11
11
  cert_chain: []
12
- date: 2023-05-05 00:00:00.000000000 Z
12
+ date: 2023-05-24 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
- name: debug
16
- requirement: !ruby/object:Gem::Requirement
17
- requirements:
18
- - - "~>"
19
- - !ruby/object:Gem::Version
20
- version: '1.1'
21
- type: :development
22
- prerelease: false
23
- version_requirements: !ruby/object:Gem::Requirement
24
- requirements:
25
- - - "~>"
26
- - !ruby/object:Gem::Version
27
- version: '1.1'
28
- - !ruby/object:Gem::Dependency
29
- name: dotenv
15
+ name: google_search_results
30
16
  requirement: !ruby/object:Gem::Requirement
31
17
  requirements:
32
18
  - - "~>"
33
19
  - !ruby/object:Gem::Version
34
- version: '2.8'
35
- type: :development
20
+ version: '2.2'
21
+ type: :runtime
36
22
  prerelease: false
37
23
  version_requirements: !ruby/object:Gem::Requirement
38
24
  requirements:
39
25
  - - "~>"
40
26
  - !ruby/object:Gem::Version
41
- version: '2.8'
27
+ version: '2.2'
42
28
  - !ruby/object:Gem::Dependency
43
- name: rspec
29
+ name: gpt4all
44
30
  requirement: !ruby/object:Gem::Requirement
45
31
  requirements:
46
32
  - - "~>"
47
33
  - !ruby/object:Gem::Version
48
- version: '3.2'
49
- type: :development
34
+ version: 0.0.4
35
+ type: :runtime
50
36
  prerelease: false
51
37
  version_requirements: !ruby/object:Gem::Requirement
52
38
  requirements:
53
39
  - - "~>"
54
40
  - !ruby/object:Gem::Version
55
- version: '3.2'
41
+ version: 0.0.4
56
42
  - !ruby/object:Gem::Dependency
57
- name: google_search_results
43
+ name: hnswlib
58
44
  requirement: !ruby/object:Gem::Requirement
59
45
  requirements:
60
46
  - - "~>"
61
47
  - !ruby/object:Gem::Version
62
- version: '2.2'
48
+ version: '0.8'
63
49
  type: :runtime
64
50
  prerelease: false
65
51
  version_requirements: !ruby/object:Gem::Requirement
66
52
  requirements:
67
53
  - - "~>"
68
54
  - !ruby/object:Gem::Version
69
- version: '2.2'
55
+ version: '0.8'
70
56
  - !ruby/object:Gem::Dependency
71
- name: gpt4all
57
+ name: ruby-openai
72
58
  requirement: !ruby/object:Gem::Requirement
73
59
  requirements:
74
60
  - - "~>"
75
61
  - !ruby/object:Gem::Version
76
- version: 0.0.4
62
+ version: '4.1'
77
63
  type: :runtime
78
64
  prerelease: false
79
65
  version_requirements: !ruby/object:Gem::Requirement
80
66
  requirements:
81
67
  - - "~>"
82
68
  - !ruby/object:Gem::Version
83
- version: 0.0.4
69
+ version: '4.1'
84
70
  - !ruby/object:Gem::Dependency
85
- name: ruby-openai
71
+ name: pgvector
86
72
  requirement: !ruby/object:Gem::Requirement
87
73
  requirements:
88
74
  - - "~>"
89
75
  - !ruby/object:Gem::Version
90
- version: '3.0'
76
+ version: '0.2'
91
77
  type: :runtime
92
78
  prerelease: false
93
79
  version_requirements: !ruby/object:Gem::Requirement
94
80
  requirements:
95
81
  - - "~>"
96
82
  - !ruby/object:Gem::Version
97
- version: '3.0'
83
+ version: '0.2'
98
84
  description: You simply set an OpenAI key, give a number of Boxcars to a Train, and
99
85
  magic ensues when you run it.
100
86
  email:
@@ -124,6 +110,7 @@ files:
124
110
  - lib/boxcars/boxcar/google_search.rb
125
111
  - lib/boxcars/boxcar/sql.rb
126
112
  - lib/boxcars/boxcar/swagger.rb
113
+ - lib/boxcars/boxcar/vector_answer.rb
127
114
  - lib/boxcars/boxcar/wikipedia_search.rb
128
115
  - lib/boxcars/conversation.rb
129
116
  - lib/boxcars/conversation_prompt.rb
@@ -144,13 +131,17 @@ files:
144
131
  - lib/boxcars/vector_store/document.rb
145
132
  - lib/boxcars/vector_store/embed_via_open_ai.rb
146
133
  - lib/boxcars/vector_store/embed_via_tensorflow.rb
147
- - lib/boxcars/vector_store/hnswlib/build_vector_store.rb
148
- - lib/boxcars/vector_store/hnswlib/hnswlib_config.rb
149
- - lib/boxcars/vector_store/hnswlib/hnswlib_search.rb
134
+ - lib/boxcars/vector_store/hnswlib/build_from_files.rb
135
+ - lib/boxcars/vector_store/hnswlib/load_from_disk.rb
150
136
  - lib/boxcars/vector_store/hnswlib/save_to_hnswlib.rb
151
- - lib/boxcars/vector_store/in_memory/add_documents.rb
137
+ - lib/boxcars/vector_store/hnswlib/search.rb
138
+ - lib/boxcars/vector_store/in_memory/build_from_document_array.rb
139
+ - lib/boxcars/vector_store/in_memory/build_from_files.rb
152
140
  - lib/boxcars/vector_store/in_memory/search.rb
153
- - lib/boxcars/vector_store/similarity_search.rb
141
+ - lib/boxcars/vector_store/pgvector/build_from_array.rb
142
+ - lib/boxcars/vector_store/pgvector/build_from_files.rb
143
+ - lib/boxcars/vector_store/pgvector/save_to_database.rb
144
+ - lib/boxcars/vector_store/pgvector/search.rb
154
145
  - lib/boxcars/vector_store/split_text.rb
155
146
  - lib/boxcars/version.rb
156
147
  homepage: https://github.com/BoxcarsAI/boxcars
@@ -169,14 +160,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
169
160
  requirements:
170
161
  - - ">="
171
162
  - !ruby/object:Gem::Version
172
- version: 2.6.0
163
+ version: '3.0'
173
164
  required_rubygems_version: !ruby/object:Gem::Requirement
174
165
  requirements:
175
166
  - - ">="
176
167
  - !ruby/object:Gem::Version
177
168
  version: '0'
178
169
  requirements: []
179
- rubygems_version: 3.4.10
170
+ rubygems_version: 3.2.32
180
171
  signing_key:
181
172
  specification_version: 4
182
173
  summary: Boxcars is a gem that enables you to create new systems with AI composability.
@@ -1,157 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'fileutils'
4
- require 'hnswlib'
5
- require 'json'
6
-
7
- module Boxcars
8
- module VectorStore
9
- module Hnswlib
10
- class BuildVectorStore
11
- include VectorStore
12
-
13
- # This class is responsible for building the vector store for the hnswlib similarity search.
14
- # It will load the training data, generate the embeddings, and save the vector store.
15
- # It will also load the vector store into memory.
16
- # For later use, it will save the splitted document with index numbers to a json file.
17
- #
18
- # @param training_data_path [String] The path to the training data. Can be a glob pattern.
19
- # @param index_file_path [String] The path to the index file.
20
- # @param split_chunk_size [Integer] The number of documents to split the text into. default 2000
21
- # @option json_doc_file_path [String]. The json file containing the document text.
22
- # if nil, it will reuse index file name.
23
- # @option force_rebuild [Boolean] Optional. If true, will rebuild the index even if it already exists.
24
- def initialize(
25
- training_data_path:,
26
- index_file_path:,
27
- split_chunk_size: 2000,
28
- json_doc_file_path: nil,
29
- force_rebuild: true
30
- )
31
- @training_data_path = training_data_path
32
- @index_file_path = index_file_path
33
- @split_chunk_size = split_chunk_size
34
- @json_doc_file_path = json_doc_file_path || index_file_path.gsub(/\.bin$/, '.json')
35
- @force_rebuild = force_rebuild
36
- end
37
-
38
- def call
39
- validate_params
40
- data = load_files
41
- documents = split_text_into_chunks(data)
42
- embeddings_with_config = generate_embeddings(documents)
43
- save_vector_store(embeddings_with_config)
44
- load_hnsw
45
- end
46
-
47
- private
48
-
49
- attr_reader :training_data_path, :index_file_path, :split_chunk_size, :json_doc_file_path, :force_rebuild
50
-
51
- def validate_params
52
- training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
53
- raise_error('training_data_path parent directory must exist') unless File.directory?(training_data_dir)
54
- raise_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
55
-
56
- index_dir = File.dirname(index_file_path)
57
- raise_error('index_file_path parent directory must exist') unless File.directory?(index_dir)
58
-
59
- raise_error('split_chunk_size must be an integer') unless split_chunk_size.is_a?(Integer)
60
- end
61
-
62
- def load_files
63
- data = []
64
- files = Dir.glob(training_data_path)
65
- raise_error "No files found at #{training_data_path}" if files.empty?
66
-
67
- files.each do |file|
68
- data << File.read(file)
69
- end
70
- puts "Added #{files.length} files to data. Splitting text into chunks..."
71
- data
72
- end
73
-
74
- def split_text_into_chunks(data)
75
- return true unless rebuild_required?
76
-
77
- docs = []
78
- data.each do |chunk|
79
- doc_output = Boxcars::VectorStore::SplitText.call(
80
- separator: "\n", chunk_size: split_chunk_size, chunk_overlap: 0, text: chunk
81
- )
82
- docs.concat(doc_output)
83
- end
84
- docs
85
- end
86
-
87
- def rebuild_required?
88
- hnswlib_config_json = "#{File.dirname(index_file_path)}/hnswlib_config.json"
89
- return true unless File.exist?(index_file_path)
90
- return true if File.exist?(index_file_path) && !File.exist?(hnswlib_config_json)
91
- return true if force_rebuild
92
-
93
- false
94
- end
95
-
96
- def generate_embeddings(documents)
97
- return true unless rebuild_required?
98
-
99
- puts "Initializing Store..."
100
- openai_client = Openai.open_ai_client
101
- embeddings_with_dim = Boxcars::VectorStore::EmbedViaOpenAI.call(texts: documents, client: openai_client)
102
- document_embeddings = embeddings_with_dim.map.with_index do |item, index|
103
- { doc_id: index, embedding: item[:embedding], document: documents[index] }
104
- end
105
-
106
- { document_embeddings: document_embeddings, dim: embeddings_with_dim.first[:dim] }
107
- end
108
-
109
- def save_vector_store(embeddings_with_config)
110
- return true unless rebuild_required?
111
-
112
- puts "Saving Vectorstore"
113
- Boxcars::VectorStore::Hnswlib::SaveToHnswlib.call(
114
- document_embeddings: embeddings_with_config[:document_embeddings],
115
- index_file_path: index_file_path,
116
- json_doc_file_path: json_doc_file_path,
117
- hnswlib_config: hnswlib_config(embeddings_with_config[:dim])
118
- )
119
- puts "VectorStore saved"
120
- end
121
-
122
- def hnswlib_config(dim)
123
- # dim: length of datum point vector that will be indexed.
124
- Boxcars::VectorStore::Hnswlib::HnswlibConfig.new(
125
- metric: "l2", max_item: 10000, dim: dim
126
- )
127
- end
128
-
129
- def load_hnsw
130
- puts "Loading Hnswlib"
131
-
132
- config_file = "#{File.dirname(index_file_path)}/hnswlib_config.json"
133
- json_config = parse_json_file(config_file)
134
- document_embeddings = parse_json_file(json_doc_file_path)
135
-
136
- search_index = ::Hnswlib::HierarchicalNSW.new(space: json_config[:metric], dim: json_config[:dim])
137
- search_index.load_index(index_file_path)
138
-
139
- { vector_store: search_index, document_embeddings: document_embeddings }
140
- end
141
-
142
- def parse_json_file(file_path)
143
- return [] if file_path.nil?
144
-
145
- file_content = File.read(file_path)
146
- JSON.parse(file_content, symbolize_names: true)
147
- rescue JSON::ParserError => e
148
- raise_error("Error parsing hnswlib_config.json: #{e.message}")
149
- end
150
-
151
- def raise_error(message)
152
- raise ::Boxcars::Error, message
153
- end
154
- end
155
- end
156
- end
157
- end
@@ -1,56 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'json'
4
-
5
- module Boxcars
6
- module VectorStore
7
- module Hnswlib
8
- class HnswlibConfig
9
- attr_reader :metric, :max_item, :dim, :ef_construction, :m
10
-
11
- # used for search index.
12
- #
13
- # @param max_item [Integer] The maximum number of items.
14
- #
15
- # @param metric [String] The distance metric between vectors ('l2', 'dot', or 'cosine').
16
- #
17
- # @param ef_construction [Integer] The size of the dynamic list for the nearest neighbors.
18
- # It controls the index time/accuracy trade-off.
19
- #
20
- # @param max_outgoing_connection [Integer] The maximum number of outgoing connections in the graph
21
- #
22
- # reference: https://yoshoku.github.io/hnswlib.rb/doc/
23
- def initialize(
24
- metric: "l2",
25
- max_item: 10000,
26
- dim: 2,
27
- ef_construction: 200,
28
- max_outgoing_connection: 16
29
- )
30
- @metric = metric
31
- @max_item = max_item
32
- @dim = dim
33
- @ef_construction = ef_construction
34
- @max_outgoing_connection = max_outgoing_connection
35
- end
36
-
37
- def space
38
- @metric == 'dot' ? 'ip' : 'l2'
39
- end
40
-
41
- def to_json(*args)
42
- JSON.pretty_generate(
43
- {
44
- metric: @metric,
45
- max_item: @max_item,
46
- dim: @dim,
47
- ef_construction: @ef_construction,
48
- max_outgoing_connection: @max_outgoing_connection
49
- },
50
- *args
51
- )
52
- end
53
- end
54
- end
55
- end
56
- end
@@ -1,54 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'hnswlib'
4
- require 'json'
5
-
6
- module Boxcars
7
- module VectorStore
8
- module Hnswlib
9
- class HnswlibSearch
10
- def initialize(vector_store:, options: {})
11
- validate_params(vector_store)
12
- @vector_store = vector_store
13
- @json_doc_path = options[:json_doc_path]
14
- @num_neighbors = options[:num_neighbors] || 1
15
- end
16
-
17
- def call(query)
18
- search(query)
19
- end
20
-
21
- private
22
-
23
- attr_reader :json_doc_path, :vector_store, :num_neighbors
24
-
25
- def validate_params(vector_store)
26
- raise_error 'vector_store must be an Hnswlib::HierarchicalNSW' unless vector_store.is_a?(::Hnswlib::HierarchicalNSW)
27
- end
28
-
29
- def search(query)
30
- raw_results = vector_store.search_knn(query, num_neighbors)
31
- raw_results.map { |doc_id, distance| lookup_embedding2(doc_id, distance) }.compact
32
- end
33
-
34
- def lookup_embedding2(doc_id, distance)
35
- embedding_data = parsed_data.find { |embedding| embedding[:doc_id] == doc_id }
36
- return unless embedding_data
37
-
38
- { document: embedding_data[:document], distance: distance }
39
- end
40
-
41
- def parsed_data
42
- @parsed_data ||= JSON.parse(
43
- File.read(json_doc_path),
44
- symbolize_names: true
45
- )
46
- end
47
-
48
- def raise_error(message)
49
- raise ::Boxcars::ArgumentError, message
50
- end
51
- end
52
- end
53
- end
54
- end
@@ -1,67 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Boxcars
4
- module VectorStore
5
- module InMemory
6
- MemoryVector = Struct.new(:content, :embedding, :metadatax)
7
-
8
- class AddDocuments
9
- include VectorStore
10
-
11
- def initialize(embedding_tool: :openai, documents: nil)
12
- validate_params(embedding_tool, documents)
13
- @embedding_tool = embedding_tool
14
- @documents = documents
15
- @memory_vectors = []
16
- end
17
-
18
- def call
19
- texts = @documents.map { |doc| doc[:page_content] }
20
- vectors = generate_vectors(texts)
21
- add_vectors(vectors, @documents)
22
- @memory_vectors
23
- end
24
-
25
- private
26
-
27
- def validate_params(embedding_tool, documents)
28
- raise ::Boxcars::ArgumentError, 'documents is nil' unless documents
29
- return if %i[openai tensorflow].include?(embedding_tool)
30
-
31
- raise ::Boxcars::ArgumentError, 'embedding_tool is invalid'
32
- end
33
-
34
- # returns array of documents with vectors
35
- def add_vectors(vectors, documents)
36
- vectors.zip(documents).each do |vector, doc|
37
- memory_vector = MemoryVector.new(doc[:page_content], vector, doc[:metadata])
38
- @memory_vectors << memory_vector
39
- end
40
- end
41
-
42
- def generate_vectors(texts)
43
- embeddings_method[:klass].call(
44
- texts: texts, client: embeddings_method[:client]
45
- )
46
- end
47
-
48
- def embeddings_method
49
- @embeddings_method ||=
50
- case @embedding_tool
51
- when :openai
52
- { klass: Boxcars::VectorStore::EmbedViaOpenAI, client: openai_client }
53
- when :tensorflow
54
- { klass: Boxcars::VectorStore::EmbedViaTensorflow, client: nil }
55
- end
56
- end
57
-
58
- # Get the OpenAI client
59
- # @param openai_access_token [String] the OpenAI access token
60
- # @return [OpenAI::Client]
61
- def openai_client(openai_access_token: nil)
62
- @openai_client ||= Openai.open_ai_client(openai_access_token: openai_access_token)
63
- end
64
- end
65
- end
66
- end
67
- end
@@ -1,55 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require 'hnswlib'
4
-
5
- module Boxcars
6
- module VectorStore
7
- class SimilaritySearch
8
- def initialize(embeddings:, vector_store:, openai_connection: nil, openai_access_token: nil)
9
- @embeddings = embeddings
10
- @vector_store = vector_store
11
- @similarity_search_instance = create_similarity_search_instance
12
- @openai_connection = openai_connection || default_connection(openai_access_token: openai_access_token)
13
- end
14
-
15
- def call(query:)
16
- validate_query(query)
17
- query_vector = convert_query_to_vector(query)
18
- @similarity_search_instance.call(query_vector)
19
- end
20
-
21
- private
22
-
23
- attr_reader :embeddings, :vector_store, :openai_connection
24
-
25
- def default_connection(openai_access_token: nil)
26
- Openai.open_ai_client(openai_access_token: openai_access_token)
27
- end
28
-
29
- def validate_query(query)
30
- raise_error 'query must be a string' unless query.is_a?(String)
31
- raise_error 'query must not be empty' if query.empty?
32
- end
33
-
34
- def convert_query_to_vector(query)
35
- Boxcars::VectorStore::EmbedViaOpenAI.call(texts: [query], client: openai_connection).first[:embedding]
36
- end
37
-
38
- def create_similarity_search_instance
39
- case vector_store
40
- when ::Hnswlib::HierarchicalNSW
41
- Boxcars::VectorStore::Hnswlib::HnswlibSearch.new(
42
- vector_store: vector_store,
43
- options: { json_doc_path: embeddings, num_neighbors: 2 }
44
- )
45
- else
46
- raise_error 'Unsupported vector store provided'
47
- end
48
- end
49
-
50
- def raise_error(message)
51
- raise ArgumentError, message
52
- end
53
- end
54
- end
55
- end