boxcars 0.7.7 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +6 -3
  3. data/.ruby-version +1 -1
  4. data/Gemfile +3 -13
  5. data/Gemfile.lock +29 -25
  6. data/POSTHOG_TEST_README.md +118 -0
  7. data/README.md +305 -0
  8. data/boxcars.gemspec +1 -2
  9. data/lib/boxcars/boxcar/active_record.rb +9 -10
  10. data/lib/boxcars/boxcar/calculator.rb +2 -2
  11. data/lib/boxcars/boxcar/engine_boxcar.rb +4 -4
  12. data/lib/boxcars/boxcar/google_search.rb +2 -2
  13. data/lib/boxcars/boxcar/json_engine_boxcar.rb +1 -1
  14. data/lib/boxcars/boxcar/ruby_calculator.rb +1 -1
  15. data/lib/boxcars/boxcar/sql_base.rb +4 -4
  16. data/lib/boxcars/boxcar/swagger.rb +3 -3
  17. data/lib/boxcars/boxcar/vector_answer.rb +3 -3
  18. data/lib/boxcars/boxcar/xml_engine_boxcar.rb +1 -1
  19. data/lib/boxcars/boxcar.rb +6 -6
  20. data/lib/boxcars/conversation_prompt.rb +3 -3
  21. data/lib/boxcars/engine/anthropic.rb +121 -23
  22. data/lib/boxcars/engine/cerebras.rb +2 -2
  23. data/lib/boxcars/engine/cohere.rb +135 -9
  24. data/lib/boxcars/engine/gemini_ai.rb +151 -76
  25. data/lib/boxcars/engine/google.rb +2 -2
  26. data/lib/boxcars/engine/gpt4all_eng.rb +92 -34
  27. data/lib/boxcars/engine/groq.rb +124 -73
  28. data/lib/boxcars/engine/intelligence_base.rb +52 -17
  29. data/lib/boxcars/engine/ollama.rb +127 -47
  30. data/lib/boxcars/engine/openai.rb +186 -103
  31. data/lib/boxcars/engine/perplexityai.rb +116 -136
  32. data/lib/boxcars/engine/together.rb +2 -2
  33. data/lib/boxcars/engine/unified_observability.rb +430 -0
  34. data/lib/boxcars/engine.rb +4 -3
  35. data/lib/boxcars/engines.rb +74 -0
  36. data/lib/boxcars/observability.rb +44 -0
  37. data/lib/boxcars/observability_backend.rb +17 -0
  38. data/lib/boxcars/observability_backends/multi_backend.rb +42 -0
  39. data/lib/boxcars/observability_backends/posthog_backend.rb +89 -0
  40. data/lib/boxcars/observation.rb +8 -8
  41. data/lib/boxcars/prompt.rb +16 -4
  42. data/lib/boxcars/result.rb +7 -12
  43. data/lib/boxcars/ruby_repl.rb +1 -1
  44. data/lib/boxcars/train/train_action.rb +1 -1
  45. data/lib/boxcars/train/xml_train.rb +3 -3
  46. data/lib/boxcars/train/xml_zero_shot.rb +1 -1
  47. data/lib/boxcars/train/zero_shot.rb +3 -3
  48. data/lib/boxcars/train.rb +1 -1
  49. data/lib/boxcars/vector_search.rb +5 -5
  50. data/lib/boxcars/vector_store/pgvector/build_from_array.rb +116 -88
  51. data/lib/boxcars/vector_store/pgvector/build_from_files.rb +106 -80
  52. data/lib/boxcars/vector_store/pgvector/save_to_database.rb +148 -122
  53. data/lib/boxcars/vector_store/pgvector/search.rb +157 -131
  54. data/lib/boxcars/vector_store.rb +4 -4
  55. data/lib/boxcars/version.rb +1 -1
  56. data/lib/boxcars.rb +31 -20
  57. metadata +11 -21
@@ -1,96 +1,122 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'pgvector'
4
- require 'fileutils'
5
- require 'json'
6
-
7
- module Boxcars
8
- module VectorStore
9
- module Pgvector
10
- class BuildFromFiles
11
- include VectorStore
12
-
13
- # @param training_data_path [String] path to training data files
14
- # @param split_chunk_size [Integer] number of characters to split the text into
15
- # @param embedding_tool [Symbol] embedding tool to use
16
- # @param database_url [String] database url
17
- # @param table_name [String] table name
18
- # @param embedding_column_name [String] embedding column name
19
- # @param content_column_name [String] content column name
20
- # @param metadata_column_name [String] metadata column name
21
- # @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
22
- def initialize(params)
23
- @split_chunk_size = params[:split_chunk_size] || 2000
24
- @training_data_path = File.absolute_path(params[:training_data_path])
25
- @embedding_tool = params[:embedding_tool] || :openai
26
-
27
- validate_params(embedding_tool, training_data_path)
28
-
29
- @database_url = params[:database_url]
30
- @table_name = params[:table_name]
31
- @embedding_column_name = params[:embedding_column_name]
32
- @content_column_name = params[:content_column_name]
33
- @metadata_column_name = params[:metadata_column_name]
34
-
35
- @pg_vectors = []
36
- end
3
+ if Gem.loaded_specs.key?('pgvector') && Gem.loaded_specs.key?('pg')
4
+ require 'pgvector'
5
+ require 'fileutils'
6
+ require 'json'
7
+
8
+ module Boxcars
9
+ module VectorStore
10
+ module Pgvector
11
+ class BuildFromFiles
12
+ include VectorStore
13
+
14
+ # @param training_data_path [String] path to training data files
15
+ # @param split_chunk_size [Integer] number of characters to split the text into
16
+ # @param embedding_tool [Symbol] embedding tool to use
17
+ # @param database_url [String] database url
18
+ # @param table_name [String] table name
19
+ # @param embedding_column_name [String] embedding column name
20
+ # @param content_column_name [String] content column name
21
+ # @param metadata_column_name [String] metadata column name
22
+ # @return [Hash] vector_store: array of hashes with :content, :metadata, and :embedding keys
23
+ def initialize(params)
24
+ @split_chunk_size = params[:split_chunk_size] || 2000
25
+ @training_data_path = File.absolute_path(params[:training_data_path])
26
+ @embedding_tool = params[:embedding_tool] || :openai
27
+
28
+ validate_params(embedding_tool, training_data_path)
29
+
30
+ @database_url = params[:database_url]
31
+ @table_name = params[:table_name]
32
+ @embedding_column_name = params[:embedding_column_name]
33
+ @content_column_name = params[:content_column_name]
34
+ @metadata_column_name = params[:metadata_column_name]
35
+
36
+ @pg_vectors = []
37
+ end
37
38
 
38
- # @return [Hash] vector_store: array of Inventor::VectorStore::Document
39
- def call
40
- data = load_data_files(training_data_path)
41
- texts = split_text_into_chunks(data)
42
- embeddings = generate_vectors(texts)
43
- add_vectors(embeddings, texts)
44
- documents = save_vector_store
45
-
46
- {
47
- type: :pgvector,
48
- vector_store: documents
49
- }
50
- end
39
+ # @return [Hash] vector_store: array of Inventor::VectorStore::Document
40
+ def call
41
+ data = load_data_files(training_data_path)
42
+ texts = split_text_into_chunks(data)
43
+ embeddings = generate_vectors(texts)
44
+ add_vectors(embeddings, texts)
45
+ documents = save_vector_store
46
+
47
+ {
48
+ type: :pgvector,
49
+ vector_store: documents
50
+ }
51
+ end
51
52
 
52
- private
53
+ private
53
54
 
54
- attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :database_url,
55
- :table_name, :embedding_column_name, :content_column_name,
56
- :metadata_column_name, :pg_vectors
55
+ attr_reader :split_chunk_size, :training_data_path, :embedding_tool, :database_url,
56
+ :table_name, :embedding_column_name, :content_column_name,
57
+ :metadata_column_name, :pg_vectors
57
58
 
58
- def validate_params(embedding_tool, training_data_path)
59
- training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
59
+ def validate_params(embedding_tool, training_data_path)
60
+ training_data_dir = File.dirname(training_data_path.gsub(/\*{1,2}/, ''))
60
61
 
61
- raise_argument_error('training_data_path parent directory must exist') unless Dir.exist?(training_data_dir)
62
- raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
63
- return if %i[openai tensorflow].include?(embedding_tool)
62
+ raise_argument_error('training_data_path parent directory must exist') unless Dir.exist?(training_data_dir)
63
+ raise_argument_error('No files found at the training_data_path pattern') if Dir.glob(training_data_path).empty?
64
+ return if %i[openai tensorflow].include?(embedding_tool)
64
65
 
65
- raise_argument_error('embedding_tool is invalid')
66
- end
66
+ raise_argument_error('embedding_tool is invalid')
67
+ end
68
+
69
+ def add_vectors(vectors, texts)
70
+ vectors.map.with_index do |vector, index|
71
+ pg_vector = Document.new(
72
+ content: texts[index],
73
+ embedding: vector[:embedding],
74
+ metadata: {
75
+ doc_id: index,
76
+ training_data_path: training_data_path
77
+ }
78
+ )
79
+ pg_vectors << pg_vector
80
+ end
81
+ end
67
82
 
68
- def add_vectors(vectors, texts)
69
- vectors.map.with_index do |vector, index|
70
- pg_vector = Document.new(
71
- content: texts[index],
72
- embedding: vector[:embedding],
73
- metadata: {
74
- doc_id: index,
75
- training_data_path: training_data_path
76
- }
83
+ def save_vector_store
84
+ result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
85
+ pg_vectors: pg_vectors,
86
+ database_url: database_url,
87
+ table_name: table_name,
88
+ embedding_column_name: embedding_column_name,
89
+ content_column_name: content_column_name,
90
+ metadata_column_name: metadata_column_name
77
91
  )
78
- pg_vectors << pg_vector
92
+ raise_argument_error('Error saving vector store to database.') unless result
93
+
94
+ result
79
95
  end
80
96
  end
97
+ end
98
+ end
99
+ end
100
+ else
101
+ # Define placeholder modules/classes that raise an error if pgvector is not available
102
+ module Boxcars
103
+ module VectorStore
104
+ module Pgvector
105
+ class PgvectorNotAvailableError < StandardError
106
+ DEFAULT_MESSAGE = "The 'pgvector' and 'pg' gems are required. Please add them to your Gemfile."
107
+ def initialize(message = DEFAULT_MESSAGE)
108
+ super
109
+ end
110
+ end
111
+
112
+ class BuildFromFiles
113
+ def initialize(*_args)
114
+ raise PgvectorNotAvailableError
115
+ end
81
116
 
82
- def save_vector_store
83
- result = Boxcars::VectorStore::Pgvector::SaveToDatabase.call(
84
- pg_vectors: pg_vectors,
85
- database_url: database_url,
86
- table_name: table_name,
87
- embedding_column_name: embedding_column_name,
88
- content_column_name: content_column_name,
89
- metadata_column_name: metadata_column_name
90
- )
91
- raise_argument_error('Error saving vector store to database.') unless result
92
-
93
- result
117
+ def call(*_args)
118
+ raise PgvectorNotAvailableError
119
+ end
94
120
  end
95
121
  end
96
122
  end
@@ -1,148 +1,174 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require 'pg'
4
- require 'pgvector'
5
-
6
- module Boxcars
7
- module VectorStore
8
- module Pgvector
9
- class SaveToDatabase
10
- include VectorStore
11
-
12
- # @param pg_vectors [Array] array of Boxcars::VectorStore::Document
13
- # @param database_url [String] database url
14
- # @param table_name [String] table name
15
- # @param embedding_column_name [String] embedding column name
16
- # @param content_column_name [String] content column name
17
- # @param metadata_column_name [String] metadata column name
18
- # @return [Array] array of Boxcars::VectorStore::Document
19
- def initialize(params)
20
- validate_param_types(params)
21
- @db_connection = test_db_params(params)
22
-
23
- @table_name = params[:table_name]
24
- @content_column_name = params[:content_column_name]
25
- @embedding_column_name = params[:embedding_column_name]
26
- @metadata_column_name = params[:metadata_column_name]
27
-
28
- @pg_vectors = params[:pg_vectors]
29
- end
3
+ if Gem.loaded_specs.key?('pgvector') && Gem.loaded_specs.key?('pg')
4
+ require 'pg'
5
+ require 'pgvector'
6
+
7
+ module Boxcars
8
+ module VectorStore
9
+ module Pgvector
10
+ class SaveToDatabase
11
+ include VectorStore
12
+
13
+ # @param pg_vectors [Array] array of Boxcars::VectorStore::Document
14
+ # @param database_url [String] database url
15
+ # @param table_name [String] table name
16
+ # @param embedding_column_name [String] embedding column name
17
+ # @param content_column_name [String] content column name
18
+ # @param metadata_column_name [String] metadata column name
19
+ # @return [Array] array of Boxcars::VectorStore::Document
20
+ def initialize(params)
21
+ validate_param_types(params)
22
+ @db_connection = test_db_params(params)
23
+
24
+ @table_name = params[:table_name]
25
+ @content_column_name = params[:content_column_name]
26
+ @embedding_column_name = params[:embedding_column_name]
27
+ @metadata_column_name = params[:metadata_column_name]
28
+
29
+ @pg_vectors = params[:pg_vectors]
30
+ end
30
31
 
31
- # @return [Array] array of Boxcars::VectorStore::Document
32
- def call
33
- add_vectors_to_database
34
- end
32
+ # @return [Array] array of Boxcars::VectorStore::Document
33
+ def call
34
+ add_vectors_to_database
35
+ end
35
36
 
36
- private
37
+ private
37
38
 
38
- attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
39
- :embedding_column_name, :content_column_name,
40
- :metadata_column_name
39
+ attr_reader :database_url, :pg_vectors, :db_connection, :table_name,
40
+ :embedding_column_name, :content_column_name,
41
+ :metadata_column_name
41
42
 
42
- def validate_param_types(params)
43
- pg_vectors = params[:pg_vectors]
43
+ def validate_param_types(params)
44
+ pg_vectors = params[:pg_vectors]
44
45
 
45
- raise_argument_error('pg_vectors must be an array') unless pg_vectors.is_a?(Array)
46
- raise_argument_error('missing data') if pg_vectors.empty?
47
- raise_argument_error('invalid vector_store') unless valid_vector_store?(pg_vectors)
48
- @database_url = params[:database_url]
49
- raise_argument_error('missing database_url argument') if @database_url.to_s.empty?
50
- end
46
+ raise_argument_error('pg_vectors must be an array') unless pg_vectors.is_a?(Array)
47
+ raise_argument_error('missing data') if pg_vectors.empty?
48
+ raise_argument_error('invalid vector_store') unless valid_vector_store?(pg_vectors)
49
+ @database_url = params[:database_url]
50
+ raise_argument_error('missing database_url argument') if @database_url.to_s.empty?
51
+ end
51
52
 
52
- def valid_vector_store?(pg_vectors)
53
- pg_vectors.all? do |doc|
54
- doc.is_a?(Boxcars::VectorStore::Document)
53
+ def valid_vector_store?(pg_vectors)
54
+ pg_vectors.all? do |doc|
55
+ doc.is_a?(Boxcars::VectorStore::Document)
56
+ end
57
+ rescue TypeError => e
58
+ raise_argument_error(e.message)
55
59
  end
56
- rescue TypeError => e
57
- raise_argument_error(e.message)
58
- end
59
60
 
60
- def test_db_params(params)
61
- conn = ::PG::Connection.new(@database_url)
62
-
63
- check_db_connection(conn)
64
- check_vector_extension(conn)
65
- check_table_exists(conn, params[:table_name])
66
- check_column_exists(conn, params)
67
-
68
- registry = PG::BasicTypeRegistry.new.define_default_types
69
- ::Pgvector::PG.register_vector(registry)
70
- conn.type_map_for_queries = PG::BasicTypeMapForQueries.new(conn, registry: registry)
71
- conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
72
- conn
73
- rescue PG::Error, NameError => e
74
- raise_argument_error(e.message)
75
- end
61
+ def test_db_params(params)
62
+ conn = ::PG::Connection.new(@database_url)
63
+
64
+ check_db_connection(conn)
65
+ check_vector_extension(conn)
66
+ check_table_exists(conn, params[:table_name])
67
+ check_column_exists(conn, params)
68
+
69
+ registry = PG::BasicTypeRegistry.new.define_default_types
70
+ ::Pgvector::PG.register_vector(registry)
71
+ conn.type_map_for_queries = PG::BasicTypeMapForQueries.new(conn, registry: registry)
72
+ conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn, registry: registry)
73
+ conn
74
+ rescue PG::Error, NameError => e
75
+ raise_argument_error(e.message)
76
+ end
76
77
 
77
- def check_db_connection(conn)
78
- return if conn.status == PG::CONNECTION_OK
78
+ def check_db_connection(conn)
79
+ return if conn.status == PG::CONNECTION_OK
79
80
 
80
- raise_argument_error("PostgreSQL connection is not ok")
81
- end
81
+ raise_argument_error("PostgreSQL connection is not ok")
82
+ end
82
83
 
83
- def check_vector_extension(conn)
84
- return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
84
+ def check_vector_extension(conn)
85
+ return if conn.exec("SELECT 1 FROM pg_extension WHERE extname = 'vector'").any?
85
86
 
86
- raise_argument_error("PostgreSQL 'vector' extension is not installed")
87
- end
87
+ raise_argument_error("PostgreSQL 'vector' extension is not installed")
88
+ end
89
+
90
+ def check_table_exists(conn, table_name)
91
+ table_exists = conn.exec_params(
92
+ "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
93
+ ).getvalue(0, 0) == "t"
94
+ return if table_exists
88
95
 
89
- def check_table_exists(conn, table_name)
90
- table_exists = conn.exec_params(
91
- "SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = $1)", [table_name]
92
- ).getvalue(0, 0) == "t"
93
- return if table_exists
96
+ raise_argument_error("Table '#{table_name}' does not exist")
97
+ end
94
98
 
95
- raise_argument_error("Table '#{table_name}' does not exist")
96
- end
99
+ def check_column_exists(conn, params)
100
+ column_names = %i[embedding_column_name content_column_name metadata_column_name]
101
+ table_name = params[:table_name]
97
102
 
98
- def check_column_exists(conn, params)
99
- column_names = %i[embedding_column_name content_column_name metadata_column_name]
100
- table_name = params[:table_name]
103
+ column_names.each do |target|
104
+ column_name = params[target]
105
+ column_exists = conn.exec_params(
106
+ "SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
107
+ [table_name, column_name]
108
+ ).getvalue(0, 0) == "t"
109
+ next if column_exists
101
110
 
102
- column_names.each do |target|
103
- column_name = params[target]
104
- column_exists = conn.exec_params(
105
- "SELECT EXISTS (SELECT 1 FROM information_schema.columns WHERE table_name = $1 AND column_name = $2)",
106
- [table_name, column_name]
107
- ).getvalue(0, 0) == "t"
108
- next if column_exists
111
+ raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
112
+ end
113
+ end
109
114
 
110
- raise_argument_error("Column '#{column_name}' does not exist in table '#{table_name}'")
115
+ def add_vectors_to_database
116
+ pg_vectors.each do |document|
117
+ embedding = document.embedding.map(&:to_f)
118
+ content = document.content
119
+ metadata = document.metadata.to_json
120
+
121
+ if document.metadata[:id]
122
+ id = document.metadata[:id]
123
+ # directly inserting table_name, embedding_column_name, and content_column_name
124
+ # into the SQL command. If these values are coming from an untrusted source,
125
+ # there is a risk of SQL injection
126
+ sql = <<-SQL
127
+ INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
128
+ VALUES ($1, $2, $3, $4)
129
+ ON CONFLICT (id) DO UPDATE
130
+ SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
131
+ #{content_column_name} = EXCLUDED.#{content_column_name},
132
+ #{metadata_column_name} = EXCLUDED.#{metadata_column_name}
133
+ SQL
134
+ # parameters are given separately from the SQL command,
135
+ # there's no risk of them being interpreted as part of the command.
136
+ db_connection.exec_params(sql, [id, embedding, content, metadata])
137
+ else
138
+ sql = <<-SQL
139
+ INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
140
+ VALUES ($1, $2, $3)
141
+ SQL
142
+ db_connection.exec_params(sql, [embedding, content, metadata])
143
+ end
144
+ end
145
+ rescue PG::Error => e
146
+ raise_argument_error(e.message)
147
+ end
148
+ end
149
+ end
150
+ end
151
+ end
152
+ else
153
+ # Define placeholder modules/classes that raise an error if pgvector is not available
154
+ module Boxcars
155
+ module VectorStore
156
+ module Pgvector
157
+ class PgvectorNotAvailableError < StandardError
158
+ DEFAULT_MESSAGE = "The 'pgvector' and 'pg' gems are required. Please add them to your Gemfile."
159
+ def initialize(message = DEFAULT_MESSAGE)
160
+ super
111
161
  end
112
162
  end
113
163
 
114
- def add_vectors_to_database
115
- pg_vectors.each do |document|
116
- embedding = document.embedding.map(&:to_f)
117
- content = document.content
118
- metadata = document.metadata.to_json
119
-
120
- if document.metadata[:id]
121
- id = document.metadata[:id]
122
- # directly inserting table_name, embedding_column_name, and content_column_name
123
- # into the SQL command. If these values are coming from an untrusted source,
124
- # there is a risk of SQL injection
125
- sql = <<-SQL
126
- INSERT INTO #{table_name} (id, #{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
127
- VALUES ($1, $2, $3, $4)
128
- ON CONFLICT (id) DO UPDATE
129
- SET #{embedding_column_name} = EXCLUDED.#{embedding_column_name},
130
- #{content_column_name} = EXCLUDED.#{content_column_name},
131
- #{metadata_column_name} = EXCLUDED.#{metadata_column_name}
132
- SQL
133
- # parameters are given separately from the SQL command,
134
- # there's no risk of them being interpreted as part of the command.
135
- db_connection.exec_params(sql, [id, embedding, content, metadata])
136
- else
137
- sql = <<-SQL
138
- INSERT INTO #{table_name} (#{embedding_column_name}, #{content_column_name}, #{metadata_column_name})
139
- VALUES ($1, $2, $3)
140
- SQL
141
- db_connection.exec_params(sql, [embedding, content, metadata])
142
- end
164
+ class SaveToDatabase
165
+ def initialize(*_args)
166
+ raise PgvectorNotAvailableError
167
+ end
168
+
169
+ def call(*_args)
170
+ raise PgvectorNotAvailableError
143
171
  end
144
- rescue PG::Error => e
145
- raise_argument_error(e.message)
146
172
  end
147
173
  end
148
174
  end