bx_builder_chain 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +13 -0
  4. data/CHANGELOG.md +5 -0
  5. data/Gemfile +22 -0
  6. data/Gemfile.lock +120 -0
  7. data/README.md +74 -0
  8. data/Rakefile +12 -0
  9. data/bx_builder_chain.gemspec +35 -0
  10. data/lib/bx_builder_chain/chunker/recursive_text.rb +38 -0
  11. data/lib/bx_builder_chain/chunker/text.rb +38 -0
  12. data/lib/bx_builder_chain/configuration.rb +21 -0
  13. data/lib/bx_builder_chain/data.rb +28 -0
  14. data/lib/bx_builder_chain/dependency_helper.rb +22 -0
  15. data/lib/bx_builder_chain/llm/base.rb +64 -0
  16. data/lib/bx_builder_chain/llm/open_ai.rb +191 -0
  17. data/lib/bx_builder_chain/loader.rb +144 -0
  18. data/lib/bx_builder_chain/processors/base.rb +21 -0
  19. data/lib/bx_builder_chain/processors/csv.rb +27 -0
  20. data/lib/bx_builder_chain/processors/docx.rb +25 -0
  21. data/lib/bx_builder_chain/processors/html.rb +29 -0
  22. data/lib/bx_builder_chain/processors/json.rb +17 -0
  23. data/lib/bx_builder_chain/processors/pdf.rb +26 -0
  24. data/lib/bx_builder_chain/processors/text.rb +17 -0
  25. data/lib/bx_builder_chain/processors/xlsx.rb +31 -0
  26. data/lib/bx_builder_chain/utils/token_data/cl100k_base.tiktoken +100256 -0
  27. data/lib/bx_builder_chain/utils/token_length/base_validator.rb +45 -0
  28. data/lib/bx_builder_chain/utils/token_length/open_ai_validator.rb +70 -0
  29. data/lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb +72 -0
  30. data/lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb +44 -0
  31. data/lib/bx_builder_chain/vectorsearch/base.rb +160 -0
  32. data/lib/bx_builder_chain/vectorsearch/pgvector.rb +228 -0
  33. data/lib/bx_builder_chain/version.rb +5 -0
  34. data/lib/bx_builder_chain.rb +38 -0
  35. data/lib/generators/bx_builder_chain/install_generator.rb +42 -0
  36. data/lib/generators/bx_builder_chain/templates/app/admin/bx_builder_chain_document.rb +65 -0
  37. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/documents_controller.rb +65 -0
  38. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/questions_controller.rb +33 -0
  39. data/lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/test_controller.rb +10 -0
  40. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document.rb +26 -0
  41. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document_chunk.rb +9 -0
  42. data/lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/embedding.rb +9 -0
  43. data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/document_upload_service.rb +47 -0
  44. data/lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/question_asking_service.rb +35 -0
  45. data/lib/generators/bx_builder_chain/templates/app/views/bx_builder_chain/test/form.html.erb +164 -0
  46. data/lib/generators/bx_builder_chain/templates/app/workers/bx_builder_chain/document_processor_worker.rb +32 -0
  47. data/lib/generators/bx_builder_chain/templates/initializer.rb +12 -0
  48. data/lib/generators/bx_builder_chain/templates/migration.rb +33 -0
  49. data/lib/pgvector/pg/binary_decoder/vector.rb +14 -0
  50. data/lib/pgvector/pg/text_decoder/vector.rb +12 -0
  51. data/lib/pgvector/pg.rb +10 -0
  52. data/lib/pgvector.rb +11 -0
  53. data/lib/sequel/plugins/pgvector/class_methods.rb +47 -0
  54. data/lib/sequel/plugins/pgvector/instance_methods.rb +34 -0
  55. data/lib/sequel/plugins/pgvector.rb +12 -0
  56. data/sig/bx_langchain_chat.rbs +4 -0
  57. metadata +238 -0
@@ -0,0 +1,47 @@
1
+ module Sequel
2
+ module Plugins
3
+ module Pgvector
4
+ module ClassMethods
5
+ attr_accessor :vector_columns
6
+
7
+ def nearest_neighbors(column, value, distance:, threshold: nil)
8
+ value = ::Pgvector.encode(value) unless value.is_a?(String)
9
+ quoted_column = dataset.quote_identifier(column)
10
+ distance = distance.to_s
11
+
12
+ operator =
13
+ case distance
14
+ when "inner_product"
15
+ "<#>"
16
+ when "cosine"
17
+ "<=>"
18
+ when "euclidean"
19
+ "<->"
20
+ else
21
+ raise ArgumentError, "Invalid distance: #{distance}"
22
+ end
23
+
24
+ order = "#{quoted_column} #{operator} ?"
25
+
26
+ neighbor_distance =
27
+ if distance == "inner_product"
28
+ "(#{order}) * -1"
29
+ else
30
+ order
31
+ end
32
+
33
+ query = select_append(Sequel.lit("#{neighbor_distance} AS neighbor_distance", value))
34
+ .exclude(column => nil)
35
+ .order(Sequel.lit(order, value))
36
+
37
+ # Apply the WHERE condition only if threshold is provided
38
+ query = query.where(Sequel.lit("(#{neighbor_distance}) < ?", value, threshold)) if threshold
39
+
40
+ query
41
+ end
42
+
43
+ Plugins.inherited_instance_variables(self, :@vector_columns => :dup)
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,34 @@
1
+ module Sequel
2
+ module Plugins
3
+ module Pgvector
4
+ module InstanceMethods
5
+ def nearest_neighbors(column, **options)
6
+ column = column.to_sym
7
+ # important! check if neighbor attribute before calling send
8
+ raise ArgumentError, "Invalid column" unless self.class.vector_columns[column]
9
+
10
+ self.class
11
+ .nearest_neighbors(column, self[column], **options)
12
+ .exclude(primary_key => self[primary_key])
13
+ end
14
+
15
+ def []=(k, v)
16
+ if self.class.vector_columns.key?(k.to_sym) && !v.is_a?(String)
17
+ super(k, ::Pgvector.encode(v))
18
+ else
19
+ super
20
+ end
21
+ end
22
+
23
+ def [](k)
24
+ if self.class.vector_columns.key?(k.to_sym)
25
+ ::Pgvector.decode(super)
26
+ else
27
+ super
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+
@@ -0,0 +1,12 @@
1
+ module Sequel
2
+ module Plugins
3
+ module Pgvector
4
+ def self.configure(model, *columns)
5
+ model.vector_columns ||= {}
6
+ columns.each do |column|
7
+ model.vector_columns[column.to_sym] = {}
8
+ end
9
+ end
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,4 @@
1
+ module BxBuilderChain
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,238 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bx_builder_chain
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Paul Ketelle
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-08-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: zeitwerk
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 2.6.11
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 2.6.11
27
+ - !ruby/object:Gem::Dependency
28
+ name: baran
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.7
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.1.7
41
+ - !ruby/object:Gem::Dependency
42
+ name: sequel
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '5.71'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '5.71'
55
+ - !ruby/object:Gem::Dependency
56
+ name: pg
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 1.5.3
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 1.5.3
69
+ - !ruby/object:Gem::Dependency
70
+ name: dotenv
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '2.8'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '2.8'
83
+ - !ruby/object:Gem::Dependency
84
+ name: ruby-openai
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 5.1.0
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: 5.1.0
97
+ - !ruby/object:Gem::Dependency
98
+ name: pdf-reader
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - "~>"
102
+ - !ruby/object:Gem::Version
103
+ version: 2.11.0
104
+ type: :runtime
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - "~>"
109
+ - !ruby/object:Gem::Version
110
+ version: 2.11.0
111
+ - !ruby/object:Gem::Dependency
112
+ name: nokogiri
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - "~>"
116
+ - !ruby/object:Gem::Version
117
+ version: '1.8'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - "~>"
123
+ - !ruby/object:Gem::Version
124
+ version: '1.8'
125
+ - !ruby/object:Gem::Dependency
126
+ name: docx
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - "~>"
130
+ - !ruby/object:Gem::Version
131
+ version: 0.8.0
132
+ type: :runtime
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - "~>"
137
+ - !ruby/object:Gem::Version
138
+ version: 0.8.0
139
+ - !ruby/object:Gem::Dependency
140
+ name: roo
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 2.8.3
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 2.8.3
153
+ description: Write a longer description or delete this line.
154
+ email:
155
+ - paul.ketelle@builder.ai
156
+ executables: []
157
+ extensions: []
158
+ extra_rdoc_files: []
159
+ files:
160
+ - ".rspec"
161
+ - ".rubocop.yml"
162
+ - CHANGELOG.md
163
+ - Gemfile
164
+ - Gemfile.lock
165
+ - README.md
166
+ - Rakefile
167
+ - bx_builder_chain.gemspec
168
+ - lib/bx_builder_chain.rb
169
+ - lib/bx_builder_chain/chunker/recursive_text.rb
170
+ - lib/bx_builder_chain/chunker/text.rb
171
+ - lib/bx_builder_chain/configuration.rb
172
+ - lib/bx_builder_chain/data.rb
173
+ - lib/bx_builder_chain/dependency_helper.rb
174
+ - lib/bx_builder_chain/llm/base.rb
175
+ - lib/bx_builder_chain/llm/open_ai.rb
176
+ - lib/bx_builder_chain/loader.rb
177
+ - lib/bx_builder_chain/processors/base.rb
178
+ - lib/bx_builder_chain/processors/csv.rb
179
+ - lib/bx_builder_chain/processors/docx.rb
180
+ - lib/bx_builder_chain/processors/html.rb
181
+ - lib/bx_builder_chain/processors/json.rb
182
+ - lib/bx_builder_chain/processors/pdf.rb
183
+ - lib/bx_builder_chain/processors/text.rb
184
+ - lib/bx_builder_chain/processors/xlsx.rb
185
+ - lib/bx_builder_chain/utils/token_data/cl100k_base.tiktoken
186
+ - lib/bx_builder_chain/utils/token_length/base_validator.rb
187
+ - lib/bx_builder_chain/utils/token_length/open_ai_validator.rb
188
+ - lib/bx_builder_chain/utils/tokenization/byte_pair_encoding.rb
189
+ - lib/bx_builder_chain/utils/tokenization/open_ai_encodings.rb
190
+ - lib/bx_builder_chain/vectorsearch/base.rb
191
+ - lib/bx_builder_chain/vectorsearch/pgvector.rb
192
+ - lib/bx_builder_chain/version.rb
193
+ - lib/generators/bx_builder_chain/install_generator.rb
194
+ - lib/generators/bx_builder_chain/templates/app/admin/bx_builder_chain_document.rb
195
+ - lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/documents_controller.rb
196
+ - lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/questions_controller.rb
197
+ - lib/generators/bx_builder_chain/templates/app/controllers/bx_builder_chain/test_controller.rb
198
+ - lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document.rb
199
+ - lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/document_chunk.rb
200
+ - lib/generators/bx_builder_chain/templates/app/models/bx_builder_chain/embedding.rb
201
+ - lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/document_upload_service.rb
202
+ - lib/generators/bx_builder_chain/templates/app/services/bx_builder_chain/question_asking_service.rb
203
+ - lib/generators/bx_builder_chain/templates/app/views/bx_builder_chain/test/form.html.erb
204
+ - lib/generators/bx_builder_chain/templates/app/workers/bx_builder_chain/document_processor_worker.rb
205
+ - lib/generators/bx_builder_chain/templates/initializer.rb
206
+ - lib/generators/bx_builder_chain/templates/migration.rb
207
+ - lib/pgvector.rb
208
+ - lib/pgvector/pg.rb
209
+ - lib/pgvector/pg/binary_decoder/vector.rb
210
+ - lib/pgvector/pg/text_decoder/vector.rb
211
+ - lib/sequel/plugins/pgvector.rb
212
+ - lib/sequel/plugins/pgvector/class_methods.rb
213
+ - lib/sequel/plugins/pgvector/instance_methods.rb
214
+ - sig/bx_langchain_chat.rbs
215
+ homepage:
216
+ licenses:
217
+ - MIT
218
+ metadata: {}
219
+ post_install_message:
220
+ rdoc_options: []
221
+ require_paths:
222
+ - lib
223
+ required_ruby_version: !ruby/object:Gem::Requirement
224
+ requirements:
225
+ - - ">="
226
+ - !ruby/object:Gem::Version
227
+ version: 2.6.0
228
+ required_rubygems_version: !ruby/object:Gem::Requirement
229
+ requirements:
230
+ - - ">="
231
+ - !ruby/object:Gem::Version
232
+ version: '0'
233
+ requirements: []
234
+ rubygems_version: 3.1.0
235
+ signing_key:
236
+ specification_version: 4
237
+ summary: Write a short summary, because RubyGems requires one.
238
+ test_files: []