vecsearch 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 23bcc9e270a754cd56fbc2013084719f6567c9f284fe0504e2e0bec666035c67
4
+ data.tar.gz: 4ddb00bc0c604bf1a7f3fc47561804e97e894df4382c0611105cdea24c4a50d8
5
+ SHA512:
6
+ metadata.gz: ab92719d0d0c8f3e14e002758c01b73493b4b4c578de81658e8bd952dd0a0eec1217d2061c4c5020a1246c6d74d5094ea12978d82661aec3ff97388cb8defad9
7
+ data.tar.gz: 23b554f0bdd20cfaea4f954dc989c3b01d52f174598d69d450de71a2bd69f5c720427d3b01389901a4ee9af4d9c6178fa82b5b2edd891ea332376deb52a0abc4
data/.rubocop.yml ADDED
@@ -0,0 +1,13 @@
1
+ AllCops:
2
+ TargetRubyVersion: 2.6
3
+
4
+ Style/StringLiterals:
5
+ Enabled: true
6
+ EnforcedStyle: double_quotes
7
+
8
+ Style/StringLiteralsInInterpolation:
9
+ Enabled: true
10
+ EnforcedStyle: double_quotes
11
+
12
+ Layout/LineLength:
13
+ Max: 120
data/Gemfile ADDED
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ # Specify your gem's dependencies in vecsearch.gemspec
6
+ gemspec
7
+
8
+ gem "rake", "~> 13.0"
9
+
10
+ gem "rubocop", "~> 1.21"
data/Gemfile.lock ADDED
@@ -0,0 +1,56 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ vecsearch (0.1.0)
5
+ faiss
6
+ ffi
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.2)
12
+ base64 (0.1.1)
13
+ faiss (0.3.0)
14
+ numo-narray
15
+ rice (>= 4.0.2)
16
+ ffi (1.16.3)
17
+ json (2.6.3)
18
+ language_server-protocol (3.17.0.3)
19
+ numo-narray (0.9.2.1)
20
+ parallel (1.23.0)
21
+ parser (3.2.2.4)
22
+ ast (~> 2.4.1)
23
+ racc
24
+ racc (1.7.1)
25
+ rainbow (3.1.1)
26
+ rake (13.0.6)
27
+ regexp_parser (2.8.1)
28
+ rexml (3.2.6)
29
+ rice (4.1.0)
30
+ rubocop (1.56.4)
31
+ base64 (~> 0.1.1)
32
+ json (~> 2.3)
33
+ language_server-protocol (>= 3.17.0)
34
+ parallel (~> 1.10)
35
+ parser (>= 3.2.2.3)
36
+ rainbow (>= 2.2.2, < 4.0)
37
+ regexp_parser (>= 1.8, < 3.0)
38
+ rexml (>= 3.2.5, < 4.0)
39
+ rubocop-ast (>= 1.28.1, < 2.0)
40
+ ruby-progressbar (~> 1.7)
41
+ unicode-display_width (>= 2.4.0, < 3.0)
42
+ rubocop-ast (1.29.0)
43
+ parser (>= 3.2.1.0)
44
+ ruby-progressbar (1.13.0)
45
+ unicode-display_width (2.5.0)
46
+
47
+ PLATFORMS
48
+ arm64-darwin-22
49
+
50
+ DEPENDENCIES
51
+ rake (~> 13.0)
52
+ rubocop (~> 1.21)
53
+ vecsearch!
54
+
55
+ BUNDLED WITH
56
+ 2.4.10
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2023 Burke Libbey
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,38 @@
1
+ # Vecsearch
2
+
3
+ Vecsearch is an all-in-one vector search library for ruby that uses a 4-bit (Q4_1) quantization of
4
+ [gte-tiny](https://huggingface.co/TaylorAI/gte-tiny) by using [bert.cpp](https://github.com/skeskinen/bert.cpp) (a
5
+ [GGML](https://ggml.ai/) implementation of [BERT](https://arxiv.org/abs/1810.04805) via
6
+ [FFI](https://github.com/ffi/ffi)), and an in-process [FAISS](https://github.com/facebookresearch/faiss) index.
7
+
8
+ Vecsearch embeds pre-built dynamic libraries for `libbert` and `libggml`, as well as a quantized model checkpoint for
9
+ gte-tiny (total size: 14MB).
10
+
11
+ Currently only ARM64 macOS is supported, purely because I haven't bothered to build other dylibs yet. There is nothing
12
+ difficult about this.
13
+
14
+ ## Usage
15
+
16
+ ```ruby
17
+ require 'vecsearch'
18
+
19
+ vs = Vecsearch.new
20
+ vs << "sharks with freaking laser beams"
21
+ vs << "hello"
22
+ vs << "the sky is green"
23
+
24
+ puts(vs.nearest("hey there")) # => "hello"
25
+ ```
26
+ ## Bugs
27
+
28
+ Yes
29
+
30
+ ## Limitations / TODO
31
+
32
+ * I haven't got the mean-pooling part of gte-tiny working. It seems to work well
33
+ enough without it but we should do that and assert that ours generates
34
+ approximately the same embedding as the canonical model.
35
+ * Batching looks unimplemented in bert.cpp; it would be nice for prefilling the
36
+ index.
37
+ * Add more builds for platforms other than darwin/amd64.
38
+ * Probably add a way to fetch an unquantized model, maybe other models entirely?
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rubocop/rake_task"
5
+
6
+ RuboCop::RakeTask.new
7
+
8
+ task default: :rubocop
data/demo.rb ADDED
@@ -0,0 +1,15 @@
1
+ require 'vecsearch'
2
+
3
+ vsi = Vecsearch.new
4
+ vsi << "sharks with freaking laser beams"
5
+ vsi << "hello"
6
+ vsi << "the sky is green"
7
+
8
+ puts("nearest record to 'hey there': #{vsi.nearest("hey there")}")
9
+
10
+ t = Time.now
11
+ N = 1000
12
+ N.times { vsi.nearest("hey there") }
13
+ elapsed = Time.now - t
14
+ puts "#{(elapsed / (N/1000.0)).round(1)} ms per query"
15
+
data/dev.yml ADDED
@@ -0,0 +1,3 @@
1
+ up:
2
+ - ruby: 3.2.2
3
+ - bundler
@@ -0,0 +1,91 @@
1
+ require 'ffi'
2
+ # require 'narray'
3
+
4
+ class Vecsearch
5
+ class GTETiny
6
+ VENDOR = File.expand_path('../../vendor', __dir__)
7
+
8
+ module CStdio
9
+ extend FFI::Library
10
+ ffi_lib 'c'
11
+
12
+ attach_function :fflush, [:pointer], :int
13
+ end
14
+
15
+ module Bert
16
+ extend FFI::Library
17
+ ffi_lib File.expand_path('libbert.dylib', VENDOR)
18
+
19
+ attach_function :bert_load_from_file, [:string], :pointer
20
+ attach_function :bert_n_embd, [:pointer], :int
21
+ attach_function :bert_encode_batch, [:pointer, :int, :int, :int, :pointer, :pointer], :void
22
+ end
23
+
24
+ GTE_BIN = File.expand_path('gte-tiny-q4_1.ggml.bin', VENDOR)
25
+ MAX_TOKENS = 512
26
+
27
+ def initialize(fname=GTE_BIN)
28
+ suppress_streams do
29
+ @ctx = Bert.bert_load_from_file(fname)
30
+ @n_embd = Bert.bert_n_embd(@ctx)
31
+ sleep(0.1)
32
+ end
33
+ end
34
+
35
+ def suppress_streams
36
+ prev_stdout = STDOUT.dup
37
+ STDOUT.reopen("/dev/null", "w")
38
+ STDOUT.sync = true
39
+ yield
40
+ ensure
41
+ CStdio.fflush(nil) # Regular STDOUT.flush doesn't do it.
42
+ STDOUT.reopen(prev_stdout)
43
+ end
44
+
45
+ def encode(sentence, n_threads: 1)
46
+ # Encode the sentence into token embeddings
47
+ token_embeddings = encode_batch([sentence], n_threads: 1)
48
+
49
+ # Pool the token embeddings into a sentence embedding
50
+ # For simplicity, we'll use an attention mask of all ones
51
+ attention_mask = Array.new(token_embeddings.first.length, 1)
52
+ # sentence_embedding = mean_pooling(token_embeddings, attention_mask)
53
+ # sentence_embedding
54
+ token_embeddings
55
+ end
56
+
57
+ def encode_batch(input, n_threads: 1)
58
+ # Create an array of pointers to the input strings
59
+ input_ptrs = input.map { |str| FFI::MemoryPointer.from_string(str) }
60
+
61
+ # Create a pointer to the array of input pointers
62
+ input_ptrs_ptr = FFI::MemoryPointer.new(:pointer, input_ptrs.length)
63
+ input_ptrs_ptr.write_array_of_pointer(input_ptrs)
64
+
65
+ # Create an output buffer for each input string
66
+ output_ptrs = input.map { FFI::MemoryPointer.new(:float, @n_embd) }
67
+
68
+ # Create a pointer to the array of output pointers
69
+ output_ptrs_ptr = FFI::MemoryPointer.new(:pointer, output_ptrs.length)
70
+ output_ptrs_ptr.write_array_of_pointer(output_ptrs)
71
+
72
+ Bert.bert_encode_batch(@ctx, n_threads, MAX_TOKENS, input.length, input_ptrs_ptr, output_ptrs_ptr)
73
+
74
+ # Convert the output buffers to Ruby arrays
75
+ output = output_ptrs.map { |ptr| ptr.read_array_of_float(@n_embd) }
76
+
77
+ output
78
+ end
79
+
80
+ # def mean_pooling(token_embeddings, attention_mask)
81
+ # token_embeddings_na = NArray.to_na(token_embeddings)
82
+ # attention_mask_na = NArray.to_na(attention_mask)
83
+
84
+ # input_mask_expanded = attention_mask_na.expand_dims(-1).repeat(token_embeddings_na.shape[-1], -1)
85
+
86
+ # sentence_embeddings = (token_embeddings_na * input_mask_expanded).sum(1) / input_mask_expanded.sum(1).clip(1e-9)
87
+
88
+ # sentence_embeddings.to_a
89
+ # end
90
+ end
91
+ end
@@ -0,0 +1,3 @@
1
+ class Vecsearch
2
+ VERSION = "0.1.0"
3
+ end
data/lib/vecsearch.rb ADDED
@@ -0,0 +1,27 @@
1
+ require 'vecsearch/version'
2
+ require 'faiss'
3
+
4
+ class Vecsearch
5
+ autoload(:GTETiny, 'vecsearch/gte_tiny')
6
+
7
+ def initialize(records=[])
8
+ @faiss = Faiss::IndexFlatL2.new(384)
9
+ @gte = GTETiny.new
10
+ @texts = []
11
+ records.each { |rec| self << rec }
12
+ end
13
+
14
+ def <<(str)
15
+ emb = @gte.encode(str)
16
+ @texts << str
17
+ @faiss.add(emb)
18
+ end
19
+
20
+ def nearest(str) = query(str, 1)[0]
21
+
22
+ def query(str, n)
23
+ emb = @gte.encode(str)
24
+ _scores, indexes = @faiss.search(emb, 2)
25
+ [].tap { |res| indexes.each { |idx| res << @texts[idx] } }
26
+ end
27
+ end
data/vecsearch.gemspec ADDED
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "lib/vecsearch/version"
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = "vecsearch"
7
+ spec.version = Vecsearch::VERSION
8
+ spec.authors = ["Burke Libbey"]
9
+ spec.email = ["burke.libbey@shopify.com"]
10
+
11
+ spec.summary = "All-in-one simple vector search class for ruby."
12
+ spec.description = "All-in-one simple vector search class for ruby."
13
+ spec.homepage = "https://github.com/shopify/vecsearch"
14
+ spec.license = "MIT"
15
+ spec.required_ruby_version = ">= 2.6.0"
16
+
17
+ spec.metadata["homepage_uri"] = spec.homepage
18
+ spec.metadata["source_code_uri"] = spec.homepage
19
+
20
+ # Specify which files should be added to the gem when it is released.
21
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
22
+ spec.files = Dir.chdir(__dir__) do
23
+ `git ls-files -z`.split("\x0").reject do |f|
24
+ (File.expand_path(f) == __FILE__) || f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor])
25
+ end
26
+ end
27
+ spec.bindir = "exe"
28
+ spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
29
+ spec.require_paths = ["lib"]
30
+
31
+ spec.add_dependency 'ffi'
32
+ spec.add_dependency 'faiss'
33
+ # spec.add_dependency 'narray' # for mean pooling
34
+ end
Binary file
Binary file
Binary file
metadata ADDED
@@ -0,0 +1,88 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: vecsearch
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Burke Libbey
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2023-10-06 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ffi
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: faiss
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: All-in-one simple vector search class for ruby.
42
+ email:
43
+ - burke.libbey@shopify.com
44
+ executables: []
45
+ extensions: []
46
+ extra_rdoc_files: []
47
+ files:
48
+ - ".rubocop.yml"
49
+ - Gemfile
50
+ - Gemfile.lock
51
+ - LICENSE.txt
52
+ - README.md
53
+ - Rakefile
54
+ - demo.rb
55
+ - dev.yml
56
+ - lib/vecsearch.rb
57
+ - lib/vecsearch/gte_tiny.rb
58
+ - lib/vecsearch/version.rb
59
+ - vecsearch.gemspec
60
+ - vendor/gte-tiny-q4_1.ggml.bin
61
+ - vendor/libbert.dylib
62
+ - vendor/libggml.dylib
63
+ homepage: https://github.com/shopify/vecsearch
64
+ licenses:
65
+ - MIT
66
+ metadata:
67
+ homepage_uri: https://github.com/shopify/vecsearch
68
+ source_code_uri: https://github.com/shopify/vecsearch
69
+ post_install_message:
70
+ rdoc_options: []
71
+ require_paths:
72
+ - lib
73
+ required_ruby_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: 2.6.0
78
+ required_rubygems_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '0'
83
+ requirements: []
84
+ rubygems_version: 3.4.10
85
+ signing_key:
86
+ specification_version: 4
87
+ summary: All-in-one simple vector search class for ruby.
88
+ test_files: []