vecsearch 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.rubocop.yml +13 -0
- data/Gemfile +10 -0
- data/Gemfile.lock +56 -0
- data/LICENSE.txt +21 -0
- data/README.md +38 -0
- data/Rakefile +8 -0
- data/demo.rb +15 -0
- data/dev.yml +3 -0
- data/lib/vecsearch/gte_tiny.rb +91 -0
- data/lib/vecsearch/version.rb +3 -0
- data/lib/vecsearch.rb +27 -0
- data/vecsearch.gemspec +34 -0
- data/vendor/gte-tiny-q4_1.ggml.bin +0 -0
- data/vendor/libbert.dylib +0 -0
- data/vendor/libggml.dylib +0 -0
- metadata +88 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 23bcc9e270a754cd56fbc2013084719f6567c9f284fe0504e2e0bec666035c67
|
4
|
+
data.tar.gz: 4ddb00bc0c604bf1a7f3fc47561804e97e894df4382c0611105cdea24c4a50d8
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ab92719d0d0c8f3e14e002758c01b73493b4b4c578de81658e8bd952dd0a0eec1217d2061c4c5020a1246c6d74d5094ea12978d82661aec3ff97388cb8defad9
|
7
|
+
data.tar.gz: 23b554f0bdd20cfaea4f954dc989c3b01d52f174598d69d450de71a2bd69f5c720427d3b01389901a4ee9af4d9c6178fa82b5b2edd891ea332376deb52a0abc4
|
data/.rubocop.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,56 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
vecsearch (0.1.0)
|
5
|
+
faiss
|
6
|
+
ffi
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
ast (2.4.2)
|
12
|
+
base64 (0.1.1)
|
13
|
+
faiss (0.3.0)
|
14
|
+
numo-narray
|
15
|
+
rice (>= 4.0.2)
|
16
|
+
ffi (1.16.3)
|
17
|
+
json (2.6.3)
|
18
|
+
language_server-protocol (3.17.0.3)
|
19
|
+
numo-narray (0.9.2.1)
|
20
|
+
parallel (1.23.0)
|
21
|
+
parser (3.2.2.4)
|
22
|
+
ast (~> 2.4.1)
|
23
|
+
racc
|
24
|
+
racc (1.7.1)
|
25
|
+
rainbow (3.1.1)
|
26
|
+
rake (13.0.6)
|
27
|
+
regexp_parser (2.8.1)
|
28
|
+
rexml (3.2.6)
|
29
|
+
rice (4.1.0)
|
30
|
+
rubocop (1.56.4)
|
31
|
+
base64 (~> 0.1.1)
|
32
|
+
json (~> 2.3)
|
33
|
+
language_server-protocol (>= 3.17.0)
|
34
|
+
parallel (~> 1.10)
|
35
|
+
parser (>= 3.2.2.3)
|
36
|
+
rainbow (>= 2.2.2, < 4.0)
|
37
|
+
regexp_parser (>= 1.8, < 3.0)
|
38
|
+
rexml (>= 3.2.5, < 4.0)
|
39
|
+
rubocop-ast (>= 1.28.1, < 2.0)
|
40
|
+
ruby-progressbar (~> 1.7)
|
41
|
+
unicode-display_width (>= 2.4.0, < 3.0)
|
42
|
+
rubocop-ast (1.29.0)
|
43
|
+
parser (>= 3.2.1.0)
|
44
|
+
ruby-progressbar (1.13.0)
|
45
|
+
unicode-display_width (2.5.0)
|
46
|
+
|
47
|
+
PLATFORMS
|
48
|
+
arm64-darwin-22
|
49
|
+
|
50
|
+
DEPENDENCIES
|
51
|
+
rake (~> 13.0)
|
52
|
+
rubocop (~> 1.21)
|
53
|
+
vecsearch!
|
54
|
+
|
55
|
+
BUNDLED WITH
|
56
|
+
2.4.10
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2023 Burke Libbey
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
# Vecsearch
|
2
|
+
|
3
|
+
Vecsearch is an all-in-one vector search library for ruby that uses a 4-bit (Q4_1) quantization of
|
4
|
+
[gte-tiny](https://huggingface.co/TaylorAI/gte-tiny) by using [bert.cpp](https://github.com/skeskinen/bert.cpp) (a
|
5
|
+
[GGML](https://ggml.ai/) implementation of [BERT](https://arxiv.org/abs/1810.04805) via
|
6
|
+
[FFI](https://github.com/ffi/ffi)), and an in-process [FAISS](https://github.com/facebookresearch/faiss) index.
|
7
|
+
|
8
|
+
Vecsearch embeds pre-built dynamic libraries for `libbert` and `libggml`, as well as a quantized model checkpoint for
|
9
|
+
gte-tiny (total size: 14MB).
|
10
|
+
|
11
|
+
Currently only ARM64 macOS is supported, purely because I haven't bothered to build other dylibs yet. There is nothing
|
12
|
+
difficult about this.
|
13
|
+
|
14
|
+
## Usage
|
15
|
+
|
16
|
+
```ruby
|
17
|
+
require 'vecsearch'
|
18
|
+
|
19
|
+
vs = Vecsearch.new
|
20
|
+
vs << "sharks with freaking laser beams"
|
21
|
+
vs << "hello"
|
22
|
+
vs << "the sky is green"
|
23
|
+
|
24
|
+
puts(vs.nearest("hey there")) # => "hello"
|
25
|
+
```
|
26
|
+
## Bugs
|
27
|
+
|
28
|
+
Yes
|
29
|
+
|
30
|
+
## Limitations / TODO
|
31
|
+
|
32
|
+
* I haven't got the mean-pooling part of gte-tiny working. It seems to work well
|
33
|
+
enough without it but we should do that and assert that ours generates
|
34
|
+
approximately the same embedding as the canonical model.
|
35
|
+
* Batching looks unimplemented in bert.cpp; it would be nice for prefilling the
|
36
|
+
index.
|
37
|
+
* Add more builds for platforms other than darwin/amd64.
|
38
|
+
* Probably add a way to fetch an unquantized model, maybe other models entirely?
|
data/Rakefile
ADDED
data/demo.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'vecsearch'
|
2
|
+
|
3
|
+
vsi = Vecsearch.new
|
4
|
+
vsi << "sharks with freaking laser beams"
|
5
|
+
vsi << "hello"
|
6
|
+
vsi << "the sky is green"
|
7
|
+
|
8
|
+
puts("nearest record to 'hey there': #{vsi.nearest("hey there")}")
|
9
|
+
|
10
|
+
t = Time.now
|
11
|
+
N = 1000
|
12
|
+
N.times { vsi.nearest("hey there") }
|
13
|
+
elapsed = Time.now - t
|
14
|
+
puts "#{(elapsed / (N/1000.0)).round(1)} ms per query"
|
15
|
+
|
data/dev.yml
ADDED
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
# require 'narray'
|
3
|
+
|
4
|
+
class Vecsearch
|
5
|
+
class GTETiny
|
6
|
+
VENDOR = File.expand_path('../../vendor', __dir__)
|
7
|
+
|
8
|
+
module CStdio
|
9
|
+
extend FFI::Library
|
10
|
+
ffi_lib 'c'
|
11
|
+
|
12
|
+
attach_function :fflush, [:pointer], :int
|
13
|
+
end
|
14
|
+
|
15
|
+
module Bert
|
16
|
+
extend FFI::Library
|
17
|
+
ffi_lib File.expand_path('libbert.dylib', VENDOR)
|
18
|
+
|
19
|
+
attach_function :bert_load_from_file, [:string], :pointer
|
20
|
+
attach_function :bert_n_embd, [:pointer], :int
|
21
|
+
attach_function :bert_encode_batch, [:pointer, :int, :int, :int, :pointer, :pointer], :void
|
22
|
+
end
|
23
|
+
|
24
|
+
GTE_BIN = File.expand_path('gte-tiny-q4_1.ggml.bin', VENDOR)
|
25
|
+
MAX_TOKENS = 512
|
26
|
+
|
27
|
+
def initialize(fname=GTE_BIN)
|
28
|
+
suppress_streams do
|
29
|
+
@ctx = Bert.bert_load_from_file(fname)
|
30
|
+
@n_embd = Bert.bert_n_embd(@ctx)
|
31
|
+
sleep(0.1)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def suppress_streams
|
36
|
+
prev_stdout = STDOUT.dup
|
37
|
+
STDOUT.reopen("/dev/null", "w")
|
38
|
+
STDOUT.sync = true
|
39
|
+
yield
|
40
|
+
ensure
|
41
|
+
CStdio.fflush(nil) # Regular STDOUT.flush doesn't do it.
|
42
|
+
STDOUT.reopen(prev_stdout)
|
43
|
+
end
|
44
|
+
|
45
|
+
def encode(sentence, n_threads: 1)
|
46
|
+
# Encode the sentence into token embeddings
|
47
|
+
token_embeddings = encode_batch([sentence], n_threads: 1)
|
48
|
+
|
49
|
+
# Pool the token embeddings into a sentence embedding
|
50
|
+
# For simplicity, we'll use an attention mask of all ones
|
51
|
+
attention_mask = Array.new(token_embeddings.first.length, 1)
|
52
|
+
# sentence_embedding = mean_pooling(token_embeddings, attention_mask)
|
53
|
+
# sentence_embedding
|
54
|
+
token_embeddings
|
55
|
+
end
|
56
|
+
|
57
|
+
def encode_batch(input, n_threads: 1)
|
58
|
+
# Create an array of pointers to the input strings
|
59
|
+
input_ptrs = input.map { |str| FFI::MemoryPointer.from_string(str) }
|
60
|
+
|
61
|
+
# Create a pointer to the array of input pointers
|
62
|
+
input_ptrs_ptr = FFI::MemoryPointer.new(:pointer, input_ptrs.length)
|
63
|
+
input_ptrs_ptr.write_array_of_pointer(input_ptrs)
|
64
|
+
|
65
|
+
# Create an output buffer for each input string
|
66
|
+
output_ptrs = input.map { FFI::MemoryPointer.new(:float, @n_embd) }
|
67
|
+
|
68
|
+
# Create a pointer to the array of output pointers
|
69
|
+
output_ptrs_ptr = FFI::MemoryPointer.new(:pointer, output_ptrs.length)
|
70
|
+
output_ptrs_ptr.write_array_of_pointer(output_ptrs)
|
71
|
+
|
72
|
+
Bert.bert_encode_batch(@ctx, n_threads, MAX_TOKENS, input.length, input_ptrs_ptr, output_ptrs_ptr)
|
73
|
+
|
74
|
+
# Convert the output buffers to Ruby arrays
|
75
|
+
output = output_ptrs.map { |ptr| ptr.read_array_of_float(@n_embd) }
|
76
|
+
|
77
|
+
output
|
78
|
+
end
|
79
|
+
|
80
|
+
# def mean_pooling(token_embeddings, attention_mask)
|
81
|
+
# token_embeddings_na = NArray.to_na(token_embeddings)
|
82
|
+
# attention_mask_na = NArray.to_na(attention_mask)
|
83
|
+
|
84
|
+
# input_mask_expanded = attention_mask_na.expand_dims(-1).repeat(token_embeddings_na.shape[-1], -1)
|
85
|
+
|
86
|
+
# sentence_embeddings = (token_embeddings_na * input_mask_expanded).sum(1) / input_mask_expanded.sum(1).clip(1e-9)
|
87
|
+
|
88
|
+
# sentence_embeddings.to_a
|
89
|
+
# end
|
90
|
+
end
|
91
|
+
end
|
data/lib/vecsearch.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'vecsearch/version'
|
2
|
+
require 'faiss'
|
3
|
+
|
4
|
+
class Vecsearch
|
5
|
+
autoload(:GTETiny, 'vecsearch/gte_tiny')
|
6
|
+
|
7
|
+
def initialize(records=[])
|
8
|
+
@faiss = Faiss::IndexFlatL2.new(384)
|
9
|
+
@gte = GTETiny.new
|
10
|
+
@texts = []
|
11
|
+
records.each { |rec| self << rec }
|
12
|
+
end
|
13
|
+
|
14
|
+
def <<(str)
|
15
|
+
emb = @gte.encode(str)
|
16
|
+
@texts << str
|
17
|
+
@faiss.add(emb)
|
18
|
+
end
|
19
|
+
|
20
|
+
def nearest(str) = query(str, 1)[0]
|
21
|
+
|
22
|
+
def query(str, n)
|
23
|
+
emb = @gte.encode(str)
|
24
|
+
_scores, indexes = @faiss.search(emb, 2)
|
25
|
+
[].tap { |res| indexes.each { |idx| res << @texts[idx] } }
|
26
|
+
end
|
27
|
+
end
|
data/vecsearch.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "lib/vecsearch/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |spec|
|
6
|
+
spec.name = "vecsearch"
|
7
|
+
spec.version = Vecsearch::VERSION
|
8
|
+
spec.authors = ["Burke Libbey"]
|
9
|
+
spec.email = ["burke.libbey@shopify.com"]
|
10
|
+
|
11
|
+
spec.summary = "All-in-one simple vector search class for ruby."
|
12
|
+
spec.description = "All-in-one simple vector search class for ruby."
|
13
|
+
spec.homepage = "https://github.com/shopify/vecsearch"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 2.6.0"
|
16
|
+
|
17
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
18
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
19
|
+
|
20
|
+
# Specify which files should be added to the gem when it is released.
|
21
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
22
|
+
spec.files = Dir.chdir(__dir__) do
|
23
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
24
|
+
(File.expand_path(f) == __FILE__) || f.start_with?(*%w[bin/ test/ spec/ features/ .git .circleci appveyor])
|
25
|
+
end
|
26
|
+
end
|
27
|
+
spec.bindir = "exe"
|
28
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
29
|
+
spec.require_paths = ["lib"]
|
30
|
+
|
31
|
+
spec.add_dependency 'ffi'
|
32
|
+
spec.add_dependency 'faiss'
|
33
|
+
# spec.add_dependency 'narray' # for mean pooling
|
34
|
+
end
|
Binary file
|
Binary file
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: vecsearch
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Burke Libbey
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-10-06 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ffi
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faiss
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: All-in-one simple vector search class for ruby.
|
42
|
+
email:
|
43
|
+
- burke.libbey@shopify.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- ".rubocop.yml"
|
49
|
+
- Gemfile
|
50
|
+
- Gemfile.lock
|
51
|
+
- LICENSE.txt
|
52
|
+
- README.md
|
53
|
+
- Rakefile
|
54
|
+
- demo.rb
|
55
|
+
- dev.yml
|
56
|
+
- lib/vecsearch.rb
|
57
|
+
- lib/vecsearch/gte_tiny.rb
|
58
|
+
- lib/vecsearch/version.rb
|
59
|
+
- vecsearch.gemspec
|
60
|
+
- vendor/gte-tiny-q4_1.ggml.bin
|
61
|
+
- vendor/libbert.dylib
|
62
|
+
- vendor/libggml.dylib
|
63
|
+
homepage: https://github.com/shopify/vecsearch
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata:
|
67
|
+
homepage_uri: https://github.com/shopify/vecsearch
|
68
|
+
source_code_uri: https://github.com/shopify/vecsearch
|
69
|
+
post_install_message:
|
70
|
+
rdoc_options: []
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 2.6.0
|
78
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
requirements: []
|
84
|
+
rubygems_version: 3.4.10
|
85
|
+
signing_key:
|
86
|
+
specification_version: 4
|
87
|
+
summary: All-in-one simple vector search class for ruby.
|
88
|
+
test_files: []
|