siglip2 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 68ada5506440969bdf35a88d2cfbd21e0b9fc3b8ebce65939b5d6f4d0630c66f
4
+ data.tar.gz: bd1df33b7d429de31464466e8c8de36205febc54a75124bafce6761f769053c4
5
+ SHA512:
6
+ metadata.gz: 8aca7baf91b51fe000817053a23068cbc26594ea2a69e1e1d0f1fd8978babe02eec9fe7ff38edeb51df9dc43764c2be70c191787a3ffed3f5103963419a8d220
7
+ data.tar.gz: f0caa17a32ce58ef6f4acdcb3b51e67bfa7cde86b0c0a5a548fb30d0e2ca30ff7c533638af32449219fe5ce0b5556d1feaf79a756ebd4801ad163bbb2d5b9678
data/.mise.toml ADDED
@@ -0,0 +1,2 @@
1
+ [tools]
2
+ ruby = "3.3.10"
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Krzysztof Hasiński
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,203 @@
1
+ # SigLIP2-rb
2
+
3
+ [![CI](https://github.com/khasinski/siglip2-rb/actions/workflows/ci.yml/badge.svg)](https://github.com/khasinski/siglip2-rb/actions/workflows/ci.yml)
4
+ [![Gem Version](https://badge.fury.io/rb/siglip2.svg)](https://rubygems.org/gems/siglip2)
5
+
6
+ Ruby implementation of Google's SigLIP2 (Sigmoid Loss for Language Image Pre-Training 2) for creating text and image embeddings. Uses ONNX models from HuggingFace [onnx-community](https://huggingface.co/onnx-community).
7
+
8
+ ## What is this for?
9
+
10
+ SigLIP2 creates numerical representations (embeddings) of images and text in the same vector space. This means you can directly compare text with images using cosine similarity.
11
+
12
+ **Common use cases:**
13
+
14
+ - **Image search** - find images matching a text query without manual tagging
15
+ - **Content moderation** - detect unwanted content by comparing against text descriptions ("violence", "nudity", etc.)
16
+ - **Image clustering** - group similar images by comparing their embeddings
17
+ - **Duplicate detection** - find near-duplicate images in large collections
18
+ - **Auto-tagging** - assign labels to images by finding best matching text descriptions
19
+
20
+ ## Installation
21
+
22
+ Add this line to your application's Gemfile:
23
+
24
+ ```ruby
25
+ gem 'siglip2'
26
+ ```
27
+
28
+ Or install directly:
29
+
30
+ ```bash
31
+ gem install siglip2
32
+ ```
33
+
34
+ ### Requirements
35
+
36
+ - Ruby >= 3.0.0
37
+ - ImageMagick (for image processing)
38
+
39
+ ## Usage
40
+
41
+ ### Ruby API
42
+
43
+ ```ruby
44
+ require 'siglip2'
45
+
46
+ # Create model with default settings (base-patch16-224)
47
+ model = Siglip2::Model.new
48
+
49
+ # Or specify a different model and quantization
50
+ model = Siglip2::Model.new(
51
+ model_name: "large-patch16-256",
52
+ quantization: "int8"
53
+ )
54
+
55
+ # Encode text
56
+ text_embedding = model.encode_text("a photo of a cat")
57
+
58
+ # Encode image
59
+ image_embedding = model.encode_image("cat.jpg")
60
+
61
+ # Calculate similarity
62
+ score = model.similarity("a photo of a cat", "cat.jpg")
63
+ puts "Similarity: #{score}"
64
+
65
+ # Batch similarity
66
+ texts = ["a cat", "a dog", "a car"]
67
+ images = ["image1.jpg", "image2.jpg"]
68
+ scores = model.batch_similarity(texts, images)
69
+ ```
70
+
71
+ ### CLI Tools
72
+
73
+ #### Embed text
74
+
75
+ ```bash
76
+ siglip2-embed-text "a photo of a cat"
77
+ siglip2-embed-text -m large-patch16-256 "a photo of a cat"
78
+ siglip2-embed-text -q int8 "a photo of a cat"
79
+ siglip2-embed-text -f csv "a photo of a cat"
80
+ ```
81
+
82
+ #### Embed image
83
+
84
+ ```bash
85
+ siglip2-embed-image cat.jpg
86
+ siglip2-embed-image -m large-patch16-256 cat.jpg
87
+ siglip2-embed-image -q int8 cat.jpg
88
+ ```
89
+
90
+ #### Calculate similarity
91
+
92
+ ```bash
93
+ siglip2-similarity "a photo of a cat" cat.jpg
94
+ ```
95
+
96
+ #### List available models
97
+
98
+ ```bash
99
+ siglip2-embed-text -l
100
+ ```
101
+
102
+ #### List quantization options
103
+
104
+ ```bash
105
+ siglip2-embed-text -L
106
+ ```
107
+
108
+ ## Use Case Examples
109
+
110
+ ### Image Search
111
+
112
+ ```ruby
113
+ model = Siglip2::Model.new
114
+
115
+ # Pre-compute embeddings for all images (store in database)
116
+ image_embeddings = images.map { |path| [path, model.encode_image(path)] }
117
+
118
+ # Search by text query
119
+ query_embedding = model.encode_text("sunset over mountains")
120
+ results = image_embeddings
121
+ .map { |path, emb| [path, dot_product(query_embedding, emb)] }
122
+ .sort_by { |_, score| -score }
123
+ .first(10)
124
+ ```
125
+
126
+ ### Content Moderation
127
+
128
+ ```ruby
129
+ model = Siglip2::Model.new
130
+
131
+ # Define unwanted content categories
132
+ categories = ["violence", "gore", "nudity", "drugs"]
133
+ category_embeddings = categories.map { |c| model.encode_text(c) }
134
+
135
+ # Check uploaded image
136
+ image_emb = model.encode_image(uploaded_file)
137
+ scores = category_embeddings.map { |ce| dot_product(image_emb, ce) }
138
+
139
+ if scores.max > 0.25 # threshold
140
+ flag_for_review(uploaded_file)
141
+ end
142
+ ```
143
+
144
+ ### Auto-tagging
145
+
146
+ ```ruby
147
+ model = Siglip2::Model.new
148
+
149
+ tags = ["cat", "dog", "car", "landscape", "portrait", "food"]
150
+ tag_embeddings = tags.map { |t| [t, model.encode_text("a photo of #{t}")] }
151
+
152
+ image_emb = model.encode_image("photo.jpg")
153
+ matched_tags = tag_embeddings
154
+ .map { |tag, emb| [tag, dot_product(image_emb, emb)] }
155
+ .select { |_, score| score > 0.2 }
156
+ .map(&:first)
157
+ # => ["cat"]
158
+ ```
159
+
160
+ ## Available Models
161
+
162
+ | Model | Image Size | Description |
163
+ |-------|------------|-------------|
164
+ | `base-patch16-224` | 224x224 | Default, smallest |
165
+ | `base-patch16-256` | 256x256 | |
166
+ | `base-patch16-384` | 384x384 | |
167
+ | `base-patch16-512` | 512x512 | |
168
+ | `base-patch32-256` | 256x256 | Larger patch size |
169
+ | `base-patch16-naflex` | 224x224 | Flexible resolution |
170
+ | `large-patch16-256` | 256x256 | Larger model |
171
+ | `large-patch16-384` | 384x384 | |
172
+ | `large-patch16-512` | 512x512 | |
173
+ | `giant-opt-patch16-256` | 256x256 | Optimized giant |
174
+ | `giant-opt-patch16-384` | 384x384 | |
175
+ | `so400m-patch14-224` | 224x224 | 400M parameters |
176
+ | `so400m-patch14-384` | 384x384 | |
177
+ | `so400m-patch16-256` | 256x256 | |
178
+ | `so400m-patch16-384` | 384x384 | |
179
+ | `so400m-patch16-512` | 512x512 | |
180
+
181
+ ## Quantization Options
182
+
183
+ | Option | Description |
184
+ |--------|-------------|
185
+ | `fp32` | Full precision (default) |
186
+ | `fp16` | Half precision |
187
+ | `int8` | 8-bit integer |
188
+ | `uint8` | Unsigned 8-bit integer |
189
+ | `q4` | 4-bit quantization |
190
+ | `q4f16` | 4-bit with fp16 |
191
+ | `bnb4` | BitsAndBytes 4-bit |
192
+
193
+ ## Model Storage
194
+
195
+ Models are automatically downloaded on first use and stored in `~/.siglip2_models/`. You can change this location:
196
+
197
+ ```ruby
198
+ Siglip2.models_dir = "/path/to/models"
199
+ ```
200
+
201
+ ## License
202
+
203
+ MIT License
data/Rakefile ADDED
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ task default: :spec
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "siglip2"
5
+ require "optparse"
6
+ require "json"
7
+
8
+ options = {
9
+ model: Siglip2::DEFAULT_MODEL,
10
+ quantization: Siglip2::DEFAULT_QUANTIZATION,
11
+ format: "json"
12
+ }
13
+
14
+ parser = OptionParser.new do |opts|
15
+ opts.banner = "Usage: siglip2-embed-image [options] IMAGE_PATH"
16
+
17
+ opts.on("-m", "--model MODEL", "Model name (default: #{Siglip2::DEFAULT_MODEL})") do |m|
18
+ options[:model] = m
19
+ end
20
+
21
+ opts.on("-q", "--quantization QUANT", "Quantization (default: #{Siglip2::DEFAULT_QUANTIZATION})") do |q|
22
+ options[:quantization] = q
23
+ end
24
+
25
+ opts.on("-f", "--format FORMAT", "Output format: json, csv (default: json)") do |f|
26
+ options[:format] = f
27
+ end
28
+
29
+ opts.on("-l", "--list-models", "List available models") do
30
+ puts "Available models:"
31
+ Siglip2.list_models.each { |m| puts " #{m}" }
32
+ exit
33
+ end
34
+
35
+ opts.on("-L", "--list-quantizations", "List available quantizations") do
36
+ puts "Available quantizations:"
37
+ Siglip2.list_quantizations.each { |q| puts " #{q}" }
38
+ exit
39
+ end
40
+
41
+ opts.on("-h", "--help", "Show this help message") do
42
+ puts opts
43
+ exit
44
+ end
45
+ end
46
+
47
+ parser.parse!
48
+
49
+ if ARGV.empty?
50
+ puts parser
51
+ exit 1
52
+ end
53
+
54
+ image_path = ARGV[0]
55
+
56
+ unless File.exist?(image_path)
57
+ warn "Error: Image file not found: #{image_path}"
58
+ exit 1
59
+ end
60
+
61
+ begin
62
+ model = Siglip2::Model.new(model_name: options[:model], quantization: options[:quantization])
63
+ embedding = model.encode_image(image_path)
64
+
65
+ case options[:format]
66
+ when "json"
67
+ puts JSON.generate(embedding)
68
+ when "csv"
69
+ puts embedding.join(",")
70
+ else
71
+ puts JSON.generate(embedding)
72
+ end
73
+ rescue Siglip2::Error => e
74
+ warn "Error: #{e.message}"
75
+ exit 1
76
+ end
@@ -0,0 +1,71 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "siglip2"
5
+ require "optparse"
6
+ require "json"
7
+
8
+ options = {
9
+ model: Siglip2::DEFAULT_MODEL,
10
+ quantization: Siglip2::DEFAULT_QUANTIZATION,
11
+ format: "json"
12
+ }
13
+
14
+ parser = OptionParser.new do |opts|
15
+ opts.banner = "Usage: siglip2-embed-text [options] TEXT"
16
+
17
+ opts.on("-m", "--model MODEL", "Model name (default: #{Siglip2::DEFAULT_MODEL})") do |m|
18
+ options[:model] = m
19
+ end
20
+
21
+ opts.on("-q", "--quantization QUANT", "Quantization (default: #{Siglip2::DEFAULT_QUANTIZATION})") do |q|
22
+ options[:quantization] = q
23
+ end
24
+
25
+ opts.on("-f", "--format FORMAT", "Output format: json, csv (default: json)") do |f|
26
+ options[:format] = f
27
+ end
28
+
29
+ opts.on("-l", "--list-models", "List available models") do
30
+ puts "Available models:"
31
+ Siglip2.list_models.each { |m| puts " #{m}" }
32
+ exit
33
+ end
34
+
35
+ opts.on("-L", "--list-quantizations", "List available quantizations") do
36
+ puts "Available quantizations:"
37
+ Siglip2.list_quantizations.each { |q| puts " #{q}" }
38
+ exit
39
+ end
40
+
41
+ opts.on("-h", "--help", "Show this help message") do
42
+ puts opts
43
+ exit
44
+ end
45
+ end
46
+
47
+ parser.parse!
48
+
49
+ if ARGV.empty?
50
+ puts parser
51
+ exit 1
52
+ end
53
+
54
+ text = ARGV.join(" ")
55
+
56
+ begin
57
+ model = Siglip2::Model.new(model_name: options[:model], quantization: options[:quantization])
58
+ embedding = model.encode_text(text)
59
+
60
+ case options[:format]
61
+ when "json"
62
+ puts JSON.generate(embedding)
63
+ when "csv"
64
+ puts embedding.join(",")
65
+ else
66
+ puts JSON.generate(embedding)
67
+ end
68
+ rescue Siglip2::Error => e
69
+ warn "Error: #{e.message}"
70
+ exit 1
71
+ end
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "siglip2"
5
+ require "optparse"
6
+ require "json"
7
+
8
+ options = {
9
+ model: Siglip2::DEFAULT_MODEL,
10
+ quantization: Siglip2::DEFAULT_QUANTIZATION
11
+ }
12
+
13
+ parser = OptionParser.new do |opts|
14
+ opts.banner = "Usage: siglip2-similarity [options] TEXT IMAGE_PATH"
15
+
16
+ opts.on("-m", "--model MODEL", "Model name (default: #{Siglip2::DEFAULT_MODEL})") do |m|
17
+ options[:model] = m
18
+ end
19
+
20
+ opts.on("-q", "--quantization QUANT", "Quantization (default: #{Siglip2::DEFAULT_QUANTIZATION})") do |q|
21
+ options[:quantization] = q
22
+ end
23
+
24
+ opts.on("-l", "--list-models", "List available models") do
25
+ puts "Available models:"
26
+ Siglip2.list_models.each { |m| puts " #{m}" }
27
+ exit
28
+ end
29
+
30
+ opts.on("-h", "--help", "Show this help message") do
31
+ puts opts
32
+ exit
33
+ end
34
+ end
35
+
36
+ parser.parse!
37
+
38
+ if ARGV.length < 2
39
+ puts parser
40
+ exit 1
41
+ end
42
+
43
+ text = ARGV[0]
44
+ image_path = ARGV[1]
45
+
46
+ unless File.exist?(image_path)
47
+ warn "Error: Image file not found: #{image_path}"
48
+ exit 1
49
+ end
50
+
51
+ begin
52
+ model = Siglip2::Model.new(model_name: options[:model], quantization: options[:quantization])
53
+ score = model.similarity(text, image_path)
54
+
55
+ puts "Similarity score: #{score}"
56
+ rescue Siglip2::Error => e
57
+ warn "Error: #{e.message}"
58
+ exit 1
59
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "mini_magick"
4
+ require "numo/narray"
5
+
6
+ module Siglip2
7
+ class ImagePreprocessor
8
+ # SigLIP2 normalization constants
9
+ # From preprocessor_config.json: mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]
10
+ MEAN = [0.5, 0.5, 0.5].freeze
11
+ STD = [0.5, 0.5, 0.5].freeze
12
+
13
+ attr_reader :size
14
+
15
+ def initialize(size: 224)
16
+ @size = size
17
+ end
18
+
19
+ def preprocess(image_path)
20
+ image = load_and_resize(image_path)
21
+ tensor = image_to_tensor(image)
22
+ tensor = normalize(tensor)
23
+ add_batch_dimension(tensor)
24
+ end
25
+
26
+ private
27
+
28
+ def load_and_resize(image_path)
29
+ image = MiniMagick::Image.open(image_path)
30
+ image.format("png")
31
+ image.resize("#{@size}x#{@size}!")
32
+ image.colorspace("sRGB")
33
+ image
34
+ end
35
+
36
+ def image_to_tensor(image)
37
+ # Get raw pixel data as RGB
38
+ pixels = image.get_pixels
39
+
40
+ # Convert to Numo::SFloat array
41
+ # Shape: [height, width, 3]
42
+ height = pixels.length
43
+ width = pixels[0].length
44
+
45
+ flat_pixels = pixels.flatten.map(&:to_f)
46
+ tensor = Numo::SFloat.cast(flat_pixels)
47
+ tensor = tensor.reshape(height, width, 3)
48
+
49
+ # Rescale from [0, 255] to [0, 1]
50
+ tensor = tensor / 255.0
51
+
52
+ # Transpose from [H, W, C] to [C, H, W]
53
+ tensor = tensor.transpose(2, 0, 1)
54
+
55
+ tensor
56
+ end
57
+
58
+ def normalize(tensor)
59
+ # Apply normalization: (x - mean) / std
60
+ # For SigLIP2: (x - 0.5) / 0.5 = 2x - 1, maps [0,1] to [-1,1]
61
+ result = Numo::SFloat.zeros(tensor.shape)
62
+
63
+ 3.times do |c|
64
+ result[c, true, true] = (tensor[c, true, true] - MEAN[c]) / STD[c]
65
+ end
66
+
67
+ result
68
+ end
69
+
70
+ def add_batch_dimension(tensor)
71
+ # Convert to nested Ruby arrays with batch dimension
72
+ # Shape: [1, 3, height, width]
73
+ [tensor.to_a]
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,124 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "onnxruntime"
4
+ require "tokenizers"
5
+
6
+ module Siglip2
7
+ class Model
8
+ attr_reader :model_name, :quantization, :model_path
9
+
10
+ # Image sizes for each model variant
11
+ IMAGE_SIZES = {
12
+ "base-patch16-224" => 224,
13
+ "base-patch16-256" => 256,
14
+ "base-patch16-384" => 384,
15
+ "base-patch16-512" => 512,
16
+ "base-patch32-256" => 256,
17
+ "base-patch16-naflex" => 224,
18
+ "large-patch16-256" => 256,
19
+ "large-patch16-384" => 384,
20
+ "large-patch16-512" => 512,
21
+ "giant-opt-patch16-256" => 256,
22
+ "giant-opt-patch16-384" => 384,
23
+ "so400m-patch14-224" => 224,
24
+ "so400m-patch14-384" => 384,
25
+ "so400m-patch16-256" => 256,
26
+ "so400m-patch16-384" => 384,
27
+ "so400m-patch16-512" => 512
28
+ }.freeze
29
+
30
+ def initialize(model_name: Siglip2::DEFAULT_MODEL, quantization: Siglip2::DEFAULT_QUANTIZATION)
31
+ @model_name = model_name
32
+ @quantization = quantization
33
+ @model_path = Siglip2.model_path(model_name, quantization: quantization)
34
+
35
+ unless Siglip2.models_exist?(model_name, quantization: quantization)
36
+ Siglip2.download_models(model_name, quantization: quantization)
37
+ end
38
+
39
+ @image_size = IMAGE_SIZES[model_name] || 224
40
+ @image_preprocessor = ImagePreprocessor.new(size: @image_size)
41
+ end
42
+
43
+ def encode_text(text)
44
+ input_ids = tokenize(text)
45
+
46
+ output = text_model.predict({ "input_ids" => input_ids })
47
+
48
+ # Extract pooler_output embeddings
49
+ embeddings = output["pooler_output"].flatten
50
+ normalize_embeddings(embeddings)
51
+ end
52
+
53
+ def encode_image(image_path)
54
+ pixel_values = @image_preprocessor.preprocess(image_path)
55
+
56
+ output = vision_model.predict({ "pixel_values" => pixel_values })
57
+
58
+ # Extract pooler_output embeddings
59
+ embeddings = output["pooler_output"].flatten
60
+ normalize_embeddings(embeddings)
61
+ end
62
+
63
+ def similarity(text, image_path)
64
+ text_embedding = encode_text(text)
65
+ image_embedding = encode_image(image_path)
66
+
67
+ dot_product(text_embedding, image_embedding)
68
+ end
69
+
70
+ def batch_similarity(texts, image_paths)
71
+ text_embeddings = texts.map { |t| encode_text(t) }
72
+ image_embeddings = image_paths.map { |p| encode_image(p) }
73
+
74
+ text_embeddings.map do |te|
75
+ image_embeddings.map { |ie| dot_product(te, ie) }
76
+ end
77
+ end
78
+
79
+ private
80
+
81
+ def tokenizer
82
+ @tokenizer ||= Tokenizers.from_file(File.join(@model_path, "tokenizer.json"))
83
+ end
84
+
85
+ def text_model
86
+ @text_model ||= OnnxRuntime::Model.new(File.join(@model_path, "text_model.onnx"))
87
+ end
88
+
89
+ def vision_model
90
+ @vision_model ||= OnnxRuntime::Model.new(File.join(@model_path, "vision_model.onnx"))
91
+ end
92
+
93
+ def tokenize(text)
94
+ # SigLIP2 uses Gemma tokenizer - lowercase text
95
+ processed_text = text.downcase
96
+
97
+ encoding = tokenizer.encode(processed_text)
98
+ input_ids = encoding.ids
99
+
100
+ # Truncate or pad to max_length (64 is typical for SigLIP2)
101
+ max_length = 64
102
+ if input_ids.length > max_length
103
+ input_ids = input_ids[0...max_length]
104
+ elsif input_ids.length < max_length
105
+ padding_length = max_length - input_ids.length
106
+ input_ids += Array.new(padding_length, 0) # 0 is pad token
107
+ end
108
+
109
+ # Return as 2D array (batch size = 1) with int64 type
110
+ [input_ids]
111
+ end
112
+
113
+ def normalize_embeddings(embeddings)
114
+ norm = Math.sqrt(embeddings.map { |x| x * x }.sum)
115
+ return embeddings if norm == 0
116
+
117
+ embeddings.map { |x| x / norm }
118
+ end
119
+
120
+ def dot_product(a, b)
121
+ a.zip(b).map { |x, y| x * y }.sum
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Siglip2
4
+ VERSION = "1.0.0"
5
+ end
data/lib/siglip2.rb ADDED
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "siglip2/version"
4
+ require_relative "siglip2/model"
5
+ require_relative "siglip2/image_preprocessor"
6
+
7
+ require "net/http"
8
+ require "uri"
9
+ require "fileutils"
10
+
11
+ module Siglip2
12
+ class Error < StandardError; end
13
+
14
+ # Available models from onnx-community on HuggingFace
15
+ AVAILABLE_MODELS = {
16
+ # Base models
17
+ "base-patch16-224" => "onnx-community/siglip2-base-patch16-224-ONNX",
18
+ "base-patch16-256" => "onnx-community/siglip2-base-patch16-256-ONNX",
19
+ "base-patch16-384" => "onnx-community/siglip2-base-patch16-384-ONNX",
20
+ "base-patch16-512" => "onnx-community/siglip2-base-patch16-512-ONNX",
21
+ "base-patch32-256" => "onnx-community/siglip2-base-patch32-256-ONNX",
22
+ "base-patch16-naflex" => "onnx-community/siglip2-base-patch16-naflex-ONNX",
23
+
24
+ # Large models
25
+ "large-patch16-256" => "onnx-community/siglip2-large-patch16-256-ONNX",
26
+ "large-patch16-384" => "onnx-community/siglip2-large-patch16-384-ONNX",
27
+ "large-patch16-512" => "onnx-community/siglip2-large-patch16-512-ONNX",
28
+
29
+ # Giant optimized models
30
+ "giant-opt-patch16-256" => "onnx-community/siglip2-giant-opt-patch16-256-ONNX",
31
+ "giant-opt-patch16-384" => "onnx-community/siglip2-giant-opt-patch16-384-ONNX",
32
+
33
+ # SO400M models
34
+ "so400m-patch14-224" => "onnx-community/siglip2-so400m-patch14-224-ONNX",
35
+ "so400m-patch14-384" => "onnx-community/siglip2-so400m-patch14-384-ONNX",
36
+ "so400m-patch16-256" => "onnx-community/siglip2-so400m-patch16-256-ONNX",
37
+ "so400m-patch16-384" => "onnx-community/siglip2-so400m-patch16-384-ONNX",
38
+ "so400m-patch16-512" => "onnx-community/siglip2-so400m-patch16-512-ONNX"
39
+ }.freeze
40
+
41
+ DEFAULT_MODEL = "base-patch16-224"
42
+
43
+ # Model quantization options
44
+ QUANTIZATION_OPTIONS = %w[
45
+ fp32
46
+ fp16
47
+ int8
48
+ uint8
49
+ q4
50
+ q4f16
51
+ bnb4
52
+ ].freeze
53
+
54
+ DEFAULT_QUANTIZATION = "fp32"
55
+
56
+ class << self
57
+ def models_dir
58
+ @models_dir ||= File.join(Dir.home, ".siglip2_models")
59
+ end
60
+
61
+ def models_dir=(path)
62
+ @models_dir = path
63
+ end
64
+
65
+ def model_path(model_name, quantization: DEFAULT_QUANTIZATION)
66
+ raise Error, "Unknown model: #{model_name}" unless AVAILABLE_MODELS.key?(model_name)
67
+ raise Error, "Unknown quantization: #{quantization}" unless QUANTIZATION_OPTIONS.include?(quantization)
68
+
69
+ File.join(models_dir, model_name, quantization)
70
+ end
71
+
72
+ def models_exist?(model_name, quantization: DEFAULT_QUANTIZATION)
73
+ path = model_path(model_name, quantization: quantization)
74
+ File.exist?(File.join(path, "vision_model.onnx")) &&
75
+ File.exist?(File.join(path, "text_model.onnx")) &&
76
+ File.exist?(File.join(path, "tokenizer.json"))
77
+ end
78
+
79
+ def download_models(model_name, quantization: DEFAULT_QUANTIZATION)
80
+ raise Error, "Unknown model: #{model_name}" unless AVAILABLE_MODELS.key?(model_name)
81
+
82
+ repo = AVAILABLE_MODELS[model_name]
83
+ path = model_path(model_name, quantization: quantization)
84
+ FileUtils.mkdir_p(path)
85
+
86
+ # Determine file suffix based on quantization
87
+ suffix = quantization_suffix(quantization)
88
+
89
+ files = {
90
+ "vision_model.onnx" => "onnx/vision_model#{suffix}.onnx",
91
+ "text_model.onnx" => "onnx/text_model#{suffix}.onnx",
92
+ "tokenizer.json" => "tokenizer.json"
93
+ }
94
+
95
+ files.each do |local_name, remote_path|
96
+ local_path = File.join(path, local_name)
97
+ next if File.exist?(local_path)
98
+
99
+ url = "https://huggingface.co/#{repo}/resolve/main/#{remote_path}"
100
+ puts "Downloading #{local_name} from #{url}..."
101
+ download_file(url, local_path)
102
+ end
103
+ end
104
+
105
+ def list_models
106
+ AVAILABLE_MODELS.keys
107
+ end
108
+
109
+ def list_quantizations
110
+ QUANTIZATION_OPTIONS
111
+ end
112
+
113
+ private
114
+
115
+ def quantization_suffix(quantization)
116
+ case quantization
117
+ when "fp32" then ""
118
+ when "fp16" then "_fp16"
119
+ when "int8" then "_int8"
120
+ when "uint8" then "_uint8"
121
+ when "q4" then "_q4"
122
+ when "q4f16" then "_q4f16"
123
+ when "bnb4" then "_bnb4"
124
+ else ""
125
+ end
126
+ end
127
+
128
+ def download_file(url, path, redirect_limit = 10)
129
+ raise Error, "Too many HTTP redirects" if redirect_limit == 0
130
+
131
+ uri = URI.parse(url)
132
+ http = Net::HTTP.new(uri.host, uri.port)
133
+ http.use_ssl = uri.scheme == "https"
134
+ http.read_timeout = 300
135
+
136
+ request = Net::HTTP::Get.new(uri)
137
+
138
+ http.request(request) do |response|
139
+ case response
140
+ when Net::HTTPSuccess
141
+ File.open(path, "wb") do |file|
142
+ response.read_body do |chunk|
143
+ file.write(chunk)
144
+ end
145
+ end
146
+ when Net::HTTPRedirection
147
+ download_file(response["location"], path, redirect_limit - 1)
148
+ else
149
+ raise Error, "Failed to download #{url}: #{response.code} #{response.message}"
150
+ end
151
+ end
152
+ end
153
+ end
154
+ end
metadata ADDED
@@ -0,0 +1,131 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: siglip2
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Krzysztof Hasiński
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2026-01-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: onnxruntime
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.9'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.9'
27
+ - !ruby/object:Gem::Dependency
28
+ name: net-http
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.6'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.6'
41
+ - !ruby/object:Gem::Dependency
42
+ name: numo-narray
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.9'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.9'
55
+ - !ruby/object:Gem::Dependency
56
+ name: mini_magick
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '5.0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '5.0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: tokenizers
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '0.5'
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '0.5'
83
+ description: Ruby implementation of Google's SigLIP2 model for creating text and image
84
+ embeddings. Uses ONNX models from HuggingFace onnx-community.
85
+ email:
86
+ - krzysztof.hasinski@gmail.com
87
+ executables:
88
+ - siglip2-embed-image
89
+ - siglip2-embed-text
90
+ - siglip2-similarity
91
+ extensions: []
92
+ extra_rdoc_files: []
93
+ files:
94
+ - ".mise.toml"
95
+ - LICENSE
96
+ - README.md
97
+ - Rakefile
98
+ - exe/siglip2-embed-image
99
+ - exe/siglip2-embed-text
100
+ - exe/siglip2-similarity
101
+ - lib/siglip2.rb
102
+ - lib/siglip2/image_preprocessor.rb
103
+ - lib/siglip2/model.rb
104
+ - lib/siglip2/version.rb
105
+ homepage: https://github.com/khasinski/siglip2-rb
106
+ licenses:
107
+ - MIT
108
+ metadata:
109
+ source_code_uri: https://github.com/khasinski/siglip2-rb
110
+ changelog_uri: https://github.com/khasinski/siglip2-rb/blob/main/CHANGELOG.md
111
+ rubygems_mfa_required: 'true'
112
+ post_install_message:
113
+ rdoc_options: []
114
+ require_paths:
115
+ - lib
116
+ required_ruby_version: !ruby/object:Gem::Requirement
117
+ requirements:
118
+ - - ">="
119
+ - !ruby/object:Gem::Version
120
+ version: 3.0.0
121
+ required_rubygems_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ requirements: []
127
+ rubygems_version: 3.5.22
128
+ signing_key:
129
+ specification_version: 4
130
+ summary: Google SigLIP2 embeddings using ONNX models
131
+ test_files: []