clip-rb 1.0.2 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -0
- data/README.md +22 -1
- data/UPGRADING.md +103 -0
- data/exe/clip-embed-image +35 -5
- data/exe/clip-embed-text +35 -5
- data/lib/clip/image_preprocessor.rb +26 -19
- data/lib/clip/model.rb +12 -5
- data/lib/clip/multilingual_model.rb +77 -0
- data/lib/clip/tokenizer.rb +11 -3
- data/lib/clip/version.rb +1 -1
- data/lib/clip-rb.rb +2 -0
- data/lib/clip.rb +95 -32
- metadata +28 -11
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d6eff0db629a49c59e9c26a4ec877094ac84803e6d086bd8ba29d7c58648dc48
|
|
4
|
+
data.tar.gz: 3227054c9a2c3d74ab81e13d76ee4bc58451715b6e1ed708d3f7cbb1c497014e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c80c6fd029598a4000319a250191c202ab2fbdc3e636c0e6025325b11984d883740c3c2aa439f11baef25c33b413945becfb1be9171a4d2efefcc6ced729b01e
|
|
7
|
+
data.tar.gz: cdab5e8c1beebf3f1d55f9e8f2810fb087b822ba01c3493c7660bbb7653d2409b1e41a4dfc79010ba43855278193044d3fc0d66e0eb6a5d0a5a86d960d92640b
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [2.0.0] - 2024-12-13
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- `Clip.similarity` method for calculating cosine similarity between embeddings
|
|
12
|
+
- `Clip.normalize` method for L2 normalization of embedding vectors
|
|
13
|
+
- `Clip::ImagePreprocessor::InvalidImageError` for better error handling
|
|
14
|
+
- Image format validation with supported formats: jpg, jpeg, png, gif, bmp, webp, tiff
|
|
15
|
+
- HTTP timeout (5 minutes default) for model downloads
|
|
16
|
+
- `Clip::DownloadError` custom exception for download failures
|
|
17
|
+
- Thread safety for lazy-loaded ONNX models using Mutex
|
|
18
|
+
- CLI tools now support `--json` flag for JSON output
|
|
19
|
+
- CLI tools now support `--multilingual` flag to use multilingual model
|
|
20
|
+
- CLI tools now have `--help` flag with usage information
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- Resource leak in tokenizer: GzipReader now properly closed after reading BPE vocabulary
|
|
24
|
+
- `basic_clean` method now properly implements HTML entity decoding and unicode normalization
|
|
25
|
+
- Inconsistent hash key types in MultilingualModel ONNX predict calls
|
|
26
|
+
- Module-level `attr_accessor :logger` now works correctly
|
|
27
|
+
- Changelog URL in gemspec now points to correct path
|
|
28
|
+
- Multilingual tokenizer now downloads correctly (workaround for tokenizers gem bug)
|
|
29
|
+
- HTTP redirects with relative URLs now handled correctly
|
|
30
|
+
|
|
31
|
+
### Changed
|
|
32
|
+
- **Breaking:** Removed `add_batch_dimension` method from ImagePreprocessor
|
|
33
|
+
- **Breaking:** MultilingualModel tokenizer is now downloaded automatically instead of using `Tokenizers.from_pretrained`
|
|
34
|
+
- Model downloads now skip files that already exist
|
|
35
|
+
- CLI tools use OptionParser for proper argument handling
|
|
36
|
+
|
|
37
|
+
## [1.1.0] - 2024-12-10
|
|
38
|
+
|
|
39
|
+
### Added
|
|
40
|
+
- XLM Roberta model for multilingual text embedding support
|
|
41
|
+
- `Clip::MultilingualModel` class for multilingual CLIP
|
|
42
|
+
|
|
43
|
+
## [1.0.0] - 2024-12-01
|
|
44
|
+
|
|
45
|
+
### Added
|
|
46
|
+
- Initial release
|
|
47
|
+
- OpenAI CLIP ViT-B-32 model support
|
|
48
|
+
- Text and image embedding generation
|
|
49
|
+
- Automatic model downloading from Hugging Face
|
|
50
|
+
- CLI tools: `clip-embed-text` and `clip-embed-image`
|
data/README.md
CHANGED
|
@@ -21,6 +21,7 @@ See [neighbor gem](https://github.com/ankane/neighbor) to learn more about vecto
|
|
|
21
21
|
|
|
22
22
|
- Ruby 3.0.0 or later
|
|
23
23
|
- ONNX CLIP models (downloaded automatically on first use)
|
|
24
|
+
- XLM Roberta CLIP model (for multilingual support)
|
|
24
25
|
|
|
25
26
|
---
|
|
26
27
|
|
|
@@ -43,7 +44,9 @@ gem install clip-rb
|
|
|
43
44
|
```ruby
|
|
44
45
|
require 'clip'
|
|
45
46
|
|
|
46
|
-
|
|
47
|
+
# This will download the models on first use (default path is .clip_models)
|
|
48
|
+
# If you don't want this behavior you can pass the path to the models as an argument.
|
|
49
|
+
clip = Clip::Model.new
|
|
47
50
|
|
|
48
51
|
text_embedding = clip.encode_text("a photo of a cat")
|
|
49
52
|
# => [0.15546110272407532, 0.07329428941011429, ...]
|
|
@@ -54,6 +57,24 @@ image_embedding = clip.encode_image("test/fixtures/test.jpg")
|
|
|
54
57
|
|
|
55
58
|
💡 Tip: Use cosine similarity for KNN vector search when comparing embeddings!
|
|
56
59
|
|
|
60
|
+
## Multilingual text embeddings
|
|
61
|
+
|
|
62
|
+
Since the original CLIP only supports English embeddings this gem now has added support for multilingual text embeddings using the XLM Roberta model.
|
|
63
|
+
|
|
64
|
+
```ruby
|
|
65
|
+
require 'clip'
|
|
66
|
+
|
|
67
|
+
# This will download the models on first use (default path is .clip_models/multilingual)
|
|
68
|
+
# If you don't want this behavior you can pass the path to the models as an argument.
|
|
69
|
+
clip = Clip::MultilingualModel.new
|
|
70
|
+
|
|
71
|
+
text_embedding = clip.encode_text("un photo de un gato")
|
|
72
|
+
# => [0.15546110272407532, 0.07329428941011429, ...]
|
|
73
|
+
|
|
74
|
+
image_embedding = clip.encode_image("test/fixtures/test.jpg")
|
|
75
|
+
# => [0.22115306556224823, 0.19343754649162292, ...]
|
|
76
|
+
```
|
|
77
|
+
|
|
57
78
|
## CLI
|
|
58
79
|
|
|
59
80
|
Additionally you can fetch embeddings by calling:
|
data/UPGRADING.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Upgrading Guide
|
|
2
|
+
|
|
3
|
+
## Upgrading from 1.x to 2.0
|
|
4
|
+
|
|
5
|
+
### Breaking Changes
|
|
6
|
+
|
|
7
|
+
#### 1. ImagePreprocessor: `add_batch_dimension` method removed
|
|
8
|
+
|
|
9
|
+
The `add_batch_dimension` method was removed from `Clip::ImagePreprocessor` because it was misleadingly named - it didn't actually add a batch dimension.
|
|
10
|
+
|
|
11
|
+
**Before (1.x):**
|
|
12
|
+
```ruby
|
|
13
|
+
preprocessor = Clip::ImagePreprocessor.new
|
|
14
|
+
tensor = preprocessor.preprocess(image_path)
|
|
15
|
+
# tensor shape was [3, 224, 224] despite method name suggesting [1, 3, 224, 224]
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**After (2.0):**
|
|
19
|
+
```ruby
|
|
20
|
+
preprocessor = Clip::ImagePreprocessor.new
|
|
21
|
+
tensor = preprocessor.preprocess(image_path)
|
|
22
|
+
# tensor shape is [3, 224, 224] - same behavior, clearer code
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
If you were calling `add_batch_dimension` directly (unlikely since it was private), you'll need to remove those calls.
|
|
26
|
+
|
|
27
|
+
#### 2. MultilingualModel: Tokenizer loading changed
|
|
28
|
+
|
|
29
|
+
The `MultilingualModel` no longer uses `Tokenizers.from_pretrained` due to a bug in the tokenizers gem. Instead, it downloads the tokenizer.json file directly and loads it from disk.
|
|
30
|
+
|
|
31
|
+
**Before (1.x):**
|
|
32
|
+
```ruby
|
|
33
|
+
model = Clip::MultilingualModel.new(
|
|
34
|
+
tokenizer: Tokenizers.from_pretrained("M-CLIP/XLM-Roberta-Large-Vit-B-32")
|
|
35
|
+
)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**After (2.0):**
|
|
39
|
+
```ruby
|
|
40
|
+
# Tokenizer is downloaded automatically - no need to specify
|
|
41
|
+
model = Clip::MultilingualModel.new
|
|
42
|
+
|
|
43
|
+
# Or provide a custom tokenizer loaded from file
|
|
44
|
+
model = Clip::MultilingualModel.new(
|
|
45
|
+
tokenizer: Tokenizers::Tokenizer.from_file("/path/to/tokenizer.json")
|
|
46
|
+
)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### New Features
|
|
50
|
+
|
|
51
|
+
#### Similarity and Normalization Helpers
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
# Calculate cosine similarity between embeddings
|
|
55
|
+
similarity = Clip.similarity(embedding1, embedding2)
|
|
56
|
+
|
|
57
|
+
# Normalize embeddings to unit length
|
|
58
|
+
normalized = Clip.normalize(embedding)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
#### Image Validation
|
|
62
|
+
|
|
63
|
+
Images are now validated before processing:
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
begin
|
|
67
|
+
model.encode_image("invalid.xyz")
|
|
68
|
+
rescue Clip::ImagePreprocessor::InvalidImageError => e
|
|
69
|
+
puts e.message # "Unsupported image format: xyz. Supported: jpg, jpeg, png, gif, bmp, webp, tiff"
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### CLI Improvements
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# JSON output for piping
|
|
77
|
+
clip-embed-text --json "a photo of a cat"
|
|
78
|
+
|
|
79
|
+
# Use multilingual model
|
|
80
|
+
clip-embed-text --multilingual "une photo d'un chat"
|
|
81
|
+
|
|
82
|
+
# Help
|
|
83
|
+
clip-embed-text --help
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
#### Thread Safety
|
|
87
|
+
|
|
88
|
+
Both `Model` and `MultilingualModel` now use mutex locks for thread-safe lazy loading of ONNX models.
|
|
89
|
+
|
|
90
|
+
#### Download Improvements
|
|
91
|
+
|
|
92
|
+
- HTTP timeout of 5 minutes (configurable)
|
|
93
|
+
- Downloads skip files that already exist
|
|
94
|
+
- Relative redirects handled correctly
|
|
95
|
+
- Custom `Clip::DownloadError` exception
|
|
96
|
+
|
|
97
|
+
### Migration Checklist
|
|
98
|
+
|
|
99
|
+
- [ ] Remove any calls to `add_batch_dimension` (if applicable)
|
|
100
|
+
- [ ] Update custom tokenizer initialization for `MultilingualModel` to use `from_file` instead of `from_pretrained`
|
|
101
|
+
- [ ] Consider using new `Clip.similarity` and `Clip.normalize` helpers
|
|
102
|
+
- [ ] Update error handling to catch `Clip::ImagePreprocessor::InvalidImageError`
|
|
103
|
+
- [ ] Update error handling to catch `Clip::DownloadError`
|
data/exe/clip-embed-image
CHANGED
|
@@ -1,16 +1,46 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
2
3
|
|
|
3
4
|
require_relative "../lib/clip"
|
|
5
|
+
require "json"
|
|
6
|
+
require "optparse"
|
|
4
7
|
|
|
8
|
+
options = { format: :ruby }
|
|
5
9
|
|
|
6
|
-
|
|
7
|
-
|
|
10
|
+
OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: clip-embed-image [options] <image_file>"
|
|
12
|
+
|
|
13
|
+
opts.on("-j", "--json", "Output as JSON") do
|
|
14
|
+
options[:format] = :json
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
opts.on("-m", "--multilingual", "Use multilingual model") do
|
|
18
|
+
options[:multilingual] = true
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.on("-h", "--help", "Show this help") do
|
|
22
|
+
puts opts
|
|
23
|
+
exit
|
|
24
|
+
end
|
|
25
|
+
end.parse!
|
|
26
|
+
|
|
27
|
+
if ARGV.empty?
|
|
28
|
+
puts "Usage: clip-embed-image [options] <image_file>"
|
|
29
|
+
puts "Run 'clip-embed-image --help' for options"
|
|
8
30
|
exit 1
|
|
9
31
|
end
|
|
10
32
|
|
|
11
33
|
begin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
34
|
+
model = options[:multilingual] ? Clip::MultilingualModel.new : Clip::Model.new
|
|
35
|
+
embedding = model.encode_image(ARGV[0])
|
|
36
|
+
|
|
37
|
+
case options[:format]
|
|
38
|
+
when :json
|
|
39
|
+
puts JSON.generate(embedding)
|
|
40
|
+
else
|
|
41
|
+
puts embedding.inspect
|
|
42
|
+
end
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
warn "Error: #{e.message}"
|
|
15
45
|
exit 1
|
|
16
46
|
end
|
data/exe/clip-embed-text
CHANGED
|
@@ -1,16 +1,46 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
2
3
|
|
|
3
4
|
require_relative "../lib/clip"
|
|
5
|
+
require "json"
|
|
6
|
+
require "optparse"
|
|
4
7
|
|
|
8
|
+
options = { format: :ruby }
|
|
5
9
|
|
|
6
|
-
|
|
7
|
-
|
|
10
|
+
OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: clip-embed-text [options] <text>"
|
|
12
|
+
|
|
13
|
+
opts.on("-j", "--json", "Output as JSON") do
|
|
14
|
+
options[:format] = :json
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
opts.on("-m", "--multilingual", "Use multilingual model") do
|
|
18
|
+
options[:multilingual] = true
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.on("-h", "--help", "Show this help") do
|
|
22
|
+
puts opts
|
|
23
|
+
exit
|
|
24
|
+
end
|
|
25
|
+
end.parse!
|
|
26
|
+
|
|
27
|
+
if ARGV.empty?
|
|
28
|
+
puts "Usage: clip-embed-text [options] <text>"
|
|
29
|
+
puts "Run 'clip-embed-text --help' for options"
|
|
8
30
|
exit 1
|
|
9
31
|
end
|
|
10
32
|
|
|
11
33
|
begin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
34
|
+
model = options[:multilingual] ? Clip::MultilingualModel.new : Clip::Model.new
|
|
35
|
+
embedding = model.encode_text(ARGV[0])
|
|
36
|
+
|
|
37
|
+
case options[:format]
|
|
38
|
+
when :json
|
|
39
|
+
puts JSON.generate(embedding)
|
|
40
|
+
else
|
|
41
|
+
puts embedding.inspect
|
|
42
|
+
end
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
warn "Error: #{e.message}"
|
|
15
45
|
exit 1
|
|
16
46
|
end
|
|
@@ -1,31 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "mini_magick"
|
|
2
4
|
require "numo/narray"
|
|
3
5
|
|
|
4
6
|
module Clip
|
|
5
7
|
class ImagePreprocessor
|
|
6
8
|
# CLIP's expected image normalization parameters
|
|
7
|
-
MEAN = Numo::DFloat[
|
|
8
|
-
STD = Numo::DFloat[
|
|
9
|
+
MEAN = Numo::DFloat[0.48145466, 0.4578275, 0.40821073]
|
|
10
|
+
STD = Numo::DFloat[0.26862954, 0.26130258, 0.27577711]
|
|
11
|
+
SUPPORTED_FORMATS = %w[jpg jpeg png gif bmp webp tiff].freeze
|
|
12
|
+
|
|
13
|
+
class InvalidImageError < StandardError; end
|
|
9
14
|
|
|
10
15
|
def initialize(target_size: 224)
|
|
11
16
|
@target_size = target_size
|
|
12
17
|
end
|
|
13
18
|
|
|
14
|
-
# Preprocess the image and return a tensor with shape [
|
|
19
|
+
# Preprocess the image and return a tensor with shape [3, 224, 224]
|
|
15
20
|
def preprocess(image_path)
|
|
21
|
+
validate_image!(image_path)
|
|
16
22
|
image = load_and_resize(image_path)
|
|
17
23
|
tensor = image_to_tensor(image)
|
|
18
|
-
|
|
19
|
-
add_batch_dimension(normalized)
|
|
24
|
+
normalize(tensor)
|
|
20
25
|
end
|
|
21
26
|
|
|
22
27
|
private
|
|
23
28
|
|
|
29
|
+
# Validate that the image file exists and has a supported format
|
|
30
|
+
def validate_image!(image_path)
|
|
31
|
+
path = image_path.is_a?(File) ? image_path.path : image_path.to_s
|
|
32
|
+
|
|
33
|
+
raise InvalidImageError, "Image file not found: #{path}" unless File.exist?(path)
|
|
34
|
+
|
|
35
|
+
extension = File.extname(path).delete(".").downcase
|
|
36
|
+
return if SUPPORTED_FORMATS.include?(extension)
|
|
37
|
+
|
|
38
|
+
raise InvalidImageError, "Unsupported image format: #{extension}. Supported: #{SUPPORTED_FORMATS.join(', ')}"
|
|
39
|
+
end
|
|
40
|
+
|
|
24
41
|
# Load image, convert to RGB, and resize to target size
|
|
25
42
|
def load_and_resize(image_path)
|
|
26
43
|
image = MiniMagick::Image.open(image_path)
|
|
27
|
-
image.format "png"
|
|
28
|
-
image
|
|
44
|
+
image.format "png"
|
|
45
|
+
image.combine_options do |c|
|
|
29
46
|
c.resize "#{@target_size}x#{@target_size}!"
|
|
30
47
|
c.quality 100
|
|
31
48
|
c.colorspace "RGB"
|
|
@@ -33,30 +50,20 @@ module Clip
|
|
|
33
50
|
image
|
|
34
51
|
end
|
|
35
52
|
|
|
36
|
-
# Convert the image to a normalized
|
|
53
|
+
# Convert the image to a normalized tensor with shape [3, 224, 224]
|
|
37
54
|
def image_to_tensor(image)
|
|
38
|
-
pixels = image.get_pixels
|
|
39
|
-
# Convert to Numo::NArray and reshape
|
|
55
|
+
pixels = image.get_pixels
|
|
40
56
|
pixel_array = Numo::UInt8.asarray(pixels).cast_to(Numo::DFloat)
|
|
41
|
-
# Reshape to [height, width, channels]
|
|
42
57
|
pixel_array = pixel_array.reshape(@target_size, @target_size, 3)
|
|
43
|
-
# Transpose to [channels, height, width]
|
|
44
58
|
pixel_array = pixel_array.transpose(2, 0, 1)
|
|
45
|
-
# Normalize to [0, 1]
|
|
46
59
|
pixel_array / 255.0
|
|
47
60
|
end
|
|
48
61
|
|
|
49
62
|
# Apply CLIP normalization: (x - mean) / std
|
|
50
63
|
def normalize(tensor)
|
|
51
|
-
# Expand mean and std to match tensor shape
|
|
52
64
|
mean = MEAN.reshape(3, 1, 1)
|
|
53
65
|
std = STD.reshape(3, 1, 1)
|
|
54
66
|
(tensor - mean) / std
|
|
55
67
|
end
|
|
56
|
-
|
|
57
|
-
# Add batch dimension: [1, 3, 224, 224]
|
|
58
|
-
def add_batch_dimension(tensor)
|
|
59
|
-
tensor.reshape(3, @target_size, @target_size)
|
|
60
|
-
end
|
|
61
68
|
end
|
|
62
69
|
end
|
data/lib/clip/model.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "onnxruntime"
|
|
2
4
|
|
|
3
5
|
module Clip
|
|
@@ -15,24 +17,29 @@ module Clip
|
|
|
15
17
|
Clip.download_models(download_dir) if download_models && !Clip.models_exist?(textual_model_path: textual_model_path, visual_model_path: visual_model_path)
|
|
16
18
|
@tokenizer = tokenizer
|
|
17
19
|
@image_preprocessor = image_preprocessor
|
|
20
|
+
@model_mutex = Mutex.new
|
|
18
21
|
end
|
|
19
22
|
|
|
20
23
|
def encode_text(text)
|
|
21
24
|
tokens = tokenizer.encode(text)
|
|
22
|
-
text_model.predict({ input
|
|
25
|
+
text_model.predict({ "input" => [tokens] })["output"].first
|
|
23
26
|
end
|
|
24
27
|
|
|
25
28
|
def encode_image(image)
|
|
26
|
-
|
|
27
|
-
image_model.predict({ input
|
|
29
|
+
image_tensor = image_preprocessor.preprocess(image).to_a
|
|
30
|
+
image_model.predict({ "input" => [image_tensor] })["output"].first
|
|
28
31
|
end
|
|
29
32
|
|
|
30
33
|
def text_model
|
|
31
|
-
@
|
|
34
|
+
@model_mutex.synchronize do
|
|
35
|
+
@text_model ||= OnnxRuntime::Model.new(textual_model_path)
|
|
36
|
+
end
|
|
32
37
|
end
|
|
33
38
|
|
|
34
39
|
def image_model
|
|
35
|
-
@
|
|
40
|
+
@model_mutex.synchronize do
|
|
41
|
+
@image_model ||= OnnxRuntime::Model.new(visual_model_path)
|
|
42
|
+
end
|
|
36
43
|
end
|
|
37
44
|
|
|
38
45
|
private
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "onnxruntime"
|
|
4
|
+
require "tokenizers"
|
|
5
|
+
|
|
6
|
+
module Clip
|
|
7
|
+
class MultilingualModel
|
|
8
|
+
TOKENIZER_FILENAME = "tokenizer.json"
|
|
9
|
+
|
|
10
|
+
def initialize(
|
|
11
|
+
textual_model_path: ".clip_models/multilingual/textual.onnx",
|
|
12
|
+
visual_model_path: ".clip_models/multilingual/visual.onnx",
|
|
13
|
+
tokenizer: nil,
|
|
14
|
+
image_preprocessor: Clip::ImagePreprocessor.new,
|
|
15
|
+
download_models: true,
|
|
16
|
+
download_dir: ".clip_models/multilingual"
|
|
17
|
+
)
|
|
18
|
+
@textual_model_path = textual_model_path
|
|
19
|
+
@visual_model_path = visual_model_path
|
|
20
|
+
@download_dir = download_dir
|
|
21
|
+
|
|
22
|
+
if download_models
|
|
23
|
+
Clip.download_models(download_dir, Clip::MULTILINGUAL_MODELS) unless Clip.models_exist?(textual_model_path: textual_model_path, visual_model_path: visual_model_path)
|
|
24
|
+
download_tokenizer unless tokenizer
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@tokenizer = tokenizer || load_tokenizer
|
|
28
|
+
@image_preprocessor = image_preprocessor
|
|
29
|
+
@model_mutex = Mutex.new
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def encode_text(text)
|
|
33
|
+
encoding = tokenizer.encode(text)
|
|
34
|
+
input_ids = [encoding.ids]
|
|
35
|
+
attention_mask = [Array.new(encoding.ids.size, 1)]
|
|
36
|
+
|
|
37
|
+
text_model.predict({ "input_ids" => input_ids, "attention_mask" => attention_mask })["output"].first
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def encode_image(image)
|
|
41
|
+
image_tensor = image_preprocessor.preprocess(image).to_a
|
|
42
|
+
image_model.predict({ "pixel_values" => [image_tensor] })["output"].first
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def text_model
|
|
46
|
+
@model_mutex.synchronize do
|
|
47
|
+
@text_model ||= OnnxRuntime::Model.new(textual_model_path)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def image_model
|
|
52
|
+
@model_mutex.synchronize do
|
|
53
|
+
@image_model ||= OnnxRuntime::Model.new(visual_model_path)
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
private
|
|
58
|
+
|
|
59
|
+
attr_reader :textual_model_path, :visual_model_path, :tokenizer, :image_preprocessor
|
|
60
|
+
|
|
61
|
+
def tokenizer_path
|
|
62
|
+
File.join(@download_dir, TOKENIZER_FILENAME)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def download_tokenizer
|
|
66
|
+
return if File.exist?(tokenizer_path)
|
|
67
|
+
|
|
68
|
+
Clip.logger ||= Logger.new($stdout)
|
|
69
|
+
Clip.logger.info("Downloading tokenizer from #{Clip::MULTILINGUAL_TOKENIZER_URL}")
|
|
70
|
+
Clip.download_file(Clip::MULTILINGUAL_TOKENIZER_URL, tokenizer_path)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def load_tokenizer
|
|
74
|
+
Tokenizers::Tokenizer.from_file(tokenizer_path)
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
data/lib/clip/tokenizer.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "zlib"
|
|
2
4
|
require "set"
|
|
3
5
|
|
|
@@ -5,10 +7,10 @@ module Clip
|
|
|
5
7
|
class Tokenizer
|
|
6
8
|
INPUT_VECTOR_SIZE = 77
|
|
7
9
|
|
|
8
|
-
def initialize(bpe_path = __dir__
|
|
10
|
+
def initialize(bpe_path = File.join(__dir__, "..", "bpe_simple_vocab_16e6.txt.gz"))
|
|
9
11
|
@byte_encoder = bytes_to_unicode
|
|
10
12
|
@byte_decoder = @byte_encoder.invert
|
|
11
|
-
merges = Zlib::GzipReader.open(bpe_path).read.split("\n")[1..(49152 - 256 - 2)]
|
|
13
|
+
merges = Zlib::GzipReader.open(bpe_path) { |gz| gz.read }.split("\n")[1..(49152 - 256 - 2)]
|
|
12
14
|
merges = merges.map { |merge| merge.split(" ") }
|
|
13
15
|
vocab = @byte_encoder.values
|
|
14
16
|
vocab += vocab.map { |v| "#{v}</w>" }
|
|
@@ -53,8 +55,14 @@ module Clip
|
|
|
53
55
|
pairs
|
|
54
56
|
end
|
|
55
57
|
|
|
58
|
+
# Clean text by decoding HTML entities and normalizing unicode
|
|
59
|
+
# Matches Python CLIP's basic_clean which uses ftfy.fix_text and html.unescape
|
|
56
60
|
def basic_clean(text)
|
|
57
|
-
|
|
61
|
+
require "cgi"
|
|
62
|
+
# Decode HTML entities (called twice like Python original)
|
|
63
|
+
text = CGI.unescapeHTML(CGI.unescapeHTML(text))
|
|
64
|
+
# Normalize unicode to NFC form (similar to ftfy's fix_text for most cases)
|
|
65
|
+
text.unicode_normalize(:nfc).strip
|
|
58
66
|
end
|
|
59
67
|
|
|
60
68
|
def whitespace_clean(text)
|
data/lib/clip/version.rb
CHANGED
data/lib/clip-rb.rb
CHANGED
data/lib/clip.rb
CHANGED
|
@@ -1,50 +1,113 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require_relative "clip/model"
|
|
4
|
+
require_relative "clip/multilingual_model"
|
|
2
5
|
require_relative "clip/tokenizer"
|
|
3
6
|
require_relative "clip/image_preprocessor"
|
|
4
7
|
require "net/http"
|
|
8
|
+
require "uri"
|
|
5
9
|
require "fileutils"
|
|
6
10
|
require "logger"
|
|
7
11
|
|
|
8
12
|
module Clip
|
|
9
|
-
attr_accessor :logger
|
|
10
|
-
|
|
11
13
|
BASE_URL = "https://huggingface.co/khasinski/"
|
|
12
14
|
MODELS = {
|
|
13
|
-
textual
|
|
14
|
-
visual
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
15
|
+
"textual.onnx" => "clip-ViT-B-32-onnx/resolve/main/textual.onnx?download=true",
|
|
16
|
+
"visual.onnx" => "clip-ViT-B-32-onnx/resolve/main/visual.onnx?download=true"
|
|
17
|
+
}.freeze
|
|
18
|
+
|
|
19
|
+
MULTILINGUAL_MODELS = {
|
|
20
|
+
"textual.onnx" => "XLM-Roberta-Large-Vit-B-32-onnx/resolve/main/textual.onnx?download=true",
|
|
21
|
+
"visual.onnx" => "XLM-Roberta-Large-Vit-B-32-onnx/resolve/main/visual.onnx?download=true",
|
|
22
|
+
"data.bin" => "XLM-Roberta-Large-Vit-B-32-onnx/resolve/main/data.bin?download=true"
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
MULTILINGUAL_TOKENIZER_URL = "https://huggingface.co/M-CLIP/XLM-Roberta-Large-Vit-B-32/resolve/main/tokenizer.json"
|
|
26
|
+
|
|
27
|
+
DEFAULT_TIMEOUT = 300 # 5 minutes for large model files
|
|
28
|
+
|
|
29
|
+
class DownloadError < StandardError; end
|
|
30
|
+
|
|
31
|
+
class << self
|
|
32
|
+
attr_accessor :logger
|
|
33
|
+
|
|
34
|
+
def download_models(download_dir, models = MODELS)
|
|
35
|
+
@logger ||= Logger.new($stdout)
|
|
36
|
+
FileUtils.mkdir_p(download_dir)
|
|
37
|
+
|
|
38
|
+
models.each do |filename, path|
|
|
39
|
+
uri = URI.join(BASE_URL, path)
|
|
40
|
+
destination = File.join(download_dir, filename)
|
|
41
|
+
|
|
42
|
+
next if File.exist?(destination)
|
|
43
|
+
|
|
44
|
+
logger.info("Downloading #{filename} model from #{uri}")
|
|
45
|
+
download_file(uri.to_s, destination)
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def download_file(url, destination, limit: 10, timeout: DEFAULT_TIMEOUT)
|
|
50
|
+
raise DownloadError, "Too many HTTP redirects" if limit == 0
|
|
51
|
+
|
|
52
|
+
uri = URI.parse(url)
|
|
53
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
54
|
+
http.use_ssl = (uri.scheme == "https")
|
|
55
|
+
http.open_timeout = timeout
|
|
56
|
+
http.read_timeout = timeout
|
|
57
|
+
|
|
58
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
59
|
+
|
|
60
|
+
http.request(request) do |response|
|
|
61
|
+
case response
|
|
62
|
+
when Net::HTTPRedirection
|
|
63
|
+
location = response["location"]
|
|
64
|
+
# Handle relative redirects
|
|
65
|
+
new_url = if location.start_with?("/")
|
|
66
|
+
"#{uri.scheme}://#{uri.host}#{location}"
|
|
67
|
+
else
|
|
68
|
+
location
|
|
69
|
+
end
|
|
70
|
+
download_file(new_url, destination, limit: limit - 1, timeout: timeout)
|
|
71
|
+
when Net::HTTPSuccess
|
|
72
|
+
File.open(destination, "wb") do |file|
|
|
73
|
+
response.read_body do |chunk|
|
|
74
|
+
file.write(chunk)
|
|
75
|
+
end
|
|
36
76
|
end
|
|
37
|
-
logger.info("Successfully downloaded #{type} model")
|
|
38
|
-
break
|
|
39
77
|
else
|
|
40
|
-
|
|
41
|
-
raise "Failed to download #{type} model from #{uri}"
|
|
78
|
+
raise DownloadError, "Failed to download file: #{response.code} #{response.message}"
|
|
42
79
|
end
|
|
43
80
|
end
|
|
44
81
|
end
|
|
45
|
-
end
|
|
46
82
|
|
|
47
|
-
|
|
48
|
-
|
|
83
|
+
def models_exist?(textual_model_path:, visual_model_path:)
|
|
84
|
+
File.exist?(textual_model_path) && File.exist?(visual_model_path)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Normalize an embedding vector to unit length (L2 normalization)
|
|
88
|
+
# @param embedding [Array<Float>] The embedding vector
|
|
89
|
+
# @return [Array<Float>] The normalized embedding vector
|
|
90
|
+
def normalize(embedding)
|
|
91
|
+
magnitude = Math.sqrt(embedding.sum { |x| x * x })
|
|
92
|
+
return embedding if magnitude.zero?
|
|
93
|
+
|
|
94
|
+
embedding.map { |x| x / magnitude }
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Calculate cosine similarity between two embeddings
|
|
98
|
+
# @param embedding1 [Array<Float>] First embedding vector
|
|
99
|
+
# @param embedding2 [Array<Float>] Second embedding vector
|
|
100
|
+
# @return [Float] Cosine similarity score between -1 and 1
|
|
101
|
+
def similarity(embedding1, embedding2)
|
|
102
|
+
raise ArgumentError, "Embeddings must have the same length" if embedding1.length != embedding2.length
|
|
103
|
+
|
|
104
|
+
dot_product = embedding1.zip(embedding2).sum { |a, b| a * b }
|
|
105
|
+
magnitude1 = Math.sqrt(embedding1.sum { |x| x * x })
|
|
106
|
+
magnitude2 = Math.sqrt(embedding2.sum { |x| x * x })
|
|
107
|
+
|
|
108
|
+
return 0.0 if magnitude1.zero? || magnitude2.zero?
|
|
109
|
+
|
|
110
|
+
dot_product / (magnitude1 * magnitude2)
|
|
111
|
+
end
|
|
49
112
|
end
|
|
50
113
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: clip-rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Krzysztof Hasiński
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: onnxruntime
|
|
@@ -84,16 +84,30 @@ dependencies:
|
|
|
84
84
|
name: mini_magick
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
|
-
- - "
|
|
87
|
+
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '0'
|
|
89
|
+
version: '5.0'
|
|
90
90
|
type: :runtime
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
|
-
- - "
|
|
94
|
+
- - "~>"
|
|
95
|
+
- !ruby/object:Gem::Version
|
|
96
|
+
version: '5.0'
|
|
97
|
+
- !ruby/object:Gem::Dependency
|
|
98
|
+
name: tokenizers
|
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
|
100
|
+
requirements:
|
|
101
|
+
- - "~>"
|
|
102
|
+
- !ruby/object:Gem::Version
|
|
103
|
+
version: '0.5'
|
|
104
|
+
type: :runtime
|
|
105
|
+
prerelease: false
|
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
107
|
+
requirements:
|
|
108
|
+
- - "~>"
|
|
95
109
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '0'
|
|
110
|
+
version: '0.5'
|
|
97
111
|
description: OpenAI CLIP embeddings, uses ONNX models. Allows to create embeddings
|
|
98
112
|
for images and text
|
|
99
113
|
email:
|
|
@@ -106,10 +120,12 @@ extra_rdoc_files: []
|
|
|
106
120
|
files:
|
|
107
121
|
- ".clip_models/.gitkeep"
|
|
108
122
|
- ".rspec"
|
|
123
|
+
- CHANGELOG.md
|
|
109
124
|
- CODE_OF_CONDUCT.md
|
|
110
125
|
- LICENSE.txt
|
|
111
126
|
- README.md
|
|
112
127
|
- Rakefile
|
|
128
|
+
- UPGRADING.md
|
|
113
129
|
- exe/clip-embed-image
|
|
114
130
|
- exe/clip-embed-text
|
|
115
131
|
- lib/bpe_simple_vocab_16e6.txt.gz
|
|
@@ -117,6 +133,7 @@ files:
|
|
|
117
133
|
- lib/clip.rb
|
|
118
134
|
- lib/clip/image_preprocessor.rb
|
|
119
135
|
- lib/clip/model.rb
|
|
136
|
+
- lib/clip/multilingual_model.rb
|
|
120
137
|
- lib/clip/tokenizer.rb
|
|
121
138
|
- lib/clip/version.rb
|
|
122
139
|
- sig/clip.rbs
|
|
@@ -127,8 +144,8 @@ licenses:
|
|
|
127
144
|
metadata:
|
|
128
145
|
homepage_uri: https://github.com/khasinski/clip-rb
|
|
129
146
|
source_code_uri: https://github.com/khasinski/clip-rb
|
|
130
|
-
changelog_uri: https://github.com/khasinski/clip-rb/CHANGELOG.md
|
|
131
|
-
post_install_message:
|
|
147
|
+
changelog_uri: https://github.com/khasinski/clip-rb/blob/main/CHANGELOG.md
|
|
148
|
+
post_install_message:
|
|
132
149
|
rdoc_options: []
|
|
133
150
|
require_paths:
|
|
134
151
|
- lib
|
|
@@ -143,8 +160,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
143
160
|
- !ruby/object:Gem::Version
|
|
144
161
|
version: '0'
|
|
145
162
|
requirements: []
|
|
146
|
-
rubygems_version: 3.
|
|
147
|
-
signing_key:
|
|
163
|
+
rubygems_version: 3.0.3.1
|
|
164
|
+
signing_key:
|
|
148
165
|
specification_version: 4
|
|
149
166
|
summary: OpenAI CLIP embeddings, uses ONNX models
|
|
150
167
|
test_files: []
|