clip-rb 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -0
- data/README.md +5 -3
- data/UPGRADING.md +103 -0
- data/exe/clip-embed-image +35 -5
- data/exe/clip-embed-text +35 -5
- data/lib/clip/image_preprocessor.rb +26 -19
- data/lib/clip/model.rb +12 -5
- data/lib/clip/multilingual_model.rb +41 -10
- data/lib/clip/tokenizer.rb +11 -3
- data/lib/clip/version.rb +1 -1
- data/lib/clip-rb.rb +2 -0
- data/lib/clip.rb +81 -35
- metadata +17 -15
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d6eff0db629a49c59e9c26a4ec877094ac84803e6d086bd8ba29d7c58648dc48
|
|
4
|
+
data.tar.gz: 3227054c9a2c3d74ab81e13d76ee4bc58451715b6e1ed708d3f7cbb1c497014e
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: c80c6fd029598a4000319a250191c202ab2fbdc3e636c0e6025325b11984d883740c3c2aa439f11baef25c33b413945becfb1be9171a4d2efefcc6ced729b01e
|
|
7
|
+
data.tar.gz: cdab5e8c1beebf3f1d55f9e8f2810fb087b822ba01c3493c7660bbb7653d2409b1e41a4dfc79010ba43855278193044d3fc0d66e0eb6a5d0a5a86d960d92640b
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [2.0.0] - 2024-12-13
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- `Clip.similarity` method for calculating cosine similarity between embeddings
|
|
12
|
+
- `Clip.normalize` method for L2 normalization of embedding vectors
|
|
13
|
+
- `Clip::ImagePreprocessor::InvalidImageError` for better error handling
|
|
14
|
+
- Image format validation with supported formats: jpg, jpeg, png, gif, bmp, webp, tiff
|
|
15
|
+
- HTTP timeout (5 minutes default) for model downloads
|
|
16
|
+
- `Clip::DownloadError` custom exception for download failures
|
|
17
|
+
- Thread safety for lazy-loaded ONNX models using Mutex
|
|
18
|
+
- CLI tools now support `--json` flag for JSON output
|
|
19
|
+
- CLI tools now support `--multilingual` flag to use multilingual model
|
|
20
|
+
- CLI tools now have `--help` flag with usage information
|
|
21
|
+
|
|
22
|
+
### Fixed
|
|
23
|
+
- Resource leak in tokenizer: GzipReader now properly closed after reading BPE vocabulary
|
|
24
|
+
- `basic_clean` method now properly implements HTML entity decoding and unicode normalization
|
|
25
|
+
- Inconsistent hash key types in MultilingualModel ONNX predict calls
|
|
26
|
+
- Module-level `attr_accessor :logger` now works correctly
|
|
27
|
+
- Changelog URL in gemspec now points to correct path
|
|
28
|
+
- Multilingual tokenizer now downloads correctly (workaround for tokenizers gem bug)
|
|
29
|
+
- HTTP redirects with relative URLs now handled correctly
|
|
30
|
+
|
|
31
|
+
### Changed
|
|
32
|
+
- **Breaking:** Removed `add_batch_dimension` method from ImagePreprocessor
|
|
33
|
+
- **Breaking:** MultilingualModel tokenizer is now downloaded automatically instead of using `Tokenizers.from_pretrained`
|
|
34
|
+
- Model downloads now skip files that already exist
|
|
35
|
+
- CLI tools use OptionParser for proper argument handling
|
|
36
|
+
|
|
37
|
+
## [1.1.0] - 2024-12-10
|
|
38
|
+
|
|
39
|
+
### Added
|
|
40
|
+
- XLM Roberta model for multilingual text embedding support
|
|
41
|
+
- `Clip::MultilingualModel` class for multilingual CLIP
|
|
42
|
+
|
|
43
|
+
## [1.0.0] - 2024-12-01
|
|
44
|
+
|
|
45
|
+
### Added
|
|
46
|
+
- Initial release
|
|
47
|
+
- OpenAI CLIP ViT-B-32 model support
|
|
48
|
+
- Text and image embedding generation
|
|
49
|
+
- Automatic model downloading from Hugging Face
|
|
50
|
+
- CLI tools: `clip-embed-text` and `clip-embed-image`
|
data/README.md
CHANGED
|
@@ -44,7 +44,9 @@ gem install clip-rb
|
|
|
44
44
|
```ruby
|
|
45
45
|
require 'clip'
|
|
46
46
|
|
|
47
|
-
|
|
47
|
+
# This will download the models on first use (default path is .clip_models)
|
|
48
|
+
# If you don't want this behavior you can pass the path to the models as an argument.
|
|
49
|
+
clip = Clip::Model.new
|
|
48
50
|
|
|
49
51
|
text_embedding = clip.encode_text("a photo of a cat")
|
|
50
52
|
# => [0.15546110272407532, 0.07329428941011429, ...]
|
|
@@ -62,6 +64,8 @@ Since the original CLIP only supports English embeddings this gem now has added
|
|
|
62
64
|
```ruby
|
|
63
65
|
require 'clip'
|
|
64
66
|
|
|
67
|
+
# This will download the models on first use (default path is .clip_models/multilingual)
|
|
68
|
+
# If you don't want this behavior you can pass the path to the models as an argument.
|
|
65
69
|
clip = Clip::MultilingualModel.new
|
|
66
70
|
|
|
67
71
|
text_embedding = clip.encode_text("un photo de un gato")
|
|
@@ -71,8 +75,6 @@ image_embedding = clip.encode_image("test/fixtures/test.jpg")
|
|
|
71
75
|
# => [0.22115306556224823, 0.19343754649162292, ...]
|
|
72
76
|
```
|
|
73
77
|
|
|
74
|
-
```bash
|
|
75
|
-
|
|
76
78
|
## CLI
|
|
77
79
|
|
|
78
80
|
Additionally you can fetch embeddings by calling:
|
data/UPGRADING.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Upgrading Guide
|
|
2
|
+
|
|
3
|
+
## Upgrading from 1.x to 2.0
|
|
4
|
+
|
|
5
|
+
### Breaking Changes
|
|
6
|
+
|
|
7
|
+
#### 1. ImagePreprocessor: `add_batch_dimension` method removed
|
|
8
|
+
|
|
9
|
+
The `add_batch_dimension` method was removed from `Clip::ImagePreprocessor` because it was misleadingly named - it didn't actually add a batch dimension.
|
|
10
|
+
|
|
11
|
+
**Before (1.x):**
|
|
12
|
+
```ruby
|
|
13
|
+
preprocessor = Clip::ImagePreprocessor.new
|
|
14
|
+
tensor = preprocessor.preprocess(image_path)
|
|
15
|
+
# tensor shape was [3, 224, 224] despite method name suggesting [1, 3, 224, 224]
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
**After (2.0):**
|
|
19
|
+
```ruby
|
|
20
|
+
preprocessor = Clip::ImagePreprocessor.new
|
|
21
|
+
tensor = preprocessor.preprocess(image_path)
|
|
22
|
+
# tensor shape is [3, 224, 224] - same behavior, clearer code
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
If you were calling `add_batch_dimension` directly (unlikely since it was private), you'll need to remove those calls.
|
|
26
|
+
|
|
27
|
+
#### 2. MultilingualModel: Tokenizer loading changed
|
|
28
|
+
|
|
29
|
+
The `MultilingualModel` no longer uses `Tokenizers.from_pretrained` due to a bug in the tokenizers gem. Instead, it downloads the tokenizer.json file directly and loads it from disk.
|
|
30
|
+
|
|
31
|
+
**Before (1.x):**
|
|
32
|
+
```ruby
|
|
33
|
+
model = Clip::MultilingualModel.new(
|
|
34
|
+
tokenizer: Tokenizers.from_pretrained("M-CLIP/XLM-Roberta-Large-Vit-B-32")
|
|
35
|
+
)
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
**After (2.0):**
|
|
39
|
+
```ruby
|
|
40
|
+
# Tokenizer is downloaded automatically - no need to specify
|
|
41
|
+
model = Clip::MultilingualModel.new
|
|
42
|
+
|
|
43
|
+
# Or provide a custom tokenizer loaded from file
|
|
44
|
+
model = Clip::MultilingualModel.new(
|
|
45
|
+
tokenizer: Tokenizers::Tokenizer.from_file("/path/to/tokenizer.json")
|
|
46
|
+
)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### New Features
|
|
50
|
+
|
|
51
|
+
#### Similarity and Normalization Helpers
|
|
52
|
+
|
|
53
|
+
```ruby
|
|
54
|
+
# Calculate cosine similarity between embeddings
|
|
55
|
+
similarity = Clip.similarity(embedding1, embedding2)
|
|
56
|
+
|
|
57
|
+
# Normalize embeddings to unit length
|
|
58
|
+
normalized = Clip.normalize(embedding)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
#### Image Validation
|
|
62
|
+
|
|
63
|
+
Images are now validated before processing:
|
|
64
|
+
|
|
65
|
+
```ruby
|
|
66
|
+
begin
|
|
67
|
+
model.encode_image("invalid.xyz")
|
|
68
|
+
rescue Clip::ImagePreprocessor::InvalidImageError => e
|
|
69
|
+
puts e.message # "Unsupported image format: xyz. Supported: jpg, jpeg, png, gif, bmp, webp, tiff"
|
|
70
|
+
end
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
#### CLI Improvements
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# JSON output for piping
|
|
77
|
+
clip-embed-text --json "a photo of a cat"
|
|
78
|
+
|
|
79
|
+
# Use multilingual model
|
|
80
|
+
clip-embed-text --multilingual "une photo d'un chat"
|
|
81
|
+
|
|
82
|
+
# Help
|
|
83
|
+
clip-embed-text --help
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
#### Thread Safety
|
|
87
|
+
|
|
88
|
+
Both `Model` and `MultilingualModel` now use mutex locks for thread-safe lazy loading of ONNX models.
|
|
89
|
+
|
|
90
|
+
#### Download Improvements
|
|
91
|
+
|
|
92
|
+
- HTTP timeout of 5 minutes (configurable)
|
|
93
|
+
- Downloads skip files that already exist
|
|
94
|
+
- Relative redirects handled correctly
|
|
95
|
+
- Custom `Clip::DownloadError` exception
|
|
96
|
+
|
|
97
|
+
### Migration Checklist
|
|
98
|
+
|
|
99
|
+
- [ ] Remove any calls to `add_batch_dimension` (if applicable)
|
|
100
|
+
- [ ] Update custom tokenizer initialization for `MultilingualModel` to use `from_file` instead of `from_pretrained`
|
|
101
|
+
- [ ] Consider using new `Clip.similarity` and `Clip.normalize` helpers
|
|
102
|
+
- [ ] Update error handling to catch `Clip::ImagePreprocessor::InvalidImageError`
|
|
103
|
+
- [ ] Update error handling to catch `Clip::DownloadError`
|
data/exe/clip-embed-image
CHANGED
|
@@ -1,16 +1,46 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
2
3
|
|
|
3
4
|
require_relative "../lib/clip"
|
|
5
|
+
require "json"
|
|
6
|
+
require "optparse"
|
|
4
7
|
|
|
8
|
+
options = { format: :ruby }
|
|
5
9
|
|
|
6
|
-
|
|
7
|
-
|
|
10
|
+
OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: clip-embed-image [options] <image_file>"
|
|
12
|
+
|
|
13
|
+
opts.on("-j", "--json", "Output as JSON") do
|
|
14
|
+
options[:format] = :json
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
opts.on("-m", "--multilingual", "Use multilingual model") do
|
|
18
|
+
options[:multilingual] = true
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.on("-h", "--help", "Show this help") do
|
|
22
|
+
puts opts
|
|
23
|
+
exit
|
|
24
|
+
end
|
|
25
|
+
end.parse!
|
|
26
|
+
|
|
27
|
+
if ARGV.empty?
|
|
28
|
+
puts "Usage: clip-embed-image [options] <image_file>"
|
|
29
|
+
puts "Run 'clip-embed-image --help' for options"
|
|
8
30
|
exit 1
|
|
9
31
|
end
|
|
10
32
|
|
|
11
33
|
begin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
34
|
+
model = options[:multilingual] ? Clip::MultilingualModel.new : Clip::Model.new
|
|
35
|
+
embedding = model.encode_image(ARGV[0])
|
|
36
|
+
|
|
37
|
+
case options[:format]
|
|
38
|
+
when :json
|
|
39
|
+
puts JSON.generate(embedding)
|
|
40
|
+
else
|
|
41
|
+
puts embedding.inspect
|
|
42
|
+
end
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
warn "Error: #{e.message}"
|
|
15
45
|
exit 1
|
|
16
46
|
end
|
data/exe/clip-embed-text
CHANGED
|
@@ -1,16 +1,46 @@
|
|
|
1
1
|
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
2
3
|
|
|
3
4
|
require_relative "../lib/clip"
|
|
5
|
+
require "json"
|
|
6
|
+
require "optparse"
|
|
4
7
|
|
|
8
|
+
options = { format: :ruby }
|
|
5
9
|
|
|
6
|
-
|
|
7
|
-
|
|
10
|
+
OptionParser.new do |opts|
|
|
11
|
+
opts.banner = "Usage: clip-embed-text [options] <text>"
|
|
12
|
+
|
|
13
|
+
opts.on("-j", "--json", "Output as JSON") do
|
|
14
|
+
options[:format] = :json
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
opts.on("-m", "--multilingual", "Use multilingual model") do
|
|
18
|
+
options[:multilingual] = true
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
opts.on("-h", "--help", "Show this help") do
|
|
22
|
+
puts opts
|
|
23
|
+
exit
|
|
24
|
+
end
|
|
25
|
+
end.parse!
|
|
26
|
+
|
|
27
|
+
if ARGV.empty?
|
|
28
|
+
puts "Usage: clip-embed-text [options] <text>"
|
|
29
|
+
puts "Run 'clip-embed-text --help' for options"
|
|
8
30
|
exit 1
|
|
9
31
|
end
|
|
10
32
|
|
|
11
33
|
begin
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
34
|
+
model = options[:multilingual] ? Clip::MultilingualModel.new : Clip::Model.new
|
|
35
|
+
embedding = model.encode_text(ARGV[0])
|
|
36
|
+
|
|
37
|
+
case options[:format]
|
|
38
|
+
when :json
|
|
39
|
+
puts JSON.generate(embedding)
|
|
40
|
+
else
|
|
41
|
+
puts embedding.inspect
|
|
42
|
+
end
|
|
43
|
+
rescue StandardError => e
|
|
44
|
+
warn "Error: #{e.message}"
|
|
15
45
|
exit 1
|
|
16
46
|
end
|
|
@@ -1,31 +1,48 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "mini_magick"
|
|
2
4
|
require "numo/narray"
|
|
3
5
|
|
|
4
6
|
module Clip
|
|
5
7
|
class ImagePreprocessor
|
|
6
8
|
# CLIP's expected image normalization parameters
|
|
7
|
-
MEAN = Numo::DFloat[
|
|
8
|
-
STD = Numo::DFloat[
|
|
9
|
+
MEAN = Numo::DFloat[0.48145466, 0.4578275, 0.40821073]
|
|
10
|
+
STD = Numo::DFloat[0.26862954, 0.26130258, 0.27577711]
|
|
11
|
+
SUPPORTED_FORMATS = %w[jpg jpeg png gif bmp webp tiff].freeze
|
|
12
|
+
|
|
13
|
+
class InvalidImageError < StandardError; end
|
|
9
14
|
|
|
10
15
|
def initialize(target_size: 224)
|
|
11
16
|
@target_size = target_size
|
|
12
17
|
end
|
|
13
18
|
|
|
14
|
-
# Preprocess the image and return a tensor with shape [
|
|
19
|
+
# Preprocess the image and return a tensor with shape [3, 224, 224]
|
|
15
20
|
def preprocess(image_path)
|
|
21
|
+
validate_image!(image_path)
|
|
16
22
|
image = load_and_resize(image_path)
|
|
17
23
|
tensor = image_to_tensor(image)
|
|
18
|
-
|
|
19
|
-
add_batch_dimension(normalized)
|
|
24
|
+
normalize(tensor)
|
|
20
25
|
end
|
|
21
26
|
|
|
22
27
|
private
|
|
23
28
|
|
|
29
|
+
# Validate that the image file exists and has a supported format
|
|
30
|
+
def validate_image!(image_path)
|
|
31
|
+
path = image_path.is_a?(File) ? image_path.path : image_path.to_s
|
|
32
|
+
|
|
33
|
+
raise InvalidImageError, "Image file not found: #{path}" unless File.exist?(path)
|
|
34
|
+
|
|
35
|
+
extension = File.extname(path).delete(".").downcase
|
|
36
|
+
return if SUPPORTED_FORMATS.include?(extension)
|
|
37
|
+
|
|
38
|
+
raise InvalidImageError, "Unsupported image format: #{extension}. Supported: #{SUPPORTED_FORMATS.join(', ')}"
|
|
39
|
+
end
|
|
40
|
+
|
|
24
41
|
# Load image, convert to RGB, and resize to target size
|
|
25
42
|
def load_and_resize(image_path)
|
|
26
43
|
image = MiniMagick::Image.open(image_path)
|
|
27
|
-
image.format "png"
|
|
28
|
-
image
|
|
44
|
+
image.format "png"
|
|
45
|
+
image.combine_options do |c|
|
|
29
46
|
c.resize "#{@target_size}x#{@target_size}!"
|
|
30
47
|
c.quality 100
|
|
31
48
|
c.colorspace "RGB"
|
|
@@ -33,30 +50,20 @@ module Clip
|
|
|
33
50
|
image
|
|
34
51
|
end
|
|
35
52
|
|
|
36
|
-
# Convert the image to a normalized
|
|
53
|
+
# Convert the image to a normalized tensor with shape [3, 224, 224]
|
|
37
54
|
def image_to_tensor(image)
|
|
38
|
-
pixels = image.get_pixels
|
|
39
|
-
# Convert to Numo::NArray and reshape
|
|
55
|
+
pixels = image.get_pixels
|
|
40
56
|
pixel_array = Numo::UInt8.asarray(pixels).cast_to(Numo::DFloat)
|
|
41
|
-
# Reshape to [height, width, channels]
|
|
42
57
|
pixel_array = pixel_array.reshape(@target_size, @target_size, 3)
|
|
43
|
-
# Transpose to [channels, height, width]
|
|
44
58
|
pixel_array = pixel_array.transpose(2, 0, 1)
|
|
45
|
-
# Normalize to [0, 1]
|
|
46
59
|
pixel_array / 255.0
|
|
47
60
|
end
|
|
48
61
|
|
|
49
62
|
# Apply CLIP normalization: (x - mean) / std
|
|
50
63
|
def normalize(tensor)
|
|
51
|
-
# Expand mean and std to match tensor shape
|
|
52
64
|
mean = MEAN.reshape(3, 1, 1)
|
|
53
65
|
std = STD.reshape(3, 1, 1)
|
|
54
66
|
(tensor - mean) / std
|
|
55
67
|
end
|
|
56
|
-
|
|
57
|
-
# Add batch dimension: [1, 3, 224, 224]
|
|
58
|
-
def add_batch_dimension(tensor)
|
|
59
|
-
tensor.reshape(3, @target_size, @target_size)
|
|
60
|
-
end
|
|
61
68
|
end
|
|
62
69
|
end
|
data/lib/clip/model.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "onnxruntime"
|
|
2
4
|
|
|
3
5
|
module Clip
|
|
@@ -15,24 +17,29 @@ module Clip
|
|
|
15
17
|
Clip.download_models(download_dir) if download_models && !Clip.models_exist?(textual_model_path: textual_model_path, visual_model_path: visual_model_path)
|
|
16
18
|
@tokenizer = tokenizer
|
|
17
19
|
@image_preprocessor = image_preprocessor
|
|
20
|
+
@model_mutex = Mutex.new
|
|
18
21
|
end
|
|
19
22
|
|
|
20
23
|
def encode_text(text)
|
|
21
24
|
tokens = tokenizer.encode(text)
|
|
22
|
-
text_model.predict({ input
|
|
25
|
+
text_model.predict({ "input" => [tokens] })["output"].first
|
|
23
26
|
end
|
|
24
27
|
|
|
25
28
|
def encode_image(image)
|
|
26
|
-
|
|
27
|
-
image_model.predict({ input
|
|
29
|
+
image_tensor = image_preprocessor.preprocess(image).to_a
|
|
30
|
+
image_model.predict({ "input" => [image_tensor] })["output"].first
|
|
28
31
|
end
|
|
29
32
|
|
|
30
33
|
def text_model
|
|
31
|
-
@
|
|
34
|
+
@model_mutex.synchronize do
|
|
35
|
+
@text_model ||= OnnxRuntime::Model.new(textual_model_path)
|
|
36
|
+
end
|
|
32
37
|
end
|
|
33
38
|
|
|
34
39
|
def image_model
|
|
35
|
-
@
|
|
40
|
+
@model_mutex.synchronize do
|
|
41
|
+
@image_model ||= OnnxRuntime::Model.new(visual_model_path)
|
|
42
|
+
end
|
|
36
43
|
end
|
|
37
44
|
|
|
38
45
|
private
|
|
@@ -1,46 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "onnxruntime"
|
|
2
4
|
require "tokenizers"
|
|
3
5
|
|
|
4
6
|
module Clip
|
|
5
7
|
class MultilingualModel
|
|
8
|
+
TOKENIZER_FILENAME = "tokenizer.json"
|
|
9
|
+
|
|
6
10
|
def initialize(
|
|
7
11
|
textual_model_path: ".clip_models/multilingual/textual.onnx",
|
|
8
12
|
visual_model_path: ".clip_models/multilingual/visual.onnx",
|
|
9
|
-
tokenizer:
|
|
13
|
+
tokenizer: nil,
|
|
10
14
|
image_preprocessor: Clip::ImagePreprocessor.new,
|
|
11
15
|
download_models: true,
|
|
12
16
|
download_dir: ".clip_models/multilingual"
|
|
13
17
|
)
|
|
14
18
|
@textual_model_path = textual_model_path
|
|
15
19
|
@visual_model_path = visual_model_path
|
|
16
|
-
|
|
17
|
-
|
|
20
|
+
@download_dir = download_dir
|
|
21
|
+
|
|
22
|
+
if download_models
|
|
23
|
+
Clip.download_models(download_dir, Clip::MULTILINGUAL_MODELS) unless Clip.models_exist?(textual_model_path: textual_model_path, visual_model_path: visual_model_path)
|
|
24
|
+
download_tokenizer unless tokenizer
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
@tokenizer = tokenizer || load_tokenizer
|
|
18
28
|
@image_preprocessor = image_preprocessor
|
|
29
|
+
@model_mutex = Mutex.new
|
|
19
30
|
end
|
|
20
31
|
|
|
21
32
|
def encode_text(text)
|
|
22
|
-
encoding
|
|
23
|
-
input_ids
|
|
33
|
+
encoding = tokenizer.encode(text)
|
|
34
|
+
input_ids = [encoding.ids]
|
|
24
35
|
attention_mask = [Array.new(encoding.ids.size, 1)]
|
|
25
36
|
|
|
26
|
-
text_model.predict({ "input_ids" => input_ids, "attention_mask" => attention_mask })[
|
|
37
|
+
text_model.predict({ "input_ids" => input_ids, "attention_mask" => attention_mask })["output"].first
|
|
27
38
|
end
|
|
28
39
|
|
|
29
40
|
def encode_image(image)
|
|
30
|
-
|
|
31
|
-
image_model.predict({ pixel_values
|
|
41
|
+
image_tensor = image_preprocessor.preprocess(image).to_a
|
|
42
|
+
image_model.predict({ "pixel_values" => [image_tensor] })["output"].first
|
|
32
43
|
end
|
|
33
44
|
|
|
34
45
|
def text_model
|
|
35
|
-
@
|
|
46
|
+
@model_mutex.synchronize do
|
|
47
|
+
@text_model ||= OnnxRuntime::Model.new(textual_model_path)
|
|
48
|
+
end
|
|
36
49
|
end
|
|
37
50
|
|
|
38
51
|
def image_model
|
|
39
|
-
@
|
|
52
|
+
@model_mutex.synchronize do
|
|
53
|
+
@image_model ||= OnnxRuntime::Model.new(visual_model_path)
|
|
54
|
+
end
|
|
40
55
|
end
|
|
41
56
|
|
|
42
57
|
private
|
|
43
58
|
|
|
44
59
|
attr_reader :textual_model_path, :visual_model_path, :tokenizer, :image_preprocessor
|
|
60
|
+
|
|
61
|
+
def tokenizer_path
|
|
62
|
+
File.join(@download_dir, TOKENIZER_FILENAME)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def download_tokenizer
|
|
66
|
+
return if File.exist?(tokenizer_path)
|
|
67
|
+
|
|
68
|
+
Clip.logger ||= Logger.new($stdout)
|
|
69
|
+
Clip.logger.info("Downloading tokenizer from #{Clip::MULTILINGUAL_TOKENIZER_URL}")
|
|
70
|
+
Clip.download_file(Clip::MULTILINGUAL_TOKENIZER_URL, tokenizer_path)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def load_tokenizer
|
|
74
|
+
Tokenizers::Tokenizer.from_file(tokenizer_path)
|
|
75
|
+
end
|
|
45
76
|
end
|
|
46
77
|
end
|
data/lib/clip/tokenizer.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require "zlib"
|
|
2
4
|
require "set"
|
|
3
5
|
|
|
@@ -5,10 +7,10 @@ module Clip
|
|
|
5
7
|
class Tokenizer
|
|
6
8
|
INPUT_VECTOR_SIZE = 77
|
|
7
9
|
|
|
8
|
-
def initialize(bpe_path = __dir__
|
|
10
|
+
def initialize(bpe_path = File.join(__dir__, "..", "bpe_simple_vocab_16e6.txt.gz"))
|
|
9
11
|
@byte_encoder = bytes_to_unicode
|
|
10
12
|
@byte_decoder = @byte_encoder.invert
|
|
11
|
-
merges = Zlib::GzipReader.open(bpe_path).read.split("\n")[1..(49152 - 256 - 2)]
|
|
13
|
+
merges = Zlib::GzipReader.open(bpe_path) { |gz| gz.read }.split("\n")[1..(49152 - 256 - 2)]
|
|
12
14
|
merges = merges.map { |merge| merge.split(" ") }
|
|
13
15
|
vocab = @byte_encoder.values
|
|
14
16
|
vocab += vocab.map { |v| "#{v}</w>" }
|
|
@@ -53,8 +55,14 @@ module Clip
|
|
|
53
55
|
pairs
|
|
54
56
|
end
|
|
55
57
|
|
|
58
|
+
# Clean text by decoding HTML entities and normalizing unicode
|
|
59
|
+
# Matches Python CLIP's basic_clean which uses ftfy.fix_text and html.unescape
|
|
56
60
|
def basic_clean(text)
|
|
57
|
-
|
|
61
|
+
require "cgi"
|
|
62
|
+
# Decode HTML entities (called twice like Python original)
|
|
63
|
+
text = CGI.unescapeHTML(CGI.unescapeHTML(text))
|
|
64
|
+
# Normalize unicode to NFC form (similar to ftfy's fix_text for most cases)
|
|
65
|
+
text.unicode_normalize(:nfc).strip
|
|
58
66
|
end
|
|
59
67
|
|
|
60
68
|
def whitespace_clean(text)
|
data/lib/clip/version.rb
CHANGED
data/lib/clip-rb.rb
CHANGED
data/lib/clip.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require_relative "clip/model"
|
|
2
4
|
require_relative "clip/multilingual_model"
|
|
3
5
|
require_relative "clip/tokenizer"
|
|
@@ -8,60 +10,104 @@ require "fileutils"
|
|
|
8
10
|
require "logger"
|
|
9
11
|
|
|
10
12
|
module Clip
|
|
11
|
-
attr_accessor :logger
|
|
12
|
-
|
|
13
13
|
BASE_URL = "https://huggingface.co/khasinski/"
|
|
14
14
|
MODELS = {
|
|
15
15
|
"textual.onnx" => "clip-ViT-B-32-onnx/resolve/main/textual.onnx?download=true",
|
|
16
16
|
"visual.onnx" => "clip-ViT-B-32-onnx/resolve/main/visual.onnx?download=true"
|
|
17
|
-
}
|
|
17
|
+
}.freeze
|
|
18
18
|
|
|
19
19
|
MULTILINGUAL_MODELS = {
|
|
20
20
|
"textual.onnx" => "XLM-Roberta-Large-Vit-B-32-onnx/resolve/main/textual.onnx?download=true",
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
}
|
|
21
|
+
"visual.onnx" => "XLM-Roberta-Large-Vit-B-32-onnx/resolve/main/visual.onnx?download=true",
|
|
22
|
+
"data.bin" => "XLM-Roberta-Large-Vit-B-32-onnx/resolve/main/data.bin?download=true"
|
|
23
|
+
}.freeze
|
|
24
|
+
|
|
25
|
+
MULTILINGUAL_TOKENIZER_URL = "https://huggingface.co/M-CLIP/XLM-Roberta-Large-Vit-B-32/resolve/main/tokenizer.json"
|
|
26
|
+
|
|
27
|
+
DEFAULT_TIMEOUT = 300 # 5 minutes for large model files
|
|
28
|
+
|
|
29
|
+
class DownloadError < StandardError; end
|
|
24
30
|
|
|
25
|
-
|
|
26
|
-
logger
|
|
27
|
-
FileUtils.mkdir_p(download_dir)
|
|
31
|
+
class << self
|
|
32
|
+
attr_accessor :logger
|
|
28
33
|
|
|
29
|
-
models
|
|
30
|
-
|
|
31
|
-
|
|
34
|
+
def download_models(download_dir, models = MODELS)
|
|
35
|
+
@logger ||= Logger.new($stdout)
|
|
36
|
+
FileUtils.mkdir_p(download_dir)
|
|
32
37
|
|
|
33
|
-
|
|
38
|
+
models.each do |filename, path|
|
|
39
|
+
uri = URI.join(BASE_URL, path)
|
|
40
|
+
destination = File.join(download_dir, filename)
|
|
41
|
+
|
|
42
|
+
next if File.exist?(destination)
|
|
43
|
+
|
|
44
|
+
logger.info("Downloading #{filename} model from #{uri}")
|
|
45
|
+
download_file(uri.to_s, destination)
|
|
46
|
+
end
|
|
34
47
|
end
|
|
35
|
-
end
|
|
36
48
|
|
|
37
|
-
|
|
38
|
-
|
|
49
|
+
def download_file(url, destination, limit: 10, timeout: DEFAULT_TIMEOUT)
|
|
50
|
+
raise DownloadError, "Too many HTTP redirects" if limit == 0
|
|
39
51
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
52
|
+
uri = URI.parse(url)
|
|
53
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
|
54
|
+
http.use_ssl = (uri.scheme == "https")
|
|
55
|
+
http.open_timeout = timeout
|
|
56
|
+
http.read_timeout = timeout
|
|
43
57
|
|
|
44
|
-
|
|
58
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
|
45
59
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
60
|
+
http.request(request) do |response|
|
|
61
|
+
case response
|
|
62
|
+
when Net::HTTPRedirection
|
|
63
|
+
location = response["location"]
|
|
64
|
+
# Handle relative redirects
|
|
65
|
+
new_url = if location.start_with?("/")
|
|
66
|
+
"#{uri.scheme}://#{uri.host}#{location}"
|
|
67
|
+
else
|
|
68
|
+
location
|
|
69
|
+
end
|
|
70
|
+
download_file(new_url, destination, limit: limit - 1, timeout: timeout)
|
|
71
|
+
when Net::HTTPSuccess
|
|
72
|
+
File.open(destination, "wb") do |file|
|
|
73
|
+
response.read_body do |chunk|
|
|
74
|
+
file.write(chunk)
|
|
75
|
+
end
|
|
55
76
|
end
|
|
77
|
+
else
|
|
78
|
+
raise DownloadError, "Failed to download file: #{response.code} #{response.message}"
|
|
56
79
|
end
|
|
57
|
-
puts "Downloaded #{url} to #{destination}"
|
|
58
|
-
else
|
|
59
|
-
raise "Failed to download file: #{response.code} #{response.message}"
|
|
60
80
|
end
|
|
61
81
|
end
|
|
62
|
-
end
|
|
63
82
|
|
|
64
|
-
|
|
65
|
-
|
|
83
|
+
def models_exist?(textual_model_path:, visual_model_path:)
|
|
84
|
+
File.exist?(textual_model_path) && File.exist?(visual_model_path)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Normalize an embedding vector to unit length (L2 normalization)
|
|
88
|
+
# @param embedding [Array<Float>] The embedding vector
|
|
89
|
+
# @return [Array<Float>] The normalized embedding vector
|
|
90
|
+
def normalize(embedding)
|
|
91
|
+
magnitude = Math.sqrt(embedding.sum { |x| x * x })
|
|
92
|
+
return embedding if magnitude.zero?
|
|
93
|
+
|
|
94
|
+
embedding.map { |x| x / magnitude }
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Calculate cosine similarity between two embeddings
|
|
98
|
+
# @param embedding1 [Array<Float>] First embedding vector
|
|
99
|
+
# @param embedding2 [Array<Float>] Second embedding vector
|
|
100
|
+
# @return [Float] Cosine similarity score between -1 and 1
|
|
101
|
+
def similarity(embedding1, embedding2)
|
|
102
|
+
raise ArgumentError, "Embeddings must have the same length" if embedding1.length != embedding2.length
|
|
103
|
+
|
|
104
|
+
dot_product = embedding1.zip(embedding2).sum { |a, b| a * b }
|
|
105
|
+
magnitude1 = Math.sqrt(embedding1.sum { |x| x * x })
|
|
106
|
+
magnitude2 = Math.sqrt(embedding2.sum { |x| x * x })
|
|
107
|
+
|
|
108
|
+
return 0.0 if magnitude1.zero? || magnitude2.zero?
|
|
109
|
+
|
|
110
|
+
dot_product / (magnitude1 * magnitude2)
|
|
111
|
+
end
|
|
66
112
|
end
|
|
67
113
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: clip-rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Krzysztof Hasiński
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-
|
|
11
|
+
date: 2025-12-12 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: onnxruntime
|
|
@@ -84,30 +84,30 @@ dependencies:
|
|
|
84
84
|
name: mini_magick
|
|
85
85
|
requirement: !ruby/object:Gem::Requirement
|
|
86
86
|
requirements:
|
|
87
|
-
- - "
|
|
87
|
+
- - "~>"
|
|
88
88
|
- !ruby/object:Gem::Version
|
|
89
|
-
version: '0'
|
|
89
|
+
version: '5.0'
|
|
90
90
|
type: :runtime
|
|
91
91
|
prerelease: false
|
|
92
92
|
version_requirements: !ruby/object:Gem::Requirement
|
|
93
93
|
requirements:
|
|
94
|
-
- - "
|
|
94
|
+
- - "~>"
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '0'
|
|
96
|
+
version: '5.0'
|
|
97
97
|
- !ruby/object:Gem::Dependency
|
|
98
98
|
name: tokenizers
|
|
99
99
|
requirement: !ruby/object:Gem::Requirement
|
|
100
100
|
requirements:
|
|
101
|
-
- - "
|
|
101
|
+
- - "~>"
|
|
102
102
|
- !ruby/object:Gem::Version
|
|
103
|
-
version: '0'
|
|
103
|
+
version: '0.5'
|
|
104
104
|
type: :runtime
|
|
105
105
|
prerelease: false
|
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
|
107
107
|
requirements:
|
|
108
|
-
- - "
|
|
108
|
+
- - "~>"
|
|
109
109
|
- !ruby/object:Gem::Version
|
|
110
|
-
version: '0'
|
|
110
|
+
version: '0.5'
|
|
111
111
|
description: OpenAI CLIP embeddings, uses ONNX models. Allows to create embeddings
|
|
112
112
|
for images and text
|
|
113
113
|
email:
|
|
@@ -120,10 +120,12 @@ extra_rdoc_files: []
|
|
|
120
120
|
files:
|
|
121
121
|
- ".clip_models/.gitkeep"
|
|
122
122
|
- ".rspec"
|
|
123
|
+
- CHANGELOG.md
|
|
123
124
|
- CODE_OF_CONDUCT.md
|
|
124
125
|
- LICENSE.txt
|
|
125
126
|
- README.md
|
|
126
127
|
- Rakefile
|
|
128
|
+
- UPGRADING.md
|
|
127
129
|
- exe/clip-embed-image
|
|
128
130
|
- exe/clip-embed-text
|
|
129
131
|
- lib/bpe_simple_vocab_16e6.txt.gz
|
|
@@ -142,8 +144,8 @@ licenses:
|
|
|
142
144
|
metadata:
|
|
143
145
|
homepage_uri: https://github.com/khasinski/clip-rb
|
|
144
146
|
source_code_uri: https://github.com/khasinski/clip-rb
|
|
145
|
-
changelog_uri: https://github.com/khasinski/clip-rb/CHANGELOG.md
|
|
146
|
-
post_install_message:
|
|
147
|
+
changelog_uri: https://github.com/khasinski/clip-rb/blob/main/CHANGELOG.md
|
|
148
|
+
post_install_message:
|
|
147
149
|
rdoc_options: []
|
|
148
150
|
require_paths:
|
|
149
151
|
- lib
|
|
@@ -158,8 +160,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
158
160
|
- !ruby/object:Gem::Version
|
|
159
161
|
version: '0'
|
|
160
162
|
requirements: []
|
|
161
|
-
rubygems_version: 3.
|
|
162
|
-
signing_key:
|
|
163
|
+
rubygems_version: 3.0.3.1
|
|
164
|
+
signing_key:
|
|
163
165
|
specification_version: 4
|
|
164
166
|
summary: OpenAI CLIP embeddings, uses ONNX models
|
|
165
167
|
test_files: []
|