clip-rb 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +17 -9
- data/lib/clip/image_preprocessor.rb +57 -0
- data/lib/clip/version.rb +1 -1
- data/sig/clip/.gitkeep +0 -0
- data/sig/clip.rbs +3 -0
- metadata +31 -2
- data/sig/clip/rb.rbs +0 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f6ffc082bf5eaf141727d2aeddb71c9239db2c136b910098292bd1e6d277a424
|
4
|
+
data.tar.gz: c5045c10400ab66ff3513a51c5bc9d7681ad6db6ea1a7fbfcca75cd1028eeb8e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70caed58aa9ad77a2ef0c8ca658efef2a32020d497929cee06226ea2dfd4142585888625f595ad14243ad1fcf11dce8023e9e0ded954d38aa212de26794c3335
|
7
|
+
data.tar.gz: 1a6534a2626aa6ca674d519b37450898df25984d029c8801c1e03c292fb08847a00468f8c6c2b71ef6c8f86fb977d3bfbbb82eb2f4bd4bce6339060a16e3ec61
|
data/README.md
CHANGED
@@ -1,18 +1,24 @@
|
|
1
1
|
# clip-rb
|
2
2
|
|
3
|
-
[](https://badge.fury.io/rb/clip-rb)
|
3
|
+
[](https://badge.fury.io/rb/clip-rb)
|
4
4
|
[](https://github.com/khasinski/clip-rb/actions/workflows/main.yml)
|
5
5
|
|
6
|
-
|
6
|
+
**clip-rb** is a Ruby implementation of [OpenAI CLIP](https://openai.com/index/clip/) powered by ONNX models—no Python required!
|
7
|
+
|
8
|
+
CLIP (Contrastive Language–Image Pre-training) is a powerful neural network developed by OpenAI. It connects text and images by learning shared representations, enabling tasks such as image-to-text matching, zero-shot classification, and visual search. With clip-rb, you can easily encode text and images into high-dimensional embeddings for similarity comparison or use in downstream applications like caption generation and vector search.
|
9
|
+
|
10
|
+
---
|
7
11
|
|
8
12
|
## Requirements
|
9
13
|
|
10
14
|
- Ruby 3.0.0 or later
|
11
|
-
- ONNX models
|
15
|
+
- ONNX CLIP models (downloaded automatically on first use)
|
16
|
+
|
17
|
+
---
|
12
18
|
|
13
19
|
## Installation
|
14
20
|
|
15
|
-
|
21
|
+
Add the gem to your application by executing:
|
16
22
|
|
17
23
|
```bash
|
18
24
|
bundle add clip-rb
|
@@ -31,11 +37,15 @@ require 'clip'
|
|
31
37
|
|
32
38
|
clip = Clip::Model.new
|
33
39
|
|
34
|
-
clip.encode_text("a photo of a cat")
|
40
|
+
text_embedding = clip.encode_text("a photo of a cat")
|
41
|
+
# => [0.15546110272407532, 0.07329428941011429, ...]
|
35
42
|
|
36
|
-
clip.encode_image("test/fixtures/test.jpg")
|
43
|
+
image_embedding = clip.encode_image("test/fixtures/test.jpg")
|
44
|
+
# => [0.22115306556224823, 0.19343754649162292, ...]
|
37
45
|
```
|
38
46
|
|
47
|
+
💡 Tip: Use cosine similarity for KNN vector search when comparing embeddings!
|
48
|
+
|
39
49
|
## CLI
|
40
50
|
|
41
51
|
Additionally you can fetch embeddings by calling:
|
@@ -45,8 +55,6 @@ $ clip-embed-text "a photo of a cat"
|
|
45
55
|
$ clip-embed-image test/fixtures/test.jpg
|
46
56
|
```
|
47
57
|
|
48
|
-
Use KNN vector search to find similar images, remember to use cosine distance!
|
49
|
-
|
50
58
|
## Development
|
51
59
|
|
52
60
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
@@ -55,7 +63,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
55
63
|
|
56
64
|
## Contributing
|
57
65
|
|
58
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
66
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/khasinski/clip-rb. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [code of conduct](https://github.com/[USERNAME]/clip-rb/blob/main/CODE_OF_CONDUCT.md).
|
59
67
|
|
60
68
|
## License
|
61
69
|
|
@@ -1,5 +1,62 @@
|
|
1
|
+
require "mini_magick"
|
2
|
+
require "numo/narray"
|
3
|
+
|
1
4
|
module Clip
|
2
5
|
class ImagePreprocessor
|
6
|
+
# CLIP's expected image normalization parameters
|
7
|
+
MEAN = Numo::DFloat[*[ 0.48145466, 0.4578275, 0.40821073 ]]
|
8
|
+
STD = Numo::DFloat[*[ 0.26862954, 0.26130258, 0.27577711 ]]
|
9
|
+
|
10
|
+
def initialize(target_size: 224)
|
11
|
+
@target_size = target_size
|
12
|
+
end
|
13
|
+
|
14
|
+
# Preprocess the image and return a tensor with shape [batch_size, 3, 224, 224]
|
15
|
+
def preprocess(image_path)
|
16
|
+
image = load_and_resize(image_path)
|
17
|
+
tensor = image_to_tensor(image)
|
18
|
+
normalized = normalize(tensor)
|
19
|
+
add_batch_dimension(normalized)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
# Load image, convert to RGB, and resize to target size
|
25
|
+
def load_and_resize(image_path)
|
26
|
+
image = MiniMagick::Image.open(image_path)
|
27
|
+
image.format "png" # Ensure consistent format
|
28
|
+
image = image.combine_options do |c|
|
29
|
+
c.resize "#{@target_size}x#{@target_size}!"
|
30
|
+
c.quality 100
|
31
|
+
c.colorspace "RGB"
|
32
|
+
end
|
33
|
+
image
|
34
|
+
end
|
35
|
+
|
36
|
+
# Convert the image to a normalized NumPy array with shape [3, 224, 224]
|
37
|
+
def image_to_tensor(image)
|
38
|
+
pixels = image.get_pixels # Returns [[R, G, B], ...] for each row
|
39
|
+
# Convert to Numo::NArray and reshape
|
40
|
+
pixel_array = Numo::UInt8.asarray(pixels).cast_to(Numo::DFloat)
|
41
|
+
# Reshape to [height, width, channels]
|
42
|
+
pixel_array = pixel_array.reshape(@target_size, @target_size, 3)
|
43
|
+
# Transpose to [channels, height, width]
|
44
|
+
pixel_array = pixel_array.transpose(2, 0, 1)
|
45
|
+
# Normalize to [0, 1]
|
46
|
+
pixel_array / 255.0
|
47
|
+
end
|
48
|
+
|
49
|
+
# Apply CLIP normalization: (x - mean) / std
|
50
|
+
def normalize(tensor)
|
51
|
+
# Expand mean and std to match tensor shape
|
52
|
+
mean = MEAN.reshape(3, 1, 1)
|
53
|
+
std = STD.reshape(3, 1, 1)
|
54
|
+
(tensor - mean) / std
|
55
|
+
end
|
3
56
|
|
57
|
+
# Add batch dimension: [1, 3, 224, 224]
|
58
|
+
def add_batch_dimension(tensor)
|
59
|
+
tensor.reshape(3, @target_size, @target_size)
|
60
|
+
end
|
4
61
|
end
|
5
62
|
end
|
data/lib/clip/version.rb
CHANGED
data/sig/clip/.gitkeep
ADDED
File without changes
|
data/sig/clip.rbs
ADDED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: clip-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Krzysztof Hasiński
|
@@ -66,6 +66,34 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '1.6'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: numo-narray
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: 0.9.2
|
76
|
+
type: :runtime
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 0.9.2
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: mini_magick
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '5.0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '5.0'
|
69
97
|
description: OpenAI CLIP embeddings, uses ONNX models. Allows to create embeddings
|
70
98
|
for images and text
|
71
99
|
email:
|
@@ -90,7 +118,8 @@ files:
|
|
90
118
|
- lib/clip/model.rb
|
91
119
|
- lib/clip/tokenizer.rb
|
92
120
|
- lib/clip/version.rb
|
93
|
-
- sig/clip
|
121
|
+
- sig/clip.rbs
|
122
|
+
- sig/clip/.gitkeep
|
94
123
|
homepage: https://github.com/khasinski/clip-rb
|
95
124
|
licenses:
|
96
125
|
- MIT
|