mini_embed 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +100 -37
- data/ext/mini_embed/mini_embed.c +466 -108
- data/lib/mini_embed.rb +14 -0
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2df9f5c081f8a7fa2447261817ebba58e5c062921a1dc6ee3ec8048fdc300022
|
|
4
|
+
data.tar.gz: 4ee4f87161506c59e6deda7dd12b819c33e86ae5f5843aa89a9754b57b27f968
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4646b9f96a6ef525751d3046f0524d308940c1deb3a85623f775666ab2e1bbcddbec62c16f110f2cc1eae620f0b52c900a64f03cfa01f8e856d3546eb404ee98
|
|
7
|
+
data.tar.gz: f03b8103bddc296f1601d62bda851655529dc47cea1eee9f6bfcfc03fb5c005ad33f2b53a81e84d882b2813c670254bdbdb465b2b64f76c4ffc66248f5b27f73
|
data/README.md
CHANGED
|
@@ -1,74 +1,137 @@
|
|
|
1
1
|
# mini_embed
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
A minimal, dependency‑free C extension for Ruby that loads [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) embedding models and computes text embeddings **locally**.
|
|
4
|
+
|
|
5
|
+
**⚠️ Important:** This gem is intended for **small projects, prototypes, and hobbyist use**. It allows you to experiment with embeddings without relying on external APIs or cloud costs. **Do not use MiniEmbed in production** – it lacks the performance, scalability, and tokenization robustness of dedicated solutions. For real applications, use a proper inference server like [llama.cpp](https://github.com/ggerganov/llama.cpp) with its HTTP API, or managed services such as OpenAI, Cohere, or Hugging Face.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Why MiniEmbed?
|
|
10
|
+
|
|
11
|
+
- **Zero external dependencies** – no TensorFlow, PyTorch, or ONNX runtime.
|
|
12
|
+
- **Single‑file C extension** – fast loading and mean‑pooled embeddings.
|
|
13
|
+
- **Supports all common GGUF quantizations** – from `F32` to `Q2_K`.
|
|
14
|
+
- **Works entirely offline** – your data never leaves your machine.
|
|
15
|
+
- Perfect for **weekend projects**, **proof‑of‑concepts**, or **learning** about embeddings.
|
|
16
|
+
|
|
17
|
+
---
|
|
4
18
|
|
|
5
19
|
## Installation
|
|
6
20
|
|
|
7
|
-
Add to your Gemfile
|
|
21
|
+
Add this line to your application's `Gemfile`:
|
|
8
22
|
|
|
9
23
|
```ruby
|
|
10
24
|
gem 'mini_embed'
|
|
11
25
|
```
|
|
12
|
-
Or install globally:
|
|
13
26
|
|
|
14
|
-
|
|
27
|
+
Then execute:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
bundle install
|
|
31
|
+
```
|
|
32
|
+
Or install it globally:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
15
35
|
gem install mini_embed
|
|
16
36
|
```
|
|
17
37
|
|
|
18
|
-
|
|
38
|
+
|
|
39
|
+
## Requirements:
|
|
40
|
+
|
|
41
|
+
A POSIX system (Linux, macOS, BSD) – Windows via WSL2 works.
|
|
42
|
+
|
|
43
|
+
A C compiler and make (for compiling the native extension).
|
|
44
|
+
|
|
45
|
+
A GGUF embedding model file (see Where to get models).
|
|
46
|
+
|
|
47
|
+
## Usage
|
|
48
|
+
|
|
19
49
|
```ruby
|
|
20
50
|
require 'mini_embed'
|
|
21
51
|
|
|
22
|
-
model
|
|
23
|
-
|
|
24
|
-
embeddings_array = embeddings_bin.unpack('f*') # => array of float
|
|
25
|
-
puts embeddings_array.size # => model dimension
|
|
26
|
-
```
|
|
52
|
+
# Load a GGUF model (F32, F16, Q8_0, Q4_K, etc. are all supported)
|
|
53
|
+
model = MiniEmbed.new(model: '/path/to/gte-small.Q8_0.gguf')
|
|
27
54
|
|
|
28
|
-
|
|
55
|
+
# Get embedding as an array of floats (default)
|
|
56
|
+
embedding = model.embeddings(text: 'hello world')
|
|
57
|
+
puts embedding.size # e.g. 384
|
|
58
|
+
puts embedding[0..4] # e.g. [0.0123, -0.0456, ...]
|
|
29
59
|
|
|
60
|
+
# Or get the raw binary string (little‑endian 32‑bit floats)
|
|
61
|
+
binary = model.embeddings(text: 'hello world', type: :binary)
|
|
62
|
+
embedding_from_binary = binary.unpack('e*')
|
|
30
63
|
```
|
|
31
|
-
F32, F16
|
|
32
64
|
|
|
33
|
-
|
|
65
|
+
Note: The type parameter is optional – it defaults to :vector which returns a Ruby `Array<Float>`. Use `type: :binary` to get the raw binary string (compatible with the original C extension).
|
|
34
66
|
|
|
35
|
-
Q5_0, Q5_1
|
|
36
67
|
|
|
37
|
-
|
|
68
|
+
## Simple tokenization note
|
|
69
|
+
MiniEmbed uses a naive space‑based tokenizer. This means it splits input on spaces and looks up each token exactly in the model's vocabulary. For models trained with subword tokenization (like BERT), this will not work for out‑of‑vocabulary words.
|
|
70
|
+
If you need proper subword tokenization, you can:
|
|
38
71
|
|
|
39
|
-
|
|
40
|
-
|
|
72
|
+
- Pre‑tokenize in Ruby using the tokenizers gem and pass token IDs (not yet exposed in the C API, but easy to add).
|
|
73
|
+
- Stick to simple vocabulary words that exist in the model (e.g., "text", "hello", "dog").
|
|
41
74
|
|
|
42
|
-
##
|
|
75
|
+
## Supported Quantization Types
|
|
43
76
|
|
|
44
|
-
|
|
77
|
+
| Type | Description |
|
|
78
|
+
|------|---------------|
|
|
79
|
+
| 0 | F32 (float32) |
|
|
80
|
+
| 1 | F16 (float16) |
|
|
81
|
+
| 2 | Q4_0 |
|
|
82
|
+
| 3 | Q4_1 |
|
|
83
|
+
| 6 | Q5_0 |
|
|
84
|
+
| 7 | Q5_1 |
|
|
85
|
+
| 8 | Q8_0 |
|
|
86
|
+
| 9 | Q8_1 |
|
|
87
|
+
| 10 | Q2_K |
|
|
88
|
+
| 11 | Q3_K |
|
|
89
|
+
| 12 | Q4_K |
|
|
90
|
+
| 13 | Q5_K |
|
|
91
|
+
| 14 | Q6_K |
|
|
92
|
+
| 15 | Q8_K |
|
|
45
93
|
|
|
46
|
-
|
|
47
|
-
bundle install
|
|
48
|
-
bundle exec rake compile
|
|
49
|
-
```
|
|
94
|
+
The extension automatically dequantizes the embedding matrix on load, so inference speed is always that of a plain float32 lookup.
|
|
50
95
|
|
|
96
|
+
Where to get models
|
|
97
|
+
Hugging Face offers many GGUF models, e.g.:
|
|
51
98
|
|
|
52
|
-
|
|
99
|
+
- `gte-small`
|
|
100
|
+
- `all‑MiniLM‑L6‑v2`
|
|
53
101
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
102
|
+
You can convert any safetensors or PyTorch model using the `convert‑hf‑to‑gguf.py` script from llama.cpp.
|
|
103
|
+
|
|
104
|
+
For testing, we recommend the `gte-small` model (384 dimensions, ~30k vocabulary).
|
|
105
|
+
|
|
106
|
+
## Limitations (Why this is not production‑ready)
|
|
107
|
+
|
|
108
|
+
- Single‑threaded, blocking C code – embedding computation runs on the Ruby thread, freezing the interpreter.
|
|
109
|
+
- No batching – only one text at a time.
|
|
110
|
+
- Space‑based tokenization only – works only for words present exactly in the vocabulary.
|
|
111
|
+
- Loads the entire embedding matrix into RAM – for large vocabularies this may consume significant memory.
|
|
112
|
+
- No GPU support – CPU only.
|
|
113
|
+
- Error handling is minimal – invalid models may crash the Ruby process.
|
|
57
114
|
|
|
58
|
-
|
|
115
|
+
If you need a robust, scalable solution, consider:
|
|
116
|
+
|
|
117
|
+
- Running llama.cpp as a server (./server -m model.gguf --embeddings) and calling its HTTP endpoint.
|
|
118
|
+
- Using a cloud embeddings API (OpenAI, Cohere, VoyageAI, etc.).
|
|
119
|
+
- Deploying a dedicated inference service with BentoML or Ray Serve.
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
## Development & Contributing
|
|
123
|
+
Bug reports and pull requests are welcome on GitHub.
|
|
124
|
+
To run the tests:
|
|
59
125
|
|
|
60
126
|
```bash
|
|
61
|
-
|
|
127
|
+
bundle exec rspec
|
|
62
128
|
```
|
|
63
|
-
Using in a Rails project
|
|
64
|
-
Add to Gemfile:
|
|
65
129
|
|
|
66
|
-
|
|
67
|
-
gem 'mini_embed', path: '/path/to/mini_embed'
|
|
68
|
-
```
|
|
130
|
+
The gem uses rake-compiler to build the extension. After making changes to the C source, run:
|
|
69
131
|
|
|
70
|
-
|
|
132
|
+
```bash
|
|
133
|
+
bundle exec rake compile
|
|
134
|
+
```
|
|
71
135
|
|
|
72
136
|
## License
|
|
73
|
-
|
|
74
|
-
MIT License. See [LICENSE](LICENSE).
|
|
137
|
+
MIT License. See [LICENSE](LICENSE).
|
data/ext/mini_embed/mini_embed.c
CHANGED
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
#define HASH_SIZE 131071
|
|
14
14
|
#define MAX_DIMS 4
|
|
15
15
|
#define GGUF_ALIGN 32
|
|
16
|
+
#define MAX_MERGES 10000
|
|
17
|
+
#define MAX_REGEX 256
|
|
16
18
|
|
|
17
19
|
enum ggml_type {
|
|
18
20
|
GGML_TYPE_F32 = 0,
|
|
@@ -31,7 +33,249 @@ enum ggml_type {
|
|
|
31
33
|
GGML_TYPE_Q8_K = 15,
|
|
32
34
|
};
|
|
33
35
|
|
|
36
|
+
enum llama_vocab_type {
|
|
37
|
+
LLAMA_VOCAB_TYPE_NONE = 0,
|
|
38
|
+
LLAMA_VOCAB_TYPE_SPM = 1,
|
|
39
|
+
LLAMA_VOCAB_TYPE_BPE = 2,
|
|
40
|
+
LLAMA_VOCAB_TYPE_WPM = 3,
|
|
41
|
+
};
|
|
42
|
+
|
|
43
|
+
/* ------------------------------------------------------------------------- */
|
|
44
|
+
// Unicode helper functions
|
|
45
|
+
static int unicode_len_utf8(char c) {
|
|
46
|
+
if ((c & 0x80) == 0) return 1;
|
|
47
|
+
if ((c & 0xE0) == 0xC0) return 2;
|
|
48
|
+
if ((c & 0xF0) == 0xE0) return 3;
|
|
49
|
+
if ((c & 0xF8) == 0xF0) return 4;
|
|
50
|
+
return 1;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
static int unicode_is_letter(uint32_t cp) {
|
|
54
|
+
return (cp >= 0x41 && cp <= 0x5A) || (cp >= 0x61 && cp <= 0x7A) ||
|
|
55
|
+
(cp >= 0xC0 && cp <= 0xD6) || (cp >= 0xD8 && cp <= 0xF6) ||
|
|
56
|
+
(cp >= 0xF8 && cp <= 0x2FF) || (cp >= 0x370 && cp <= 0x37D) ||
|
|
57
|
+
(cp >= 0x37F && cp <= 0x1FFF) || (cp >= 0x200C && cp <= 0x200D) ||
|
|
58
|
+
(cp >= 0x2070 && cp <= 0x218F) || (cp >= 0x2C00 && cp <= 0x2FEF) ||
|
|
59
|
+
(cp >= 0x3001 && cp <= 0xD7FF) || (cp >= 0xF900 && cp <= 0xFDCF) ||
|
|
60
|
+
(cp >= 0xFDF0 && cp <= 0xFFFD);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
static int unicode_is_number(uint32_t cp) {
|
|
64
|
+
return (cp >= 0x30 && cp <= 0x39) || (cp >= 0x660 && cp <= 0x669) ||
|
|
65
|
+
(cp >= 0x6F0 && cp <= 0x6F9) || (cp >= 0x7C0 && cp <= 0x7C9) ||
|
|
66
|
+
(cp >= 0x966 && cp <= 0x96F);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
static uint32_t unicode_cpt_from_utf8(const char *s, size_t *len) {
|
|
70
|
+
unsigned char c = (unsigned char)s[0];
|
|
71
|
+
if (c < 0x80) { *len = 1; return c; }
|
|
72
|
+
if ((c & 0xE0) == 0xC0) { *len = 2; return ((c & 0x1F) << 6) | (s[1] & 0x3F); }
|
|
73
|
+
if ((c & 0xF0) == 0xE0) { *len = 3; return ((c & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); }
|
|
74
|
+
if ((c & 0xF8) == 0xF0) { *len = 4; return ((c & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); }
|
|
75
|
+
*len = 1;
|
|
76
|
+
return c;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/* ------------------------------------------------------------------------- */
|
|
80
|
+
// Simple regex pattern matcher (simplified)
|
|
81
|
+
typedef struct {
|
|
82
|
+
char *pattern;
|
|
83
|
+
int pattern_len;
|
|
84
|
+
} RegexPattern;
|
|
85
|
+
|
|
86
|
+
static int match_regex(const char *text, const RegexPattern *patterns, int num_patterns) {
|
|
87
|
+
for (int i = 0; i < num_patterns; i++) {
|
|
88
|
+
const char *p = patterns[i].pattern;
|
|
89
|
+
if (strstr(p, "\\p{L}")) {
|
|
90
|
+
size_t len;
|
|
91
|
+
uint32_t cp = unicode_cpt_from_utf8(text, &len);
|
|
92
|
+
if (unicode_is_letter(cp)) return 1;
|
|
93
|
+
} else if (strstr(p, "\\p{N}")) {
|
|
94
|
+
size_t len;
|
|
95
|
+
uint32_t cp = unicode_cpt_from_utf8(text, &len);
|
|
96
|
+
if (unicode_is_number(cp)) return 1;
|
|
97
|
+
} else if (p[0] == '\\' && p[1] == 's') {
|
|
98
|
+
if (isspace(text[0])) return 1;
|
|
99
|
+
} else if (p[0] == '\\' && p[1] == 'r') {
|
|
100
|
+
if (text[0] == '\r') return 1;
|
|
101
|
+
} else if (p[0] == '\\' && p[1] == 'n') {
|
|
102
|
+
if (text[0] == '\n') return 1;
|
|
103
|
+
} else if (p[0] == '.' && p[1] == '*') {
|
|
104
|
+
return 1;
|
|
105
|
+
} else if (isalnum(p[0]) || ispunct(p[0])) {
|
|
106
|
+
if (text[0] == p[0]) return 1;
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
static char** unicode_regex_split(const char *text, const RegexPattern *patterns, int num_patterns, int *num_words) {
|
|
113
|
+
char **words = NULL;
|
|
114
|
+
int word_count = 0, word_capacity = 0;
|
|
115
|
+
size_t text_len = strlen(text), pos = 0;
|
|
116
|
+
|
|
117
|
+
while (pos < text_len) {
|
|
118
|
+
size_t start = pos;
|
|
119
|
+
while (start < text_len && !match_regex(text + start, patterns, num_patterns)) start++;
|
|
120
|
+
if (start >= text_len) break;
|
|
121
|
+
size_t end = start;
|
|
122
|
+
while (end < text_len && match_regex(text + end, patterns, num_patterns)) end++;
|
|
123
|
+
if (end > start) {
|
|
124
|
+
size_t word_len = end - start;
|
|
125
|
+
char *word = malloc(word_len + 1);
|
|
126
|
+
if (!word) { while (--word_count >= 0) free(words[word_count]); free(words); *num_words = 0; return NULL; }
|
|
127
|
+
memcpy(word, text + start, word_len);
|
|
128
|
+
word[word_len] = '\0';
|
|
129
|
+
if (word_count >= word_capacity) {
|
|
130
|
+
word_capacity = word_capacity ? word_capacity * 2 : 16;
|
|
131
|
+
words = realloc(words, word_capacity * sizeof(char*));
|
|
132
|
+
if (!words) { free(word); while (--word_count >= 0) free(words[word_count]); *num_words = 0; return NULL; }
|
|
133
|
+
}
|
|
134
|
+
words[word_count++] = word;
|
|
135
|
+
}
|
|
136
|
+
pos = end;
|
|
137
|
+
}
|
|
138
|
+
*num_words = word_count;
|
|
139
|
+
return words;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/* ------------------------------------------------------------------------- */
|
|
143
|
+
// BPE merge structures
|
|
144
|
+
typedef struct {
|
|
145
|
+
char *left;
|
|
146
|
+
char *right;
|
|
147
|
+
char *merged;
|
|
148
|
+
int rank;
|
|
149
|
+
} BPEMerge;
|
|
150
|
+
|
|
151
|
+
typedef struct {
|
|
152
|
+
BPEMerge *merges;
|
|
153
|
+
int num_merges;
|
|
154
|
+
int capacity;
|
|
155
|
+
} BPEMergeTable;
|
|
156
|
+
|
|
157
|
+
static void bpe_merge_table_init(BPEMergeTable *table) {
|
|
158
|
+
memset(table, 0, sizeof(*table));
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
static void bpe_merge_table_add(BPEMergeTable *table, const char *left, const char *right, const char *merged, int rank) {
|
|
162
|
+
if (table->num_merges >= table->capacity) {
|
|
163
|
+
table->capacity = table->capacity ? table->capacity * 2 : 100;
|
|
164
|
+
table->merges = realloc(table->merges, table->capacity * sizeof(BPEMerge));
|
|
165
|
+
}
|
|
166
|
+
BPEMerge *m = &table->merges[table->num_merges++];
|
|
167
|
+
m->left = strdup(left);
|
|
168
|
+
m->right = strdup(right);
|
|
169
|
+
m->merged = strdup(merged);
|
|
170
|
+
m->rank = rank;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
static void bpe_merge_table_free(BPEMergeTable *table) {
|
|
174
|
+
for (int i = 0; i < table->num_merges; i++) {
|
|
175
|
+
free(table->merges[i].left);
|
|
176
|
+
free(table->merges[i].right);
|
|
177
|
+
free(table->merges[i].merged);
|
|
178
|
+
}
|
|
179
|
+
free(table->merges);
|
|
180
|
+
table->merges = NULL;
|
|
181
|
+
table->num_merges = 0;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
static int bpe_merge_rank(const BPEMergeTable *table, const char *left, const char *right) {
|
|
185
|
+
for (int i = 0; i < table->num_merges; i++) {
|
|
186
|
+
if (strcmp(table->merges[i].left, left) == 0 && strcmp(table->merges[i].right, right) == 0)
|
|
187
|
+
return table->merges[i].rank;
|
|
188
|
+
}
|
|
189
|
+
return -1;
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/* ------------------------------------------------------------------------- */
|
|
193
|
+
// BPE tokenization
|
|
194
|
+
typedef struct {
|
|
195
|
+
char *text;
|
|
196
|
+
int start, end;
|
|
197
|
+
int prev, next;
|
|
198
|
+
int used;
|
|
199
|
+
} BPESymbol;
|
|
200
|
+
|
|
201
|
+
static void bpe_tokenize_word(const BPEMergeTable *merges, const char *word, int (*text_to_id)(void*, const char*), void *vocab_data, int *token_ids, int *num_tokens) {
|
|
202
|
+
int word_len = strlen(word);
|
|
203
|
+
int num_symbols = 0;
|
|
204
|
+
BPESymbol *symbols = malloc(word_len * sizeof(BPESymbol));
|
|
205
|
+
int offset = 0;
|
|
206
|
+
while (offset < word_len) {
|
|
207
|
+
int char_len = unicode_len_utf8(word[offset]);
|
|
208
|
+
symbols[num_symbols].text = (char*)word + offset;
|
|
209
|
+
symbols[num_symbols].start = offset;
|
|
210
|
+
symbols[num_symbols].end = offset + char_len;
|
|
211
|
+
symbols[num_symbols].prev = num_symbols - 1;
|
|
212
|
+
symbols[num_symbols].next = num_symbols + 1;
|
|
213
|
+
symbols[num_symbols].used = 1;
|
|
214
|
+
offset += char_len;
|
|
215
|
+
num_symbols++;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if (num_symbols <= 1) {
|
|
219
|
+
int id = text_to_id(vocab_data, word);
|
|
220
|
+
if (id != -1) token_ids[(*num_tokens)++] = id;
|
|
221
|
+
free(symbols);
|
|
222
|
+
return;
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
typedef struct { int left, right, rank; } Bigram;
|
|
226
|
+
Bigram *bigrams = malloc(num_symbols * num_symbols * sizeof(Bigram));
|
|
227
|
+
int num_bigrams = 0;
|
|
228
|
+
for (int i = 0; i < num_symbols - 1; i++) {
|
|
229
|
+
if (symbols[i].used && symbols[i+1].used) {
|
|
230
|
+
char *left_str = malloc(symbols[i].end - symbols[i].start + 1);
|
|
231
|
+
char *right_str = malloc(symbols[i+1].end - symbols[i+1].start + 1);
|
|
232
|
+
memcpy(left_str, symbols[i].text, symbols[i].end - symbols[i].start);
|
|
233
|
+
memcpy(right_str, symbols[i+1].text, symbols[i+1].end - symbols[i+1].start);
|
|
234
|
+
left_str[symbols[i].end - symbols[i].start] = '\0';
|
|
235
|
+
right_str[symbols[i+1].end - symbols[i+1].start] = '\0';
|
|
236
|
+
int rank = bpe_merge_rank(merges, left_str, right_str);
|
|
237
|
+
if (rank != -1) {
|
|
238
|
+
bigrams[num_bigrams].left = i;
|
|
239
|
+
bigrams[num_bigrams].right = i+1;
|
|
240
|
+
bigrams[num_bigrams].rank = rank;
|
|
241
|
+
num_bigrams++;
|
|
242
|
+
}
|
|
243
|
+
free(left_str); free(right_str);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
for (int i = 0; i < num_bigrams - 1; i++)
|
|
247
|
+
for (int j = i+1; j < num_bigrams; j++)
|
|
248
|
+
if (bigrams[i].rank > bigrams[j].rank) {
|
|
249
|
+
Bigram tmp = bigrams[i];
|
|
250
|
+
bigrams[i] = bigrams[j];
|
|
251
|
+
bigrams[j] = tmp;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
int *merged = calloc(num_symbols, sizeof(int));
|
|
255
|
+
for (int i = 0; i < num_bigrams; i++) {
|
|
256
|
+
int left = bigrams[i].left, right = bigrams[i].right;
|
|
257
|
+
if (merged[left] || merged[right]) continue;
|
|
258
|
+
symbols[left].end = symbols[right].end;
|
|
259
|
+
symbols[left].next = symbols[right].next;
|
|
260
|
+
merged[right] = 1;
|
|
261
|
+
if (symbols[right].next < num_symbols) symbols[symbols[right].next].prev = left;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
for (int i = 0; i < num_symbols; i++) {
|
|
265
|
+
if (!merged[i] && symbols[i].used) {
|
|
266
|
+
char *substr = malloc(symbols[i].end - symbols[i].start + 1);
|
|
267
|
+
memcpy(substr, word + symbols[i].start, symbols[i].end - symbols[i].start);
|
|
268
|
+
substr[symbols[i].end - symbols[i].start] = '\0';
|
|
269
|
+
int id = text_to_id(vocab_data, substr);
|
|
270
|
+
if (id != -1) token_ids[(*num_tokens)++] = id;
|
|
271
|
+
free(substr);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
free(bigrams); free(merged); free(symbols);
|
|
275
|
+
}
|
|
276
|
+
|
|
34
277
|
/* ------------------------------------------------------------------------- */
|
|
278
|
+
// GGUF parsing
|
|
35
279
|
static int safe_advance(uint8_t **p, uint8_t *end, size_t sz) {
|
|
36
280
|
if (*p + sz > end) return 0;
|
|
37
281
|
*p += sz;
|
|
@@ -39,14 +283,14 @@ static int safe_advance(uint8_t **p, uint8_t *end, size_t sz) {
|
|
|
39
283
|
}
|
|
40
284
|
|
|
41
285
|
static uint32_t rd32(uint8_t **p, uint8_t *end) {
|
|
42
|
-
uint32_t v
|
|
286
|
+
uint32_t v;
|
|
43
287
|
if (!safe_advance(p, end, 4)) return 0;
|
|
44
288
|
memcpy(&v, *p - 4, 4);
|
|
45
289
|
return v;
|
|
46
290
|
}
|
|
47
291
|
|
|
48
292
|
static uint64_t rd64(uint8_t **p, uint8_t *end) {
|
|
49
|
-
uint64_t v
|
|
293
|
+
uint64_t v;
|
|
50
294
|
if (!safe_advance(p, end, 8)) return 0;
|
|
51
295
|
memcpy(&v, *p - 8, 8);
|
|
52
296
|
return v;
|
|
@@ -57,9 +301,9 @@ static char *rdstr(uint8_t **p, uint8_t *end) {
|
|
|
57
301
|
uint64_t len;
|
|
58
302
|
memcpy(&len, *p, 8);
|
|
59
303
|
*p += 8;
|
|
60
|
-
if (len == 0 || len > (1
|
|
304
|
+
if (len == 0 || len > (1<<20)) return NULL;
|
|
61
305
|
if (*p + len > end) return NULL;
|
|
62
|
-
char *s = malloc(len
|
|
306
|
+
char *s = malloc(len+1);
|
|
63
307
|
if (!s) return NULL;
|
|
64
308
|
memcpy(s, *p, len);
|
|
65
309
|
s[len] = '\0';
|
|
@@ -70,27 +314,35 @@ static char *rdstr(uint8_t **p, uint8_t *end) {
|
|
|
70
314
|
static void align_to_32(uint8_t **p, uint8_t *end, uint8_t *base) {
|
|
71
315
|
size_t off = *p - base;
|
|
72
316
|
size_t aligned = (off + GGUF_ALIGN - 1) & ~(GGUF_ALIGN - 1);
|
|
73
|
-
if (base + aligned <= end)
|
|
74
|
-
*p = base + aligned;
|
|
317
|
+
if (base + aligned <= end) *p = base + aligned;
|
|
75
318
|
}
|
|
76
319
|
|
|
77
320
|
/* ------------------------------------------------------------------------- */
|
|
321
|
+
// Hash table for vocabulary
|
|
78
322
|
typedef struct HashNode {
|
|
79
323
|
char *key;
|
|
80
|
-
int
|
|
324
|
+
int id;
|
|
81
325
|
struct HashNode *next;
|
|
82
326
|
} HashNode;
|
|
83
327
|
|
|
84
328
|
typedef struct {
|
|
85
|
-
int
|
|
86
|
-
int
|
|
87
|
-
char
|
|
88
|
-
float
|
|
89
|
-
void
|
|
90
|
-
int
|
|
91
|
-
void
|
|
92
|
-
size_t
|
|
329
|
+
int vocab_size;
|
|
330
|
+
int dim;
|
|
331
|
+
char **tokens;
|
|
332
|
+
float *float_data;
|
|
333
|
+
void *tensor_data;
|
|
334
|
+
int tensor_type;
|
|
335
|
+
void *mapped;
|
|
336
|
+
size_t mapped_size;
|
|
93
337
|
HashNode **table;
|
|
338
|
+
BPEMergeTable merges;
|
|
339
|
+
RegexPattern *pre_patterns;
|
|
340
|
+
int num_pre_patterns;
|
|
341
|
+
int unknown_token_id;
|
|
342
|
+
int bos_token_id;
|
|
343
|
+
int eos_token_id;
|
|
344
|
+
int vocab_type;
|
|
345
|
+
char space_marker[8];
|
|
94
346
|
} EmbedModel;
|
|
95
347
|
|
|
96
348
|
typedef struct {
|
|
@@ -122,7 +374,12 @@ static int hget(EmbedModel *m, const char *k) {
|
|
|
122
374
|
return -1;
|
|
123
375
|
}
|
|
124
376
|
|
|
377
|
+
static int text_to_id(void *vocab_data, const char *text) {
|
|
378
|
+
return hget((EmbedModel*)vocab_data, text);
|
|
379
|
+
}
|
|
380
|
+
|
|
125
381
|
/* ------------------------------------------------------------------------- */
|
|
382
|
+
// File mapping
|
|
126
383
|
static void *map_file(const char *path, size_t *size) {
|
|
127
384
|
int fd = open(path, O_RDONLY);
|
|
128
385
|
if (fd < 0) return NULL;
|
|
@@ -131,29 +388,22 @@ static void *map_file(const char *path, size_t *size) {
|
|
|
131
388
|
*size = st.st_size;
|
|
132
389
|
void *data = mmap(NULL, *size, PROT_READ, MAP_PRIVATE, fd, 0);
|
|
133
390
|
close(fd);
|
|
134
|
-
|
|
135
|
-
return data;
|
|
391
|
+
return data == MAP_FAILED ? NULL : data;
|
|
136
392
|
}
|
|
137
393
|
|
|
138
394
|
/* ------------------------------------------------------------------------- */
|
|
395
|
+
// FP16 conversion
|
|
139
396
|
static float fp16_to_fp32(uint16_t h) {
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
if (exp ==
|
|
145
|
-
|
|
146
|
-
} else if (exp == 31) {
|
|
147
|
-
return 0.0f;
|
|
148
|
-
} else {
|
|
149
|
-
val = (1.0f + mant / 1024.0f) * (1 << (exp - 15));
|
|
150
|
-
}
|
|
151
|
-
return sign ? -val : val;
|
|
397
|
+
uint16_t sign = (h >> 15) & 1;
|
|
398
|
+
uint16_t exp = (h >> 10) & 0x1F;
|
|
399
|
+
uint16_t mant = h & 0x3FF;
|
|
400
|
+
if (exp == 0) return (mant / 1024.0f) * 6.103515625e-5f * (sign ? -1.0f : 1.0f);
|
|
401
|
+
if (exp == 31) return 0.0f;
|
|
402
|
+
return (1.0f + mant / 1024.0f) * (1 << (exp - 15)) * (sign ? -1.0f : 1.0f);
|
|
152
403
|
}
|
|
153
404
|
|
|
154
405
|
/* ------------------------------------------------------------------------- */
|
|
155
|
-
|
|
156
|
-
|
|
406
|
+
// Block dequantization functions
|
|
157
407
|
static void dequantize_row_q4_0(const void *vx, float *y, int k) {
|
|
158
408
|
const int nb = k / 32;
|
|
159
409
|
const uint8_t *x = vx;
|
|
@@ -241,7 +491,6 @@ static void dequantize_row_q8_1(const void *vx, float *y, int k) {
|
|
|
241
491
|
}
|
|
242
492
|
}
|
|
243
493
|
|
|
244
|
-
/* K-quants */
|
|
245
494
|
static void dequantize_row_q2_K(const void *vx, float *y, int k) {
|
|
246
495
|
const int nb = k / 256;
|
|
247
496
|
const uint8_t *x = vx;
|
|
@@ -368,7 +617,6 @@ static void dequantize_row_q8_K(const void *vx, float *y, int k) {
|
|
|
368
617
|
}
|
|
369
618
|
}
|
|
370
619
|
|
|
371
|
-
/* ------------------------------------------------------------------------- */
|
|
372
620
|
static float* dequantize_tensor(const void *data, int type, int n_rows, int n_cols) {
|
|
373
621
|
if (type == GGML_TYPE_F32) {
|
|
374
622
|
float *out = malloc(n_rows * n_cols * sizeof(float));
|
|
@@ -413,6 +661,14 @@ static float* dequantize_tensor(const void *data, int type, int n_rows, int n_co
|
|
|
413
661
|
for (int r = 0; r < n_rows; r++) {
|
|
414
662
|
dequant_func(in + r * row_bytes, out + r * n_cols, n_cols);
|
|
415
663
|
}
|
|
664
|
+
|
|
665
|
+
// Sanitize the tensor: replace NaNs, Infs, and astronomically large values with zero
|
|
666
|
+
int total = n_rows * n_cols;
|
|
667
|
+
for (int i = 0; i < total; i++) {
|
|
668
|
+
if (isnan(out[i]) || isinf(out[i]) || fabs(out[i]) > 1e10f) {
|
|
669
|
+
out[i] = 0.0f;
|
|
670
|
+
}
|
|
671
|
+
}
|
|
416
672
|
return out;
|
|
417
673
|
}
|
|
418
674
|
|
|
@@ -457,44 +713,89 @@ static void free_model_contents(EmbedModel *m) {
|
|
|
457
713
|
}
|
|
458
714
|
if (m->float_data) free(m->float_data);
|
|
459
715
|
if (m->mapped) munmap(m->mapped, m->mapped_size);
|
|
716
|
+
bpe_merge_table_free(&m->merges);
|
|
717
|
+
if (m->pre_patterns) {
|
|
718
|
+
for (int i = 0; i < m->num_pre_patterns; i++) free(m->pre_patterns[i].pattern);
|
|
719
|
+
free(m->pre_patterns);
|
|
720
|
+
}
|
|
460
721
|
free(m);
|
|
461
722
|
}
|
|
462
723
|
|
|
463
724
|
/* ------------------------------------------------------------------------- */
|
|
464
725
|
static int is_printable_string(const char *s, size_t len) {
|
|
465
|
-
for (size_t i = 0; i < len; i++)
|
|
466
|
-
if (!isprint((unsigned char)s[i])) return 0;
|
|
726
|
+
for (size_t i = 0; i < len; i++) if (!isprint((unsigned char)s[i])) return 0;
|
|
467
727
|
return 1;
|
|
468
728
|
}
|
|
469
729
|
|
|
470
|
-
/* Fallback: find the start of tensor info by scanning for a valid string */
|
|
471
730
|
static uint8_t *find_tensor_info_start(uint8_t *cur, uint8_t *end) {
|
|
472
731
|
uint8_t *scan = cur;
|
|
473
732
|
while (scan + 8 < end) {
|
|
474
733
|
uint64_t len;
|
|
475
734
|
memcpy(&len, scan, 8);
|
|
476
|
-
if (len > 0 && len < 256 && scan + 8 + len <= end)
|
|
477
|
-
|
|
478
|
-
return scan;
|
|
479
|
-
}
|
|
480
|
-
}
|
|
735
|
+
if (len > 0 && len < 256 && scan + 8 + len <= end && is_printable_string((char*)scan+8, len))
|
|
736
|
+
return scan;
|
|
481
737
|
scan++;
|
|
482
738
|
}
|
|
483
739
|
return NULL;
|
|
484
740
|
}
|
|
485
741
|
|
|
742
|
+
/* ------------------------------------------------------------------------- */
|
|
743
|
+
static void detect_space_marker(EmbedModel *m) {
|
|
744
|
+
const char *candidates[] = {"▁", "Ġ", " "};
|
|
745
|
+
for (int i = 0; i < 3; i++) {
|
|
746
|
+
const char *marker = candidates[i];
|
|
747
|
+
int marker_len = strlen(marker);
|
|
748
|
+
for (int j = 0; j < m->vocab_size; j++) {
|
|
749
|
+
if (strncmp(m->tokens[j], marker, marker_len) == 0) {
|
|
750
|
+
strcpy(m->space_marker, marker);
|
|
751
|
+
return;
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
m->space_marker[0] = '\0';
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
static void setup_default_pre_patterns(EmbedModel *m) {
|
|
759
|
+
const char *default_patterns[] = {
|
|
760
|
+
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])",
|
|
761
|
+
"[^\\r\\n\\p{L}\\p{N}]?\\p{L}+",
|
|
762
|
+
"\\p{N}{1,3}",
|
|
763
|
+
" ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*",
|
|
764
|
+
"\\s*[\\r\\n]+",
|
|
765
|
+
"\\s+(?!\\S)",
|
|
766
|
+
"\\s+"
|
|
767
|
+
};
|
|
768
|
+
m->num_pre_patterns = sizeof(default_patterns)/sizeof(default_patterns[0]);
|
|
769
|
+
m->pre_patterns = malloc(m->num_pre_patterns * sizeof(RegexPattern));
|
|
770
|
+
for (int i = 0; i < m->num_pre_patterns; i++) {
|
|
771
|
+
m->pre_patterns[i].pattern = strdup(default_patterns[i]);
|
|
772
|
+
m->pre_patterns[i].pattern_len = strlen(default_patterns[i]);
|
|
773
|
+
}
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
static void parse_merge(const char *merge_str, char **left, char **right) {
|
|
777
|
+
const char *space = strchr(merge_str, ' ');
|
|
778
|
+
if (space) {
|
|
779
|
+
int left_len = space - merge_str;
|
|
780
|
+
*left = malloc(left_len+1);
|
|
781
|
+
memcpy(*left, merge_str, left_len);
|
|
782
|
+
(*left)[left_len] = '\0';
|
|
783
|
+
*right = strdup(space+1);
|
|
784
|
+
} else {
|
|
785
|
+
*left = strdup(merge_str);
|
|
786
|
+
*right = strdup("");
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
|
|
486
790
|
/* ------------------------------------------------------------------------- */
|
|
487
791
|
static EmbedModel *embed_load_gguf(const char *path) {
|
|
488
792
|
size_t sz;
|
|
489
793
|
uint8_t *base = map_file(path, &sz);
|
|
490
794
|
if (!base) return NULL;
|
|
491
|
-
uint8_t *cur = base;
|
|
492
|
-
uint8_t *end = base + sz;
|
|
493
|
-
|
|
795
|
+
uint8_t *cur = base, *end = base + sz;
|
|
494
796
|
if (memcmp(cur, "GGUF", 4) != 0) { munmap(base, sz); return NULL; }
|
|
495
797
|
cur += 4;
|
|
496
798
|
uint32_t version = rd32(&cur, end);
|
|
497
|
-
(void)version;
|
|
498
799
|
uint64_t n_tensors = rd64(&cur, end);
|
|
499
800
|
uint64_t n_kv = rd64(&cur, end);
|
|
500
801
|
|
|
@@ -504,16 +805,20 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
504
805
|
m->mapped_size = sz;
|
|
505
806
|
m->table = calloc(HASH_SIZE, sizeof(HashNode*));
|
|
506
807
|
if (!m->table) { free_model_contents(m); return NULL; }
|
|
808
|
+
bpe_merge_table_init(&m->merges);
|
|
809
|
+
setup_default_pre_patterns(m);
|
|
810
|
+
m->unknown_token_id = -1;
|
|
811
|
+
m->bos_token_id = -1;
|
|
812
|
+
m->eos_token_id = -1;
|
|
813
|
+
m->vocab_type = LLAMA_VOCAB_TYPE_NONE;
|
|
814
|
+
m->space_marker[0] = '\0';
|
|
507
815
|
|
|
508
|
-
/* ---------- Metadata ---------- */
|
|
509
816
|
int vocab_found = 0;
|
|
510
817
|
for (uint64_t i = 0; i < n_kv; i++) {
|
|
511
818
|
char *key = rdstr(&cur, end);
|
|
512
819
|
if (!key) { free_model_contents(m); return NULL; }
|
|
513
820
|
uint32_t type = rd32(&cur, end);
|
|
514
|
-
|
|
515
|
-
if ((strcmp(key, "tokenizer.ggml.tokens") == 0 ||
|
|
516
|
-
strcmp(key, "tokenizer.ggml.token_list") == 0) && type == 9) {
|
|
821
|
+
if ((strcmp(key, "tokenizer.ggml.tokens") == 0 || strcmp(key, "tokenizer.ggml.token_list") == 0) && type == 9) {
|
|
517
822
|
uint32_t subtype = rd32(&cur, end);
|
|
518
823
|
uint64_t n = rd64(&cur, end);
|
|
519
824
|
if (subtype != 8) { free(key); free_model_contents(m); return NULL; }
|
|
@@ -527,21 +832,50 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
527
832
|
hset(m, tok, (int)j);
|
|
528
833
|
}
|
|
529
834
|
vocab_found = 1;
|
|
530
|
-
} else {
|
|
531
|
-
|
|
532
|
-
|
|
835
|
+
} else if (strcmp(key, "tokenizer.ggml.merges") == 0 && type == 9) {
|
|
836
|
+
uint32_t subtype = rd32(&cur, end);
|
|
837
|
+
uint64_t n = rd64(&cur, end);
|
|
838
|
+
if (subtype == 8) {
|
|
839
|
+
for (uint64_t j = 0; j < n && j < MAX_MERGES; j++) {
|
|
840
|
+
char *merge_str = rdstr(&cur, end);
|
|
841
|
+
if (merge_str) {
|
|
842
|
+
char *left, *right;
|
|
843
|
+
parse_merge(merge_str, &left, &right);
|
|
844
|
+
bpe_merge_table_add(&m->merges, left, right, merge_str, (int)j);
|
|
845
|
+
free(left); free(right);
|
|
846
|
+
free(merge_str);
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
} else {
|
|
850
|
+
if (!skip_value(&cur, end, type)) { free(key); free_model_contents(m); return NULL; }
|
|
533
851
|
}
|
|
852
|
+
} else if (strcmp(key, "tokenizer.ggml.model") == 0 && type == 8) {
|
|
853
|
+
char *model_type = rdstr(&cur, end);
|
|
854
|
+
if (model_type) {
|
|
855
|
+
if (strcmp(model_type, "gpt2") == 0 || strcmp(model_type, "llama") == 0) m->vocab_type = LLAMA_VOCAB_TYPE_BPE;
|
|
856
|
+
else if (strcmp(model_type, "bert") == 0) m->vocab_type = LLAMA_VOCAB_TYPE_WPM;
|
|
857
|
+
free(model_type);
|
|
858
|
+
}
|
|
859
|
+
} else if (strcmp(key, "tokenizer.ggml.pre") == 0 && type == 8) {
|
|
860
|
+
char *pre = rdstr(&cur, end);
|
|
861
|
+
if (pre) free(pre);
|
|
862
|
+
} else if (strcmp(key, "tokenizer.ggml.unknown_token_id") == 0 && type == 6) {
|
|
863
|
+
m->unknown_token_id = rd32(&cur, end);
|
|
864
|
+
} else if (strcmp(key, "tokenizer.ggml.bos_token_id") == 0 && type == 6) {
|
|
865
|
+
m->bos_token_id = rd32(&cur, end);
|
|
866
|
+
} else if (strcmp(key, "tokenizer.ggml.eos_token_id") == 0 && type == 6) {
|
|
867
|
+
m->eos_token_id = rd32(&cur, end);
|
|
868
|
+
} else {
|
|
869
|
+
if (!skip_value(&cur, end, type)) { free(key); free_model_contents(m); return NULL; }
|
|
534
870
|
}
|
|
535
871
|
free(key);
|
|
536
872
|
}
|
|
537
|
-
|
|
538
873
|
if (!vocab_found) { free_model_contents(m); return NULL; }
|
|
874
|
+
detect_space_marker(m);
|
|
539
875
|
|
|
540
876
|
uint8_t *after_kv = cur;
|
|
541
877
|
align_to_32(&cur, end, base);
|
|
542
878
|
uint8_t *tensor_start = cur;
|
|
543
|
-
|
|
544
|
-
/* ---------- Tensor info ---------- */
|
|
545
879
|
int embd_found = 0;
|
|
546
880
|
void *raw_tensor_data = NULL;
|
|
547
881
|
int tensor_type = -1;
|
|
@@ -555,39 +889,27 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
555
889
|
if (!name) break;
|
|
556
890
|
uint32_t n_dims = rd32(&cur, end);
|
|
557
891
|
uint64_t dims[MAX_DIMS] = {0};
|
|
558
|
-
for (uint32_t d = 0; d < n_dims && d < MAX_DIMS; d++)
|
|
559
|
-
dims[d] = rd64(&cur, end);
|
|
892
|
+
for (uint32_t d = 0; d < n_dims && d < MAX_DIMS; d++) dims[d] = rd64(&cur, end);
|
|
560
893
|
uint32_t type = rd32(&cur, end);
|
|
561
894
|
uint64_t offset = rd64(&cur, end);
|
|
562
|
-
|
|
563
895
|
int is_token_embd = (strcmp(name, "token_embd.weight") == 0 ||
|
|
564
896
|
strcmp(name, "embeddings.word_embeddings.weight") == 0 ||
|
|
565
897
|
strcmp(name, "model.embed_tokens.weight") == 0);
|
|
566
|
-
|
|
567
898
|
if (!is_token_embd && n_dims == 2 && m->vocab_size > 0) {
|
|
568
|
-
if ((uint64_t)m->vocab_size == dims[0] && strstr(name, "embd")
|
|
569
|
-
|
|
570
|
-
else if ((uint64_t)m->vocab_size == dims[1] && strstr(name, "embd") != NULL)
|
|
571
|
-
is_token_embd = 1;
|
|
899
|
+
if ((uint64_t)m->vocab_size == dims[0] && strstr(name, "embd")) is_token_embd = 1;
|
|
900
|
+
else if ((uint64_t)m->vocab_size == dims[1] && strstr(name, "embd")) is_token_embd = 1;
|
|
572
901
|
}
|
|
573
|
-
|
|
574
902
|
if (!embd_found && is_token_embd) {
|
|
575
903
|
if (n_dims < 2 || dims[1] == 0) { free(name); free_model_contents(m); return NULL; }
|
|
576
|
-
dim0 = dims[0];
|
|
577
|
-
dim1 =
|
|
578
|
-
if (
|
|
579
|
-
|
|
580
|
-
need_transpose = 0;
|
|
581
|
-
} else if (dim1 == (uint64_t)m->vocab_size) {
|
|
582
|
-
m->dim = (int)dim0;
|
|
583
|
-
need_transpose = 1;
|
|
584
|
-
} else {
|
|
585
|
-
m->dim = (dim0 < dim1) ? (int)dim0 : (int)dim1;
|
|
586
|
-
need_transpose = (dim0 > dim1) ? 1 : 0;
|
|
587
|
-
}
|
|
904
|
+
dim0 = dims[0]; dim1 = dims[1];
|
|
905
|
+
if (dim0 == (uint64_t)m->vocab_size) { m->dim = (int)dim1; need_transpose = 0; }
|
|
906
|
+
else if (dim1 == (uint64_t)m->vocab_size) { m->dim = (int)dim0; need_transpose = 1; }
|
|
907
|
+
else { m->dim = (dim0 < dim1) ? (int)dim0 : (int)dim1; need_transpose = (dim0 > dim1) ? 1 : 0; }
|
|
588
908
|
raw_tensor_data = base + offset;
|
|
589
909
|
tensor_type = type;
|
|
590
910
|
embd_found = 1;
|
|
911
|
+
free(name);
|
|
912
|
+
break;
|
|
591
913
|
}
|
|
592
914
|
free(name);
|
|
593
915
|
}
|
|
@@ -597,13 +919,8 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
597
919
|
if (!tensor_start) break;
|
|
598
920
|
}
|
|
599
921
|
}
|
|
922
|
+
if (!embd_found || m->dim == 0) { free_model_contents(m); return NULL; }
|
|
600
923
|
|
|
601
|
-
if (!embd_found || m->dim == 0) {
|
|
602
|
-
free_model_contents(m);
|
|
603
|
-
return NULL;
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
/* Dequantize */
|
|
607
924
|
if (tensor_type == GGML_TYPE_F32 && !need_transpose) {
|
|
608
925
|
m->float_data = NULL;
|
|
609
926
|
m->tensor_data = raw_tensor_data;
|
|
@@ -611,10 +928,7 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
611
928
|
int n_rows = need_transpose ? (int)dim1 : (int)dim0;
|
|
612
929
|
int n_cols = need_transpose ? (int)dim0 : (int)dim1;
|
|
613
930
|
m->float_data = dequantize_tensor(raw_tensor_data, tensor_type, n_rows, n_cols);
|
|
614
|
-
if (!m->float_data) {
|
|
615
|
-
free_model_contents(m);
|
|
616
|
-
return NULL;
|
|
617
|
-
}
|
|
931
|
+
if (!m->float_data) { free_model_contents(m); return NULL; }
|
|
618
932
|
m->tensor_data = m->float_data;
|
|
619
933
|
}
|
|
620
934
|
m->tensor_type = tensor_type;
|
|
@@ -625,36 +939,84 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
625
939
|
/* ------------------------------------------------------------------------- */
|
|
626
940
|
static void embed_text(EmbedModel *m, const char *txt, float *out) {
|
|
627
941
|
memset(out, 0, sizeof(float) * m->dim);
|
|
628
|
-
|
|
629
|
-
|
|
942
|
+
int num_words = 0;
|
|
943
|
+
char **words = unicode_regex_split(txt, m->pre_patterns, m->num_pre_patterns, &num_words);
|
|
944
|
+
if (!words || num_words == 0) {
|
|
945
|
+
// Fallback to simple space split
|
|
946
|
+
char *copy = strdup(txt);
|
|
947
|
+
if (copy) {
|
|
948
|
+
char *tok = strtok(copy, " \t\n\r");
|
|
949
|
+
int used = 0;
|
|
950
|
+
const float *embd = (float*)m->tensor_data;
|
|
951
|
+
while (tok) {
|
|
952
|
+
int id = hget(m, tok);
|
|
953
|
+
if (id >= 0 && id < m->vocab_size) {
|
|
954
|
+
const float *vec = embd + id * m->dim;
|
|
955
|
+
for (int i = 0; i < m->dim; i++) out[i] += vec[i];
|
|
956
|
+
used++;
|
|
957
|
+
}
|
|
958
|
+
tok = strtok(NULL, " \t\n\r");
|
|
959
|
+
}
|
|
960
|
+
if (used) { float inv = 1.0f / used; for (int i = 0; i < m->dim; i++) out[i] *= inv; }
|
|
961
|
+
free(copy);
|
|
962
|
+
}
|
|
963
|
+
if (words) free(words);
|
|
964
|
+
return;
|
|
965
|
+
}
|
|
630
966
|
|
|
631
|
-
|
|
967
|
+
int *token_ids = malloc(m->vocab_size * sizeof(int));
|
|
632
968
|
int used = 0;
|
|
633
|
-
const float *
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
int id = hget(m,
|
|
637
|
-
if (id
|
|
638
|
-
|
|
639
|
-
|
|
969
|
+
const float *embd = (float*)m->tensor_data;
|
|
970
|
+
for (int i = 0; i < num_words; i++) {
|
|
971
|
+
char *word = words[i];
|
|
972
|
+
int id = hget(m, word);
|
|
973
|
+
if (id == -1 && m->space_marker[0]) {
|
|
974
|
+
char *with_marker = malloc(strlen(m->space_marker) + strlen(word) + 1);
|
|
975
|
+
strcpy(with_marker, m->space_marker);
|
|
976
|
+
strcat(with_marker, word);
|
|
977
|
+
id = hget(m, with_marker);
|
|
978
|
+
free(with_marker);
|
|
979
|
+
}
|
|
980
|
+
if (id != -1) {
|
|
981
|
+
const float *vec = embd + id * m->dim;
|
|
982
|
+
for (int j = 0; j < m->dim; j++) out[j] += vec[j];
|
|
640
983
|
used++;
|
|
984
|
+
} else {
|
|
985
|
+
int num_tokens = 0;
|
|
986
|
+
bpe_tokenize_word(&m->merges, word, text_to_id, m, token_ids, &num_tokens);
|
|
987
|
+
for (int k = 0; k < num_tokens; k++) {
|
|
988
|
+
int tid = token_ids[k];
|
|
989
|
+
if (tid >= 0 && tid < m->vocab_size) {
|
|
990
|
+
const float *vec = embd + tid * m->dim;
|
|
991
|
+
for (int j = 0; j < m->dim; j++) out[j] += vec[j];
|
|
992
|
+
used++;
|
|
993
|
+
} else if (m->unknown_token_id != -1 && m->unknown_token_id < m->vocab_size) {
|
|
994
|
+
const float *vec = embd + m->unknown_token_id * m->dim;
|
|
995
|
+
for (int j = 0; j < m->dim; j++) out[j] += vec[j];
|
|
996
|
+
used++;
|
|
997
|
+
}
|
|
998
|
+
}
|
|
641
999
|
}
|
|
642
|
-
|
|
1000
|
+
free(word);
|
|
643
1001
|
}
|
|
644
|
-
|
|
1002
|
+
free(words);
|
|
1003
|
+
free(token_ids);
|
|
645
1004
|
if (used > 0) {
|
|
646
1005
|
float inv = 1.0f / used;
|
|
647
1006
|
for (int i = 0; i < m->dim; i++) out[i] *= inv;
|
|
648
1007
|
}
|
|
649
|
-
|
|
1008
|
+
for (int i = 0; i < m->dim; i++) {
|
|
1009
|
+
if (isnan(out[i]) || isinf(out[i])) {
|
|
1010
|
+
out[i] = 0.0f;
|
|
1011
|
+
}
|
|
1012
|
+
}
|
|
650
1013
|
}
|
|
651
1014
|
|
|
652
1015
|
/* ------------------------------------------------------------------------- */
|
|
1016
|
+
// Ruby bindings
|
|
653
1017
|
static void rb_embedder_free(void *p) {
|
|
654
1018
|
ruby_embedder *e = p;
|
|
655
|
-
if (
|
|
656
|
-
if (e->model) free_model_contents(e->model);
|
|
657
|
-
free(e);
|
|
1019
|
+
if (e) { if (e->model) free_model_contents(e->model); free(e); }
|
|
658
1020
|
}
|
|
659
1021
|
|
|
660
1022
|
static size_t rb_embedder_memsize(const void *p) {
|
|
@@ -675,22 +1037,18 @@ static VALUE rb_embedder_alloc(VALUE klass) {
|
|
|
675
1037
|
static VALUE rb_embedder_initialize(VALUE self, VALUE opts) {
|
|
676
1038
|
ruby_embedder *e;
|
|
677
1039
|
TypedData_Get_Struct(self, ruby_embedder, &ruby_embedder_type, e);
|
|
678
|
-
|
|
679
1040
|
VALUE path = rb_hash_aref(opts, ID2SYM(rb_intern("model")));
|
|
680
1041
|
const char *cpath = StringValueCStr(path);
|
|
681
1042
|
e->model = embed_load_gguf(cpath);
|
|
682
|
-
if (!e->model)
|
|
683
|
-
rb_raise(rb_eRuntimeError, "failed to load GGUF model");
|
|
1043
|
+
if (!e->model) rb_raise(rb_eRuntimeError, "failed to load GGUF model");
|
|
684
1044
|
return self;
|
|
685
1045
|
}
|
|
686
1046
|
|
|
687
1047
|
static VALUE rb_embed(VALUE self, VALUE opts) {
|
|
688
1048
|
ruby_embedder *e;
|
|
689
1049
|
TypedData_Get_Struct(self, ruby_embedder, &ruby_embedder_type, e);
|
|
690
|
-
|
|
691
1050
|
VALUE text = rb_hash_aref(opts, ID2SYM(rb_intern("text")));
|
|
692
1051
|
const char *ctext = StringValueCStr(text);
|
|
693
|
-
|
|
694
1052
|
VALUE out = rb_str_new(NULL, e->model->dim * sizeof(float));
|
|
695
1053
|
embed_text(e->model, ctext, (float*)RSTRING_PTR(out));
|
|
696
1054
|
return out;
|
|
@@ -700,5 +1058,5 @@ void Init_mini_embed(void) {
|
|
|
700
1058
|
VALUE c = rb_define_class("MiniEmbed", rb_cObject);
|
|
701
1059
|
rb_define_alloc_func(c, rb_embedder_alloc);
|
|
702
1060
|
rb_define_method(c, "initialize", rb_embedder_initialize, 1);
|
|
703
|
-
rb_define_method(c, "
|
|
1061
|
+
rb_define_method(c, "embed", rb_embed, 1);
|
|
704
1062
|
}
|
data/lib/mini_embed.rb
CHANGED
|
@@ -1,3 +1,17 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'mini_embed/mini_embed'
|
|
4
|
+
|
|
5
|
+
class MiniEmbed
|
|
6
|
+
# @param text [String] - text to extract embeddings from
|
|
7
|
+
# @param type [Symbol, nil] - :binary or :vector - type of data you want to receive
|
|
8
|
+
# @return [String, <Float>] - type == :binary - binary string, type == :vector - array of floats
|
|
9
|
+
def embeddings(text: text, type: :vector)
|
|
10
|
+
binary_data = embed(text: text) # call original C method
|
|
11
|
+
|
|
12
|
+
return binary_data if type == :binary
|
|
13
|
+
return binary_data.unpack('e*') if type == :vector
|
|
14
|
+
|
|
15
|
+
raise ArgumentError, "Unsupported data type: #{type}"
|
|
16
|
+
end
|
|
17
|
+
end
|