RubyGems - mini_embed - Versions diffs - 0.1.1 → 0.2.0 - Mend

mini_embed 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/README.md +9 -5
data/ext/mini_embed/mini_embed.c +185 -354
data/lib/mini_embed.rb +14 -0
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: fd4a9fa127d0882eef7443594736c7ed633bf4728f75cfcf49a2987a515b3e8e
-  data.tar.gz: 632a8f4cdd9f2a218dc025b47e6f4c19c03ee81db67f5f50c8184cf14809b05e
+  metadata.gz: 2df9f5c081f8a7fa2447261817ebba58e5c062921a1dc6ee3ec8048fdc300022
+  data.tar.gz: 4ee4f87161506c59e6deda7dd12b819c33e86ae5f5843aa89a9754b57b27f968
 SHA512:
-  metadata.gz: 1a3bf50d26e8d53a560e97f1b3125797b1f6de94c773a344d1acb96ab1ef6a8b7a731707f104e4d9cdd856e3df6e1579b9510dafc959d890c6623441d470fa95
-  data.tar.gz: ac6f937aafff0dd9dc93193ac85eae4293eb0fa51dbb56897e4ad25c60cf784b9c7896032dc48607b4d702bf81cbb0524b8f3dddbc90d190c2eacdee63200dfb
+  metadata.gz: 4646b9f96a6ef525751d3046f0524d308940c1deb3a85623f775666ab2e1bbcddbec62c16f110f2cc1eae620f0b52c900a64f03cfa01f8e856d3546eb404ee98
+  data.tar.gz: f03b8103bddc296f1601d62bda851655529dc47cea1eee9f6bfcfc03fb5c005ad33f2b53a81e84d882b2813c670254bdbdb465b2b64f76c4ffc66248f5b27f73

data/README.md CHANGED Viewed

@@ -52,15 +52,19 @@ require 'mini_embed'
 # Load a GGUF model (F32, F16, Q8_0, Q4_K, etc. are all supported)
 model = MiniEmbed.new(model: '/path/to/gte-small.Q8_0.gguf')
-# Get the raw binary string (little‑endian 32‑bit floats)
-binary = model.embeddings(text: 'hello world')
-# Get an embedding as an array of floats
-embedding = binary.unpack('e*')
+# Get embedding as an array of floats (default)
+embedding = model.embeddings(text: 'hello world')
 puts embedding.size   # e.g. 384
 puts embedding[0..4]  # e.g. [0.0123, -0.0456, ...]
+# Or get the raw binary string (little‑endian 32‑bit floats)
+binary = model.embeddings(text: 'hello world', type: :binary)
+embedding_from_binary = binary.unpack('e*')
 ```
+Note: The type parameter is optional – it defaults to :vector which returns a Ruby `Array<Float>`. Use `type: :binary` to get the raw binary string (compatible with the original C extension).
 ## Simple tokenization note
 MiniEmbed uses a naive space‑based tokenizer. This means it splits input on spaces and looks up each token exactly in the model's vocabulary. For models trained with subword tokenization (like BERT), this will not work for out‑of‑vocabulary words.
 If you need proper subword tokenization, you can:

data/ext/mini_embed/mini_embed.c CHANGED Viewed

@@ -41,17 +41,16 @@ enum llama_vocab_type {
 };
 /* ------------------------------------------------------------------------- */
-// Unicode helper functions (adapted from llama.cpp)
+// Unicode helper functions
 static int unicode_len_utf8(char c) {
     if ((c & 0x80) == 0) return 1;
     if ((c & 0xE0) == 0xC0) return 2;
     if ((c & 0xF0) == 0xE0) return 3;
     if ((c & 0xF8) == 0xF0) return 4;
-    return 1; // fallback
+    return 1;
 }
 static int unicode_is_letter(uint32_t cp) {
-    // Basic Unicode letter detection (simplified)
     return (cp >= 0x41 && cp <= 0x5A) || (cp >= 0x61 && cp <= 0x7A) ||
            (cp >= 0xC0 && cp <= 0xD6) || (cp >= 0xD8 && cp <= 0xF6) ||
            (cp >= 0xF8 && cp <= 0x2FF) || (cp >= 0x370 && cp <= 0x37D) ||
@@ -68,74 +67,42 @@ static int unicode_is_number(uint32_t cp) {
 }
 static uint32_t unicode_cpt_from_utf8(const char *s, size_t *len) {
-    uint32_t cp = 0;
     unsigned char c = (unsigned char)s[0];
-    if (c < 0x80) {
-        *len = 1;
-        return c;
-    } else if ((c & 0xE0) == 0xC0) {
-        *len = 2;
-        cp = (c & 0x1F) << 6;
-        cp |= (s[1] & 0x3F);
-        return cp;
-    } else if ((c & 0xF0) == 0xE0) {
-        *len = 3;
-        cp = (c & 0x0F) << 12;
-        cp |= (s[1] & 0x3F) << 6;
-        cp |= (s[2] & 0x3F);
-        return cp;
-    } else if ((c & 0xF8) == 0xF0) {
-        *len = 4;
-        cp = (c & 0x07) << 18;
-        cp |= (s[1] & 0x3F) << 12;
-        cp |= (s[2] & 0x3F) << 6;
-        cp |= (s[3] & 0x3F);
-        return cp;
-    }
+    if (c < 0x80) { *len = 1; return c; }
+    if ((c & 0xE0) == 0xC0) { *len = 2; return ((c & 0x1F) << 6) | (s[1] & 0x3F); }
+    if ((c & 0xF0) == 0xE0) { *len = 3; return ((c & 0x0F) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); }
+    if ((c & 0xF8) == 0xF0) { *len = 4; return ((c & 0x07) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); }
     *len = 1;
     return c;
 }
 /* ------------------------------------------------------------------------- */
-// Simple regex pattern matcher for pre-tokenization
+// Simple regex pattern matcher (simplified)
 typedef struct {
     char *pattern;
     int pattern_len;
 } RegexPattern;
 static int match_regex(const char *text, const RegexPattern *patterns, int num_patterns) {
-    // Simplified implementation for common BPE patterns
-    // Full regex engine would be complex; this handles the most common cases
     for (int i = 0; i < num_patterns; i++) {
         const char *p = patterns[i].pattern;
-        int plen = patterns[i].pattern_len;
-        // Check for common patterns
         if (strstr(p, "\\p{L}")) {
-            // Match Unicode letter
             size_t len;
             uint32_t cp = unicode_cpt_from_utf8(text, &len);
             if (unicode_is_letter(cp)) return 1;
         } else if (strstr(p, "\\p{N}")) {
-            // Match Unicode number
             size_t len;
             uint32_t cp = unicode_cpt_from_utf8(text, &len);
             if (unicode_is_number(cp)) return 1;
         } else if (p[0] == '\\' && p[1] == 's') {
-            // Match whitespace
             if (isspace(text[0])) return 1;
         } else if (p[0] == '\\' && p[1] == 'r') {
             if (text[0] == '\r') return 1;
         } else if (p[0] == '\\' && p[1] == 'n') {
             if (text[0] == '\n') return 1;
         } else if (p[0] == '.' && p[1] == '*') {
-            // Match anything
             return 1;
         } else if (isalnum(p[0]) || ispunct(p[0])) {
-            // Match literal character
             if (text[0] == p[0]) return 1;
         }
     }
@@ -144,65 +111,36 @@ static int match_regex(const char *text, const RegexPattern *patterns, int num_p
 static char** unicode_regex_split(const char *text, const RegexPattern *patterns, int num_patterns, int *num_words) {
     char **words = NULL;
-    int word_count = 0;
-    int word_capacity = 0;
-    size_t text_len = strlen(text);
-    size_t pos = 0;
+    int word_count = 0, word_capacity = 0;
+    size_t text_len = strlen(text), pos = 0;
     while (pos < text_len) {
-        // Find the start of a word (character that matches any regex)
         size_t start = pos;
-        while (start < text_len) {
-            if (match_regex(text + start, patterns, num_patterns)) {
-                break;
-            }
-            start++;
-        }
+        while (start < text_len && !match_regex(text + start, patterns, num_patterns)) start++;
         if (start >= text_len) break;
-        // Find the end of the word (character that doesn't match any regex)
         size_t end = start;
-        while (end < text_len) {
-            if (!match_regex(text + end, patterns, num_patterns)) {
-                break;
-            }
-            end++;
-        }
+        while (end < text_len && match_regex(text + end, patterns, num_patterns)) end++;
         if (end > start) {
-            // Extract the word
             size_t word_len = end - start;
             char *word = malloc(word_len + 1);
-            if (word) {
-                memcpy(word, text + start, word_len);
-                word[word_len] = '\0';
-                // Add to array
-                if (word_count >= word_capacity) {
-                    word_capacity = word_capacity == 0 ? 16 : word_capacity * 2;
-                    words = realloc(words, word_capacity * sizeof(char*));
-                    if (!words) {
-                        for (int i = 0; i < word_count; i++) free(words[i]);
-                        free(words);
-                        *num_words = 0;
-                        return NULL;
-                    }
-                }
-                words[word_count++] = word;
+            if (!word) { while (--word_count >= 0) free(words[word_count]); free(words); *num_words = 0; return NULL; }
+            memcpy(word, text + start, word_len);
+            word[word_len] = '\0';
+            if (word_count >= word_capacity) {
+                word_capacity = word_capacity ? word_capacity * 2 : 16;
+                words = realloc(words, word_capacity * sizeof(char*));
+                if (!words) { free(word); while (--word_count >= 0) free(words[word_count]); *num_words = 0; return NULL; }
             }
+            words[word_count++] = word;
         }
         pos = end;
     }
     *num_words = word_count;
     return words;
 }
 /* ------------------------------------------------------------------------- */
-// BPE merge structure
+// BPE merge structures
 typedef struct {
     char *left;
     char *right;
@@ -217,22 +155,19 @@ typedef struct {
 } BPEMergeTable;
 static void bpe_merge_table_init(BPEMergeTable *table) {
-    table->merges = NULL;
-    table->num_merges = 0;
-    table->capacity = 0;
+    memset(table, 0, sizeof(*table));
 }
 static void bpe_merge_table_add(BPEMergeTable *table, const char *left, const char *right, const char *merged, int rank) {
     if (table->num_merges >= table->capacity) {
-        table->capacity = table->capacity == 0 ? 100 : table->capacity * 2;
+        table->capacity = table->capacity ? table->capacity * 2 : 100;
         table->merges = realloc(table->merges, table->capacity * sizeof(BPEMerge));
     }
-    BPEMerge *merge = &table->merges[table->num_merges++];
-    merge->left = strdup(left);
-    merge->right = strdup(right);
-    merge->merged = strdup(merged);
-    merge->rank = rank;
+    BPEMerge *m = &table->merges[table->num_merges++];
+    m->left = strdup(left);
+    m->right = strdup(right);
+    m->merged = strdup(merged);
+    m->rank = rank;
 }
 static void bpe_merge_table_free(BPEMergeTable *table) {
@@ -248,40 +183,25 @@ static void bpe_merge_table_free(BPEMergeTable *table) {
 static int bpe_merge_rank(const BPEMergeTable *table, const char *left, const char *right) {
     for (int i = 0; i < table->num_merges; i++) {
-        if (strcmp(table->merges[i].left, left) == 0 && strcmp(table->merges[i].right, right) == 0) {
+        if (strcmp(table->merges[i].left, left) == 0 && strcmp(table->merges[i].right, right) == 0)
             return table->merges[i].rank;
-        }
     }
     return -1;
 }
-static char* bpe_merge(const BPEMergeTable *table, const char *left, const char *right) {
-    for (int i = 0; i < table->num_merges; i++) {
-        if (strcmp(table->merges[i].left, left) == 0 && strcmp(table->merges[i].right, right) == 0) {
-            return table->merges[i].merged;
-        }
-    }
-    return NULL;
-}
 /* ------------------------------------------------------------------------- */
-// BPE tokenization helper structures
+// BPE tokenization
 typedef struct {
     char *text;
-    int start;
-    int end;
-    int prev;
-    int next;
+    int start, end;
+    int prev, next;
     int used;
 } BPESymbol;
 static void bpe_tokenize_word(const BPEMergeTable *merges, const char *word, int (*text_to_id)(void*, const char*), void *vocab_data, int *token_ids, int *num_tokens) {
-    // Initialize symbols from characters
     int word_len = strlen(word);
     int num_symbols = 0;
     BPESymbol *symbols = malloc(word_len * sizeof(BPESymbol));
-    // Split into UTF-8 characters
     int offset = 0;
     while (offset < word_len) {
         int char_len = unicode_len_utf8(word[offset]);
@@ -294,39 +214,25 @@ static void bpe_tokenize_word(const BPEMergeTable *merges, const char *word, int
         offset += char_len;
         num_symbols++;
     }
     if (num_symbols <= 1) {
-        // Single character, just tokenize it
         int id = text_to_id(vocab_data, word);
-        if (id != -1) {
-            token_ids[*num_tokens] = id;
-            (*num_tokens)++;
-        }
+        if (id != -1) token_ids[(*num_tokens)++] = id;
         free(symbols);
         return;
     }
-    // Build priority queue for merges (simplified)
-    typedef struct {
-        int left;
-        int right;
-        int rank;
-    } Bigram;
-    Bigram *bigrams = malloc(word_len * word_len * sizeof(Bigram));
+    typedef struct { int left, right, rank; } Bigram;
+    Bigram *bigrams = malloc(num_symbols * num_symbols * sizeof(Bigram));
     int num_bigrams = 0;
-    // Initialize bigrams
     for (int i = 0; i < num_symbols - 1; i++) {
         if (symbols[i].used && symbols[i+1].used) {
-            // Get the concatenated string for this pair
             char *left_str = malloc(symbols[i].end - symbols[i].start + 1);
             char *right_str = malloc(symbols[i+1].end - symbols[i+1].start + 1);
             memcpy(left_str, symbols[i].text, symbols[i].end - symbols[i].start);
             memcpy(right_str, symbols[i+1].text, symbols[i+1].end - symbols[i+1].start);
             left_str[symbols[i].end - symbols[i].start] = '\0';
             right_str[symbols[i+1].end - symbols[i+1].start] = '\0';
             int rank = bpe_merge_rank(merges, left_str, right_str);
             if (rank != -1) {
                 bigrams[num_bigrams].left = i;
@@ -334,70 +240,42 @@ static void bpe_tokenize_word(const BPEMergeTable *merges, const char *word, int
                 bigrams[num_bigrams].rank = rank;
                 num_bigrams++;
             }
-            free(left_str);
-            free(right_str);
+            free(left_str); free(right_str);
         }
     }
-    // Sort bigrams by rank (lower rank = higher priority)
-    for (int i = 0; i < num_bigrams - 1; i++) {
-        for (int j = i+1; j < num_bigrams; j++) {
+    for (int i = 0; i < num_bigrams - 1; i++)
+        for (int j = i+1; j < num_bigrams; j++)
             if (bigrams[i].rank > bigrams[j].rank) {
-                Bigram temp = bigrams[i];
+                Bigram tmp = bigrams[i];
                 bigrams[i] = bigrams[j];
-                bigrams[j] = temp;
+                bigrams[j] = tmp;
             }
-        }
-    }
-    // Apply merges
     int *merged = calloc(num_symbols, sizeof(int));
     for (int i = 0; i < num_bigrams; i++) {
-        int left = bigrams[i].left;
-        int right = bigrams[i].right;
+        int left = bigrams[i].left, right = bigrams[i].right;
         if (merged[left] || merged[right]) continue;
-        // Merge right into left
         symbols[left].end = symbols[right].end;
         symbols[left].next = symbols[right].next;
         merged[right] = 1;
-        // Update next symbol's prev
-        if (symbols[right].next < num_symbols) {
-            symbols[symbols[right].next].prev = left;
-        }
+        if (symbols[right].next < num_symbols) symbols[symbols[right].next].prev = left;
     }
-    // Collect final tokens
     for (int i = 0; i < num_symbols; i++) {
         if (!merged[i] && symbols[i].used) {
-            // Extract the substring
             char *substr = malloc(symbols[i].end - symbols[i].start + 1);
             memcpy(substr, word + symbols[i].start, symbols[i].end - symbols[i].start);
             substr[symbols[i].end - symbols[i].start] = '\0';
             int id = text_to_id(vocab_data, substr);
-            if (id != -1) {
-                token_ids[*num_tokens] = id;
-                (*num_tokens)++;
-            } else {
-                // Unknown token - use byte-level fallback
-                // For simplicity, we'll use space as a placeholder
-                // In a full implementation, you'd encode bytes individually
-            }
+            if (id != -1) token_ids[(*num_tokens)++] = id;
             free(substr);
         }
     }
-    free(bigrams);
-    free(merged);
-    free(symbols);
+    free(bigrams); free(merged); free(symbols);
 }
 /* ------------------------------------------------------------------------- */
+// GGUF parsing
 static int safe_advance(uint8_t **p, uint8_t *end, size_t sz) {
     if (*p + sz > end) return 0;
     *p += sz;
@@ -405,14 +283,14 @@ static int safe_advance(uint8_t **p, uint8_t *end, size_t sz) {
 }
 static uint32_t rd32(uint8_t **p, uint8_t *end) {
-    uint32_t v = 0;
+    uint32_t v;
     if (!safe_advance(p, end, 4)) return 0;
     memcpy(&v, *p - 4, 4);
     return v;
 }
 static uint64_t rd64(uint8_t **p, uint8_t *end) {
-    uint64_t v = 0;
+    uint64_t v;
     if (!safe_advance(p, end, 8)) return 0;
     memcpy(&v, *p - 8, 8);
     return v;
@@ -423,9 +301,9 @@ static char *rdstr(uint8_t **p, uint8_t *end) {
     uint64_t len;
     memcpy(&len, *p, 8);
     *p += 8;
-    if (len == 0 || len > (1 << 20)) return NULL;
+    if (len == 0 || len > (1<<20)) return NULL;
     if (*p + len > end) return NULL;
-    char *s = malloc(len + 1);
+    char *s = malloc(len+1);
     if (!s) return NULL;
     memcpy(s, *p, len);
     s[len] = '\0';
@@ -436,29 +314,27 @@ static char *rdstr(uint8_t **p, uint8_t *end) {
 static void align_to_32(uint8_t **p, uint8_t *end, uint8_t *base) {
     size_t off = *p - base;
     size_t aligned = (off + GGUF_ALIGN - 1) & ~(GGUF_ALIGN - 1);
-    if (base + aligned <= end)
-        *p = base + aligned;
+    if (base + aligned <= end) *p = base + aligned;
 }
 /* ------------------------------------------------------------------------- */
+// Hash table for vocabulary
 typedef struct HashNode {
     char *key;
-    int   id;
+    int id;
     struct HashNode *next;
 } HashNode;
 typedef struct {
-    int        vocab_size;
-    int        dim;
-    char     **tokens;
-    float     *float_data;
-    void      *tensor_data;
-    int        tensor_type;
-    void      *mapped;
-    size_t     mapped_size;
+    int vocab_size;
+    int dim;
+    char **tokens;
+    float *float_data;
+    void *tensor_data;
+    int tensor_type;
+    void *mapped;
+    size_t mapped_size;
     HashNode **table;
-    // BPE tokenization data
     BPEMergeTable merges;
     RegexPattern *pre_patterns;
     int num_pre_patterns;
@@ -466,6 +342,7 @@ typedef struct {
     int bos_token_id;
     int eos_token_id;
     int vocab_type;
+    char space_marker[8];
 } EmbedModel;
 typedef struct {
@@ -498,11 +375,11 @@ static int hget(EmbedModel *m, const char *k) {
 }
 static int text_to_id(void *vocab_data, const char *text) {
-    EmbedModel *m = (EmbedModel*)vocab_data;
-    return hget(m, text);
+    return hget((EmbedModel*)vocab_data, text);
 }
 /* ------------------------------------------------------------------------- */
+// File mapping
 static void *map_file(const char *path, size_t *size) {
     int fd = open(path, O_RDONLY);
     if (fd < 0) return NULL;
@@ -511,29 +388,22 @@ static void *map_file(const char *path, size_t *size) {
     *size = st.st_size;
     void *data = mmap(NULL, *size, PROT_READ, MAP_PRIVATE, fd, 0);
     close(fd);
-    if (data == MAP_FAILED) return NULL;
-    return data;
+    return data == MAP_FAILED ? NULL : data;
 }
 /* ------------------------------------------------------------------------- */
+// FP16 conversion
 static float fp16_to_fp32(uint16_t h) {
-    const uint16_t sign = (h >> 15) & 1;
-    const uint16_t exp  = (h >> 10) & 0x1F;
-    const uint16_t mant = h & 0x3FF;
-    float val;
-    if (exp == 0) {
-        val = (mant / 1024.0f) * 6.103515625e-5f;
-    } else if (exp == 31) {
-        return 0.0f;
-    } else {
-        val = (1.0f + mant / 1024.0f) * (1 << (exp - 15));
-    }
-    return sign ? -val : val;
+    uint16_t sign = (h >> 15) & 1;
+    uint16_t exp  = (h >> 10) & 0x1F;
+    uint16_t mant = h & 0x3FF;
+    if (exp == 0) return (mant / 1024.0f) * 6.103515625e-5f * (sign ? -1.0f : 1.0f);
+    if (exp == 31) return 0.0f;
+    return (1.0f + mant / 1024.0f) * (1 << (exp - 15)) * (sign ? -1.0f : 1.0f);
 }
 /* ------------------------------------------------------------------------- */
-/* Block dequantization */
+// Block dequantization functions
 static void dequantize_row_q4_0(const void *vx, float *y, int k) {
     const int nb = k / 32;
     const uint8_t *x = vx;
@@ -621,7 +491,6 @@ static void dequantize_row_q8_1(const void *vx, float *y, int k) {
     }
 }
-/* K-quants */
 static void dequantize_row_q2_K(const void *vx, float *y, int k) {
     const int nb = k / 256;
     const uint8_t *x = vx;
@@ -748,7 +617,6 @@ static void dequantize_row_q8_K(const void *vx, float *y, int k) {
     }
 }
-/* ------------------------------------------------------------------------- */
 static float* dequantize_tensor(const void *data, int type, int n_rows, int n_cols) {
     if (type == GGML_TYPE_F32) {
         float *out = malloc(n_rows * n_cols * sizeof(float));
@@ -793,6 +661,14 @@ static float* dequantize_tensor(const void *data, int type, int n_rows, int n_co
     for (int r = 0; r < n_rows; r++) {
         dequant_func(in + r * row_bytes, out + r * n_cols, n_cols);
     }
+    // Sanitize the tensor: replace NaNs, Infs, and astronomically large values with zero
+    int total = n_rows * n_cols;
+    for (int i = 0; i < total; i++) {
+        if (isnan(out[i]) || isinf(out[i]) || fabs(out[i]) > 1e10f) {
+            out[i] = 0.0f;
+        }
+    }
     return out;
 }
@@ -837,45 +713,49 @@ static void free_model_contents(EmbedModel *m) {
     }
     if (m->float_data) free(m->float_data);
     if (m->mapped) munmap(m->mapped, m->mapped_size);
-    // Free BPE tokenization data
     bpe_merge_table_free(&m->merges);
     if (m->pre_patterns) {
-        for (int i = 0; i < m->num_pre_patterns; i++) {
-            free(m->pre_patterns[i].pattern);
-        }
+        for (int i = 0; i < m->num_pre_patterns; i++) free(m->pre_patterns[i].pattern);
         free(m->pre_patterns);
     }
     free(m);
 }
 /* ------------------------------------------------------------------------- */
 static int is_printable_string(const char *s, size_t len) {
-    for (size_t i = 0; i < len; i++)
-        if (!isprint((unsigned char)s[i])) return 0;
+    for (size_t i = 0; i < len; i++) if (!isprint((unsigned char)s[i])) return 0;
     return 1;
 }
-/* Fallback: find the start of tensor info by scanning for a valid string */
 static uint8_t *find_tensor_info_start(uint8_t *cur, uint8_t *end) {
     uint8_t *scan = cur;
     while (scan + 8 < end) {
         uint64_t len;
         memcpy(&len, scan, 8);
-        if (len > 0 && len < 256 && scan + 8 + len <= end) {
-            if (is_printable_string((char*)scan + 8, len)) {
-                return scan;
-            }
-        }
+        if (len > 0 && len < 256 && scan + 8 + len <= end && is_printable_string((char*)scan+8, len))
+            return scan;
         scan++;
     }
     return NULL;
 }
 /* ------------------------------------------------------------------------- */
+static void detect_space_marker(EmbedModel *m) {
+    const char *candidates[] = {"▁", "Ġ", " "};
+    for (int i = 0; i < 3; i++) {
+        const char *marker = candidates[i];
+        int marker_len = strlen(marker);
+        for (int j = 0; j < m->vocab_size; j++) {
+            if (strncmp(m->tokens[j], marker, marker_len) == 0) {
+                strcpy(m->space_marker, marker);
+                return;
+            }
+        }
+    }
+    m->space_marker[0] = '\0';
+}
 static void setup_default_pre_patterns(EmbedModel *m) {
-    // Default pre-tokenization regex patterns (similar to Llama 3)
     const char *default_patterns[] = {
         "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])",
         "[^\\r\\n\\p{L}\\p{N}]?\\p{L}+",
@@ -885,29 +765,23 @@ static void setup_default_pre_patterns(EmbedModel *m) {
         "\\s+(?!\\S)",
         "\\s+"
     };
-    m->num_pre_patterns = sizeof(default_patterns) / sizeof(default_patterns[0]);
+    m->num_pre_patterns = sizeof(default_patterns)/sizeof(default_patterns[0]);
     m->pre_patterns = malloc(m->num_pre_patterns * sizeof(RegexPattern));
     for (int i = 0; i < m->num_pre_patterns; i++) {
         m->pre_patterns[i].pattern = strdup(default_patterns[i]);
         m->pre_patterns[i].pattern_len = strlen(default_patterns[i]);
     }
 }
-/* ------------------------------------------------------------------------- */
 static void parse_merge(const char *merge_str, char **left, char **right) {
-    // Parse a merge string like "h ello" -> left="h", right="ello"
     const char *space = strchr(merge_str, ' ');
     if (space) {
         int left_len = space - merge_str;
-        *left = malloc(left_len + 1);
+        *left = malloc(left_len+1);
         memcpy(*left, merge_str, left_len);
         (*left)[left_len] = '\0';
-        *right = strdup(space + 1);
+        *right = strdup(space+1);
     } else {
-        // No space - treat as single token
         *left = strdup(merge_str);
         *right = strdup("");
     }
@@ -918,13 +792,10 @@ static EmbedModel *embed_load_gguf(const char *path) {
     size_t sz;
     uint8_t *base = map_file(path, &sz);
     if (!base) return NULL;
-    uint8_t *cur = base;
-    uint8_t *end = base + sz;
+    uint8_t *cur = base, *end = base + sz;
     if (memcmp(cur, "GGUF", 4) != 0) { munmap(base, sz); return NULL; }
     cur += 4;
     uint32_t version = rd32(&cur, end);
-    (void)version;
     uint64_t n_tensors = rd64(&cur, end);
     uint64_t n_kv = rd64(&cur, end);
@@ -934,26 +805,20 @@ static EmbedModel *embed_load_gguf(const char *path) {
     m->mapped_size = sz;
     m->table = calloc(HASH_SIZE, sizeof(HashNode*));
     if (!m->table) { free_model_contents(m); return NULL; }
-    // Initialize BPE structures
     bpe_merge_table_init(&m->merges);
     setup_default_pre_patterns(m);
-    // Default values
     m->unknown_token_id = -1;
     m->bos_token_id = -1;
     m->eos_token_id = -1;
     m->vocab_type = LLAMA_VOCAB_TYPE_NONE;
+    m->space_marker[0] = '\0';
-    /* ---------- Metadata ---------- */
     int vocab_found = 0;
     for (uint64_t i = 0; i < n_kv; i++) {
         char *key = rdstr(&cur, end);
         if (!key) { free_model_contents(m); return NULL; }
         uint32_t type = rd32(&cur, end);
-        if ((strcmp(key, "tokenizer.ggml.tokens") == 0 ||
-             strcmp(key, "tokenizer.ggml.token_list") == 0) && type == 9) {
+        if ((strcmp(key, "tokenizer.ggml.tokens") == 0 || strcmp(key, "tokenizer.ggml.token_list") == 0) && type == 9) {
             uint32_t subtype = rd32(&cur, end);
             uint64_t n = rd64(&cur, end);
             if (subtype != 8) { free(key); free_model_contents(m); return NULL; }
@@ -971,40 +836,29 @@ static EmbedModel *embed_load_gguf(const char *path) {
             uint32_t subtype = rd32(&cur, end);
             uint64_t n = rd64(&cur, end);
             if (subtype == 8) {
-                // Parse merges
                 for (uint64_t j = 0; j < n && j < MAX_MERGES; j++) {
                     char *merge_str = rdstr(&cur, end);
                     if (merge_str) {
                         char *left, *right;
                         parse_merge(merge_str, &left, &right);
-                        bpe_merge_table_add(&m->merges, left, right, merge_str, j);
-                        free(left);
-                        free(right);
+                        bpe_merge_table_add(&m->merges, left, right, merge_str, (int)j);
+                        free(left); free(right);
                         free(merge_str);
                     }
                 }
             } else {
-                // Skip if not string array
-                if (!skip_value(&cur, end, type)) {
-                    free(key); free_model_contents(m); return NULL;
-                }
+                if (!skip_value(&cur, end, type)) { free(key); free_model_contents(m); return NULL; }
             }
         } else if (strcmp(key, "tokenizer.ggml.model") == 0 && type == 8) {
             char *model_type = rdstr(&cur, end);
             if (model_type) {
-                if (strcmp(model_type, "gpt2") == 0 || strcmp(model_type, "llama") == 0) {
-                    m->vocab_type = LLAMA_VOCAB_TYPE_BPE;
-                } else if (strcmp(model_type, "bert") == 0) {
-                    m->vocab_type = LLAMA_VOCAB_TYPE_WPM;
-                }
+                if (strcmp(model_type, "gpt2") == 0 || strcmp(model_type, "llama") == 0) m->vocab_type = LLAMA_VOCAB_TYPE_BPE;
+                else if (strcmp(model_type, "bert") == 0) m->vocab_type = LLAMA_VOCAB_TYPE_WPM;
                 free(model_type);
             }
         } else if (strcmp(key, "tokenizer.ggml.pre") == 0 && type == 8) {
-            char *pre_type = rdstr(&cur, end);
-            if (pre_type) {
-                // Could load custom regex patterns here if needed
-                free(pre_type);
-            }
+            char *pre = rdstr(&cur, end);
+            if (pre) free(pre);
         } else if (strcmp(key, "tokenizer.ggml.unknown_token_id") == 0 && type == 6) {
             m->unknown_token_id = rd32(&cur, end);
         } else if (strcmp(key, "tokenizer.ggml.bos_token_id") == 0 && type == 6) {
@@ -1012,20 +866,16 @@ static EmbedModel *embed_load_gguf(const char *path) {
         } else if (strcmp(key, "tokenizer.ggml.eos_token_id") == 0 && type == 6) {
             m->eos_token_id = rd32(&cur, end);
         } else {
-            if (!skip_value(&cur, end, type)) {
-                free(key); free_model_contents(m); return NULL;
-            }
+            if (!skip_value(&cur, end, type)) { free(key); free_model_contents(m); return NULL; }
         }
         free(key);
     }
     if (!vocab_found) { free_model_contents(m); return NULL; }
+    detect_space_marker(m);
     uint8_t *after_kv = cur;
     align_to_32(&cur, end, base);
     uint8_t *tensor_start = cur;
-    /* ---------- Tensor info ---------- */
     int embd_found = 0;
     void *raw_tensor_data = NULL;
     int tensor_type = -1;
@@ -1039,39 +889,27 @@ static EmbedModel *embed_load_gguf(const char *path) {
             if (!name) break;
             uint32_t n_dims = rd32(&cur, end);
             uint64_t dims[MAX_DIMS] = {0};
-            for (uint32_t d = 0; d < n_dims && d < MAX_DIMS; d++)
-                dims[d] = rd64(&cur, end);
+            for (uint32_t d = 0; d < n_dims && d < MAX_DIMS; d++) dims[d] = rd64(&cur, end);
             uint32_t type   = rd32(&cur, end);
             uint64_t offset = rd64(&cur, end);
             int is_token_embd = (strcmp(name, "token_embd.weight") == 0 ||
                                  strcmp(name, "embeddings.word_embeddings.weight") == 0 ||
                                  strcmp(name, "model.embed_tokens.weight") == 0);
             if (!is_token_embd && n_dims == 2 && m->vocab_size > 0) {
-                if ((uint64_t)m->vocab_size == dims[0] && strstr(name, "embd") != NULL)
-                    is_token_embd = 1;
-                else if ((uint64_t)m->vocab_size == dims[1] && strstr(name, "embd") != NULL)
-                    is_token_embd = 1;
+                if ((uint64_t)m->vocab_size == dims[0] && strstr(name, "embd")) is_token_embd = 1;
+                else if ((uint64_t)m->vocab_size == dims[1] && strstr(name, "embd")) is_token_embd = 1;
             }
             if (!embd_found && is_token_embd) {
                 if (n_dims < 2 || dims[1] == 0) { free(name); free_model_contents(m); return NULL; }
-                dim0 = dims[0];
-                dim1 = dims[1];
-                if (dim0 == (uint64_t)m->vocab_size) {
-                    m->dim = (int)dim1;
-                    need_transpose = 0;
-                } else if (dim1 == (uint64_t)m->vocab_size) {
-                    m->dim = (int)dim0;
-                    need_transpose = 1;
-                } else {
-                    m->dim = (dim0 < dim1) ? (int)dim0 : (int)dim1;
-                    need_transpose = (dim0 > dim1) ? 1 : 0;
-                }
+                dim0 = dims[0]; dim1 = dims[1];
+                if (dim0 == (uint64_t)m->vocab_size) { m->dim = (int)dim1; need_transpose = 0; }
+                else if (dim1 == (uint64_t)m->vocab_size) { m->dim = (int)dim0; need_transpose = 1; }
+                else { m->dim = (dim0 < dim1) ? (int)dim0 : (int)dim1; need_transpose = (dim0 > dim1) ? 1 : 0; }
                 raw_tensor_data = base + offset;
                 tensor_type = type;
                 embd_found = 1;
+                free(name);
+                break;
             }
             free(name);
         }
@@ -1081,13 +919,8 @@ static EmbedModel *embed_load_gguf(const char *path) {
             if (!tensor_start) break;
         }
     }
+    if (!embd_found || m->dim == 0) { free_model_contents(m); return NULL; }
-    if (!embd_found || m->dim == 0) {
-        free_model_contents(m);
-        return NULL;
-    }
-    /* Dequantize */
     if (tensor_type == GGML_TYPE_F32 && !need_transpose) {
         m->float_data = NULL;
         m->tensor_data = raw_tensor_data;
@@ -1095,10 +928,7 @@ static EmbedModel *embed_load_gguf(const char *path) {
         int n_rows = need_transpose ? (int)dim1 : (int)dim0;
         int n_cols = need_transpose ? (int)dim0 : (int)dim1;
         m->float_data = dequantize_tensor(raw_tensor_data, tensor_type, n_rows, n_cols);
-        if (!m->float_data) {
-            free_model_contents(m);
-            return NULL;
-        }
+        if (!m->float_data) { free_model_contents(m); return NULL; }
         m->tensor_data = m->float_data;
     }
     m->tensor_type = tensor_type;
@@ -1109,79 +939,84 @@ static EmbedModel *embed_load_gguf(const char *path) {
 /* ------------------------------------------------------------------------- */
 static void embed_text(EmbedModel *m, const char *txt, float *out) {
     memset(out, 0, sizeof(float) * m->dim);
-    // Pre-tokenize using regex
     int num_words = 0;
     char **words = unicode_regex_split(txt, m->pre_patterns, m->num_pre_patterns, &num_words);
     if (!words || num_words == 0) {
-        // Fallback to space splitting if regex fails
+        // Fallback to simple space split
         char *copy = strdup(txt);
-        if (!copy) return;
-        char *tok = strtok(copy, " \t\n\r");
-        int used = 0;
-        const float *embd_matrix = m->tensor_data;
-        while (tok) {
-            int id = hget(m, tok);
-            if (id >= 0 && id < m->vocab_size) {
-                const float *vec = embd_matrix + id * m->dim;
-                for (int i = 0; i < m->dim; i++) out[i] += vec[i];
-                used++;
+        if (copy) {
+            char *tok = strtok(copy, " \t\n\r");
+            int used = 0;
+            const float *embd = (float*)m->tensor_data;
+            while (tok) {
+                int id = hget(m, tok);
+                if (id >= 0 && id < m->vocab_size) {
+                    const float *vec = embd + id * m->dim;
+                    for (int i = 0; i < m->dim; i++) out[i] += vec[i];
+                    used++;
+                }
+                tok = strtok(NULL, " \t\n\r");
             }
-            tok = strtok(NULL, " \t\n\r");
+            if (used) { float inv = 1.0f / used; for (int i = 0; i < m->dim; i++) out[i] *= inv; }
+            free(copy);
         }
-        if (used > 0) {
-            float inv = 1.0f / used;
-            for (int i = 0; i < m->dim; i++) out[i] *= inv;
-        }
-        free(copy);
+        if (words) free(words);
         return;
     }
-    // Tokenize each word using BPE
-    int *token_ids = malloc(m->vocab_size * sizeof(int)); // Max possible tokens
-    int num_tokens = 0;
-    const float *embd_matrix = m->tensor_data;
+    int *token_ids = malloc(m->vocab_size * sizeof(int));
     int used = 0;
+    const float *embd = (float*)m->tensor_data;
     for (int i = 0; i < num_words; i++) {
-        num_tokens = 0;
-        bpe_tokenize_word(&m->merges, words[i], text_to_id, m, token_ids, &num_tokens);
-        for (int j = 0; j < num_tokens; j++) {
-            int id = token_ids[j];
-            if (id >= 0 && id < m->vocab_size) {
-                const float *vec = embd_matrix + id * m->dim;
-                for (int k = 0; k < m->dim; k++) out[k] += vec[k];
-                used++;
-            } else if (m->unknown_token_id != -1 && m->unknown_token_id < m->vocab_size) {
-                // Use unknown token as fallback
-                const float *vec = embd_matrix + m->unknown_token_id * m->dim;
-                for (int k = 0; k < m->dim; k++) out[k] += vec[k];
-                used++;
+        char *word = words[i];
+        int id = hget(m, word);
+        if (id == -1 && m->space_marker[0]) {
+            char *with_marker = malloc(strlen(m->space_marker) + strlen(word) + 1);
+            strcpy(with_marker, m->space_marker);
+            strcat(with_marker, word);
+            id = hget(m, with_marker);
+            free(with_marker);
+        }
+        if (id != -1) {
+            const float *vec = embd + id * m->dim;
+            for (int j = 0; j < m->dim; j++) out[j] += vec[j];
+            used++;
+        } else {
+            int num_tokens = 0;
+            bpe_tokenize_word(&m->merges, word, text_to_id, m, token_ids, &num_tokens);
+            for (int k = 0; k < num_tokens; k++) {
+                int tid = token_ids[k];
+                if (tid >= 0 && tid < m->vocab_size) {
+                    const float *vec = embd + tid * m->dim;
+                    for (int j = 0; j < m->dim; j++) out[j] += vec[j];
+                    used++;
+                } else if (m->unknown_token_id != -1 && m->unknown_token_id < m->vocab_size) {
+                    const float *vec = embd + m->unknown_token_id * m->dim;
+                    for (int j = 0; j < m->dim; j++) out[j] += vec[j];
+                    used++;
+                }
             }
         }
-        free(words[i]);
+        free(word);
     }
     free(words);
     free(token_ids);
     if (used > 0) {
         float inv = 1.0f / used;
         for (int i = 0; i < m->dim; i++) out[i] *= inv;
     }
+    for (int i = 0; i < m->dim; i++) {
+        if (isnan(out[i]) || isinf(out[i])) {
+            out[i] = 0.0f;
+        }
+    }
 }
 /* ------------------------------------------------------------------------- */
+// Ruby bindings
 static void rb_embedder_free(void *p) {
     ruby_embedder *e = p;
-    if (!e) return;
-    if (e->model) free_model_contents(e->model);
-    free(e);
+    if (e) { if (e->model) free_model_contents(e->model); free(e); }
 }
 static size_t rb_embedder_memsize(const void *p) {
@@ -1202,22 +1037,18 @@ static VALUE rb_embedder_alloc(VALUE klass) {
 static VALUE rb_embedder_initialize(VALUE self, VALUE opts) {
     ruby_embedder *e;
     TypedData_Get_Struct(self, ruby_embedder, &ruby_embedder_type, e);
     VALUE path = rb_hash_aref(opts, ID2SYM(rb_intern("model")));
     const char *cpath = StringValueCStr(path);
     e->model = embed_load_gguf(cpath);
-    if (!e->model)
-        rb_raise(rb_eRuntimeError, "failed to load GGUF model");
+    if (!e->model) rb_raise(rb_eRuntimeError, "failed to load GGUF model");
     return self;
 }
 static VALUE rb_embed(VALUE self, VALUE opts) {
     ruby_embedder *e;
     TypedData_Get_Struct(self, ruby_embedder, &ruby_embedder_type, e);
     VALUE text = rb_hash_aref(opts, ID2SYM(rb_intern("text")));
     const char *ctext = StringValueCStr(text);
     VALUE out = rb_str_new(NULL, e->model->dim * sizeof(float));
     embed_text(e->model, ctext, (float*)RSTRING_PTR(out));
     return out;
@@ -1227,5 +1058,5 @@ void Init_mini_embed(void) {
     VALUE c = rb_define_class("MiniEmbed", rb_cObject);
     rb_define_alloc_func(c, rb_embedder_alloc);
     rb_define_method(c, "initialize", rb_embedder_initialize, 1);
-    rb_define_method(c, "embeddings", rb_embed, 1);
+    rb_define_method(c, "embed", rb_embed, 1);
 }

data/lib/mini_embed.rb CHANGED Viewed

@@ -1,3 +1,17 @@
 # frozen_string_literal: true
 require 'mini_embed/mini_embed'
+class MiniEmbed
+  # @param text [String] - text to extract embeddings from
+  # @param type [Symbol, nil] - :binary or :vector - type of data you want to receive
+  # @return [String, <Float>] - type == :binary - binary string, type == :vector - array of floats
+  def embeddings(text: text, type: :vector)
+    binary_data = embed(text: text) # call original C method
+    return binary_data if type == :binary
+    return binary_data.unpack('e*') if type == :vector
+    raise ArgumentError, "Unsupported data type: #{type}"
+  end
+end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: mini_embed
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Makapoxa