mini_embed 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +2 -0
  3. data/ext/mini_embed/mini_embed.c +491 -99
  4. metadata +1 -1
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d6230ebfba3a401a8d26543f106e46952198bc6e89c5cf7632da40346933cf64
4
- data.tar.gz: d5d37dd58c4bb3671053acb280db02ebb2ef78722d9c115f57f2594ad3a9ab50
3
+ metadata.gz: 0f3a2f9365c3ba228faf709ec97d986dadcc22a78ab0b706d35ba3da5e1552ce
4
+ data.tar.gz: 778847fe77dc4cb8b8774b62fe6f212c435880e470868f15c2517f1adb37211a
5
5
  SHA512:
6
- metadata.gz: a826aad05808580120035f689412afdf976d77637cd9b1cb57df02740a7c86efff0120bf7fba172498e0b5d7ed82617bac99777e731d33e84bcbb823db543e29
7
- data.tar.gz: f5bb3db889b9c51348daed59c3fbab9496237c3e9a64cb908ef386a1093e5e678531a5ad10eb051d0614dbe1fb9217d93a32049e6a5b8392b053d2474d6e9606
6
+ metadata.gz: '086948ced123967c0aa5f7e0fcb6624dffb2f68f6c95e4abd2dbaa429fb8717d98be76f6b173925b21af1bc14a0fe8af9d6d68d891ba4ce90d5a0b2145df55ef'
7
+ data.tar.gz: 2ac2c25baf87dd7b21fc38ccce6c3be0a3c133008ef27fa1790a95d3bc6146d5cb522166c93d8961edb16c3128e13d8f0d9206869abee43af10586f0124e00f2
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # mini_embed
2
2
 
3
+ [![CircleCI](https://dl.circleci.com/status-badge/img/gh/Makapoxa/mini_embed/tree/main.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Makapoxa/mini_embed/tree/main) [![Gem Version](https://badge.fury.io/rb/mini_embed.svg)](https://badge.fury.io/rb/mini_embed)
4
+
3
5
  A minimal, dependency‑free C extension for Ruby that loads [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) embedding models and computes text embeddings **locally**.
4
6
 
5
7
  **⚠️ Important:** This gem is intended for **small projects, prototypes, and hobbyist use**. It allows you to experiment with embeddings without relying on external APIs or cloud costs. **Do not use MiniEmbed in production** – it lacks the performance, scalability, and tokenization robustness of dedicated solutions. For real applications, use a proper inference server like [llama.cpp](https://github.com/ggerganov/llama.cpp) with its HTTP API, or managed services such as OpenAI, Cohere, or Hugging Face.
@@ -440,9 +440,23 @@ typedef struct HashNode {
440
440
  struct HashNode *next;
441
441
  } HashNode;
442
442
 
443
+ typedef struct {
444
+ char *name;
445
+ uint32_t n_dims;
446
+ uint64_t dims[MAX_DIMS];
447
+ int type;
448
+ const uint8_t *data;
449
+ size_t row_bytes;
450
+ } Tensor;
451
+
443
452
  typedef struct {
444
453
  int vocab_size;
445
454
  int dim;
455
+ int n_layers;
456
+ int n_heads;
457
+ int n_ctx;
458
+ int n_ff;
459
+ float eps;
446
460
  char **tokens;
447
461
  void *mapped;
448
462
  size_t mapped_size;
@@ -460,6 +474,11 @@ typedef struct {
460
474
  int need_transpose;
461
475
  uint64_t raw_dim0, raw_dim1;
462
476
  int normalize;
477
+ Tensor *tensors;
478
+ int n_tensors;
479
+ int sep_token_id;
480
+ int pad_token_id;
481
+ int cls_token_id;
463
482
  } EmbedModel;
464
483
 
465
484
  typedef struct {
@@ -541,6 +560,36 @@ static float fp16_to_fp32(uint16_t h) {
541
560
  return result;
542
561
  }
543
562
 
563
+ static uint16_t fp32_to_fp16(float f) {
564
+ uint32_t x;
565
+ memcpy(&x, &f, sizeof(x));
566
+
567
+ uint32_t sign = (x >> 16) & 0x8000;
568
+ int exp = ((x >> 23) & 0xFF) - 127 + 15;
569
+ uint32_t mant = x & 0x7FFFFF;
570
+
571
+ if (exp <= 0) {
572
+ if (exp < -10) return (uint16_t)sign;
573
+ mant |= 0x800000;
574
+ uint32_t t = mant >> (1 - exp);
575
+ if (t & 0x00001000) t += 0x00002000;
576
+ return (uint16_t)(sign | (t >> 13));
577
+ } else if (exp >= 31) {
578
+ if (mant == 0) return (uint16_t)(sign | 0x7C00);
579
+ return (uint16_t)(sign | 0x7C00 | (mant >> 13));
580
+ } else {
581
+ if (mant & 0x00001000) {
582
+ mant += 0x00002000;
583
+ if (mant & 0x00800000) {
584
+ mant = 0;
585
+ exp += 1;
586
+ }
587
+ }
588
+ if (exp >= 31) return (uint16_t)(sign | 0x7C00);
589
+ return (uint16_t)(sign | ((uint32_t)exp << 10) | (mant >> 13));
590
+ }
591
+ }
592
+
544
593
  /* ------------------------------------------------------------------------- */
545
594
  // Block dequantization functions (correct sizes)
546
595
  static void dequantize_row_q4_0(const void *vx, float *y, int k) {
@@ -552,9 +601,9 @@ static void dequantize_row_q4_0(const void *vx, float *y, int k) {
552
601
  memcpy(&d16, block, 2);
553
602
  const float d = fp16_to_fp32(d16);
554
603
  const uint8_t *q = block + 2;
555
- for (int j = 0; j < 32; j++) {
556
- const int v = (q[j/2] >> (4*(j%2))) & 0x0F;
557
- y[i*32 + j] = (v - 8.0f) * d;
604
+ for (int j = 0; j < 16; j++) {
605
+ y[i*32 + j] = ((q[j] & 0x0F) - 8.0f) * d;
606
+ y[i*32 + j + 16] = ((q[j] >> 4) - 8.0f) * d;
558
607
  }
559
608
  }
560
609
  }
@@ -570,9 +619,9 @@ static void dequantize_row_q4_1(const void *vx, float *y, int k) {
570
619
  const float d = fp16_to_fp32(d16);
571
620
  const float m = fp16_to_fp32(m16);
572
621
  const uint8_t *q = block + 4;
573
- for (int j = 0; j < 32; j++) {
574
- const int v = (q[j/2] >> (4*(j%2))) & 0x0F;
575
- y[i*32 + j] = v * d + m;
622
+ for (int j = 0; j < 16; j++) {
623
+ y[i*32 + j] = (q[j] & 0x0F) * d + m;
624
+ y[i*32 + j + 16] = (q[j] >> 4) * d + m;
576
625
  }
577
626
  }
578
627
  }
@@ -622,9 +671,10 @@ static void dequantize_row_q8_0(const void *vx, float *y, int k) {
622
671
  const uint8_t *x = vx;
623
672
  for (int i = 0; i < nb; i++) {
624
673
  const uint8_t *block = x + i * 34;
625
- float d;
626
- memcpy(&d, block, 4);
627
- const int8_t *q = (const int8_t*)(block + 4);
674
+ uint16_t d16;
675
+ memcpy(&d16, block, 2);
676
+ const float d = fp16_to_fp32(d16);
677
+ const int8_t *q = (const int8_t*)(block + 2);
628
678
  for (int j = 0; j < 32; j++) {
629
679
  y[i*32 + j] = (float)q[j] * d;
630
680
  }
@@ -635,11 +685,13 @@ static void dequantize_row_q8_1(const void *vx, float *y, int k) {
635
685
  const int nb = k / QK8_0;
636
686
  const uint8_t *x = vx;
637
687
  for (int i = 0; i < nb; i++) {
638
- const uint8_t *block = x + i * 40;
639
- float d, s;
640
- memcpy(&d, block, 4);
641
- memcpy(&s, block + 4, 4);
642
- const int8_t *q = (const int8_t*)(block + 8);
688
+ const uint8_t *block = x + i * 36;
689
+ uint16_t d16, s16;
690
+ memcpy(&d16, block, 2);
691
+ memcpy(&s16, block + 2, 2);
692
+ const float d = fp16_to_fp32(d16);
693
+ const float s = fp16_to_fp32(s16);
694
+ const int8_t *q = (const int8_t*)(block + 4);
643
695
  for (int j = 0; j < 32; j++) {
644
696
  y[i*32 + j] = (float)q[j] * d + s;
645
697
  }
@@ -932,7 +984,7 @@ static size_t get_row_bytes(int type, int n_cols) {
932
984
  case GGML_TYPE_Q5_0: return (n_cols / 32) * 22;
933
985
  case GGML_TYPE_Q5_1: return (n_cols / 32) * 24;
934
986
  case GGML_TYPE_Q8_0: return (n_cols / 32) * 34;
935
- case GGML_TYPE_Q8_1: return (n_cols / 32) * 40;
987
+ case GGML_TYPE_Q8_1: return (n_cols / 32) * 36;
936
988
  case GGML_TYPE_Q2_K: return (n_cols / 256) * 84;
937
989
  case GGML_TYPE_Q3_K: return (n_cols / 256) * 110;
938
990
  case GGML_TYPE_Q4_K: return (n_cols / 256) * 144;
@@ -949,6 +1001,7 @@ static int skip_value(uint8_t **p, uint8_t *end, uint32_t type) {
949
1001
  case 0: case 1: case 7: return safe_advance(p, end, 1);
950
1002
  case 2: case 3: return safe_advance(p, end, 2);
951
1003
  case 4: case 5: case 6: return safe_advance(p, end, 4);
1004
+ case 10: case 11: case 12: return safe_advance(p, end, 8);
952
1005
  case 8: {
953
1006
  uint64_t len = rd64(p, end);
954
1007
  return safe_advance(p, end, len);
@@ -982,6 +1035,10 @@ static void free_model_contents(EmbedModel *m) {
982
1035
  }
983
1036
  free(m->table);
984
1037
  }
1038
+ if (m->tensors) {
1039
+ for (int i = 0; i < m->n_tensors; i++) free(m->tensors[i].name);
1040
+ free(m->tensors);
1041
+ }
985
1042
  if (m->mapped) munmap(m->mapped, m->mapped_size);
986
1043
  bpe_merge_table_free(&m->merges);
987
1044
  free(m);
@@ -1047,6 +1104,21 @@ static void parse_merge(const char *merge_str, char **left, char **right) {
1047
1104
  }
1048
1105
  }
1049
1106
 
1107
+ static Tensor *find_tensor(EmbedModel *m, const char *name) {
1108
+ if (!m || !m->tensors) return NULL;
1109
+ for (int i = 0; i < m->n_tensors; i++) {
1110
+ if (strcmp(m->tensors[i].name, name) == 0) return &m->tensors[i];
1111
+ }
1112
+ return NULL;
1113
+ }
1114
+
1115
+ static float rd_float32(uint8_t **p, uint8_t *end) {
1116
+ uint32_t bits = rd32(p, end);
1117
+ float v;
1118
+ memcpy(&v, &bits, sizeof(v));
1119
+ return v;
1120
+ }
1121
+
1050
1122
  /* ------------------------------------------------------------------------- */
1051
1123
  static EmbedModel *embed_load_gguf(const char *path) {
1052
1124
  size_t sz;
@@ -1072,8 +1144,12 @@ static EmbedModel *embed_load_gguf(const char *path) {
1072
1144
  m->unknown_token_id = -1;
1073
1145
  m->bos_token_id = -1;
1074
1146
  m->eos_token_id = -1;
1147
+ m->sep_token_id = -1;
1148
+ m->pad_token_id = 0;
1149
+ m->cls_token_id = -1;
1075
1150
  m->vocab_type = LLAMA_VOCAB_TYPE_NONE;
1076
1151
  m->normalize = NORM_NONE;
1152
+ m->eps = 1e-12f;
1077
1153
 
1078
1154
  int vocab_found = 0;
1079
1155
  for (uint64_t i = 0; i < n_kv; i++) {
@@ -1136,12 +1212,31 @@ static EmbedModel *embed_load_gguf(const char *path) {
1136
1212
  } else if (strcmp(key, "tokenizer.ggml.pre") == 0 && type == 8) {
1137
1213
  char *pre = rdstr(&cur, end);
1138
1214
  free(pre);
1139
- } else if (strcmp(key, "tokenizer.ggml.unknown_token_id") == 0 && type == 6) {
1215
+ } else if (strcmp(key, "bert.block_count") == 0 && type == 4) {
1216
+ m->n_layers = (int)rd32(&cur, end);
1217
+ } else if (strcmp(key, "bert.context_length") == 0 && type == 4) {
1218
+ m->n_ctx = (int)rd32(&cur, end);
1219
+ } else if (strcmp(key, "bert.embedding_length") == 0 && type == 4) {
1220
+ m->dim = (int)rd32(&cur, end);
1221
+ } else if (strcmp(key, "bert.feed_forward_length") == 0 && type == 4) {
1222
+ m->n_ff = (int)rd32(&cur, end);
1223
+ } else if (strcmp(key, "bert.attention.head_count") == 0 && type == 4) {
1224
+ m->n_heads = (int)rd32(&cur, end);
1225
+ } else if (strcmp(key, "bert.attention.layer_norm_epsilon") == 0 && type == 6) {
1226
+ m->eps = rd_float32(&cur, end);
1227
+ } else if (strcmp(key, "tokenizer.ggml.unknown_token_id") == 0 && type == 4) {
1140
1228
  m->unknown_token_id = (int)rd32(&cur, end);
1141
- } else if (strcmp(key, "tokenizer.ggml.bos_token_id") == 0 && type == 6) {
1229
+ } else if (strcmp(key, "tokenizer.ggml.bos_token_id") == 0 && type == 4) {
1142
1230
  m->bos_token_id = (int)rd32(&cur, end);
1143
- } else if (strcmp(key, "tokenizer.ggml.eos_token_id") == 0 && type == 6) {
1231
+ } else if (strcmp(key, "tokenizer.ggml.eos_token_id") == 0 && type == 4) {
1144
1232
  m->eos_token_id = (int)rd32(&cur, end);
1233
+ m->sep_token_id = m->eos_token_id;
1234
+ } else if (strcmp(key, "tokenizer.ggml.seperator_token_id") == 0 && type == 4) {
1235
+ m->sep_token_id = (int)rd32(&cur, end);
1236
+ } else if (strcmp(key, "tokenizer.ggml.padding_token_id") == 0 && type == 4) {
1237
+ m->pad_token_id = (int)rd32(&cur, end);
1238
+ } else if (strcmp(key, "tokenizer.ggml.cls_token_id") == 0 && type == 4) {
1239
+ m->cls_token_id = (int)rd32(&cur, end);
1145
1240
  } else if (strcmp(key, "general.alignment") == 0 && type == 6) {
1146
1241
  rd32(&cur, end);
1147
1242
  } else {
@@ -1153,107 +1248,404 @@ static EmbedModel *embed_load_gguf(const char *path) {
1153
1248
  if (!vocab_found) { free_model_contents(m); return NULL; }
1154
1249
  detect_space_marker(m);
1155
1250
 
1156
- uint8_t *after_kv = cur;
1251
+ m->tensors = calloc((size_t)n_tensors, sizeof(Tensor));
1252
+ if (!m->tensors) { free_model_contents(m); return NULL; }
1253
+ m->n_tensors = (int)n_tensors;
1254
+
1255
+ for (uint64_t i = 0; i < n_tensors; i++) {
1256
+ Tensor *t = &m->tensors[i];
1257
+ t->name = rdstr(&cur, end);
1258
+ if (!t->name) { free_model_contents(m); return NULL; }
1259
+ t->n_dims = rd32(&cur, end);
1260
+ if (t->n_dims == 0 || t->n_dims > MAX_DIMS) { free_model_contents(m); return NULL; }
1261
+ for (uint32_t d = 0; d < t->n_dims; d++) t->dims[d] = rd64(&cur, end);
1262
+ t->type = (int)rd32(&cur, end);
1263
+ uint64_t offset = rd64(&cur, end);
1264
+ t->row_bytes = get_row_bytes(t->type, (int)t->dims[0]);
1265
+ if (t->row_bytes == 0) { free_model_contents(m); return NULL; }
1266
+ t->data = (const uint8_t*)(uintptr_t)offset;
1267
+ }
1268
+
1157
1269
  align_to_32(&cur, end, base);
1158
- uint8_t *tensor_start = cur;
1159
- int embd_found = 0;
1160
-
1161
- for (int attempt = 0; attempt < 2; attempt++) {
1162
- cur = tensor_start;
1163
- for (uint64_t i = 0; i < n_tensors; i++) {
1164
- char *name = rdstr(&cur, end);
1165
- if (!name) break;
1166
- uint32_t n_dims = rd32(&cur, end);
1167
- uint64_t dims[MAX_DIMS] = {0};
1168
- for (uint32_t d = 0; d < n_dims && d < MAX_DIMS; d++) dims[d] = rd64(&cur, end);
1169
- uint32_t type = rd32(&cur, end);
1170
- uint64_t offset = rd64(&cur, end);
1171
-
1172
- int is_token_embd = (strcmp(name, "token_embd.weight") == 0 ||
1173
- strcmp(name, "embeddings.word_embeddings.weight") == 0 ||
1174
- strcmp(name, "model.embed_tokens.weight") == 0);
1175
-
1176
- if (!is_token_embd && n_dims == 2 && m->vocab_size > 0) {
1177
- if ((uint64_t)m->vocab_size == dims[0] && strstr(name, "embd")) is_token_embd = 1;
1178
- else if ((uint64_t)m->vocab_size == dims[1] && strstr(name, "embd")) is_token_embd = 1;
1270
+ uint8_t *data_start = cur;
1271
+ for (int i = 0; i < m->n_tensors; i++) {
1272
+ Tensor *t = &m->tensors[i];
1273
+ uint64_t offset = (uint64_t)(uintptr_t)t->data;
1274
+ size_t rows = t->n_dims > 1 ? (size_t)t->dims[1] : 1;
1275
+ size_t total_size = rows * t->row_bytes;
1276
+ if (offset > (uint64_t)sz || data_start + offset < data_start ||
1277
+ data_start + offset + total_size > end) {
1278
+ free_model_contents(m);
1279
+ return NULL;
1280
+ }
1281
+ t->data = data_start + offset;
1282
+ }
1283
+
1284
+ Tensor *embd = find_tensor(m, "token_embd.weight");
1285
+ if (!embd) embd = find_tensor(m, "embeddings.word_embeddings.weight");
1286
+ if (!embd || embd->n_dims < 2 || embd->dims[1] != (uint64_t)m->vocab_size) {
1287
+ free_model_contents(m);
1288
+ return NULL;
1289
+ }
1290
+
1291
+ if (m->dim == 0) m->dim = (int)embd->dims[0];
1292
+ if (m->n_ctx == 0) m->n_ctx = 512;
1293
+ if (m->n_ff == 0) m->n_ff = m->dim * 4;
1294
+ if (m->n_heads == 0) m->n_heads = 12;
1295
+ if (m->n_layers == 0) m->n_layers = 12;
1296
+ if (m->cls_token_id < 0) m->cls_token_id = m->bos_token_id;
1297
+ if (m->sep_token_id < 0) m->sep_token_id = m->eos_token_id;
1298
+
1299
+ m->raw_tensor_data = embd->data;
1300
+ m->tensor_type = embd->type;
1301
+ m->row_bytes = embd->row_bytes;
1302
+ m->raw_dim0 = embd->dims[0];
1303
+ m->raw_dim1 = embd->dims[1];
1304
+ m->need_transpose = 0;
1305
+
1306
+ if (m->dim <= 0 || m->dim > MAX_DIM) {
1307
+ free_model_contents(m); return NULL;
1308
+ }
1309
+
1310
+ return m;
1311
+ }
1312
+
1313
+ /* ------------------------------------------------------------------------- */
1314
+ // L2 normalization
1315
+ static void normalize_l2(float *vec, int dim) {
1316
+ double sum = 0.0;
1317
+ for (int i = 0; i < dim; i++) sum += vec[i] * vec[i];
1318
+ double norm = sqrt(sum);
1319
+ if (norm > 0.0) {
1320
+ float inv = (float)(1.0 / norm);
1321
+ for (int i = 0; i < dim; i++) vec[i] *= inv;
1322
+ }
1323
+ }
1324
+
1325
+ static void tensor_get_row(const Tensor *t, int row, float *out) {
1326
+ if (!t || row < 0 || (t->n_dims > 1 && row >= (int)t->dims[1])) {
1327
+ return;
1328
+ }
1329
+
1330
+ const uint8_t *raw = t->data + (size_t)row * t->row_bytes;
1331
+ int cols = (int)t->dims[0];
1332
+ switch (t->type) {
1333
+ case GGML_TYPE_F32:
1334
+ memcpy(out, raw, (size_t)cols * sizeof(float));
1335
+ break;
1336
+ case GGML_TYPE_F16:
1337
+ for (int i = 0; i < cols; i++) {
1338
+ uint16_t h;
1339
+ memcpy(&h, raw + (size_t)i * sizeof(uint16_t), sizeof(uint16_t));
1340
+ out[i] = fp16_to_fp32(h);
1179
1341
  }
1342
+ break;
1343
+ case GGML_TYPE_Q4_0:
1344
+ dequantize_row_q4_0(raw, out, cols);
1345
+ break;
1346
+ case GGML_TYPE_Q8_0:
1347
+ dequantize_row_q8_0(raw, out, cols);
1348
+ break;
1349
+ default:
1350
+ memset(out, 0, (size_t)cols * sizeof(float));
1351
+ break;
1352
+ }
1353
+ }
1180
1354
 
1181
- if (!embd_found && is_token_embd) {
1182
- if (n_dims < 2 || dims[1] == 0) {
1183
- free(name); free_model_contents(m); return NULL;
1184
- }
1185
-
1186
- uint64_t ne0 = dims[0];
1187
- uint64_t ne1 = dims[1];
1188
-
1189
- int need_transpose = 0;
1190
- int dim;
1191
-
1192
- if (ne1 == (uint64_t)m->vocab_size) {
1193
- dim = (int)ne0;
1194
- need_transpose = 0;
1195
- } else if (ne0 == (uint64_t)m->vocab_size) {
1196
- dim = (int)ne1;
1197
- need_transpose = 1;
1198
- } else {
1199
- dim = (ne0 < ne1) ? (int)ne0 : (int)ne1;
1200
- need_transpose = (ne0 > ne1) ? 1 : 0;
1201
- }
1355
+ static const float *tensor_f32_data(const Tensor *t) {
1356
+ if (!t || t->type != GGML_TYPE_F32) return NULL;
1357
+ return (const float*)t->data;
1358
+ }
1202
1359
 
1203
- if (dim <= 0 || dim > MAX_DIM) {
1204
- free(name); free_model_contents(m); return NULL;
1205
- }
1360
+ static float dot_q4_0_q8_0_like_ggml(const uint8_t *raw, const float *x, int n) {
1361
+ int nb = n / QK8_0;
1362
+ float sumf = 0.0f;
1206
1363
 
1207
- size_t row_bytes = get_row_bytes(type, (int)(need_transpose ? ne1 : ne0));
1208
- size_t total_size = (size_t)(need_transpose ? ne1 : ne0) * row_bytes;
1209
-
1210
- if (offset >= sz || offset + total_size > sz) {
1211
- free(name);
1212
- free_model_contents(m);
1213
- return NULL;
1214
- }
1364
+ for (int ib = 0; ib < nb; ib++) {
1365
+ const uint8_t *block = raw + (size_t)ib * 18;
1366
+ uint16_t d16;
1367
+ memcpy(&d16, block, 2);
1368
+ const float dx = fp16_to_fp32(d16);
1369
+ const uint8_t *q = block + 2;
1370
+
1371
+ const float *xb = x + (size_t)ib * QK8_0;
1372
+ float amax = 0.0f;
1373
+ for (int j = 0; j < QK8_0; j++) {
1374
+ float av = fabsf(xb[j]);
1375
+ if (av > amax) amax = av;
1376
+ }
1377
+
1378
+ const float d = amax / 127.0f;
1379
+ const float id = d ? 1.0f / d : 0.0f;
1380
+ const float dy = fp16_to_fp32(fp32_to_fp16(d));
1381
+ int8_t qy[QK8_0];
1382
+ for (int j = 0; j < QK8_0; j++) qy[j] = (int8_t)roundf(xb[j] * id);
1383
+
1384
+ int sumi0 = 0;
1385
+ int sumi1 = 0;
1386
+ for (int j = 0; j < QK8_0/2; j++) {
1387
+ const int v0 = (q[j] & 0x0F) - 8;
1388
+ const int v1 = (q[j] >> 4) - 8;
1389
+ sumi0 += v0 * qy[j];
1390
+ sumi1 += v1 * qy[j + QK8_0/2];
1391
+ }
1392
+ sumf += (float)(sumi0 + sumi1) * dx * dy;
1393
+ }
1215
1394
 
1216
- m->dim = dim;
1217
- m->raw_dim0 = ne0;
1218
- m->raw_dim1 = ne1;
1219
- m->need_transpose = need_transpose;
1220
- m->raw_tensor_data = base + offset;
1221
- m->tensor_type = type;
1222
- m->row_bytes = row_bytes;
1223
- embd_found = 1;
1224
- free(name);
1395
+ return sumf;
1396
+ }
1397
+
1398
+ static int ascii_wordpiece_tokenize(EmbedModel *m, const char *txt, int *ids, int max_ids) {
1399
+ int n = 0;
1400
+ if (m->cls_token_id >= 0 && n < max_ids) ids[n++] = m->cls_token_id;
1401
+
1402
+ size_t len = strlen(txt);
1403
+ size_t i = 0;
1404
+ while (i < len && n < max_ids - 1) {
1405
+ while (i < len && isspace((unsigned char)txt[i])) i++;
1406
+ if (i >= len) break;
1407
+
1408
+ char word[256];
1409
+ int wl = 0;
1410
+ if (isalnum((unsigned char)txt[i])) {
1411
+ while (i < len && (isalnum((unsigned char)txt[i]) || txt[i] == '_') && wl < (int)sizeof(word) - 1) {
1412
+ word[wl++] = (char)tolower((unsigned char)txt[i++]);
1413
+ }
1414
+ while (i < len && (isalnum((unsigned char)txt[i]) || txt[i] == '_')) i++;
1415
+ } else {
1416
+ word[wl++] = txt[i++];
1417
+ }
1418
+ word[wl] = '\0';
1419
+ if (wl == 0) continue;
1420
+
1421
+ char word1[260];
1422
+ const char marker[] = "\xE2\x96\x81";
1423
+ memcpy(word1, marker, 3);
1424
+ memcpy(word1 + 3, word, (size_t)wl + 1);
1425
+ int w1l = wl + 3;
1426
+
1427
+ int current_tokens = n;
1428
+ for (int start = 0; start < w1l && n < max_ids - 1; start++) {
1429
+ int matched = 0;
1430
+ for (int end_pos = w1l; end_pos > start; end_pos--) {
1431
+ char piece[260];
1432
+ int plen = end_pos - start;
1433
+ memcpy(piece, word1 + start, plen);
1434
+ piece[plen] = '\0';
1435
+ int piece_id = hget(m, piece);
1436
+ if (piece_id >= 0) {
1437
+ ids[n++] = piece_id;
1438
+ start = end_pos - 1;
1439
+ matched = 1;
1440
+ break;
1441
+ }
1442
+ }
1443
+ if (!matched) {
1444
+ n = current_tokens;
1225
1445
  break;
1226
1446
  }
1227
- free(name);
1228
1447
  }
1229
- if (embd_found) break;
1230
- if (attempt == 0) {
1231
- tensor_start = find_tensor_info_start(after_kv, end);
1232
- if (!tensor_start) break;
1448
+
1449
+ if (n == current_tokens && m->unknown_token_id >= 0 && n < max_ids - 1) ids[n++] = m->unknown_token_id;
1450
+ }
1451
+
1452
+ if (m->sep_token_id >= 0 && n < max_ids) ids[n++] = m->sep_token_id;
1453
+ return n;
1454
+ }
1455
+
1456
+ static void linear_one(const Tensor *w, const Tensor *b, const float *x, float *out, float *row) {
1457
+ int in = (int)w->dims[0];
1458
+ int out_dim = (int)w->dims[1];
1459
+ const float *bias = tensor_f32_data(b);
1460
+ for (int o = 0; o < out_dim; o++) {
1461
+ float sum = bias ? bias[o] : 0.0f;
1462
+ if (w->type == GGML_TYPE_Q4_0) {
1463
+ const uint8_t *raw = w->data + (size_t)o * w->row_bytes;
1464
+ sum += dot_q4_0_q8_0_like_ggml(raw, x, in);
1465
+ } else {
1466
+ tensor_get_row(w, o, row);
1467
+ for (int i = 0; i < in; i++) sum += row[i] * x[i];
1233
1468
  }
1469
+ out[o] = sum;
1234
1470
  }
1471
+ }
1235
1472
 
1236
- if (!embd_found || m->dim == 0) {
1237
- free_model_contents(m); return NULL;
1473
+ static void linear_batch(const Tensor *w, const Tensor *b, const float *x, int seq, float *out, float *row) {
1474
+ int in = (int)w->dims[0];
1475
+ int out_dim = (int)w->dims[1];
1476
+ for (int t = 0; t < seq; t++) {
1477
+ linear_one(w, b, x + (size_t)t * in, out + (size_t)t * out_dim, row);
1238
1478
  }
1479
+ }
1239
1480
 
1240
- return m;
1481
+ static void layer_norm(const float *x, const Tensor *w, const Tensor *b, int seq, int dim, float eps, float *out) {
1482
+ const float *weight = tensor_f32_data(w);
1483
+ const float *bias = tensor_f32_data(b);
1484
+ for (int t = 0; t < seq; t++) {
1485
+ const float *src = x + (size_t)t * dim;
1486
+ float *dst = out + (size_t)t * dim;
1487
+ float mean = 0.0f;
1488
+ for (int i = 0; i < dim; i++) mean += src[i];
1489
+ mean /= (float)dim;
1490
+ float var = 0.0f;
1491
+ for (int i = 0; i < dim; i++) {
1492
+ float d = src[i] - mean;
1493
+ var += d * d;
1494
+ }
1495
+ var /= (float)dim;
1496
+ float scale = 1.0f / sqrtf(var + eps);
1497
+ for (int i = 0; i < dim; i++) {
1498
+ dst[i] = (src[i] - mean) * scale * (weight ? weight[i] : 1.0f) + (bias ? bias[i] : 0.0f);
1499
+ }
1500
+ }
1241
1501
  }
1242
1502
 
1243
- /* ------------------------------------------------------------------------- */
1244
- // L2 normalization
1245
- static void normalize_l2(float *vec, int dim) {
1246
- float sum = 0;
1247
- for (int i = 0; i < dim; i++) sum += vec[i] * vec[i];
1248
- float norm = sqrtf(sum);
1249
- if (norm > 1e-8f) {
1250
- float inv = 1.0f / norm;
1251
- for (int i = 0; i < dim; i++) vec[i] *= inv;
1503
+ static float gelu_approx(float x) {
1504
+ if (x <= -10.0f) return 0.0f;
1505
+ if (x >= 10.0f) return x;
1506
+ const float c = 0.7978845608028654f;
1507
+ float hx = fp16_to_fp32(fp32_to_fp16(x));
1508
+ float y = 0.5f * hx * (1.0f + tanhf(c * hx * (1.0f + 0.044715f * hx * hx)));
1509
+ return fp16_to_fp32(fp32_to_fp16(y));
1510
+ }
1511
+
1512
+ static int bert_embed_text(EmbedModel *m, const char *txt, float *out) {
1513
+ if (m->vocab_type != LLAMA_VOCAB_TYPE_WPM || !find_tensor(m, "blk.0.attn_q.weight")) return 0;
1514
+
1515
+ memset(out, 0, (size_t)m->dim * sizeof(float));
1516
+ if (!txt || !*txt) return 1;
1517
+
1518
+ int max_seq = m->n_ctx > 0 ? m->n_ctx : 512;
1519
+ int *ids = malloc((size_t)max_seq * sizeof(int));
1520
+ if (!ids) return 1;
1521
+ int seq = ascii_wordpiece_tokenize(m, txt, ids, max_seq);
1522
+ if (seq <= 0) { free(ids); return 1; }
1523
+
1524
+ int dim = m->dim;
1525
+ int ff = m->n_ff;
1526
+ int heads = m->n_heads;
1527
+ int head_dim = dim / heads;
1528
+ float *hidden = calloc((size_t)seq * dim, sizeof(float));
1529
+ float *tmp = calloc((size_t)seq * dim, sizeof(float));
1530
+ float *q = calloc((size_t)seq * dim, sizeof(float));
1531
+ float *k = calloc((size_t)seq * dim, sizeof(float));
1532
+ float *v = calloc((size_t)seq * dim, sizeof(float));
1533
+ float *ctx = calloc((size_t)seq * dim, sizeof(float));
1534
+ float *proj = calloc((size_t)seq * dim, sizeof(float));
1535
+ float *ffn = calloc((size_t)seq * ff, sizeof(float));
1536
+ float *row = malloc((size_t)(ff > dim ? ff : dim) * sizeof(float));
1537
+ float *scores = malloc((size_t)seq * sizeof(float));
1538
+ if (!hidden || !tmp || !q || !k || !v || !ctx || !proj || !ffn || !row || !scores) {
1539
+ free(ids); free(hidden); free(tmp); free(q); free(k); free(v); free(ctx); free(proj); free(ffn); free(row); free(scores);
1540
+ return 1;
1541
+ }
1542
+
1543
+ Tensor *tok_emb = find_tensor(m, "token_embd.weight");
1544
+ Tensor *pos_emb = find_tensor(m, "position_embd.weight");
1545
+ Tensor *typ_emb = find_tensor(m, "token_types.weight");
1546
+ Tensor *emb_norm_w = find_tensor(m, "token_embd_norm.weight");
1547
+ Tensor *emb_norm_b = find_tensor(m, "token_embd_norm.bias");
1548
+
1549
+ float *tok = row;
1550
+ float *pos = malloc((size_t)dim * sizeof(float));
1551
+ float *typ = malloc((size_t)dim * sizeof(float));
1552
+ if (!tok_emb || !pos_emb || !typ_emb || !pos || !typ) {
1553
+ free(ids); free(hidden); free(tmp); free(q); free(k); free(v); free(ctx); free(proj); free(ffn); free(row); free(scores); free(pos); free(typ);
1554
+ return 1;
1252
1555
  }
1556
+
1557
+ for (int t = 0; t < seq; t++) {
1558
+ tensor_get_row(tok_emb, ids[t], tok);
1559
+ tensor_get_row(pos_emb, t, pos);
1560
+ tensor_get_row(typ_emb, 0, typ);
1561
+ for (int d = 0; d < dim; d++) hidden[(size_t)t * dim + d] = tok[d] + pos[d] + typ[d];
1562
+ }
1563
+ layer_norm(hidden, emb_norm_w, emb_norm_b, seq, dim, m->eps, tmp);
1564
+ memcpy(hidden, tmp, (size_t)seq * dim * sizeof(float));
1565
+
1566
+ for (int layer = 0; layer < m->n_layers; layer++) {
1567
+ char name[80];
1568
+ #define TENSOR(suffix) (snprintf(name, sizeof(name), "blk.%d.%s", layer, suffix), find_tensor(m, name))
1569
+ Tensor *qw = TENSOR("attn_q.weight");
1570
+ Tensor *qb = TENSOR("attn_q.bias");
1571
+ Tensor *kw = TENSOR("attn_k.weight");
1572
+ Tensor *kb = TENSOR("attn_k.bias");
1573
+ Tensor *vw = TENSOR("attn_v.weight");
1574
+ Tensor *vb = TENSOR("attn_v.bias");
1575
+ Tensor *ow = TENSOR("attn_output.weight");
1576
+ Tensor *ob = TENSOR("attn_output.bias");
1577
+ Tensor *an_w = TENSOR("attn_output_norm.weight");
1578
+ Tensor *an_b = TENSOR("attn_output_norm.bias");
1579
+ Tensor *fu_w = TENSOR("ffn_up.weight");
1580
+ Tensor *fu_b = TENSOR("ffn_up.bias");
1581
+ Tensor *fd_w = TENSOR("ffn_down.weight");
1582
+ Tensor *fd_b = TENSOR("ffn_down.bias");
1583
+ Tensor *ln_w = TENSOR("layer_output_norm.weight");
1584
+ Tensor *ln_b = TENSOR("layer_output_norm.bias");
1585
+ #undef TENSOR
1586
+
1587
+ if (!qw || !qb || !kw || !kb || !vw || !vb || !ow || !ob || !an_w || !an_b ||
1588
+ !fu_w || !fu_b || !fd_w || !fd_b || !ln_w || !ln_b) break;
1589
+
1590
+ linear_batch(qw, qb, hidden, seq, q, row);
1591
+ linear_batch(kw, kb, hidden, seq, k, row);
1592
+ linear_batch(vw, vb, hidden, seq, v, row);
1593
+ memset(ctx, 0, (size_t)seq * dim * sizeof(float));
1594
+
1595
+ float att_scale = 1.0f / sqrtf((float)head_dim);
1596
+ for (int h = 0; h < heads; h++) {
1597
+ int off = h * head_dim;
1598
+ for (int ti = 0; ti < seq; ti++) {
1599
+ float max_score = -INFINITY;
1600
+ for (int tj = 0; tj < seq; tj++) {
1601
+ float dot = 0.0f;
1602
+ const float *qv0 = q + (size_t)ti * dim + off;
1603
+ const float *kv0 = k + (size_t)tj * dim + off;
1604
+ for (int d = 0; d < head_dim; d++) dot += qv0[d] * kv0[d];
1605
+ scores[tj] = dot * att_scale;
1606
+ if (scores[tj] > max_score) max_score = scores[tj];
1607
+ }
1608
+ double sum = 0.0;
1609
+ for (int tj = 0; tj < seq; tj++) {
1610
+ scores[tj] = expf(scores[tj] - max_score);
1611
+ sum += scores[tj];
1612
+ }
1613
+ float inv_sum = (float)(1.0 / sum);
1614
+ float *dst = ctx + (size_t)ti * dim + off;
1615
+ for (int tj = 0; tj < seq; tj++) {
1616
+ float p = scores[tj] * inv_sum;
1617
+ const float *vv0 = v + (size_t)tj * dim + off;
1618
+ for (int d = 0; d < head_dim; d++) dst[d] += p * vv0[d];
1619
+ }
1620
+ }
1621
+ }
1622
+
1623
+ linear_batch(ow, ob, ctx, seq, proj, row);
1624
+ for (int i = 0; i < seq * dim; i++) tmp[i] = hidden[i] + proj[i];
1625
+ layer_norm(tmp, an_w, an_b, seq, dim, m->eps, hidden);
1626
+
1627
+ linear_batch(fu_w, fu_b, hidden, seq, ffn, row);
1628
+ for (int i = 0; i < seq * ff; i++) ffn[i] = gelu_approx(ffn[i]);
1629
+ linear_batch(fd_w, fd_b, ffn, seq, proj, row);
1630
+ for (int i = 0; i < seq * dim; i++) tmp[i] = hidden[i] + proj[i];
1631
+ layer_norm(tmp, ln_w, ln_b, seq, dim, m->eps, hidden);
1632
+ }
1633
+
1634
+ for (int t = 0; t < seq; t++) {
1635
+ for (int d = 0; d < dim; d++) out[d] += hidden[(size_t)t * dim + d];
1636
+ }
1637
+ float inv = 1.0f / (float)seq;
1638
+ for (int d = 0; d < dim; d++) out[d] *= inv;
1639
+ normalize_l2(out, dim);
1640
+
1641
+ free(ids); free(hidden); free(tmp); free(q); free(k); free(v); free(ctx); free(proj); free(ffn); free(row); free(scores); free(pos); free(typ);
1642
+ return 1;
1253
1643
  }
1254
1644
 
1255
1645
  /* ------------------------------------------------------------------------- */
1256
1646
  static void embed_text(EmbedModel *m, const char *txt, float *out) {
1647
+ if (bert_embed_text(m, txt, out)) return;
1648
+
1257
1649
  memset(out, 0, sizeof(float) * m->dim);
1258
1650
  if (!txt || !*txt) return;
1259
1651
 
@@ -1413,4 +1805,4 @@ void Init_mini_embed(void) {
1413
1805
  rb_define_alloc_func(c, rb_embedder_alloc);
1414
1806
  rb_define_method(c, "initialize", rb_embedder_initialize, 1);
1415
1807
  rb_define_method(c, "embed", rb_embed, 1);
1416
- }
1808
+ }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: mini_embed
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Makapoxa