mini_embed 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +2 -0
- data/ext/mini_embed/mini_embed.c +491 -99
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 0f3a2f9365c3ba228faf709ec97d986dadcc22a78ab0b706d35ba3da5e1552ce
|
|
4
|
+
data.tar.gz: 778847fe77dc4cb8b8774b62fe6f212c435880e470868f15c2517f1adb37211a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '086948ced123967c0aa5f7e0fcb6624dffb2f68f6c95e4abd2dbaa429fb8717d98be76f6b173925b21af1bc14a0fe8af9d6d68d891ba4ce90d5a0b2145df55ef'
|
|
7
|
+
data.tar.gz: 2ac2c25baf87dd7b21fc38ccce6c3be0a3c133008ef27fa1790a95d3bc6146d5cb522166c93d8961edb16c3128e13d8f0d9206869abee43af10586f0124e00f2
|
data/README.md
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
# mini_embed
|
|
2
2
|
|
|
3
|
+
[](https://dl.circleci.com/status-badge/redirect/gh/Makapoxa/mini_embed/tree/main) [](https://badge.fury.io/rb/mini_embed)
|
|
4
|
+
|
|
3
5
|
A minimal, dependency‑free C extension for Ruby that loads [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) embedding models and computes text embeddings **locally**.
|
|
4
6
|
|
|
5
7
|
**⚠️ Important:** This gem is intended for **small projects, prototypes, and hobbyist use**. It allows you to experiment with embeddings without relying on external APIs or cloud costs. **Do not use MiniEmbed in production** – it lacks the performance, scalability, and tokenization robustness of dedicated solutions. For real applications, use a proper inference server like [llama.cpp](https://github.com/ggerganov/llama.cpp) with its HTTP API, or managed services such as OpenAI, Cohere, or Hugging Face.
|
data/ext/mini_embed/mini_embed.c
CHANGED
|
@@ -440,9 +440,23 @@ typedef struct HashNode {
|
|
|
440
440
|
struct HashNode *next;
|
|
441
441
|
} HashNode;
|
|
442
442
|
|
|
443
|
+
typedef struct {
|
|
444
|
+
char *name;
|
|
445
|
+
uint32_t n_dims;
|
|
446
|
+
uint64_t dims[MAX_DIMS];
|
|
447
|
+
int type;
|
|
448
|
+
const uint8_t *data;
|
|
449
|
+
size_t row_bytes;
|
|
450
|
+
} Tensor;
|
|
451
|
+
|
|
443
452
|
typedef struct {
|
|
444
453
|
int vocab_size;
|
|
445
454
|
int dim;
|
|
455
|
+
int n_layers;
|
|
456
|
+
int n_heads;
|
|
457
|
+
int n_ctx;
|
|
458
|
+
int n_ff;
|
|
459
|
+
float eps;
|
|
446
460
|
char **tokens;
|
|
447
461
|
void *mapped;
|
|
448
462
|
size_t mapped_size;
|
|
@@ -460,6 +474,11 @@ typedef struct {
|
|
|
460
474
|
int need_transpose;
|
|
461
475
|
uint64_t raw_dim0, raw_dim1;
|
|
462
476
|
int normalize;
|
|
477
|
+
Tensor *tensors;
|
|
478
|
+
int n_tensors;
|
|
479
|
+
int sep_token_id;
|
|
480
|
+
int pad_token_id;
|
|
481
|
+
int cls_token_id;
|
|
463
482
|
} EmbedModel;
|
|
464
483
|
|
|
465
484
|
typedef struct {
|
|
@@ -541,6 +560,36 @@ static float fp16_to_fp32(uint16_t h) {
|
|
|
541
560
|
return result;
|
|
542
561
|
}
|
|
543
562
|
|
|
563
|
+
static uint16_t fp32_to_fp16(float f) {
|
|
564
|
+
uint32_t x;
|
|
565
|
+
memcpy(&x, &f, sizeof(x));
|
|
566
|
+
|
|
567
|
+
uint32_t sign = (x >> 16) & 0x8000;
|
|
568
|
+
int exp = ((x >> 23) & 0xFF) - 127 + 15;
|
|
569
|
+
uint32_t mant = x & 0x7FFFFF;
|
|
570
|
+
|
|
571
|
+
if (exp <= 0) {
|
|
572
|
+
if (exp < -10) return (uint16_t)sign;
|
|
573
|
+
mant |= 0x800000;
|
|
574
|
+
uint32_t t = mant >> (1 - exp);
|
|
575
|
+
if (t & 0x00001000) t += 0x00002000;
|
|
576
|
+
return (uint16_t)(sign | (t >> 13));
|
|
577
|
+
} else if (exp >= 31) {
|
|
578
|
+
if (mant == 0) return (uint16_t)(sign | 0x7C00);
|
|
579
|
+
return (uint16_t)(sign | 0x7C00 | (mant >> 13));
|
|
580
|
+
} else {
|
|
581
|
+
if (mant & 0x00001000) {
|
|
582
|
+
mant += 0x00002000;
|
|
583
|
+
if (mant & 0x00800000) {
|
|
584
|
+
mant = 0;
|
|
585
|
+
exp += 1;
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
if (exp >= 31) return (uint16_t)(sign | 0x7C00);
|
|
589
|
+
return (uint16_t)(sign | ((uint32_t)exp << 10) | (mant >> 13));
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
|
|
544
593
|
/* ------------------------------------------------------------------------- */
|
|
545
594
|
// Block dequantization functions (correct sizes)
|
|
546
595
|
static void dequantize_row_q4_0(const void *vx, float *y, int k) {
|
|
@@ -552,9 +601,9 @@ static void dequantize_row_q4_0(const void *vx, float *y, int k) {
|
|
|
552
601
|
memcpy(&d16, block, 2);
|
|
553
602
|
const float d = fp16_to_fp32(d16);
|
|
554
603
|
const uint8_t *q = block + 2;
|
|
555
|
-
for (int j = 0; j <
|
|
556
|
-
|
|
557
|
-
y[i*32 + j] = (
|
|
604
|
+
for (int j = 0; j < 16; j++) {
|
|
605
|
+
y[i*32 + j] = ((q[j] & 0x0F) - 8.0f) * d;
|
|
606
|
+
y[i*32 + j + 16] = ((q[j] >> 4) - 8.0f) * d;
|
|
558
607
|
}
|
|
559
608
|
}
|
|
560
609
|
}
|
|
@@ -570,9 +619,9 @@ static void dequantize_row_q4_1(const void *vx, float *y, int k) {
|
|
|
570
619
|
const float d = fp16_to_fp32(d16);
|
|
571
620
|
const float m = fp16_to_fp32(m16);
|
|
572
621
|
const uint8_t *q = block + 4;
|
|
573
|
-
for (int j = 0; j <
|
|
574
|
-
|
|
575
|
-
y[i*32 + j] =
|
|
622
|
+
for (int j = 0; j < 16; j++) {
|
|
623
|
+
y[i*32 + j] = (q[j] & 0x0F) * d + m;
|
|
624
|
+
y[i*32 + j + 16] = (q[j] >> 4) * d + m;
|
|
576
625
|
}
|
|
577
626
|
}
|
|
578
627
|
}
|
|
@@ -622,9 +671,10 @@ static void dequantize_row_q8_0(const void *vx, float *y, int k) {
|
|
|
622
671
|
const uint8_t *x = vx;
|
|
623
672
|
for (int i = 0; i < nb; i++) {
|
|
624
673
|
const uint8_t *block = x + i * 34;
|
|
625
|
-
|
|
626
|
-
memcpy(&
|
|
627
|
-
const
|
|
674
|
+
uint16_t d16;
|
|
675
|
+
memcpy(&d16, block, 2);
|
|
676
|
+
const float d = fp16_to_fp32(d16);
|
|
677
|
+
const int8_t *q = (const int8_t*)(block + 2);
|
|
628
678
|
for (int j = 0; j < 32; j++) {
|
|
629
679
|
y[i*32 + j] = (float)q[j] * d;
|
|
630
680
|
}
|
|
@@ -635,11 +685,13 @@ static void dequantize_row_q8_1(const void *vx, float *y, int k) {
|
|
|
635
685
|
const int nb = k / QK8_0;
|
|
636
686
|
const uint8_t *x = vx;
|
|
637
687
|
for (int i = 0; i < nb; i++) {
|
|
638
|
-
const uint8_t *block = x + i *
|
|
639
|
-
|
|
640
|
-
memcpy(&
|
|
641
|
-
memcpy(&
|
|
642
|
-
const
|
|
688
|
+
const uint8_t *block = x + i * 36;
|
|
689
|
+
uint16_t d16, s16;
|
|
690
|
+
memcpy(&d16, block, 2);
|
|
691
|
+
memcpy(&s16, block + 2, 2);
|
|
692
|
+
const float d = fp16_to_fp32(d16);
|
|
693
|
+
const float s = fp16_to_fp32(s16);
|
|
694
|
+
const int8_t *q = (const int8_t*)(block + 4);
|
|
643
695
|
for (int j = 0; j < 32; j++) {
|
|
644
696
|
y[i*32 + j] = (float)q[j] * d + s;
|
|
645
697
|
}
|
|
@@ -932,7 +984,7 @@ static size_t get_row_bytes(int type, int n_cols) {
|
|
|
932
984
|
case GGML_TYPE_Q5_0: return (n_cols / 32) * 22;
|
|
933
985
|
case GGML_TYPE_Q5_1: return (n_cols / 32) * 24;
|
|
934
986
|
case GGML_TYPE_Q8_0: return (n_cols / 32) * 34;
|
|
935
|
-
case GGML_TYPE_Q8_1: return (n_cols / 32) *
|
|
987
|
+
case GGML_TYPE_Q8_1: return (n_cols / 32) * 36;
|
|
936
988
|
case GGML_TYPE_Q2_K: return (n_cols / 256) * 84;
|
|
937
989
|
case GGML_TYPE_Q3_K: return (n_cols / 256) * 110;
|
|
938
990
|
case GGML_TYPE_Q4_K: return (n_cols / 256) * 144;
|
|
@@ -949,6 +1001,7 @@ static int skip_value(uint8_t **p, uint8_t *end, uint32_t type) {
|
|
|
949
1001
|
case 0: case 1: case 7: return safe_advance(p, end, 1);
|
|
950
1002
|
case 2: case 3: return safe_advance(p, end, 2);
|
|
951
1003
|
case 4: case 5: case 6: return safe_advance(p, end, 4);
|
|
1004
|
+
case 10: case 11: case 12: return safe_advance(p, end, 8);
|
|
952
1005
|
case 8: {
|
|
953
1006
|
uint64_t len = rd64(p, end);
|
|
954
1007
|
return safe_advance(p, end, len);
|
|
@@ -982,6 +1035,10 @@ static void free_model_contents(EmbedModel *m) {
|
|
|
982
1035
|
}
|
|
983
1036
|
free(m->table);
|
|
984
1037
|
}
|
|
1038
|
+
if (m->tensors) {
|
|
1039
|
+
for (int i = 0; i < m->n_tensors; i++) free(m->tensors[i].name);
|
|
1040
|
+
free(m->tensors);
|
|
1041
|
+
}
|
|
985
1042
|
if (m->mapped) munmap(m->mapped, m->mapped_size);
|
|
986
1043
|
bpe_merge_table_free(&m->merges);
|
|
987
1044
|
free(m);
|
|
@@ -1047,6 +1104,21 @@ static void parse_merge(const char *merge_str, char **left, char **right) {
|
|
|
1047
1104
|
}
|
|
1048
1105
|
}
|
|
1049
1106
|
|
|
1107
|
+
static Tensor *find_tensor(EmbedModel *m, const char *name) {
|
|
1108
|
+
if (!m || !m->tensors) return NULL;
|
|
1109
|
+
for (int i = 0; i < m->n_tensors; i++) {
|
|
1110
|
+
if (strcmp(m->tensors[i].name, name) == 0) return &m->tensors[i];
|
|
1111
|
+
}
|
|
1112
|
+
return NULL;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
static float rd_float32(uint8_t **p, uint8_t *end) {
|
|
1116
|
+
uint32_t bits = rd32(p, end);
|
|
1117
|
+
float v;
|
|
1118
|
+
memcpy(&v, &bits, sizeof(v));
|
|
1119
|
+
return v;
|
|
1120
|
+
}
|
|
1121
|
+
|
|
1050
1122
|
/* ------------------------------------------------------------------------- */
|
|
1051
1123
|
static EmbedModel *embed_load_gguf(const char *path) {
|
|
1052
1124
|
size_t sz;
|
|
@@ -1072,8 +1144,12 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
1072
1144
|
m->unknown_token_id = -1;
|
|
1073
1145
|
m->bos_token_id = -1;
|
|
1074
1146
|
m->eos_token_id = -1;
|
|
1147
|
+
m->sep_token_id = -1;
|
|
1148
|
+
m->pad_token_id = 0;
|
|
1149
|
+
m->cls_token_id = -1;
|
|
1075
1150
|
m->vocab_type = LLAMA_VOCAB_TYPE_NONE;
|
|
1076
1151
|
m->normalize = NORM_NONE;
|
|
1152
|
+
m->eps = 1e-12f;
|
|
1077
1153
|
|
|
1078
1154
|
int vocab_found = 0;
|
|
1079
1155
|
for (uint64_t i = 0; i < n_kv; i++) {
|
|
@@ -1136,12 +1212,31 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
1136
1212
|
} else if (strcmp(key, "tokenizer.ggml.pre") == 0 && type == 8) {
|
|
1137
1213
|
char *pre = rdstr(&cur, end);
|
|
1138
1214
|
free(pre);
|
|
1139
|
-
} else if (strcmp(key, "
|
|
1215
|
+
} else if (strcmp(key, "bert.block_count") == 0 && type == 4) {
|
|
1216
|
+
m->n_layers = (int)rd32(&cur, end);
|
|
1217
|
+
} else if (strcmp(key, "bert.context_length") == 0 && type == 4) {
|
|
1218
|
+
m->n_ctx = (int)rd32(&cur, end);
|
|
1219
|
+
} else if (strcmp(key, "bert.embedding_length") == 0 && type == 4) {
|
|
1220
|
+
m->dim = (int)rd32(&cur, end);
|
|
1221
|
+
} else if (strcmp(key, "bert.feed_forward_length") == 0 && type == 4) {
|
|
1222
|
+
m->n_ff = (int)rd32(&cur, end);
|
|
1223
|
+
} else if (strcmp(key, "bert.attention.head_count") == 0 && type == 4) {
|
|
1224
|
+
m->n_heads = (int)rd32(&cur, end);
|
|
1225
|
+
} else if (strcmp(key, "bert.attention.layer_norm_epsilon") == 0 && type == 6) {
|
|
1226
|
+
m->eps = rd_float32(&cur, end);
|
|
1227
|
+
} else if (strcmp(key, "tokenizer.ggml.unknown_token_id") == 0 && type == 4) {
|
|
1140
1228
|
m->unknown_token_id = (int)rd32(&cur, end);
|
|
1141
|
-
} else if (strcmp(key, "tokenizer.ggml.bos_token_id") == 0 && type ==
|
|
1229
|
+
} else if (strcmp(key, "tokenizer.ggml.bos_token_id") == 0 && type == 4) {
|
|
1142
1230
|
m->bos_token_id = (int)rd32(&cur, end);
|
|
1143
|
-
} else if (strcmp(key, "tokenizer.ggml.eos_token_id") == 0 && type ==
|
|
1231
|
+
} else if (strcmp(key, "tokenizer.ggml.eos_token_id") == 0 && type == 4) {
|
|
1144
1232
|
m->eos_token_id = (int)rd32(&cur, end);
|
|
1233
|
+
m->sep_token_id = m->eos_token_id;
|
|
1234
|
+
} else if (strcmp(key, "tokenizer.ggml.seperator_token_id") == 0 && type == 4) {
|
|
1235
|
+
m->sep_token_id = (int)rd32(&cur, end);
|
|
1236
|
+
} else if (strcmp(key, "tokenizer.ggml.padding_token_id") == 0 && type == 4) {
|
|
1237
|
+
m->pad_token_id = (int)rd32(&cur, end);
|
|
1238
|
+
} else if (strcmp(key, "tokenizer.ggml.cls_token_id") == 0 && type == 4) {
|
|
1239
|
+
m->cls_token_id = (int)rd32(&cur, end);
|
|
1145
1240
|
} else if (strcmp(key, "general.alignment") == 0 && type == 6) {
|
|
1146
1241
|
rd32(&cur, end);
|
|
1147
1242
|
} else {
|
|
@@ -1153,107 +1248,404 @@ static EmbedModel *embed_load_gguf(const char *path) {
|
|
|
1153
1248
|
if (!vocab_found) { free_model_contents(m); return NULL; }
|
|
1154
1249
|
detect_space_marker(m);
|
|
1155
1250
|
|
|
1156
|
-
|
|
1251
|
+
m->tensors = calloc((size_t)n_tensors, sizeof(Tensor));
|
|
1252
|
+
if (!m->tensors) { free_model_contents(m); return NULL; }
|
|
1253
|
+
m->n_tensors = (int)n_tensors;
|
|
1254
|
+
|
|
1255
|
+
for (uint64_t i = 0; i < n_tensors; i++) {
|
|
1256
|
+
Tensor *t = &m->tensors[i];
|
|
1257
|
+
t->name = rdstr(&cur, end);
|
|
1258
|
+
if (!t->name) { free_model_contents(m); return NULL; }
|
|
1259
|
+
t->n_dims = rd32(&cur, end);
|
|
1260
|
+
if (t->n_dims == 0 || t->n_dims > MAX_DIMS) { free_model_contents(m); return NULL; }
|
|
1261
|
+
for (uint32_t d = 0; d < t->n_dims; d++) t->dims[d] = rd64(&cur, end);
|
|
1262
|
+
t->type = (int)rd32(&cur, end);
|
|
1263
|
+
uint64_t offset = rd64(&cur, end);
|
|
1264
|
+
t->row_bytes = get_row_bytes(t->type, (int)t->dims[0]);
|
|
1265
|
+
if (t->row_bytes == 0) { free_model_contents(m); return NULL; }
|
|
1266
|
+
t->data = (const uint8_t*)(uintptr_t)offset;
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1157
1269
|
align_to_32(&cur, end, base);
|
|
1158
|
-
uint8_t *
|
|
1159
|
-
int
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1270
|
+
uint8_t *data_start = cur;
|
|
1271
|
+
for (int i = 0; i < m->n_tensors; i++) {
|
|
1272
|
+
Tensor *t = &m->tensors[i];
|
|
1273
|
+
uint64_t offset = (uint64_t)(uintptr_t)t->data;
|
|
1274
|
+
size_t rows = t->n_dims > 1 ? (size_t)t->dims[1] : 1;
|
|
1275
|
+
size_t total_size = rows * t->row_bytes;
|
|
1276
|
+
if (offset > (uint64_t)sz || data_start + offset < data_start ||
|
|
1277
|
+
data_start + offset + total_size > end) {
|
|
1278
|
+
free_model_contents(m);
|
|
1279
|
+
return NULL;
|
|
1280
|
+
}
|
|
1281
|
+
t->data = data_start + offset;
|
|
1282
|
+
}
|
|
1283
|
+
|
|
1284
|
+
Tensor *embd = find_tensor(m, "token_embd.weight");
|
|
1285
|
+
if (!embd) embd = find_tensor(m, "embeddings.word_embeddings.weight");
|
|
1286
|
+
if (!embd || embd->n_dims < 2 || embd->dims[1] != (uint64_t)m->vocab_size) {
|
|
1287
|
+
free_model_contents(m);
|
|
1288
|
+
return NULL;
|
|
1289
|
+
}
|
|
1290
|
+
|
|
1291
|
+
if (m->dim == 0) m->dim = (int)embd->dims[0];
|
|
1292
|
+
if (m->n_ctx == 0) m->n_ctx = 512;
|
|
1293
|
+
if (m->n_ff == 0) m->n_ff = m->dim * 4;
|
|
1294
|
+
if (m->n_heads == 0) m->n_heads = 12;
|
|
1295
|
+
if (m->n_layers == 0) m->n_layers = 12;
|
|
1296
|
+
if (m->cls_token_id < 0) m->cls_token_id = m->bos_token_id;
|
|
1297
|
+
if (m->sep_token_id < 0) m->sep_token_id = m->eos_token_id;
|
|
1298
|
+
|
|
1299
|
+
m->raw_tensor_data = embd->data;
|
|
1300
|
+
m->tensor_type = embd->type;
|
|
1301
|
+
m->row_bytes = embd->row_bytes;
|
|
1302
|
+
m->raw_dim0 = embd->dims[0];
|
|
1303
|
+
m->raw_dim1 = embd->dims[1];
|
|
1304
|
+
m->need_transpose = 0;
|
|
1305
|
+
|
|
1306
|
+
if (m->dim <= 0 || m->dim > MAX_DIM) {
|
|
1307
|
+
free_model_contents(m); return NULL;
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
return m;
|
|
1311
|
+
}
|
|
1312
|
+
|
|
1313
|
+
/* ------------------------------------------------------------------------- */
|
|
1314
|
+
// L2 normalization
|
|
1315
|
+
static void normalize_l2(float *vec, int dim) {
|
|
1316
|
+
double sum = 0.0;
|
|
1317
|
+
for (int i = 0; i < dim; i++) sum += vec[i] * vec[i];
|
|
1318
|
+
double norm = sqrt(sum);
|
|
1319
|
+
if (norm > 0.0) {
|
|
1320
|
+
float inv = (float)(1.0 / norm);
|
|
1321
|
+
for (int i = 0; i < dim; i++) vec[i] *= inv;
|
|
1322
|
+
}
|
|
1323
|
+
}
|
|
1324
|
+
|
|
1325
|
+
static void tensor_get_row(const Tensor *t, int row, float *out) {
|
|
1326
|
+
if (!t || row < 0 || (t->n_dims > 1 && row >= (int)t->dims[1])) {
|
|
1327
|
+
return;
|
|
1328
|
+
}
|
|
1329
|
+
|
|
1330
|
+
const uint8_t *raw = t->data + (size_t)row * t->row_bytes;
|
|
1331
|
+
int cols = (int)t->dims[0];
|
|
1332
|
+
switch (t->type) {
|
|
1333
|
+
case GGML_TYPE_F32:
|
|
1334
|
+
memcpy(out, raw, (size_t)cols * sizeof(float));
|
|
1335
|
+
break;
|
|
1336
|
+
case GGML_TYPE_F16:
|
|
1337
|
+
for (int i = 0; i < cols; i++) {
|
|
1338
|
+
uint16_t h;
|
|
1339
|
+
memcpy(&h, raw + (size_t)i * sizeof(uint16_t), sizeof(uint16_t));
|
|
1340
|
+
out[i] = fp16_to_fp32(h);
|
|
1179
1341
|
}
|
|
1342
|
+
break;
|
|
1343
|
+
case GGML_TYPE_Q4_0:
|
|
1344
|
+
dequantize_row_q4_0(raw, out, cols);
|
|
1345
|
+
break;
|
|
1346
|
+
case GGML_TYPE_Q8_0:
|
|
1347
|
+
dequantize_row_q8_0(raw, out, cols);
|
|
1348
|
+
break;
|
|
1349
|
+
default:
|
|
1350
|
+
memset(out, 0, (size_t)cols * sizeof(float));
|
|
1351
|
+
break;
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1180
1354
|
|
|
1181
|
-
|
|
1182
|
-
|
|
1183
|
-
|
|
1184
|
-
|
|
1185
|
-
|
|
1186
|
-
uint64_t ne0 = dims[0];
|
|
1187
|
-
uint64_t ne1 = dims[1];
|
|
1188
|
-
|
|
1189
|
-
int need_transpose = 0;
|
|
1190
|
-
int dim;
|
|
1191
|
-
|
|
1192
|
-
if (ne1 == (uint64_t)m->vocab_size) {
|
|
1193
|
-
dim = (int)ne0;
|
|
1194
|
-
need_transpose = 0;
|
|
1195
|
-
} else if (ne0 == (uint64_t)m->vocab_size) {
|
|
1196
|
-
dim = (int)ne1;
|
|
1197
|
-
need_transpose = 1;
|
|
1198
|
-
} else {
|
|
1199
|
-
dim = (ne0 < ne1) ? (int)ne0 : (int)ne1;
|
|
1200
|
-
need_transpose = (ne0 > ne1) ? 1 : 0;
|
|
1201
|
-
}
|
|
1355
|
+
static const float *tensor_f32_data(const Tensor *t) {
|
|
1356
|
+
if (!t || t->type != GGML_TYPE_F32) return NULL;
|
|
1357
|
+
return (const float*)t->data;
|
|
1358
|
+
}
|
|
1202
1359
|
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1360
|
+
static float dot_q4_0_q8_0_like_ggml(const uint8_t *raw, const float *x, int n) {
|
|
1361
|
+
int nb = n / QK8_0;
|
|
1362
|
+
float sumf = 0.0f;
|
|
1206
1363
|
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1364
|
+
for (int ib = 0; ib < nb; ib++) {
|
|
1365
|
+
const uint8_t *block = raw + (size_t)ib * 18;
|
|
1366
|
+
uint16_t d16;
|
|
1367
|
+
memcpy(&d16, block, 2);
|
|
1368
|
+
const float dx = fp16_to_fp32(d16);
|
|
1369
|
+
const uint8_t *q = block + 2;
|
|
1370
|
+
|
|
1371
|
+
const float *xb = x + (size_t)ib * QK8_0;
|
|
1372
|
+
float amax = 0.0f;
|
|
1373
|
+
for (int j = 0; j < QK8_0; j++) {
|
|
1374
|
+
float av = fabsf(xb[j]);
|
|
1375
|
+
if (av > amax) amax = av;
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1378
|
+
const float d = amax / 127.0f;
|
|
1379
|
+
const float id = d ? 1.0f / d : 0.0f;
|
|
1380
|
+
const float dy = fp16_to_fp32(fp32_to_fp16(d));
|
|
1381
|
+
int8_t qy[QK8_0];
|
|
1382
|
+
for (int j = 0; j < QK8_0; j++) qy[j] = (int8_t)roundf(xb[j] * id);
|
|
1383
|
+
|
|
1384
|
+
int sumi0 = 0;
|
|
1385
|
+
int sumi1 = 0;
|
|
1386
|
+
for (int j = 0; j < QK8_0/2; j++) {
|
|
1387
|
+
const int v0 = (q[j] & 0x0F) - 8;
|
|
1388
|
+
const int v1 = (q[j] >> 4) - 8;
|
|
1389
|
+
sumi0 += v0 * qy[j];
|
|
1390
|
+
sumi1 += v1 * qy[j + QK8_0/2];
|
|
1391
|
+
}
|
|
1392
|
+
sumf += (float)(sumi0 + sumi1) * dx * dy;
|
|
1393
|
+
}
|
|
1215
1394
|
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1395
|
+
return sumf;
|
|
1396
|
+
}
|
|
1397
|
+
|
|
1398
|
+
static int ascii_wordpiece_tokenize(EmbedModel *m, const char *txt, int *ids, int max_ids) {
|
|
1399
|
+
int n = 0;
|
|
1400
|
+
if (m->cls_token_id >= 0 && n < max_ids) ids[n++] = m->cls_token_id;
|
|
1401
|
+
|
|
1402
|
+
size_t len = strlen(txt);
|
|
1403
|
+
size_t i = 0;
|
|
1404
|
+
while (i < len && n < max_ids - 1) {
|
|
1405
|
+
while (i < len && isspace((unsigned char)txt[i])) i++;
|
|
1406
|
+
if (i >= len) break;
|
|
1407
|
+
|
|
1408
|
+
char word[256];
|
|
1409
|
+
int wl = 0;
|
|
1410
|
+
if (isalnum((unsigned char)txt[i])) {
|
|
1411
|
+
while (i < len && (isalnum((unsigned char)txt[i]) || txt[i] == '_') && wl < (int)sizeof(word) - 1) {
|
|
1412
|
+
word[wl++] = (char)tolower((unsigned char)txt[i++]);
|
|
1413
|
+
}
|
|
1414
|
+
while (i < len && (isalnum((unsigned char)txt[i]) || txt[i] == '_')) i++;
|
|
1415
|
+
} else {
|
|
1416
|
+
word[wl++] = txt[i++];
|
|
1417
|
+
}
|
|
1418
|
+
word[wl] = '\0';
|
|
1419
|
+
if (wl == 0) continue;
|
|
1420
|
+
|
|
1421
|
+
char word1[260];
|
|
1422
|
+
const char marker[] = "\xE2\x96\x81";
|
|
1423
|
+
memcpy(word1, marker, 3);
|
|
1424
|
+
memcpy(word1 + 3, word, (size_t)wl + 1);
|
|
1425
|
+
int w1l = wl + 3;
|
|
1426
|
+
|
|
1427
|
+
int current_tokens = n;
|
|
1428
|
+
for (int start = 0; start < w1l && n < max_ids - 1; start++) {
|
|
1429
|
+
int matched = 0;
|
|
1430
|
+
for (int end_pos = w1l; end_pos > start; end_pos--) {
|
|
1431
|
+
char piece[260];
|
|
1432
|
+
int plen = end_pos - start;
|
|
1433
|
+
memcpy(piece, word1 + start, plen);
|
|
1434
|
+
piece[plen] = '\0';
|
|
1435
|
+
int piece_id = hget(m, piece);
|
|
1436
|
+
if (piece_id >= 0) {
|
|
1437
|
+
ids[n++] = piece_id;
|
|
1438
|
+
start = end_pos - 1;
|
|
1439
|
+
matched = 1;
|
|
1440
|
+
break;
|
|
1441
|
+
}
|
|
1442
|
+
}
|
|
1443
|
+
if (!matched) {
|
|
1444
|
+
n = current_tokens;
|
|
1225
1445
|
break;
|
|
1226
1446
|
}
|
|
1227
|
-
free(name);
|
|
1228
1447
|
}
|
|
1229
|
-
|
|
1230
|
-
if (
|
|
1231
|
-
|
|
1232
|
-
|
|
1448
|
+
|
|
1449
|
+
if (n == current_tokens && m->unknown_token_id >= 0 && n < max_ids - 1) ids[n++] = m->unknown_token_id;
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
if (m->sep_token_id >= 0 && n < max_ids) ids[n++] = m->sep_token_id;
|
|
1453
|
+
return n;
|
|
1454
|
+
}
|
|
1455
|
+
|
|
1456
|
+
static void linear_one(const Tensor *w, const Tensor *b, const float *x, float *out, float *row) {
|
|
1457
|
+
int in = (int)w->dims[0];
|
|
1458
|
+
int out_dim = (int)w->dims[1];
|
|
1459
|
+
const float *bias = tensor_f32_data(b);
|
|
1460
|
+
for (int o = 0; o < out_dim; o++) {
|
|
1461
|
+
float sum = bias ? bias[o] : 0.0f;
|
|
1462
|
+
if (w->type == GGML_TYPE_Q4_0) {
|
|
1463
|
+
const uint8_t *raw = w->data + (size_t)o * w->row_bytes;
|
|
1464
|
+
sum += dot_q4_0_q8_0_like_ggml(raw, x, in);
|
|
1465
|
+
} else {
|
|
1466
|
+
tensor_get_row(w, o, row);
|
|
1467
|
+
for (int i = 0; i < in; i++) sum += row[i] * x[i];
|
|
1233
1468
|
}
|
|
1469
|
+
out[o] = sum;
|
|
1234
1470
|
}
|
|
1471
|
+
}
|
|
1235
1472
|
|
|
1236
|
-
|
|
1237
|
-
|
|
1473
|
+
static void linear_batch(const Tensor *w, const Tensor *b, const float *x, int seq, float *out, float *row) {
|
|
1474
|
+
int in = (int)w->dims[0];
|
|
1475
|
+
int out_dim = (int)w->dims[1];
|
|
1476
|
+
for (int t = 0; t < seq; t++) {
|
|
1477
|
+
linear_one(w, b, x + (size_t)t * in, out + (size_t)t * out_dim, row);
|
|
1238
1478
|
}
|
|
1479
|
+
}
|
|
1239
1480
|
|
|
1240
|
-
|
|
1481
|
+
static void layer_norm(const float *x, const Tensor *w, const Tensor *b, int seq, int dim, float eps, float *out) {
|
|
1482
|
+
const float *weight = tensor_f32_data(w);
|
|
1483
|
+
const float *bias = tensor_f32_data(b);
|
|
1484
|
+
for (int t = 0; t < seq; t++) {
|
|
1485
|
+
const float *src = x + (size_t)t * dim;
|
|
1486
|
+
float *dst = out + (size_t)t * dim;
|
|
1487
|
+
float mean = 0.0f;
|
|
1488
|
+
for (int i = 0; i < dim; i++) mean += src[i];
|
|
1489
|
+
mean /= (float)dim;
|
|
1490
|
+
float var = 0.0f;
|
|
1491
|
+
for (int i = 0; i < dim; i++) {
|
|
1492
|
+
float d = src[i] - mean;
|
|
1493
|
+
var += d * d;
|
|
1494
|
+
}
|
|
1495
|
+
var /= (float)dim;
|
|
1496
|
+
float scale = 1.0f / sqrtf(var + eps);
|
|
1497
|
+
for (int i = 0; i < dim; i++) {
|
|
1498
|
+
dst[i] = (src[i] - mean) * scale * (weight ? weight[i] : 1.0f) + (bias ? bias[i] : 0.0f);
|
|
1499
|
+
}
|
|
1500
|
+
}
|
|
1241
1501
|
}
|
|
1242
1502
|
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
float
|
|
1247
|
-
|
|
1248
|
-
float
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1503
|
+
static float gelu_approx(float x) {
|
|
1504
|
+
if (x <= -10.0f) return 0.0f;
|
|
1505
|
+
if (x >= 10.0f) return x;
|
|
1506
|
+
const float c = 0.7978845608028654f;
|
|
1507
|
+
float hx = fp16_to_fp32(fp32_to_fp16(x));
|
|
1508
|
+
float y = 0.5f * hx * (1.0f + tanhf(c * hx * (1.0f + 0.044715f * hx * hx)));
|
|
1509
|
+
return fp16_to_fp32(fp32_to_fp16(y));
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
static int bert_embed_text(EmbedModel *m, const char *txt, float *out) {
|
|
1513
|
+
if (m->vocab_type != LLAMA_VOCAB_TYPE_WPM || !find_tensor(m, "blk.0.attn_q.weight")) return 0;
|
|
1514
|
+
|
|
1515
|
+
memset(out, 0, (size_t)m->dim * sizeof(float));
|
|
1516
|
+
if (!txt || !*txt) return 1;
|
|
1517
|
+
|
|
1518
|
+
int max_seq = m->n_ctx > 0 ? m->n_ctx : 512;
|
|
1519
|
+
int *ids = malloc((size_t)max_seq * sizeof(int));
|
|
1520
|
+
if (!ids) return 1;
|
|
1521
|
+
int seq = ascii_wordpiece_tokenize(m, txt, ids, max_seq);
|
|
1522
|
+
if (seq <= 0) { free(ids); return 1; }
|
|
1523
|
+
|
|
1524
|
+
int dim = m->dim;
|
|
1525
|
+
int ff = m->n_ff;
|
|
1526
|
+
int heads = m->n_heads;
|
|
1527
|
+
int head_dim = dim / heads;
|
|
1528
|
+
float *hidden = calloc((size_t)seq * dim, sizeof(float));
|
|
1529
|
+
float *tmp = calloc((size_t)seq * dim, sizeof(float));
|
|
1530
|
+
float *q = calloc((size_t)seq * dim, sizeof(float));
|
|
1531
|
+
float *k = calloc((size_t)seq * dim, sizeof(float));
|
|
1532
|
+
float *v = calloc((size_t)seq * dim, sizeof(float));
|
|
1533
|
+
float *ctx = calloc((size_t)seq * dim, sizeof(float));
|
|
1534
|
+
float *proj = calloc((size_t)seq * dim, sizeof(float));
|
|
1535
|
+
float *ffn = calloc((size_t)seq * ff, sizeof(float));
|
|
1536
|
+
float *row = malloc((size_t)(ff > dim ? ff : dim) * sizeof(float));
|
|
1537
|
+
float *scores = malloc((size_t)seq * sizeof(float));
|
|
1538
|
+
if (!hidden || !tmp || !q || !k || !v || !ctx || !proj || !ffn || !row || !scores) {
|
|
1539
|
+
free(ids); free(hidden); free(tmp); free(q); free(k); free(v); free(ctx); free(proj); free(ffn); free(row); free(scores);
|
|
1540
|
+
return 1;
|
|
1541
|
+
}
|
|
1542
|
+
|
|
1543
|
+
Tensor *tok_emb = find_tensor(m, "token_embd.weight");
|
|
1544
|
+
Tensor *pos_emb = find_tensor(m, "position_embd.weight");
|
|
1545
|
+
Tensor *typ_emb = find_tensor(m, "token_types.weight");
|
|
1546
|
+
Tensor *emb_norm_w = find_tensor(m, "token_embd_norm.weight");
|
|
1547
|
+
Tensor *emb_norm_b = find_tensor(m, "token_embd_norm.bias");
|
|
1548
|
+
|
|
1549
|
+
float *tok = row;
|
|
1550
|
+
float *pos = malloc((size_t)dim * sizeof(float));
|
|
1551
|
+
float *typ = malloc((size_t)dim * sizeof(float));
|
|
1552
|
+
if (!tok_emb || !pos_emb || !typ_emb || !pos || !typ) {
|
|
1553
|
+
free(ids); free(hidden); free(tmp); free(q); free(k); free(v); free(ctx); free(proj); free(ffn); free(row); free(scores); free(pos); free(typ);
|
|
1554
|
+
return 1;
|
|
1252
1555
|
}
|
|
1556
|
+
|
|
1557
|
+
for (int t = 0; t < seq; t++) {
|
|
1558
|
+
tensor_get_row(tok_emb, ids[t], tok);
|
|
1559
|
+
tensor_get_row(pos_emb, t, pos);
|
|
1560
|
+
tensor_get_row(typ_emb, 0, typ);
|
|
1561
|
+
for (int d = 0; d < dim; d++) hidden[(size_t)t * dim + d] = tok[d] + pos[d] + typ[d];
|
|
1562
|
+
}
|
|
1563
|
+
layer_norm(hidden, emb_norm_w, emb_norm_b, seq, dim, m->eps, tmp);
|
|
1564
|
+
memcpy(hidden, tmp, (size_t)seq * dim * sizeof(float));
|
|
1565
|
+
|
|
1566
|
+
for (int layer = 0; layer < m->n_layers; layer++) {
|
|
1567
|
+
char name[80];
|
|
1568
|
+
#define TENSOR(suffix) (snprintf(name, sizeof(name), "blk.%d.%s", layer, suffix), find_tensor(m, name))
|
|
1569
|
+
Tensor *qw = TENSOR("attn_q.weight");
|
|
1570
|
+
Tensor *qb = TENSOR("attn_q.bias");
|
|
1571
|
+
Tensor *kw = TENSOR("attn_k.weight");
|
|
1572
|
+
Tensor *kb = TENSOR("attn_k.bias");
|
|
1573
|
+
Tensor *vw = TENSOR("attn_v.weight");
|
|
1574
|
+
Tensor *vb = TENSOR("attn_v.bias");
|
|
1575
|
+
Tensor *ow = TENSOR("attn_output.weight");
|
|
1576
|
+
Tensor *ob = TENSOR("attn_output.bias");
|
|
1577
|
+
Tensor *an_w = TENSOR("attn_output_norm.weight");
|
|
1578
|
+
Tensor *an_b = TENSOR("attn_output_norm.bias");
|
|
1579
|
+
Tensor *fu_w = TENSOR("ffn_up.weight");
|
|
1580
|
+
Tensor *fu_b = TENSOR("ffn_up.bias");
|
|
1581
|
+
Tensor *fd_w = TENSOR("ffn_down.weight");
|
|
1582
|
+
Tensor *fd_b = TENSOR("ffn_down.bias");
|
|
1583
|
+
Tensor *ln_w = TENSOR("layer_output_norm.weight");
|
|
1584
|
+
Tensor *ln_b = TENSOR("layer_output_norm.bias");
|
|
1585
|
+
#undef TENSOR
|
|
1586
|
+
|
|
1587
|
+
if (!qw || !qb || !kw || !kb || !vw || !vb || !ow || !ob || !an_w || !an_b ||
|
|
1588
|
+
!fu_w || !fu_b || !fd_w || !fd_b || !ln_w || !ln_b) break;
|
|
1589
|
+
|
|
1590
|
+
linear_batch(qw, qb, hidden, seq, q, row);
|
|
1591
|
+
linear_batch(kw, kb, hidden, seq, k, row);
|
|
1592
|
+
linear_batch(vw, vb, hidden, seq, v, row);
|
|
1593
|
+
memset(ctx, 0, (size_t)seq * dim * sizeof(float));
|
|
1594
|
+
|
|
1595
|
+
float att_scale = 1.0f / sqrtf((float)head_dim);
|
|
1596
|
+
for (int h = 0; h < heads; h++) {
|
|
1597
|
+
int off = h * head_dim;
|
|
1598
|
+
for (int ti = 0; ti < seq; ti++) {
|
|
1599
|
+
float max_score = -INFINITY;
|
|
1600
|
+
for (int tj = 0; tj < seq; tj++) {
|
|
1601
|
+
float dot = 0.0f;
|
|
1602
|
+
const float *qv0 = q + (size_t)ti * dim + off;
|
|
1603
|
+
const float *kv0 = k + (size_t)tj * dim + off;
|
|
1604
|
+
for (int d = 0; d < head_dim; d++) dot += qv0[d] * kv0[d];
|
|
1605
|
+
scores[tj] = dot * att_scale;
|
|
1606
|
+
if (scores[tj] > max_score) max_score = scores[tj];
|
|
1607
|
+
}
|
|
1608
|
+
double sum = 0.0;
|
|
1609
|
+
for (int tj = 0; tj < seq; tj++) {
|
|
1610
|
+
scores[tj] = expf(scores[tj] - max_score);
|
|
1611
|
+
sum += scores[tj];
|
|
1612
|
+
}
|
|
1613
|
+
float inv_sum = (float)(1.0 / sum);
|
|
1614
|
+
float *dst = ctx + (size_t)ti * dim + off;
|
|
1615
|
+
for (int tj = 0; tj < seq; tj++) {
|
|
1616
|
+
float p = scores[tj] * inv_sum;
|
|
1617
|
+
const float *vv0 = v + (size_t)tj * dim + off;
|
|
1618
|
+
for (int d = 0; d < head_dim; d++) dst[d] += p * vv0[d];
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
}
|
|
1622
|
+
|
|
1623
|
+
linear_batch(ow, ob, ctx, seq, proj, row);
|
|
1624
|
+
for (int i = 0; i < seq * dim; i++) tmp[i] = hidden[i] + proj[i];
|
|
1625
|
+
layer_norm(tmp, an_w, an_b, seq, dim, m->eps, hidden);
|
|
1626
|
+
|
|
1627
|
+
linear_batch(fu_w, fu_b, hidden, seq, ffn, row);
|
|
1628
|
+
for (int i = 0; i < seq * ff; i++) ffn[i] = gelu_approx(ffn[i]);
|
|
1629
|
+
linear_batch(fd_w, fd_b, ffn, seq, proj, row);
|
|
1630
|
+
for (int i = 0; i < seq * dim; i++) tmp[i] = hidden[i] + proj[i];
|
|
1631
|
+
layer_norm(tmp, ln_w, ln_b, seq, dim, m->eps, hidden);
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
for (int t = 0; t < seq; t++) {
|
|
1635
|
+
for (int d = 0; d < dim; d++) out[d] += hidden[(size_t)t * dim + d];
|
|
1636
|
+
}
|
|
1637
|
+
float inv = 1.0f / (float)seq;
|
|
1638
|
+
for (int d = 0; d < dim; d++) out[d] *= inv;
|
|
1639
|
+
normalize_l2(out, dim);
|
|
1640
|
+
|
|
1641
|
+
free(ids); free(hidden); free(tmp); free(q); free(k); free(v); free(ctx); free(proj); free(ffn); free(row); free(scores); free(pos); free(typ);
|
|
1642
|
+
return 1;
|
|
1253
1643
|
}
|
|
1254
1644
|
|
|
1255
1645
|
/* ------------------------------------------------------------------------- */
|
|
1256
1646
|
static void embed_text(EmbedModel *m, const char *txt, float *out) {
|
|
1647
|
+
if (bert_embed_text(m, txt, out)) return;
|
|
1648
|
+
|
|
1257
1649
|
memset(out, 0, sizeof(float) * m->dim);
|
|
1258
1650
|
if (!txt || !*txt) return;
|
|
1259
1651
|
|
|
@@ -1413,4 +1805,4 @@ void Init_mini_embed(void) {
|
|
|
1413
1805
|
rb_define_alloc_func(c, rb_embedder_alloc);
|
|
1414
1806
|
rb_define_method(c, "initialize", rb_embedder_initialize, 1);
|
|
1415
1807
|
rb_define_method(c, "embed", rb_embed, 1);
|
|
1416
|
-
}
|
|
1808
|
+
}
|