fasttextembed 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fasttextembed-1.0.0/CMakeLists.txt +29 -0
- fasttextembed-1.0.0/PKG-INFO +37 -0
- fasttextembed-1.0.0/README.md +22 -0
- fasttextembed-1.0.0/csrc/include/fte/fte.h +30 -0
- fasttextembed-1.0.0/csrc/src/api.c +197 -0
- fasttextembed-1.0.0/csrc/src/config.h +20 -0
- fasttextembed-1.0.0/csrc/src/fp16.h +43 -0
- fasttextembed-1.0.0/csrc/src/fte_format.h +33 -0
- fasttextembed-1.0.0/csrc/src/kernels/kernels.h +48 -0
- fasttextembed-1.0.0/csrc/src/kernels/kernels_scalar.c +407 -0
- fasttextembed-1.0.0/csrc/src/layer_matmul_names.c +18 -0
- fasttextembed-1.0.0/csrc/src/loader.c +44 -0
- fasttextembed-1.0.0/csrc/src/loader.h +22 -0
- fasttextembed-1.0.0/csrc/src/model_bert.c +156 -0
- fasttextembed-1.0.0/csrc/src/model_bert.h +15 -0
- fasttextembed-1.0.0/csrc/src/pack.c +41 -0
- fasttextembed-1.0.0/csrc/src/pack.h +21 -0
- fasttextembed-1.0.0/csrc/src/tensor.c +31 -0
- fasttextembed-1.0.0/csrc/src/tensor.h +13 -0
- fasttextembed-1.0.0/csrc/src/threadpool.c +204 -0
- fasttextembed-1.0.0/csrc/src/threadpool.h +24 -0
- fasttextembed-1.0.0/csrc/src/tokenizer/tokenizer.c +113 -0
- fasttextembed-1.0.0/csrc/src/tokenizer/tokenizer.h +12 -0
- fasttextembed-1.0.0/pyproject.toml +31 -0
- fasttextembed-1.0.0/src/fasttextembed/__init__.py +128 -0
- fasttextembed-1.0.0/vendor.sh +12 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
cmake_minimum_required(VERSION 3.20)
|
|
2
|
+
project(fasttextembed_py C)
|
|
3
|
+
|
|
4
|
+
set(CMAKE_C_STANDARD 11)
|
|
5
|
+
set(CMAKE_C_STANDARD_REQUIRED ON)
|
|
6
|
+
add_compile_options(-O3 -ffp-contract=off)
|
|
7
|
+
|
|
8
|
+
# SIMD features. NOTE for publishing: -mcpu=native targets the BUILD machine and is wrong for
|
|
9
|
+
# redistributable wheels. CI should override (e.g. -DCMAKE_C_FLAGS="-march=armv8.2-a+fp16" on
|
|
10
|
+
# arm64, or a baseline + future runtime dispatch). Fine for local installs.
|
|
11
|
+
include(CheckCCompilerFlag)
|
|
12
|
+
check_c_compiler_flag("-mcpu=native" HAS_MCPU_NATIVE)
|
|
13
|
+
if(HAS_MCPU_NATIVE AND NOT DEFINED ENV{FTE_PORTABLE})
|
|
14
|
+
add_compile_options(-mcpu=native)
|
|
15
|
+
endif()
|
|
16
|
+
|
|
17
|
+
# Repo layout when building in-tree; vendored ./csrc layout for sdist / isolated builds.
|
|
18
|
+
set(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
|
|
19
|
+
if(NOT EXISTS ${ROOT}/src)
|
|
20
|
+
set(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/csrc)
|
|
21
|
+
endif()
|
|
22
|
+
file(GLOB CORE ${ROOT}/src/*.c ${ROOT}/src/kernels/*.c ${ROOT}/src/tokenizer/*.c)
|
|
23
|
+
|
|
24
|
+
add_library(fte SHARED ${CORE})
|
|
25
|
+
target_include_directories(fte PRIVATE ${ROOT}/include ${ROOT}/src)
|
|
26
|
+
target_link_libraries(fte m)
|
|
27
|
+
|
|
28
|
+
# Drop the compiled library into the importable package directory.
|
|
29
|
+
install(TARGETS fte LIBRARY DESTINATION fasttextembed RUNTIME DESTINATION fasttextembed)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: fasttextembed
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Fast, dependency-free text embeddings (BAAI/bge-small-en-v1.5) in pure C
|
|
5
|
+
Keywords: embeddings,bge,vector-search,nlp,onnx-free
|
|
6
|
+
Author: Cemsina Guzel
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: C
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Project-URL: Homepage, https://github.com/cemsina/fasttextembed
|
|
13
|
+
Requires-Python: >=3.8
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# fasttextembed (Python)
|
|
17
|
+
|
|
18
|
+
Fast, dependency-free text embeddings for `BAAI/bge-small-en-v1.5`, powered by a small pure-C
|
|
19
|
+
engine (no PyTorch, no ONNX Runtime). The model (~64 MB) is downloaded and cached on first use.
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install fasttextembed
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
```python
|
|
26
|
+
from fasttextembed import TextEmbedding
|
|
27
|
+
|
|
28
|
+
model = TextEmbedding() # downloads + caches the model on first call
|
|
29
|
+
vectors = model.embed(["hello world", "fast"]) # list of 384-float vectors
|
|
30
|
+
one = model.embed_one("hello world")
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
- `embed(texts, threads=0)` — batch; `threads<=0` uses all cores.
|
|
34
|
+
- Returns plain `list[list[float]]` (wrap with `numpy.asarray(...)` if you want an array).
|
|
35
|
+
|
|
36
|
+
Environment overrides: `FTE_MODEL_DIR` (use local `model.fte`/`vocab.tsv`), `FTE_MODEL_URL`
|
|
37
|
+
(download base URL), `FTE_CACHE` (cache dir), `FTE_LIB` (path to the C shared library).
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# fasttextembed (Python)
|
|
2
|
+
|
|
3
|
+
Fast, dependency-free text embeddings for `BAAI/bge-small-en-v1.5`, powered by a small pure-C
|
|
4
|
+
engine (no PyTorch, no ONNX Runtime). The model (~64 MB) is downloaded and cached on first use.
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
pip install fasttextembed
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
from fasttextembed import TextEmbedding
|
|
12
|
+
|
|
13
|
+
model = TextEmbedding() # downloads + caches the model on first call
|
|
14
|
+
vectors = model.embed(["hello world", "fast"]) # list of 384-float vectors
|
|
15
|
+
one = model.embed_one("hello world")
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
- `embed(texts, threads=0)` — batch; `threads<=0` uses all cores.
|
|
19
|
+
- Returns plain `list[list[float]]` (wrap with `numpy.asarray(...)` if you want an array).
|
|
20
|
+
|
|
21
|
+
Environment overrides: `FTE_MODEL_DIR` (use local `model.fte`/`vocab.tsv`), `FTE_MODEL_URL`
|
|
22
|
+
(download base URL), `FTE_CACHE` (cache dir), `FTE_LIB` (path to the C shared library).
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
#ifndef FTE_H
|
|
2
|
+
#define FTE_H
|
|
3
|
+
#include <stddef.h>
|
|
4
|
+
|
|
5
|
+
typedef struct fte_model fte_model;
|
|
6
|
+
|
|
7
|
+
typedef enum {
|
|
8
|
+
FTE_OK = 0,
|
|
9
|
+
FTE_ERR_IO,
|
|
10
|
+
FTE_ERR_FORMAT,
|
|
11
|
+
FTE_ERR_ARCH_MISMATCH,
|
|
12
|
+
FTE_ERR_OOM,
|
|
13
|
+
FTE_ERR_INPUT
|
|
14
|
+
} fte_status;
|
|
15
|
+
|
|
16
|
+
#define FTE_DIM 384
|
|
17
|
+
|
|
18
|
+
fte_status fte_init(const char *fte_path, const char *vocab_tsv, fte_model **out);
|
|
19
|
+
fte_status fte_embed(fte_model *m, const char *text, float *out384);
|
|
20
|
+
|
|
21
|
+
/* Embed n documents into out[n*384]. Parallelized across `threads` worker threads
|
|
22
|
+
* (<=0 means auto = number of cores). Weights are shared read-only; each thread has
|
|
23
|
+
* its own scratch. Thread-safe for distinct calls. */
|
|
24
|
+
fte_status fte_embed_batch(fte_model *m, const char *const *texts, size_t n,
|
|
25
|
+
float *out, int threads);
|
|
26
|
+
|
|
27
|
+
void fte_free(fte_model *m);
|
|
28
|
+
const char *fte_strerror(fte_status s);
|
|
29
|
+
|
|
30
|
+
#endif
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
#include "fte/fte.h"
|
|
2
|
+
#include "loader.h"
|
|
3
|
+
#include "tokenizer/tokenizer.h"
|
|
4
|
+
#include "model_bert.h"
|
|
5
|
+
#include "threadpool.h"
|
|
6
|
+
#include "config.h"
|
|
7
|
+
#ifndef FTE_NO_THREADS
|
|
8
|
+
#include <pthread.h>
|
|
9
|
+
#include <stdatomic.h>
|
|
10
|
+
#endif
|
|
11
|
+
#include <stdlib.h>
|
|
12
|
+
#include <unistd.h>
|
|
13
|
+
|
|
14
|
+
struct fte_model {
|
|
15
|
+
fte_weights w;
|
|
16
|
+
fte_packed packed;
|
|
17
|
+
fte_tokenizer *tok;
|
|
18
|
+
fte_arena arena;
|
|
19
|
+
fte_pool *pool; /* intra-doc parallelism for single fte_embed calls */
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
#if defined(__APPLE__)
|
|
23
|
+
#include <sys/sysctl.h>
|
|
24
|
+
#endif
|
|
25
|
+
|
|
26
|
+
/* Threads for intra-document parallelism, chosen for the actual hardware:
|
|
27
|
+
* - FTE_THREADS env var overrides everything (tuning / explicit control).
|
|
28
|
+
* - Apple Silicon: performance cores only minus 2 (efficiency cores straggle at the
|
|
29
|
+
* fork/join barrier; leaving headroom avoids preempting the main thread).
|
|
30
|
+
* - Homogeneous CPUs (Linux ARM/x86): all cores minus 1 (leave one for the OS).
|
|
31
|
+
* No reason to copy the Mac's "minus 2" onto a homogeneous machine. */
|
|
32
|
+
static int fte_intra_threads(void) {
|
|
33
|
+
const char *env = getenv("FTE_THREADS");
|
|
34
|
+
if (env) { int v = atoi(env); if (v >= 1) return v; }
|
|
35
|
+
#if defined(__APPLE__)
|
|
36
|
+
int v = 0;
|
|
37
|
+
size_t sz = sizeof v;
|
|
38
|
+
if (sysctlbyname("hw.perflevel0.physicalcpu", &v, &sz, NULL, 0) == 0 && v > 2)
|
|
39
|
+
return v - 2;
|
|
40
|
+
#endif
|
|
41
|
+
long nc = sysconf(_SC_NPROCESSORS_ONLN); /* homogeneous: use all cores (measured best on N1) */
|
|
42
|
+
return nc > 0 ? (int)nc : 1;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
fte_status fte_init(const char *fte_path, const char *vocab, fte_model **out) {
|
|
46
|
+
if (!fte_path || !vocab || !out) return FTE_ERR_INPUT;
|
|
47
|
+
fte_model *m = calloc(1, sizeof *m);
|
|
48
|
+
if (!m) return FTE_ERR_OOM;
|
|
49
|
+
|
|
50
|
+
int r = fte_weights_open(fte_path, &m->w);
|
|
51
|
+
if (r == -3) { free(m); return FTE_ERR_ARCH_MISMATCH; }
|
|
52
|
+
if (r == -2) { free(m); return FTE_ERR_FORMAT; }
|
|
53
|
+
if (r != 0) { free(m); return FTE_ERR_IO; }
|
|
54
|
+
|
|
55
|
+
if (fte_tokenizer_load(vocab, &m->tok) != 0) {
|
|
56
|
+
fte_weights_close(&m->w);
|
|
57
|
+
free(m);
|
|
58
|
+
return FTE_ERR_IO;
|
|
59
|
+
}
|
|
60
|
+
if (fte_arena_init(&m->arena, (size_t)64 * 1024 * 1024) != 0) { /* 64MB scratch */
|
|
61
|
+
fte_tokenizer_free(m->tok);
|
|
62
|
+
fte_weights_close(&m->w);
|
|
63
|
+
free(m);
|
|
64
|
+
return FTE_ERR_OOM;
|
|
65
|
+
}
|
|
66
|
+
if (fte_pack_build(&m->w, &m->packed) != 0) {
|
|
67
|
+
fte_pack_free(&m->packed);
|
|
68
|
+
fte_arena_free(&m->arena);
|
|
69
|
+
fte_tokenizer_free(m->tok);
|
|
70
|
+
fte_weights_close(&m->w);
|
|
71
|
+
free(m);
|
|
72
|
+
return FTE_ERR_OOM;
|
|
73
|
+
}
|
|
74
|
+
int pc = fte_intra_threads();
|
|
75
|
+
m->pool = fte_pool_create(pc > 1 ? pc : 1); /* intra-doc parallelism; NULL if 1 core */
|
|
76
|
+
*out = m;
|
|
77
|
+
return FTE_OK;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
fte_status fte_embed(fte_model *m, const char *text, float *out) {
|
|
81
|
+
if (!m || !text || !out) return FTE_ERR_INPUT;
|
|
82
|
+
int ids[FTE_MAX_POS];
|
|
83
|
+
int seq = fte_tokenizer_encode(m->tok, text, ids, FTE_MAX_POS);
|
|
84
|
+
if (seq < 2) return FTE_ERR_INPUT;
|
|
85
|
+
fte_arena_reset(&m->arena);
|
|
86
|
+
fte_pool_begin(m->pool);
|
|
87
|
+
int rc = fte_bert_embed(&m->w, &m->packed, m->pool, &m->arena, ids, seq, out);
|
|
88
|
+
fte_pool_end(m->pool);
|
|
89
|
+
if (rc != 0) return FTE_ERR_OOM;
|
|
90
|
+
return FTE_OK;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#ifdef FTE_NO_THREADS
|
|
94
|
+
/* Single-threaded batch (e.g. WebAssembly): embed each doc on the caller. */
|
|
95
|
+
fte_status fte_embed_batch(fte_model *m, const char *const *texts, size_t n,
|
|
96
|
+
float *out, int threads) {
|
|
97
|
+
(void)threads;
|
|
98
|
+
if (!m || !texts || !out) return FTE_ERR_INPUT;
|
|
99
|
+
int ids[FTE_MAX_POS];
|
|
100
|
+
for (size_t i = 0; i < n; i++) {
|
|
101
|
+
float *o = out + i * FTE_DIM;
|
|
102
|
+
int seq = fte_tokenizer_encode(m->tok, texts[i], ids, FTE_MAX_POS);
|
|
103
|
+
if (seq < 2) { for (int d = 0; d < FTE_DIM; d++) o[d] = 0.0f; continue; }
|
|
104
|
+
fte_arena_reset(&m->arena);
|
|
105
|
+
if (fte_bert_embed(&m->w, &m->packed, NULL, &m->arena, ids, seq, o) != 0) return FTE_ERR_OOM;
|
|
106
|
+
}
|
|
107
|
+
return FTE_OK;
|
|
108
|
+
}
|
|
109
|
+
#else
|
|
110
|
+
|
|
111
|
+
typedef struct {
|
|
112
|
+
fte_model *m;
|
|
113
|
+
const char *const *texts;
|
|
114
|
+
float *out;
|
|
115
|
+
size_t n;
|
|
116
|
+
_Atomic size_t *next; /* shared work-stealing doc counter */
|
|
117
|
+
int err;
|
|
118
|
+
} batch_job;
|
|
119
|
+
|
|
120
|
+
static void *batch_worker(void *p) {
|
|
121
|
+
batch_job *j = p;
|
|
122
|
+
fte_arena a;
|
|
123
|
+
if (fte_arena_init(&a, (size_t)64 * 1024 * 1024) != 0) { j->err = 1; return NULL; }
|
|
124
|
+
int ids[FTE_MAX_POS];
|
|
125
|
+
/* Work-stealing: grab the next document dynamically. On asymmetric CPUs (e.g. Apple
|
|
126
|
+
* Silicon's perf+efficiency cores) this stops slow cores from stalling the whole batch. */
|
|
127
|
+
for (;;) {
|
|
128
|
+
size_t i = atomic_fetch_add_explicit(j->next, 1, memory_order_relaxed);
|
|
129
|
+
if (i >= j->n) break;
|
|
130
|
+
float *o = j->out + i * FTE_DIM;
|
|
131
|
+
int seq = fte_tokenizer_encode(j->m->tok, j->texts[i], ids, FTE_MAX_POS);
|
|
132
|
+
if (seq < 2) { for (int d = 0; d < FTE_DIM; d++) o[d] = 0.0f; continue; }
|
|
133
|
+
fte_arena_reset(&a);
|
|
134
|
+
if (fte_bert_embed(&j->m->w, &j->m->packed, NULL, &a, ids, seq, o) != 0) { j->err = 1; break; }
|
|
135
|
+
}
|
|
136
|
+
fte_arena_free(&a);
|
|
137
|
+
return NULL;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
fte_status fte_embed_batch(fte_model *m, const char *const *texts, size_t n,
|
|
141
|
+
float *out, int threads) {
|
|
142
|
+
if (!m || !texts || !out) return FTE_ERR_INPUT;
|
|
143
|
+
if (n == 0) return FTE_OK;
|
|
144
|
+
if (threads <= 0) {
|
|
145
|
+
long nc = sysconf(_SC_NPROCESSORS_ONLN);
|
|
146
|
+
threads = nc > 0 ? (int)nc : 1;
|
|
147
|
+
}
|
|
148
|
+
if ((size_t)threads > n) threads = (int)n;
|
|
149
|
+
|
|
150
|
+
if (threads > 256) threads = 256;
|
|
151
|
+
pthread_t tid[256];
|
|
152
|
+
batch_job jobs[256];
|
|
153
|
+
char created[256] = {0};
|
|
154
|
+
_Atomic size_t next = 0;
|
|
155
|
+
for (int t = 0; t < threads; t++) {
|
|
156
|
+
jobs[t] = (batch_job){m, texts, out, n, &next, 0};
|
|
157
|
+
if (pthread_create(&tid[t], NULL, batch_worker, &jobs[t]) == 0)
|
|
158
|
+
created[t] = 1;
|
|
159
|
+
else
|
|
160
|
+
batch_worker(&jobs[t]); /* run inline if spawn fails */
|
|
161
|
+
}
|
|
162
|
+
fte_status st = FTE_OK;
|
|
163
|
+
for (int t = 0; t < threads; t++) {
|
|
164
|
+
if (created[t]) pthread_join(tid[t], NULL);
|
|
165
|
+
if (jobs[t].err) st = FTE_ERR_OOM;
|
|
166
|
+
}
|
|
167
|
+
return st;
|
|
168
|
+
}
|
|
169
|
+
#endif /* FTE_NO_THREADS */
|
|
170
|
+
|
|
171
|
+
#ifdef FTE_PROFILE
|
|
172
|
+
extern void fte_profile_dump(void);
|
|
173
|
+
#endif
|
|
174
|
+
|
|
175
|
+
void fte_free(fte_model *m) {
|
|
176
|
+
if (!m) return;
|
|
177
|
+
#ifdef FTE_PROFILE
|
|
178
|
+
fte_profile_dump();
|
|
179
|
+
#endif
|
|
180
|
+
fte_pool_destroy(m->pool);
|
|
181
|
+
fte_arena_free(&m->arena);
|
|
182
|
+
fte_pack_free(&m->packed);
|
|
183
|
+
fte_tokenizer_free(m->tok);
|
|
184
|
+
fte_weights_close(&m->w);
|
|
185
|
+
free(m);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
const char *fte_strerror(fte_status s) {
|
|
189
|
+
switch (s) {
|
|
190
|
+
case FTE_OK: return "ok";
|
|
191
|
+
case FTE_ERR_IO: return "io error";
|
|
192
|
+
case FTE_ERR_FORMAT: return "bad format";
|
|
193
|
+
case FTE_ERR_ARCH_MISMATCH: return "arch mismatch";
|
|
194
|
+
case FTE_ERR_OOM: return "out of memory";
|
|
195
|
+
default: return "bad input";
|
|
196
|
+
}
|
|
197
|
+
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#ifndef FTE_CONFIG_H
|
|
2
|
+
#define FTE_CONFIG_H
|
|
3
|
+
|
|
4
|
+
/* Compile-time constants specialized to BAAI/bge-small-en-v1.5
|
|
5
|
+
* (model_optimized.onnx, fp16 weights run in fp32 via com.microsoft fused ops). */
|
|
6
|
+
#define FTE_HIDDEN 384
|
|
7
|
+
#define FTE_LAYERS 12
|
|
8
|
+
#define FTE_HEADS 12
|
|
9
|
+
#define FTE_HEAD_DIM 32 /* HIDDEN / HEADS */
|
|
10
|
+
#define FTE_INTERMEDIATE 1536
|
|
11
|
+
#define FTE_VOCAB 30522
|
|
12
|
+
#define FTE_MAX_POS 512
|
|
13
|
+
#define FTE_TYPE_VOCAB 2
|
|
14
|
+
#define FTE_LN_EPS 1e-12f
|
|
15
|
+
#define FTE_ATTN_SCALE 0.17677669529663687f /* 1/sqrt(32) */
|
|
16
|
+
#define FTE_MASK_FILTER -3.4028234663852886e+38f
|
|
17
|
+
#define FTE_GELU_C 0.7978845608028654f /* sqrt(2/pi) */
|
|
18
|
+
#define FTE_GELU_A 0.044715f
|
|
19
|
+
|
|
20
|
+
#endif
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
#ifndef FTE_FP16_H
|
|
2
|
+
#define FTE_FP16_H
|
|
3
|
+
#include <stdint.h>
|
|
4
|
+
#include <string.h>
|
|
5
|
+
|
|
6
|
+
/* Raw IEEE-754 half stored as 16 bits. */
|
|
7
|
+
typedef unsigned short fte_f16;
|
|
8
|
+
|
|
9
|
+
#if defined(__FLT16_MAX__) && !defined(FTE_NO_FLOAT16)
|
|
10
|
+
/* Hardware path (clang/gcc on arm64, x86-64 with the feature): matches NEON vcvt exactly. */
|
|
11
|
+
static inline float fte_h2f(fte_f16 h) {
|
|
12
|
+
_Float16 v;
|
|
13
|
+
memcpy(&v, &h, sizeof v);
|
|
14
|
+
return (float)v;
|
|
15
|
+
}
|
|
16
|
+
#else
|
|
17
|
+
/* Software IEEE half -> float, for targets without _Float16 (e.g. WebAssembly). */
|
|
18
|
+
static inline float fte_h2f(fte_f16 h) {
|
|
19
|
+
uint32_t sign = (uint32_t)(h & 0x8000u) << 16;
|
|
20
|
+
uint32_t exp = (h >> 10) & 0x1fu;
|
|
21
|
+
uint32_t mant = h & 0x3ffu;
|
|
22
|
+
uint32_t bits;
|
|
23
|
+
if (exp == 0) {
|
|
24
|
+
if (mant == 0) {
|
|
25
|
+
bits = sign; /* +/- zero */
|
|
26
|
+
} else { /* subnormal */
|
|
27
|
+
exp = 127 - 15 + 1;
|
|
28
|
+
while (!(mant & 0x400u)) { mant <<= 1; exp--; }
|
|
29
|
+
mant &= 0x3ffu;
|
|
30
|
+
bits = sign | (exp << 23) | (mant << 13);
|
|
31
|
+
}
|
|
32
|
+
} else if (exp == 0x1f) { /* inf / nan */
|
|
33
|
+
bits = sign | 0x7f800000u | (mant << 13);
|
|
34
|
+
} else { /* normal */
|
|
35
|
+
bits = sign | ((exp - 15 + 127) << 23) | (mant << 13);
|
|
36
|
+
}
|
|
37
|
+
float f;
|
|
38
|
+
memcpy(&f, &bits, 4);
|
|
39
|
+
return f;
|
|
40
|
+
}
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
#endif
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
#ifndef FTE_FORMAT_H
|
|
2
|
+
#define FTE_FORMAT_H
|
|
3
|
+
#include <stdint.h>
|
|
4
|
+
|
|
5
|
+
/* On-disk .fte format. Shared contract between tools/convert.py and src/loader.c. */
|
|
6
|
+
#define FTE_MAGIC 0x31455446u /* "FTE1" little-endian */
|
|
7
|
+
#define FTE_VERSION 2u /* v2: per-tensor dtype (fp32 or fp16) */
|
|
8
|
+
#define FTE_NAME_MAX 64
|
|
9
|
+
#define FTE_ALIGN 64
|
|
10
|
+
|
|
11
|
+
#define FTE_DT_F32 0
|
|
12
|
+
#define FTE_DT_F16 1
|
|
13
|
+
|
|
14
|
+
typedef struct { /* one entry per weight tensor; 104 bytes, no padding */
|
|
15
|
+
uint64_t offset; /* absolute byte offset into the file */
|
|
16
|
+
uint64_t nbytes; /* = product(shape) * elem_size(dtype) */
|
|
17
|
+
char name[FTE_NAME_MAX];
|
|
18
|
+
int32_t ndim;
|
|
19
|
+
int32_t shape[4];
|
|
20
|
+
int32_t dtype; /* FTE_DT_F32 | FTE_DT_F16 */
|
|
21
|
+
} fte_tensor_entry;
|
|
22
|
+
|
|
23
|
+
typedef struct {
|
|
24
|
+
/* 8-byte fields first to avoid implicit padding (Python writer matches byte-for-byte) */
|
|
25
|
+
uint64_t table_offset; /* byte offset of fte_tensor_entry[n_tensors] */
|
|
26
|
+
uint64_t blob_offset; /* byte offset of the weight blob */
|
|
27
|
+
uint32_t magic, version;
|
|
28
|
+
uint32_t hidden, layers, heads, intermediate, vocab, max_pos, type_vocab;
|
|
29
|
+
uint32_t n_tensors;
|
|
30
|
+
uint32_t _pad;
|
|
31
|
+
} fte_header;
|
|
32
|
+
|
|
33
|
+
#endif
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
#ifndef FTE_KERNELS_H
|
|
2
|
+
#define FTE_KERNELS_H
|
|
3
|
+
#include <stddef.h>
|
|
4
|
+
#include "fp16.h"
|
|
5
|
+
|
|
6
|
+
/* All row-major. */
|
|
7
|
+
|
|
8
|
+
/* A:[M,K] B:[K,N] -> C:[M,N] (B not transposed), all fp32. */
|
|
9
|
+
void fte_matmul(const float *A, const float *B, float *C, int M, int K, int N);
|
|
10
|
+
|
|
11
|
+
/* Same, but B is fp16 weights (widened to fp32 in-flight); fp32 accumulation.
|
|
12
|
+
* Bit-identical to fte_matmul on the fp32-widened weights. NEON-accelerated on arm64. */
|
|
13
|
+
void fte_matmul_f16w(const float *A, const fte_f16 *B, float *C, int M, int K, int N);
|
|
14
|
+
|
|
15
|
+
/* B pre-packed into 16-column panels (see pack.h). Keeps a 16-wide C tile in NEON
|
|
16
|
+
* registers across k; reads packed fp16 contiguously. Same k-order ⇒ bit-identical
|
|
17
|
+
* to fte_matmul_f16w. N must be a multiple of 16. */
|
|
18
|
+
void fte_matmul_f16w_packed(const float *A, const fte_f16 *Bp, float *C, int M, int K, int N);
|
|
19
|
+
|
|
20
|
+
/* Same, but only computes output column-panels [nb0,nb1) (each panel is 16 cols).
|
|
21
|
+
* Used to split one matmul across threads — disjoint output, no races. */
|
|
22
|
+
void fte_matmul_f16w_packed_range(const float *A, const fte_f16 *Bp, float *C,
|
|
23
|
+
int M, int K, int N, int nb0, int nb1);
|
|
24
|
+
|
|
25
|
+
/* fp16-ACCUMULATE variant (matches ONNX Runtime MLAS HalfGemmKernelNeon): A is already
|
|
26
|
+
* fp16, B is packed fp16, accumulation is fp16 (8-wide .8h FMA — 2x the fp32 kernel per
|
|
27
|
+
* core). Output written as fp32. Panels [nb0,nb1). */
|
|
28
|
+
void fte_matmul_f16_packed_range(const fte_f16 *A, const fte_f16 *Bp, float *C,
|
|
29
|
+
int M, int K, int N, int nb0, int nb1);
|
|
30
|
+
|
|
31
|
+
/* C[m,n] += bias[n] */
|
|
32
|
+
void fte_add_bias(float *C, const float *bias, int M, int N);
|
|
33
|
+
|
|
34
|
+
/* in/out:[M,D]; LayerNorm over D with gamma,beta,eps (population variance) */
|
|
35
|
+
void fte_layernorm(float *X, const float *gamma, const float *beta, int M, int D, float eps);
|
|
36
|
+
|
|
37
|
+
/* OUT[M,D] = LayerNorm(X + skip + bias) over D, then * gamma + beta */
|
|
38
|
+
void fte_skip_layernorm(const float *X, const float *skip, const float *bias,
|
|
39
|
+
const float *gamma, const float *beta, float *OUT,
|
|
40
|
+
int M, int D, float eps);
|
|
41
|
+
|
|
42
|
+
/* in/out:[M,D]; FastGelu of (x + bias[d]) */
|
|
43
|
+
void fte_fastgelu(float *X, const float *bias, int M, int D);
|
|
44
|
+
|
|
45
|
+
/* in/out:[rows,cols]; softmax over each row's first `valid` entries, rest set to 0 */
|
|
46
|
+
void fte_softmax_rows(float *X, int rows, int cols, int valid);
|
|
47
|
+
|
|
48
|
+
#endif
|