fasttextembed 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ cmake_minimum_required(VERSION 3.20)
2
+ project(fasttextembed_py C)
3
+
4
+ set(CMAKE_C_STANDARD 11)
5
+ set(CMAKE_C_STANDARD_REQUIRED ON)
6
+ add_compile_options(-O3 -ffp-contract=off)
7
+
8
+ # SIMD features. NOTE for publishing: -mcpu=native targets the BUILD machine and is wrong for
9
+ # redistributable wheels. CI should override (e.g. -DCMAKE_C_FLAGS="-march=armv8.2-a+fp16" on
10
+ # arm64, or a baseline + future runtime dispatch). Fine for local installs.
11
+ include(CheckCCompilerFlag)
12
+ check_c_compiler_flag("-mcpu=native" HAS_MCPU_NATIVE)
13
+ if(HAS_MCPU_NATIVE AND NOT DEFINED ENV{FTE_PORTABLE})
14
+ add_compile_options(-mcpu=native)
15
+ endif()
16
+
17
+ # Repo layout when building in-tree; vendored ./csrc layout for sdist / isolated builds.
18
+ set(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
19
+ if(NOT EXISTS ${ROOT}/src)
20
+ set(ROOT ${CMAKE_CURRENT_SOURCE_DIR}/csrc)
21
+ endif()
22
+ file(GLOB CORE ${ROOT}/src/*.c ${ROOT}/src/kernels/*.c ${ROOT}/src/tokenizer/*.c)
23
+
24
+ add_library(fte SHARED ${CORE})
25
+ target_include_directories(fte PRIVATE ${ROOT}/include ${ROOT}/src)
26
+ target_link_libraries(fte m)
27
+
28
+ # Drop the compiled library into the importable package directory.
29
+ install(TARGETS fte LIBRARY DESTINATION fasttextembed RUNTIME DESTINATION fasttextembed)
@@ -0,0 +1,37 @@
1
+ Metadata-Version: 2.2
2
+ Name: fasttextembed
3
+ Version: 1.0.0
4
+ Summary: Fast, dependency-free text embeddings (BAAI/bge-small-en-v1.5) in pure C
5
+ Keywords: embeddings,bge,vector-search,nlp,onnx-free
6
+ Author: Cemsina Guzel
7
+ License: MIT
8
+ Classifier: Programming Language :: C
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Project-URL: Homepage, https://github.com/cemsina/fasttextembed
13
+ Requires-Python: >=3.8
14
+ Description-Content-Type: text/markdown
15
+
16
+ # fasttextembed (Python)
17
+
18
+ Fast, dependency-free text embeddings for `BAAI/bge-small-en-v1.5`, powered by a small pure-C
19
+ engine (no PyTorch, no ONNX Runtime). The model (~64 MB) is downloaded and cached on first use.
20
+
21
+ ```bash
22
+ pip install fasttextembed
23
+ ```
24
+
25
+ ```python
26
+ from fasttextembed import TextEmbedding
27
+
28
+ model = TextEmbedding() # downloads + caches the model on first call
29
+ vectors = model.embed(["hello world", "fast"]) # list of 384-float vectors
30
+ one = model.embed_one("hello world")
31
+ ```
32
+
33
+ - `embed(texts, threads=0)` — batch; `threads<=0` uses all cores.
34
+ - Returns plain `list[list[float]]` (wrap with `numpy.asarray(...)` if you want an array).
35
+
36
+ Environment overrides: `FTE_MODEL_DIR` (use local `model.fte`/`vocab.tsv`), `FTE_MODEL_URL`
37
+ (download base URL), `FTE_CACHE` (cache dir), `FTE_LIB` (path to the C shared library).
@@ -0,0 +1,22 @@
1
+ # fasttextembed (Python)
2
+
3
+ Fast, dependency-free text embeddings for `BAAI/bge-small-en-v1.5`, powered by a small pure-C
4
+ engine (no PyTorch, no ONNX Runtime). The model (~64 MB) is downloaded and cached on first use.
5
+
6
+ ```bash
7
+ pip install fasttextembed
8
+ ```
9
+
10
+ ```python
11
+ from fasttextembed import TextEmbedding
12
+
13
+ model = TextEmbedding() # downloads + caches the model on first call
14
+ vectors = model.embed(["hello world", "fast"]) # list of 384-float vectors
15
+ one = model.embed_one("hello world")
16
+ ```
17
+
18
+ - `embed(texts, threads=0)` — batch; `threads<=0` uses all cores.
19
+ - Returns plain `list[list[float]]` (wrap with `numpy.asarray(...)` if you want an array).
20
+
21
+ Environment overrides: `FTE_MODEL_DIR` (use local `model.fte`/`vocab.tsv`), `FTE_MODEL_URL`
22
+ (download base URL), `FTE_CACHE` (cache dir), `FTE_LIB` (path to the C shared library).
@@ -0,0 +1,30 @@
1
+ #ifndef FTE_H
2
+ #define FTE_H
3
+ #include <stddef.h>
4
+
5
+ typedef struct fte_model fte_model;
6
+
7
+ typedef enum {
8
+ FTE_OK = 0,
9
+ FTE_ERR_IO,
10
+ FTE_ERR_FORMAT,
11
+ FTE_ERR_ARCH_MISMATCH,
12
+ FTE_ERR_OOM,
13
+ FTE_ERR_INPUT
14
+ } fte_status;
15
+
16
+ #define FTE_DIM 384
17
+
18
+ fte_status fte_init(const char *fte_path, const char *vocab_tsv, fte_model **out);
19
+ fte_status fte_embed(fte_model *m, const char *text, float *out384);
20
+
21
+ /* Embed n documents into out[n*384]. Parallelized across `threads` worker threads
22
+ * (<=0 means auto = number of cores). Weights are shared read-only; each thread has
23
+ * its own scratch. Thread-safe for distinct calls. */
24
+ fte_status fte_embed_batch(fte_model *m, const char *const *texts, size_t n,
25
+ float *out, int threads);
26
+
27
+ void fte_free(fte_model *m);
28
+ const char *fte_strerror(fte_status s);
29
+
30
+ #endif
@@ -0,0 +1,197 @@
1
+ #include "fte/fte.h"
2
+ #include "loader.h"
3
+ #include "tokenizer/tokenizer.h"
4
+ #include "model_bert.h"
5
+ #include "threadpool.h"
6
+ #include "config.h"
7
+ #ifndef FTE_NO_THREADS
8
+ #include <pthread.h>
9
+ #include <stdatomic.h>
10
+ #endif
11
+ #include <stdlib.h>
12
+ #include <unistd.h>
13
+
14
+ struct fte_model {
15
+ fte_weights w;
16
+ fte_packed packed;
17
+ fte_tokenizer *tok;
18
+ fte_arena arena;
19
+ fte_pool *pool; /* intra-doc parallelism for single fte_embed calls */
20
+ };
21
+
22
+ #if defined(__APPLE__)
23
+ #include <sys/sysctl.h>
24
+ #endif
25
+
26
+ /* Threads for intra-document parallelism, chosen for the actual hardware:
27
+ * - FTE_THREADS env var overrides everything (tuning / explicit control).
28
+ * - Apple Silicon: performance cores only minus 2 (efficiency cores straggle at the
29
+ * fork/join barrier; leaving headroom avoids preempting the main thread).
30
+ * - Homogeneous CPUs (Linux ARM/x86): all cores minus 1 (leave one for the OS).
31
+ * No reason to copy the Mac's "minus 2" onto a homogeneous machine. */
32
+ static int fte_intra_threads(void) {
33
+ const char *env = getenv("FTE_THREADS");
34
+ if (env) { int v = atoi(env); if (v >= 1) return v; }
35
+ #if defined(__APPLE__)
36
+ int v = 0;
37
+ size_t sz = sizeof v;
38
+ if (sysctlbyname("hw.perflevel0.physicalcpu", &v, &sz, NULL, 0) == 0 && v > 2)
39
+ return v - 2;
40
+ #endif
41
+ long nc = sysconf(_SC_NPROCESSORS_ONLN); /* homogeneous: use all cores (measured best on N1) */
42
+ return nc > 0 ? (int)nc : 1;
43
+ }
44
+
45
+ fte_status fte_init(const char *fte_path, const char *vocab, fte_model **out) {
46
+ if (!fte_path || !vocab || !out) return FTE_ERR_INPUT;
47
+ fte_model *m = calloc(1, sizeof *m);
48
+ if (!m) return FTE_ERR_OOM;
49
+
50
+ int r = fte_weights_open(fte_path, &m->w);
51
+ if (r == -3) { free(m); return FTE_ERR_ARCH_MISMATCH; }
52
+ if (r == -2) { free(m); return FTE_ERR_FORMAT; }
53
+ if (r != 0) { free(m); return FTE_ERR_IO; }
54
+
55
+ if (fte_tokenizer_load(vocab, &m->tok) != 0) {
56
+ fte_weights_close(&m->w);
57
+ free(m);
58
+ return FTE_ERR_IO;
59
+ }
60
+ if (fte_arena_init(&m->arena, (size_t)64 * 1024 * 1024) != 0) { /* 64MB scratch */
61
+ fte_tokenizer_free(m->tok);
62
+ fte_weights_close(&m->w);
63
+ free(m);
64
+ return FTE_ERR_OOM;
65
+ }
66
+ if (fte_pack_build(&m->w, &m->packed) != 0) {
67
+ fte_pack_free(&m->packed);
68
+ fte_arena_free(&m->arena);
69
+ fte_tokenizer_free(m->tok);
70
+ fte_weights_close(&m->w);
71
+ free(m);
72
+ return FTE_ERR_OOM;
73
+ }
74
+ int pc = fte_intra_threads();
75
+ m->pool = fte_pool_create(pc > 1 ? pc : 1); /* intra-doc parallelism; NULL if 1 core */
76
+ *out = m;
77
+ return FTE_OK;
78
+ }
79
+
80
+ fte_status fte_embed(fte_model *m, const char *text, float *out) {
81
+ if (!m || !text || !out) return FTE_ERR_INPUT;
82
+ int ids[FTE_MAX_POS];
83
+ int seq = fte_tokenizer_encode(m->tok, text, ids, FTE_MAX_POS);
84
+ if (seq < 2) return FTE_ERR_INPUT;
85
+ fte_arena_reset(&m->arena);
86
+ fte_pool_begin(m->pool);
87
+ int rc = fte_bert_embed(&m->w, &m->packed, m->pool, &m->arena, ids, seq, out);
88
+ fte_pool_end(m->pool);
89
+ if (rc != 0) return FTE_ERR_OOM;
90
+ return FTE_OK;
91
+ }
92
+
93
+ #ifdef FTE_NO_THREADS
94
+ /* Single-threaded batch (e.g. WebAssembly): embed each doc on the caller. */
95
+ fte_status fte_embed_batch(fte_model *m, const char *const *texts, size_t n,
96
+ float *out, int threads) {
97
+ (void)threads;
98
+ if (!m || !texts || !out) return FTE_ERR_INPUT;
99
+ int ids[FTE_MAX_POS];
100
+ for (size_t i = 0; i < n; i++) {
101
+ float *o = out + i * FTE_DIM;
102
+ int seq = fte_tokenizer_encode(m->tok, texts[i], ids, FTE_MAX_POS);
103
+ if (seq < 2) { for (int d = 0; d < FTE_DIM; d++) o[d] = 0.0f; continue; }
104
+ fte_arena_reset(&m->arena);
105
+ if (fte_bert_embed(&m->w, &m->packed, NULL, &m->arena, ids, seq, o) != 0) return FTE_ERR_OOM;
106
+ }
107
+ return FTE_OK;
108
+ }
109
+ #else
110
+
111
+ typedef struct {
112
+ fte_model *m;
113
+ const char *const *texts;
114
+ float *out;
115
+ size_t n;
116
+ _Atomic size_t *next; /* shared work-stealing doc counter */
117
+ int err;
118
+ } batch_job;
119
+
120
+ static void *batch_worker(void *p) {
121
+ batch_job *j = p;
122
+ fte_arena a;
123
+ if (fte_arena_init(&a, (size_t)64 * 1024 * 1024) != 0) { j->err = 1; return NULL; }
124
+ int ids[FTE_MAX_POS];
125
+ /* Work-stealing: grab the next document dynamically. On asymmetric CPUs (e.g. Apple
126
+ * Silicon's perf+efficiency cores) this stops slow cores from stalling the whole batch. */
127
+ for (;;) {
128
+ size_t i = atomic_fetch_add_explicit(j->next, 1, memory_order_relaxed);
129
+ if (i >= j->n) break;
130
+ float *o = j->out + i * FTE_DIM;
131
+ int seq = fte_tokenizer_encode(j->m->tok, j->texts[i], ids, FTE_MAX_POS);
132
+ if (seq < 2) { for (int d = 0; d < FTE_DIM; d++) o[d] = 0.0f; continue; }
133
+ fte_arena_reset(&a);
134
+ if (fte_bert_embed(&j->m->w, &j->m->packed, NULL, &a, ids, seq, o) != 0) { j->err = 1; break; }
135
+ }
136
+ fte_arena_free(&a);
137
+ return NULL;
138
+ }
139
+
140
+ fte_status fte_embed_batch(fte_model *m, const char *const *texts, size_t n,
141
+ float *out, int threads) {
142
+ if (!m || !texts || !out) return FTE_ERR_INPUT;
143
+ if (n == 0) return FTE_OK;
144
+ if (threads <= 0) {
145
+ long nc = sysconf(_SC_NPROCESSORS_ONLN);
146
+ threads = nc > 0 ? (int)nc : 1;
147
+ }
148
+ if ((size_t)threads > n) threads = (int)n;
149
+
150
+ if (threads > 256) threads = 256;
151
+ pthread_t tid[256];
152
+ batch_job jobs[256];
153
+ char created[256] = {0};
154
+ _Atomic size_t next = 0;
155
+ for (int t = 0; t < threads; t++) {
156
+ jobs[t] = (batch_job){m, texts, out, n, &next, 0};
157
+ if (pthread_create(&tid[t], NULL, batch_worker, &jobs[t]) == 0)
158
+ created[t] = 1;
159
+ else
160
+ batch_worker(&jobs[t]); /* run inline if spawn fails */
161
+ }
162
+ fte_status st = FTE_OK;
163
+ for (int t = 0; t < threads; t++) {
164
+ if (created[t]) pthread_join(tid[t], NULL);
165
+ if (jobs[t].err) st = FTE_ERR_OOM;
166
+ }
167
+ return st;
168
+ }
169
+ #endif /* FTE_NO_THREADS */
170
+
171
+ #ifdef FTE_PROFILE
172
+ extern void fte_profile_dump(void);
173
+ #endif
174
+
175
+ void fte_free(fte_model *m) {
176
+ if (!m) return;
177
+ #ifdef FTE_PROFILE
178
+ fte_profile_dump();
179
+ #endif
180
+ fte_pool_destroy(m->pool);
181
+ fte_arena_free(&m->arena);
182
+ fte_pack_free(&m->packed);
183
+ fte_tokenizer_free(m->tok);
184
+ fte_weights_close(&m->w);
185
+ free(m);
186
+ }
187
+
188
+ const char *fte_strerror(fte_status s) {
189
+ switch (s) {
190
+ case FTE_OK: return "ok";
191
+ case FTE_ERR_IO: return "io error";
192
+ case FTE_ERR_FORMAT: return "bad format";
193
+ case FTE_ERR_ARCH_MISMATCH: return "arch mismatch";
194
+ case FTE_ERR_OOM: return "out of memory";
195
+ default: return "bad input";
196
+ }
197
+ }
@@ -0,0 +1,20 @@
1
+ #ifndef FTE_CONFIG_H
2
+ #define FTE_CONFIG_H
3
+
4
+ /* Compile-time constants specialized to BAAI/bge-small-en-v1.5
5
+ * (model_optimized.onnx, fp16 weights run in fp32 via com.microsoft fused ops). */
6
+ #define FTE_HIDDEN 384
7
+ #define FTE_LAYERS 12
8
+ #define FTE_HEADS 12
9
+ #define FTE_HEAD_DIM 32 /* HIDDEN / HEADS */
10
+ #define FTE_INTERMEDIATE 1536
11
+ #define FTE_VOCAB 30522
12
+ #define FTE_MAX_POS 512
13
+ #define FTE_TYPE_VOCAB 2
14
+ #define FTE_LN_EPS 1e-12f
15
+ #define FTE_ATTN_SCALE 0.17677669529663687f /* 1/sqrt(32) */
16
+ #define FTE_MASK_FILTER -3.4028234663852886e+38f
17
+ #define FTE_GELU_C 0.7978845608028654f /* sqrt(2/pi) */
18
+ #define FTE_GELU_A 0.044715f
19
+
20
+ #endif
@@ -0,0 +1,43 @@
1
+ #ifndef FTE_FP16_H
2
+ #define FTE_FP16_H
3
+ #include <stdint.h>
4
+ #include <string.h>
5
+
6
+ /* Raw IEEE-754 half stored as 16 bits. */
7
+ typedef unsigned short fte_f16;
8
+
9
+ #if defined(__FLT16_MAX__) && !defined(FTE_NO_FLOAT16)
10
+ /* Hardware path (clang/gcc on arm64, x86-64 with the feature): matches NEON vcvt exactly. */
11
+ static inline float fte_h2f(fte_f16 h) {
12
+ _Float16 v;
13
+ memcpy(&v, &h, sizeof v);
14
+ return (float)v;
15
+ }
16
+ #else
17
+ /* Software IEEE half -> float, for targets without _Float16 (e.g. WebAssembly). */
18
+ static inline float fte_h2f(fte_f16 h) {
19
+ uint32_t sign = (uint32_t)(h & 0x8000u) << 16;
20
+ uint32_t exp = (h >> 10) & 0x1fu;
21
+ uint32_t mant = h & 0x3ffu;
22
+ uint32_t bits;
23
+ if (exp == 0) {
24
+ if (mant == 0) {
25
+ bits = sign; /* +/- zero */
26
+ } else { /* subnormal */
27
+ exp = 127 - 15 + 1;
28
+ while (!(mant & 0x400u)) { mant <<= 1; exp--; }
29
+ mant &= 0x3ffu;
30
+ bits = sign | (exp << 23) | (mant << 13);
31
+ }
32
+ } else if (exp == 0x1f) { /* inf / nan */
33
+ bits = sign | 0x7f800000u | (mant << 13);
34
+ } else { /* normal */
35
+ bits = sign | ((exp - 15 + 127) << 23) | (mant << 13);
36
+ }
37
+ float f;
38
+ memcpy(&f, &bits, 4);
39
+ return f;
40
+ }
41
+ #endif
42
+
43
+ #endif
@@ -0,0 +1,33 @@
1
+ #ifndef FTE_FORMAT_H
2
+ #define FTE_FORMAT_H
3
+ #include <stdint.h>
4
+
5
+ /* On-disk .fte format. Shared contract between tools/convert.py and src/loader.c. */
6
+ #define FTE_MAGIC 0x31455446u /* "FTE1" little-endian */
7
+ #define FTE_VERSION 2u /* v2: per-tensor dtype (fp32 or fp16) */
8
+ #define FTE_NAME_MAX 64
9
+ #define FTE_ALIGN 64
10
+
11
+ #define FTE_DT_F32 0
12
+ #define FTE_DT_F16 1
13
+
14
+ typedef struct { /* one entry per weight tensor; 104 bytes, no padding */
15
+ uint64_t offset; /* absolute byte offset into the file */
16
+ uint64_t nbytes; /* = product(shape) * elem_size(dtype) */
17
+ char name[FTE_NAME_MAX];
18
+ int32_t ndim;
19
+ int32_t shape[4];
20
+ int32_t dtype; /* FTE_DT_F32 | FTE_DT_F16 */
21
+ } fte_tensor_entry;
22
+
23
+ typedef struct {
24
+ /* 8-byte fields first to avoid implicit padding (Python writer matches byte-for-byte) */
25
+ uint64_t table_offset; /* byte offset of fte_tensor_entry[n_tensors] */
26
+ uint64_t blob_offset; /* byte offset of the weight blob */
27
+ uint32_t magic, version;
28
+ uint32_t hidden, layers, heads, intermediate, vocab, max_pos, type_vocab;
29
+ uint32_t n_tensors;
30
+ uint32_t _pad;
31
+ } fte_header;
32
+
33
+ #endif
@@ -0,0 +1,48 @@
1
+ #ifndef FTE_KERNELS_H
2
+ #define FTE_KERNELS_H
3
+ #include <stddef.h>
4
+ #include "fp16.h"
5
+
6
+ /* All row-major. */
7
+
8
+ /* A:[M,K] B:[K,N] -> C:[M,N] (B not transposed), all fp32. */
9
+ void fte_matmul(const float *A, const float *B, float *C, int M, int K, int N);
10
+
11
+ /* Same, but B is fp16 weights (widened to fp32 in-flight); fp32 accumulation.
12
+ * Bit-identical to fte_matmul on the fp32-widened weights. NEON-accelerated on arm64. */
13
+ void fte_matmul_f16w(const float *A, const fte_f16 *B, float *C, int M, int K, int N);
14
+
15
+ /* B pre-packed into 16-column panels (see pack.h). Keeps a 16-wide C tile in NEON
16
+ * registers across k; reads packed fp16 contiguously. Same k-order ⇒ bit-identical
17
+ * to fte_matmul_f16w. N must be a multiple of 16. */
18
+ void fte_matmul_f16w_packed(const float *A, const fte_f16 *Bp, float *C, int M, int K, int N);
19
+
20
+ /* Same, but only computes output column-panels [nb0,nb1) (each panel is 16 cols).
21
+ * Used to split one matmul across threads — disjoint output, no races. */
22
+ void fte_matmul_f16w_packed_range(const float *A, const fte_f16 *Bp, float *C,
23
+ int M, int K, int N, int nb0, int nb1);
24
+
25
+ /* fp16-ACCUMULATE variant (matches ONNX Runtime MLAS HalfGemmKernelNeon): A is already
26
+ * fp16, B is packed fp16, accumulation is fp16 (8-wide .8h FMA — 2x the fp32 kernel per
27
+ * core). Output written as fp32. Panels [nb0,nb1). */
28
+ void fte_matmul_f16_packed_range(const fte_f16 *A, const fte_f16 *Bp, float *C,
29
+ int M, int K, int N, int nb0, int nb1);
30
+
31
+ /* C[m,n] += bias[n] */
32
+ void fte_add_bias(float *C, const float *bias, int M, int N);
33
+
34
+ /* in/out:[M,D]; LayerNorm over D with gamma,beta,eps (population variance) */
35
+ void fte_layernorm(float *X, const float *gamma, const float *beta, int M, int D, float eps);
36
+
37
+ /* OUT[M,D] = LayerNorm(X + skip + bias) over D, then * gamma + beta */
38
+ void fte_skip_layernorm(const float *X, const float *skip, const float *bias,
39
+ const float *gamma, const float *beta, float *OUT,
40
+ int M, int D, float eps);
41
+
42
+ /* in/out:[M,D]; FastGelu of (x + bias[d]) */
43
+ void fte_fastgelu(float *X, const float *bias, int M, int D);
44
+
45
+ /* in/out:[rows,cols]; softmax over each row's first `valid` entries, rest set to 0 */
46
+ void fte_softmax_rows(float *X, int rows, int cols, int valid);
47
+
48
+ #endif