rllama 1.0.1-aarch64-linux-musl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/rllama/cpp.rb ADDED
@@ -0,0 +1,690 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ffi'
4
+
5
+ module Rllama
6
+ module Cpp
7
+ extend FFI::Library
8
+
9
+ LIB_NAME = 'llama'
10
+
11
+ platform =
12
+ case FFI::Platform::OS
13
+ when 'darwin'
14
+ FFI::Platform::ARCH == 'aarch64' ? 'arm64-darwin' : 'x86_64-darwin'
15
+ when 'windows', 'mingw32'
16
+ 'x64-mingw32'
17
+ else
18
+ FFI::Platform::ARCH == 'aarch64' ? 'aarch64-linux' : 'x86_64-linux'
19
+ end
20
+
21
+ lib_file =
22
+ case FFI::Platform::OS
23
+ when 'darwin'
24
+ "lib#{LIB_NAME}.dylib"
25
+ when 'windows', 'mingw32'
26
+ "#{LIB_NAME}.dll"
27
+ else
28
+ "lib#{LIB_NAME}.so"
29
+ end
30
+
31
+ platform_dir = File.join(__dir__, platform)
32
+ platform_path = File.join(platform_dir, lib_file)
33
+
34
+ lib_paths = []
35
+ lib_paths << platform_path if File.exist?(platform_path)
36
+
37
+ lib_paths +=
38
+ case FFI::Platform::OS
39
+ when 'darwin'
40
+ [
41
+ "lib#{LIB_NAME}.dylib",
42
+ "/opt/homebrew/lib/lib#{LIB_NAME}.dylib",
43
+ "/usr/local/lib/lib#{LIB_NAME}.dylib"
44
+ ]
45
+ when 'windows', 'mingw32'
46
+ [
47
+ "#{LIB_NAME}.dll",
48
+ "lib#{LIB_NAME}.dll"
49
+ ]
50
+ else
51
+ [
52
+ "lib#{LIB_NAME}.so",
53
+ "/usr/lib/lib#{LIB_NAME}.so",
54
+ "/usr/local/lib/lib#{LIB_NAME}.so"
55
+ ]
56
+ end
57
+
58
+ ffi_lib lib_paths
59
+
60
+ # --- Typedefs and Opaque Pointers ---
61
+ typedef :pointer, :llama_vocab_p
62
+ typedef :pointer, :llama_model_p
63
+ typedef :pointer, :llama_context_p
64
+ typedef :pointer, :llama_sampler_p
65
+ typedef :pointer, :llama_memory_t
66
+ typedef :pointer, :llama_adapter_lora_p
67
+ typedef :pointer, :llama_sampler_context_t
68
+ typedef :pointer, :ggml_threadpool_t
69
+
70
+ typedef :int32, :llama_pos
71
+ typedef :int32, :llama_token
72
+ typedef :int32, :llama_seq_id
73
+ typedef :uint32, :llama_state_seq_flags
74
+
75
+ # --- Callbacks ---
76
+ # from ggml-backend.h
77
+ callback :ggml_backend_sched_eval_callback, %i[pointer bool pointer], :bool
78
+ callback :ggml_abort_callback, [:pointer], :bool
79
+ callback :ggml_log_callback, %i[int string pointer], :void # Assuming ggml_log_level is int
80
+
81
+ # from llama.h
82
+ callback :llama_progress_callback, %i[float pointer], :bool
83
+
84
+ # for training
85
+ callback :llama_opt_param_filter, %i[pointer pointer], :bool
86
+
87
+ # --- Enums and Constants as Module Constants ---
88
+
89
+ # from ggml.h (ggml_type)
90
+ GGML_TYPE_F32 = 0
91
+ GGML_TYPE_F16 = 1
92
+ GGML_TYPE_Q4_0 = 2
93
+ GGML_TYPE_Q4_1 = 3
94
+ GGML_TYPE_Q5_0 = 6
95
+ GGML_TYPE_Q5_1 = 7
96
+ GGML_TYPE_Q8_0 = 8
97
+ GGML_TYPE_Q8_1 = 9
98
+ GGML_TYPE_Q2_K = 10
99
+ GGML_TYPE_Q3_K = 11
100
+ GGML_TYPE_Q4_K = 12
101
+ GGML_TYPE_Q5_K = 13
102
+ GGML_TYPE_Q6_K = 14
103
+ GGML_TYPE_Q8_K = 15
104
+ GGML_TYPE_IQ2_XXS = 16
105
+ GGML_TYPE_IQ2_XS = 17
106
+ GGML_TYPE_IQ3_XXS = 18
107
+ GGML_TYPE_IQ1_S = 19
108
+ GGML_TYPE_IQ4_NL = 20
109
+ GGML_TYPE_IQ3_S = 21
110
+ GGML_TYPE_IQ2_S = 22
111
+ GGML_TYPE_IQ4_XS = 23
112
+ GGML_TYPE_I8 = 24
113
+ GGML_TYPE_I16 = 25
114
+ GGML_TYPE_I32 = 26
115
+ GGML_TYPE_I64 = 27
116
+ GGML_TYPE_F64 = 28
117
+ GGML_TYPE_IQ1_M = 29
118
+ GGML_TYPE_COUNT = 30
119
+
120
+ # from llama.h
121
+ attach_function :llama_max_devices, [], :size_t
122
+ LLAMA_MAX_DEVICES = llama_max_devices
123
+
124
+ LLAMA_DEFAULT_SEED = 0xFFFFFFFF
125
+ LLAMA_TOKEN_NULL = -1
126
+ LLAMA_FILE_MAGIC_GGLA = 0x67676C61
127
+ LLAMA_FILE_MAGIC_GGSN = 0x6767736E
128
+ LLAMA_FILE_MAGIC_GGSQ = 0x67677371
129
+ LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
130
+ LLAMA_SESSION_VERSION = 9
131
+ LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
132
+ LLAMA_STATE_SEQ_VERSION = 2
133
+ LLAMA_STATE_SEQ_FLAGS_SWA_ONLY = 1
134
+
135
+ # enum llama_vocab_type
136
+ LLAMA_VOCAB_TYPE_NONE = 0
137
+ LLAMA_VOCAB_TYPE_SPM = 1
138
+ LLAMA_VOCAB_TYPE_BPE = 2
139
+ LLAMA_VOCAB_TYPE_WPM = 3
140
+ LLAMA_VOCAB_TYPE_UGM = 4
141
+ LLAMA_VOCAB_TYPE_RWKV = 5
142
+ LLAMA_VOCAB_TYPE_PLAMO2 = 6
143
+
144
+ # enum llama_rope_type
145
+ GGML_ROPE_TYPE_NEOX = 2
146
+ GGML_ROPE_TYPE_MROPE = 8
147
+ GGML_ROPE_TYPE_VISION = 24
148
+ LLAMA_ROPE_TYPE_NONE = -1
149
+ LLAMA_ROPE_TYPE_NORM = 0
150
+ LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX
151
+ LLAMA_ROPE_TYPE_MROPE = GGML_ROPE_TYPE_MROPE
152
+ LLAMA_ROPE_TYPE_VISION = GGML_ROPE_TYPE_VISION
153
+
154
+ # enum llama_token_type
155
+ LLAMA_TOKEN_TYPE_UNDEFINED = 0
156
+ LLAMA_TOKEN_TYPE_NORMAL = 1
157
+ LLAMA_TOKEN_TYPE_UNKNOWN = 2
158
+ LLAMA_TOKEN_TYPE_CONTROL = 3
159
+ LLAMA_TOKEN_TYPE_USER_DEFINED = 4
160
+ LLAMA_TOKEN_TYPE_UNUSED = 5
161
+ LLAMA_TOKEN_TYPE_BYTE = 6
162
+
163
+ # enum llama_token_attr
164
+ LLAMA_TOKEN_ATTR_UNDEFINED = 0
165
+ LLAMA_TOKEN_ATTR_UNKNOWN = 1 << 0
166
+ LLAMA_TOKEN_ATTR_UNUSED = 1 << 1
167
+ LLAMA_TOKEN_ATTR_NORMAL = 1 << 2
168
+ LLAMA_TOKEN_ATTR_CONTROL = 1 << 3
169
+ LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4
170
+ LLAMA_TOKEN_ATTR_BYTE = 1 << 5
171
+ LLAMA_TOKEN_ATTR_NORMALIZED = 1 << 6
172
+ LLAMA_TOKEN_ATTR_LSTRIP = 1 << 7
173
+ LLAMA_TOKEN_ATTR_RSTRIP = 1 << 8
174
+ LLAMA_TOKEN_ATTR_SINGLE_WORD = 1 << 9
175
+
176
+ # enum llama_ftype
177
+ LLAMA_FTYPE_ALL_F32 = 0
178
+ LLAMA_FTYPE_MOSTLY_F16 = 1
179
+ LLAMA_FTYPE_MOSTLY_Q4_0 = 2
180
+ LLAMA_FTYPE_MOSTLY_Q4_1 = 3
181
+ LLAMA_FTYPE_MOSTLY_Q8_0 = 7
182
+ LLAMA_FTYPE_MOSTLY_Q5_0 = 8
183
+ LLAMA_FTYPE_MOSTLY_Q5_1 = 9
184
+ LLAMA_FTYPE_MOSTLY_Q2_K = 10
185
+ LLAMA_FTYPE_MOSTLY_Q3_K_S = 11
186
+ LLAMA_FTYPE_MOSTLY_Q3_K_M = 12
187
+ LLAMA_FTYPE_MOSTLY_Q3_K_L = 13
188
+ LLAMA_FTYPE_MOSTLY_Q4_K_S = 14
189
+ LLAMA_FTYPE_MOSTLY_Q4_K_M = 15
190
+ LLAMA_FTYPE_MOSTLY_Q5_K_S = 16
191
+ LLAMA_FTYPE_MOSTLY_Q5_K_M = 17
192
+ LLAMA_FTYPE_MOSTLY_Q6_K = 18
193
+ LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19
194
+ LLAMA_FTYPE_MOSTLY_IQ2_XS = 20
195
+ LLAMA_FTYPE_MOSTLY_Q2_K_S = 21
196
+ LLAMA_FTYPE_MOSTLY_IQ3_XS = 22
197
+ LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23
198
+ LLAMA_FTYPE_MOSTLY_IQ1_S = 24
199
+ LLAMA_FTYPE_MOSTLY_IQ4_NL = 25
200
+ LLAMA_FTYPE_MOSTLY_IQ3_S = 26
201
+ LLAMA_FTYPE_MOSTLY_IQ3_M = 27
202
+ LLAMA_FTYPE_MOSTLY_IQ2_S = 28
203
+ LLAMA_FTYPE_MOSTLY_IQ2_M = 29
204
+ LLAMA_FTYPE_MOSTLY_IQ4_XS = 30
205
+ LLAMA_FTYPE_MOSTLY_IQ1_M = 31
206
+ LLAMA_FTYPE_MOSTLY_BF16 = 32
207
+ LLAMA_FTYPE_MOSTLY_TQ1_0 = 36
208
+ LLAMA_FTYPE_MOSTLY_TQ2_0 = 37
209
+ LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38
210
+ LLAMA_FTYPE_GUESSED = 1024
211
+
212
+ # enum llama_rope_scaling_type
213
+ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1
214
+ LLAMA_ROPE_SCALING_TYPE_NONE = 0
215
+ LLAMA_ROPE_SCALING_TYPE_LINEAR = 1
216
+ LLAMA_ROPE_SCALING_TYPE_YARN = 2
217
+ LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3
218
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE
219
+
220
+ # enum llama_pooling_type
221
+ LLAMA_POOLING_TYPE_UNSPECIFIED = -1
222
+ LLAMA_POOLING_TYPE_NONE = 0
223
+ LLAMA_POOLING_TYPE_MEAN = 1
224
+ LLAMA_POOLING_TYPE_CLS = 2
225
+ LLAMA_POOLING_TYPE_LAST = 3
226
+ LLAMA_POOLING_TYPE_RANK = 4
227
+
228
+ # enum llama_attention_type
229
+ LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1
230
+ LLAMA_ATTENTION_TYPE_CAUSAL = 0
231
+ LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1
232
+
233
+ # enum llama_flash_attn_type
234
+ LLAMA_FLASH_ATTN_TYPE_AUTO = -1
235
+ LLAMA_FLASH_ATTN_TYPE_DISABLED = 0
236
+ LLAMA_FLASH_ATTN_TYPE_ENABLED = 1
237
+
238
+ # enum llama_split_mode
239
+ LLAMA_SPLIT_MODE_NONE = 0
240
+ LLAMA_SPLIT_MODE_LAYER = 1
241
+ LLAMA_SPLIT_MODE_ROW = 2
242
+
243
+ # enum llama_model_kv_override_type
244
+ LLAMA_KV_OVERRIDE_TYPE_INT = 0
245
+ LLAMA_KV_OVERRIDE_TYPE_FLOAT = 1
246
+ LLAMA_KV_OVERRIDE_TYPE_BOOL = 2
247
+ LLAMA_KV_OVERRIDE_TYPE_STR = 3
248
+
249
+ # enum ggml_numa_strategy
250
+ GGML_NUMA_STRATEGY_DISABLED = 0
251
+ GGML_NUMA_STRATEGY_DISTRIBUTE = 1
252
+ GGML_NUMA_STRATEGY_ISOLATE = 2
253
+ GGML_NUMA_STRATEGY_NUMACTL = 3
254
+ GGML_NUMA_STRATEGY_MIRROR = 4
255
+ GGML_NUMA_STRATEGY_COUNT = 5
256
+
257
+ # --- Structs and Unions ---
258
+
259
+ class LlamaTokenData < FFI::Struct
260
+ layout :id, :llama_token,
261
+ :logit, :float,
262
+ :p, :float
263
+ end
264
+
265
+ class LlamaTokenDataArray < FFI::Struct
266
+ layout :data, :pointer, # LlamaTokenData*
267
+ :size, :size_t,
268
+ :selected, :int64,
269
+ :sorted, :bool
270
+ end
271
+
272
+ class LlamaBatch < FFI::Struct
273
+ layout :n_tokens, :int32,
274
+ :token, :pointer, # llama_token*
275
+ :embd, :pointer, # float*
276
+ :pos, :pointer, # llama_pos*
277
+ :n_seq_id, :pointer, # int32*
278
+ :seq_id, :pointer, # llama_seq_id**
279
+ :logits, :pointer # int8*
280
+ end
281
+
282
+ class LlamaModelKvOverrideValue < FFI::Union
283
+ layout :val_i64, :int64,
284
+ :val_f64, :double,
285
+ :val_bool, :bool,
286
+ :val_str, [:char, 128]
287
+ end
288
+
289
+ class LlamaModelKvOverride < FFI::Struct
290
+ layout :tag, :int, # enum llama_model_kv_override_type
291
+ :key, [:char, 128],
292
+ :value, LlamaModelKvOverrideValue
293
+ end
294
+
295
+ class LlamaModelTensorBuftOverride < FFI::Struct
296
+ layout :pattern, :string,
297
+ :buft, :pointer # ggml_backend_buffer_type_t
298
+ end
299
+
300
+ class LlamaModelParams < FFI::Struct
301
+ layout :devices, :pointer, # ggml_backend_dev_t*
302
+ :tensor_buft_overrides, :pointer, # LlamaModelTensorBuftOverride*
303
+ :n_gpu_layers, :int32,
304
+ :split_mode, :int, # enum llama_split_mode
305
+ :main_gpu, :int32,
306
+ :tensor_split, :pointer, # const float *
307
+ :progress_callback, :llama_progress_callback,
308
+ :progress_callback_user_data, :pointer,
309
+ :kv_overrides, :pointer, # const LlamaModelKvOverride*
310
+ :vocab_only, :bool,
311
+ :use_mmap, :bool,
312
+ :use_mlock, :bool,
313
+ :check_tensors, :bool,
314
+ :use_extra_bufts, :bool
315
+ end
316
+
317
+ class LlamaContextParams < FFI::Struct
318
+ layout :n_ctx, :uint32,
319
+ :n_batch, :uint32,
320
+ :n_ubatch, :uint32,
321
+ :n_seq_max, :uint32,
322
+ :n_threads, :int32,
323
+ :n_threads_batch, :int32,
324
+ :rope_scaling_type, :int, # enum llama_rope_scaling_type
325
+ :pooling_type, :int, # enum llama_pooling_type
326
+ :attention_type, :int, # enum llama_attention_type
327
+ :flash_attn_type, :int, # enum llama_flash_attn_type
328
+ :rope_freq_base, :float,
329
+ :rope_freq_scale, :float,
330
+ :yarn_ext_factor, :float,
331
+ :yarn_attn_factor, :float,
332
+ :yarn_beta_fast, :float,
333
+ :yarn_beta_slow, :float,
334
+ :yarn_orig_ctx, :uint32,
335
+ :defrag_thold, :float,
336
+ :cb_eval, :ggml_backend_sched_eval_callback,
337
+ :cb_eval_user_data, :pointer,
338
+ :type_k, :int, # enum ggml_type
339
+ :type_v, :int, # enum ggml_type
340
+ :abort_callback, :ggml_abort_callback,
341
+ :abort_callback_data, :pointer,
342
+ :embeddings, :bool,
343
+ :offload_kqv, :bool,
344
+ :no_perf, :bool,
345
+ :op_offload, :bool,
346
+ :swa_full, :bool,
347
+ :kv_unified, :bool
348
+ end
349
+
350
+ class LlamaModelQuantizeParams < FFI::Struct
351
+ layout :nthread, :int32,
352
+ :ftype, :int, # enum llama_ftype
353
+ :output_tensor_type, :int, # enum ggml_type
354
+ :token_embedding_type, :int, # enum ggml_type
355
+ :allow_requantize, :bool,
356
+ :quantize_output_tensor, :bool,
357
+ :only_copy, :bool,
358
+ :pure, :bool,
359
+ :keep_split, :bool,
360
+ :imatrix, :pointer,
361
+ :kv_overrides, :pointer,
362
+ :tensor_types, :pointer,
363
+ :prune_layers, :pointer
364
+ end
365
+
366
+ class LlamaLogitBias < FFI::Struct
367
+ layout :token, :llama_token,
368
+ :bias, :float
369
+ end
370
+
371
+ class LlamaSamplerChainParams < FFI::Struct
372
+ layout :no_perf, :bool
373
+ end
374
+
375
+ class LlamaChatMessage < FFI::Struct
376
+ layout :role, :pointer,
377
+ :content, :pointer
378
+ end
379
+
380
+ class LlamaSamplerI < FFI::Struct; end
381
+
382
+ class LlamaSampler < FFI::Struct
383
+ layout :iface, :pointer, # const LlamaSamplerI *
384
+ :ctx, :llama_sampler_context_t
385
+ end
386
+
387
+ callback :llama_sampler_i_name, [:pointer], :pointer
388
+ callback :llama_sampler_i_accept, %i[pointer llama_token], :void
389
+ callback :llama_sampler_i_apply, %i[pointer pointer], :void # pointer to LlamaTokenDataArray
390
+ callback :llama_sampler_i_reset, [:pointer], :void
391
+ callback :llama_sampler_i_clone, [:pointer], :pointer
392
+ callback :llama_sampler_i_free, [:pointer], :void
393
+
394
+ LlamaSamplerI.layout(
395
+ :name, :llama_sampler_i_name,
396
+ :accept, :llama_sampler_i_accept,
397
+ :apply, :llama_sampler_i_apply,
398
+ :reset, :llama_sampler_i_reset,
399
+ :clone, :llama_sampler_i_clone,
400
+ :free, :llama_sampler_i_free
401
+ )
402
+
403
+ class LlamaPerfContextData < FFI::Struct
404
+ layout :t_start_ms, :double,
405
+ :t_load_ms, :double,
406
+ :t_p_eval_ms, :double,
407
+ :t_eval_ms, :double,
408
+ :n_p_eval, :int32,
409
+ :n_eval, :int32,
410
+ :n_reused, :int32
411
+ end
412
+
413
+ class LlamaPerfSamplerData < FFI::Struct
414
+ layout :t_sample_ms, :double,
415
+ :n_sample, :int32
416
+ end
417
+
418
+ class LlamaOptParams < FFI::Struct
419
+ layout :n_ctx_train, :uint32,
420
+ :param_filter, :llama_opt_param_filter,
421
+ :param_filter_ud, :pointer,
422
+ :get_opt_pars, :pointer, # ggml_opt_get_optimizer_params
423
+ :get_opt_pars_ud, :pointer,
424
+ :optimizer_type, :int # enum ggml_opt_optimizer_type
425
+ end
426
+
427
+ # --- Function Attachments ---
428
+
429
+ # Default params
430
+ attach_function :llama_model_default_params, [], LlamaModelParams.by_value
431
+ attach_function :llama_context_default_params, [], LlamaContextParams.by_value
432
+ attach_function :llama_sampler_chain_default_params, [], LlamaSamplerChainParams.by_value
433
+ attach_function :llama_model_quantize_default_params, [], LlamaModelQuantizeParams.by_value
434
+
435
+ # Backend init/free
436
+ attach_function :llama_backend_init, [], :void
437
+ attach_function :llama_backend_free, [], :void
438
+ attach_function :llama_numa_init, [:int], :void # ggml_numa_strategy
439
+
440
+ # Threadpool
441
+ attach_function :llama_attach_threadpool, %i[llama_context_p ggml_threadpool_t ggml_threadpool_t], :void
442
+ attach_function :llama_detach_threadpool, [:llama_context_p], :void
443
+
444
+ # Model loading
445
+ attach_function :llama_load_model_from_file, [:string, LlamaModelParams.by_value], :llama_model_p # DEPRECATED
446
+ attach_function :llama_model_load_from_file, [:string, LlamaModelParams.by_value], :llama_model_p
447
+ attach_function :llama_model_load_from_splits, [:pointer, :size_t, LlamaModelParams.by_value], :llama_model_p
448
+ attach_function :llama_model_save_to_file, %i[llama_model_p string], :void
449
+ attach_function :llama_free_model, [:llama_model_p], :void # DEPRECATED
450
+ attach_function :llama_model_free, [:llama_model_p], :void
451
+
452
+ # Context creation
453
+ attach_function :llama_init_from_model, [:llama_model_p, LlamaContextParams.by_value], :llama_context_p
454
+ # DEPRECATED
455
+ attach_function :llama_new_context_with_model, [:llama_model_p, LlamaContextParams.by_value], :llama_context_p
456
+ attach_function :llama_free, [:llama_context_p], :void
457
+
458
+ # System info and support checks
459
+ attach_function :llama_time_us, [], :int64
460
+ # llama_max_devices already attached
461
+ attach_function :llama_max_parallel_sequences, [], :size_t
462
+ attach_function :llama_supports_mmap, [], :bool
463
+ attach_function :llama_supports_mlock, [], :bool
464
+ attach_function :llama_supports_gpu_offload, [], :bool
465
+ attach_function :llama_supports_rpc, [], :bool
466
+ attach_function :llama_flash_attn_type_name, [:int], :string
467
+
468
+ # Context info
469
+ attach_function :llama_n_ctx, [:llama_context_p], :uint32
470
+ attach_function :llama_n_batch, [:llama_context_p], :uint32
471
+ attach_function :llama_n_ubatch, [:llama_context_p], :uint32
472
+ attach_function :llama_n_seq_max, [:llama_context_p], :uint32
473
+ attach_function :llama_get_model, [:llama_context_p], :llama_model_p
474
+ attach_function :llama_get_memory, [:llama_context_p], :llama_memory_t
475
+ attach_function :llama_pooling_type, [:llama_context_p], :int # enum llama_pooling_type
476
+
477
+ # Model info
478
+ attach_function :llama_model_get_vocab, [:llama_model_p], :llama_vocab_p
479
+ attach_function :llama_model_rope_type, [:llama_model_p], :int # enum llama_rope_type
480
+ attach_function :llama_model_n_ctx_train, [:llama_model_p], :int32
481
+ attach_function :llama_model_n_embd, [:llama_model_p], :int32
482
+ attach_function :llama_model_n_layer, [:llama_model_p], :int32
483
+ attach_function :llama_model_n_head, [:llama_model_p], :int32
484
+ attach_function :llama_model_n_head_kv, [:llama_model_p], :int32
485
+ attach_function :llama_model_n_swa, [:llama_model_p], :int32
486
+ attach_function :llama_model_rope_freq_scale_train, [:llama_model_p], :float
487
+ attach_function :llama_model_n_cls_out, [:llama_model_p], :uint32
488
+ attach_function :llama_model_cls_label, %i[llama_model_p uint32], :string
489
+ attach_function :llama_model_meta_val_str, %i[llama_model_p string pointer size_t], :int32
490
+ attach_function :llama_model_meta_count, [:llama_model_p], :int32
491
+ attach_function :llama_model_meta_key_by_index, %i[llama_model_p int32 pointer size_t], :int32
492
+ attach_function :llama_model_meta_val_str_by_index, %i[llama_model_p int32 pointer size_t], :int32
493
+ attach_function :llama_model_desc, %i[llama_model_p pointer size_t], :int32
494
+ attach_function :llama_model_size, [:llama_model_p], :uint64
495
+ attach_function :llama_model_chat_template, %i[llama_model_p string], :string
496
+ attach_function :llama_model_n_params, [:llama_model_p], :uint64
497
+ attach_function :llama_model_has_encoder, [:llama_model_p], :bool
498
+ attach_function :llama_model_has_decoder, [:llama_model_p], :bool
499
+ attach_function :llama_model_decoder_start_token, [:llama_model_p], :llama_token
500
+ attach_function :llama_model_is_recurrent, [:llama_model_p], :bool
501
+ attach_function :llama_model_is_diffusion, [:llama_model_p], :bool
502
+
503
+ # Vocab info
504
+ attach_function :llama_vocab_type, [:llama_vocab_p], :int # enum llama_vocab_type
505
+ attach_function :llama_vocab_n_tokens, [:llama_vocab_p], :int32
506
+
507
+ # Quantization
508
+ attach_function :llama_model_quantize, %i[string string pointer], :uint32
509
+
510
+ # Adapters
511
+ attach_function :llama_adapter_lora_init, %i[llama_model_p string], :llama_adapter_lora_p
512
+ attach_function :llama_adapter_meta_val_str, %i[llama_adapter_lora_p string pointer size_t], :int32
513
+ attach_function :llama_adapter_meta_count, [:llama_adapter_lora_p], :int32
514
+ attach_function :llama_adapter_meta_key_by_index, %i[llama_adapter_lora_p int32 pointer size_t], :int32
515
+ attach_function :llama_adapter_meta_val_str_by_index, %i[llama_adapter_lora_p int32 pointer size_t], :int32
516
+ attach_function :llama_adapter_lora_free, [:llama_adapter_lora_p], :void
517
+ attach_function :llama_adapter_get_alora_n_invocation_tokens, [:llama_adapter_lora_p], :uint64
518
+ attach_function :llama_adapter_get_alora_invocation_tokens, [:llama_adapter_lora_p], :pointer # const llama_token*
519
+ attach_function :llama_set_adapter_lora, %i[llama_context_p llama_adapter_lora_p float], :int32
520
+ attach_function :llama_rm_adapter_lora, %i[llama_context_p llama_adapter_lora_p], :int32
521
+ attach_function :llama_clear_adapter_lora, [:llama_context_p], :void
522
+ attach_function :llama_apply_adapter_cvec, %i[llama_context_p pointer size_t int32 int32 int32], :int32
523
+
524
+ # Memory management
525
+ attach_function :llama_memory_clear, %i[llama_memory_t bool], :void
526
+ attach_function :llama_memory_seq_rm, %i[llama_memory_t llama_seq_id llama_pos llama_pos], :bool
527
+ attach_function :llama_memory_seq_cp, %i[llama_memory_t llama_seq_id llama_seq_id llama_pos llama_pos], :void
528
+ attach_function :llama_memory_seq_keep, %i[llama_memory_t llama_seq_id], :void
529
+ attach_function :llama_memory_seq_add, %i[llama_memory_t llama_seq_id llama_pos llama_pos llama_pos], :void
530
+ attach_function :llama_memory_seq_div, %i[llama_memory_t llama_seq_id llama_pos llama_pos int], :void
531
+ attach_function :llama_memory_seq_pos_min, %i[llama_memory_t llama_seq_id], :llama_pos
532
+ attach_function :llama_memory_seq_pos_max, %i[llama_memory_t llama_seq_id], :llama_pos
533
+ attach_function :llama_memory_can_shift, [:llama_memory_t], :bool
534
+
535
+ # State / sessions
536
+ attach_function :llama_state_get_size, [:llama_context_p], :size_t
537
+ attach_function :llama_state_get_data, %i[llama_context_p pointer size_t], :size_t
538
+ attach_function :llama_state_set_data, %i[llama_context_p pointer size_t], :size_t
539
+ attach_function :llama_state_load_file, %i[llama_context_p string pointer size_t pointer], :bool
540
+ attach_function :llama_state_save_file, %i[llama_context_p string pointer size_t], :bool
541
+ attach_function :llama_state_seq_get_size, %i[llama_context_p llama_seq_id], :size_t
542
+ attach_function :llama_state_seq_get_data, %i[llama_context_p pointer size_t llama_seq_id], :size_t
543
+ attach_function :llama_state_seq_set_data, %i[llama_context_p pointer size_t llama_seq_id], :size_t
544
+ attach_function :llama_state_seq_save_file, %i[llama_context_p string llama_seq_id pointer size_t], :size_t
545
+ attach_function :llama_state_seq_load_file, %i[llama_context_p string llama_seq_id pointer size_t pointer], :size_t
546
+ attach_function :llama_state_seq_get_size_ext, %i[llama_context_p llama_seq_id llama_state_seq_flags], :size_t
547
+ attach_function :llama_state_seq_get_data_ext,
548
+ %i[llama_context_p pointer size_t llama_seq_id llama_state_seq_flags], :size_t
549
+ attach_function :llama_state_seq_set_data_ext,
550
+ %i[llama_context_p pointer size_t llama_seq_id llama_state_seq_flags], :size_t
551
+
552
+ # Decoding
553
+ attach_function :llama_batch_get_one, %i[pointer int32], LlamaBatch.by_value
554
+ attach_function :llama_batch_init, %i[int32 int32 int32], LlamaBatch.by_value
555
+ attach_function :llama_batch_free, [LlamaBatch.by_value], :void
556
+ attach_function :llama_encode, [:llama_context_p, LlamaBatch.by_value], :int32
557
+ attach_function :llama_decode, [:llama_context_p, LlamaBatch.by_value], :int32
558
+
559
+ # Decoding settings
560
+ attach_function :llama_set_n_threads, %i[llama_context_p int32 int32], :void
561
+ attach_function :llama_n_threads, [:llama_context_p], :int32
562
+ attach_function :llama_n_threads_batch, [:llama_context_p], :int32
563
+ attach_function :llama_set_embeddings, %i[llama_context_p bool], :void
564
+ attach_function :llama_set_causal_attn, %i[llama_context_p bool], :void
565
+ attach_function :llama_set_warmup, %i[llama_context_p bool], :void
566
+ attach_function :llama_set_abort_callback, %i[llama_context_p ggml_abort_callback pointer], :void
567
+ attach_function :llama_synchronize, [:llama_context_p], :void
568
+
569
+ # Get results
570
+ attach_function :llama_get_logits, [:llama_context_p], :pointer # float*
571
+ attach_function :llama_get_logits_ith, %i[llama_context_p int32], :pointer # float*
572
+ attach_function :llama_get_embeddings, [:llama_context_p], :pointer # float*
573
+ attach_function :llama_get_embeddings_ith, %i[llama_context_p int32], :pointer # float*
574
+ attach_function :llama_get_embeddings_seq, %i[llama_context_p llama_seq_id], :pointer # float*
575
+
576
+ # Vocab utils
577
+ attach_function :llama_vocab_get_text, %i[llama_vocab_p llama_token], :string
578
+ attach_function :llama_vocab_get_score, %i[llama_vocab_p llama_token], :float
579
+ attach_function :llama_vocab_get_attr, %i[llama_vocab_p llama_token], :int # enum llama_token_attr
580
+ attach_function :llama_vocab_is_eog, %i[llama_vocab_p llama_token], :bool
581
+ attach_function :llama_vocab_is_control, %i[llama_vocab_p llama_token], :bool
582
+ attach_function :llama_vocab_bos, [:llama_vocab_p], :llama_token
583
+ attach_function :llama_vocab_eos, [:llama_vocab_p], :llama_token
584
+ attach_function :llama_vocab_eot, [:llama_vocab_p], :llama_token
585
+ attach_function :llama_vocab_sep, [:llama_vocab_p], :llama_token
586
+ attach_function :llama_vocab_nl, [:llama_vocab_p], :llama_token
587
+ attach_function :llama_vocab_pad, [:llama_vocab_p], :llama_token
588
+ attach_function :llama_vocab_mask, [:llama_vocab_p], :llama_token
589
+ attach_function :llama_vocab_get_add_bos, [:llama_vocab_p], :bool
590
+ attach_function :llama_vocab_get_add_eos, [:llama_vocab_p], :bool
591
+ attach_function :llama_vocab_get_add_sep, [:llama_vocab_p], :bool
592
+ attach_function :llama_vocab_fim_pre, [:llama_vocab_p], :llama_token
593
+ attach_function :llama_vocab_fim_suf, [:llama_vocab_p], :llama_token
594
+ attach_function :llama_vocab_fim_mid, [:llama_vocab_p], :llama_token
595
+ attach_function :llama_vocab_fim_pad, [:llama_vocab_p], :llama_token
596
+ attach_function :llama_vocab_fim_rep, [:llama_vocab_p], :llama_token
597
+ attach_function :llama_vocab_fim_sep, [:llama_vocab_p], :llama_token
598
+ attach_function :llama_vocab_cls, [:llama_vocab_p], :llama_token # DEPRECATED
599
+
600
+ # Tokenization
601
+ attach_function :llama_tokenize, %i[llama_vocab_p string int32 pointer int32 bool bool], :int32
602
+ attach_function :llama_token_to_piece, %i[llama_vocab_p llama_token pointer int32 int32 bool], :int32
603
+ attach_function :llama_detokenize, %i[llama_vocab_p pointer int32 pointer int32 bool bool], :int32
604
+
605
+ # Chat templates
606
+ attach_function :llama_chat_apply_template, %i[string pointer size_t bool pointer int32], :int32
607
+ attach_function :llama_chat_builtin_templates, %i[pointer size_t], :int32
608
+
609
+ # Sampling API
610
+ attach_function :llama_sampler_init, %i[pointer llama_sampler_context_t], :llama_sampler_p
611
+ attach_function :llama_sampler_name, [:llama_sampler_p], :string
612
+ attach_function :llama_sampler_accept, %i[llama_sampler_p llama_token], :void
613
+ attach_function :llama_sampler_apply, %i[llama_sampler_p pointer], :void
614
+ attach_function :llama_sampler_reset, [:llama_sampler_p], :void
615
+ attach_function :llama_sampler_clone, [:llama_sampler_p], :llama_sampler_p
616
+ attach_function :llama_sampler_free, [:llama_sampler_p], :void
617
+
618
+ # Sampler chain
619
+ attach_function :llama_sampler_chain_init, [LlamaSamplerChainParams.by_value], :llama_sampler_p
620
+ attach_function :llama_sampler_chain_add, %i[llama_sampler_p llama_sampler_p], :void
621
+ attach_function :llama_sampler_chain_get, %i[llama_sampler_p int32], :llama_sampler_p
622
+ attach_function :llama_sampler_chain_n, [:llama_sampler_p], :int
623
+ attach_function :llama_sampler_chain_remove, %i[llama_sampler_p int32], :llama_sampler_p
624
+
625
+ # Built-in samplers
626
+ attach_function :llama_sampler_init_greedy, [], :llama_sampler_p
627
+ attach_function :llama_sampler_init_dist, [:uint32], :llama_sampler_p
628
+ attach_function :llama_sampler_init_top_k, [:int32], :llama_sampler_p
629
+ attach_function :llama_sampler_init_top_p, %i[float size_t], :llama_sampler_p
630
+ attach_function :llama_sampler_init_min_p, %i[float size_t], :llama_sampler_p
631
+ attach_function :llama_sampler_init_typical, %i[float size_t], :llama_sampler_p
632
+ attach_function :llama_sampler_init_temp, [:float], :llama_sampler_p
633
+ attach_function :llama_sampler_init_temp_ext, %i[float float float], :llama_sampler_p
634
+ attach_function :llama_sampler_init_xtc, %i[float float size_t uint32], :llama_sampler_p
635
+ attach_function :llama_sampler_init_top_n_sigma, [:float], :llama_sampler_p
636
+ attach_function :llama_sampler_init_mirostat, %i[int32 uint32 float float int32], :llama_sampler_p
637
+ attach_function :llama_sampler_init_mirostat_v2, %i[uint32 float float], :llama_sampler_p
638
+ attach_function :llama_sampler_init_grammar, %i[llama_vocab_p string string], :llama_sampler_p
639
+ attach_function :llama_sampler_init_grammar_lazy_patterns,
640
+ %i[llama_vocab_p string string pointer size_t pointer size_t], :llama_sampler_p
641
+ attach_function :llama_sampler_init_penalties, %i[int32 float float float], :llama_sampler_p
642
+ attach_function :llama_sampler_init_dry, %i[llama_vocab_p int32 float float int32 int32 pointer size_t],
643
+ :llama_sampler_p
644
+ attach_function :llama_sampler_init_logit_bias, %i[int32 int32 pointer], :llama_sampler_p
645
+ attach_function :llama_sampler_init_infill, [:llama_vocab_p], :llama_sampler_p
646
+ attach_function :llama_sampler_get_seed, [:llama_sampler_p], :uint32
647
+ attach_function :llama_sampler_sample, %i[llama_sampler_p llama_context_p int32], :llama_token
648
+
649
+ # Model split
650
+ attach_function :llama_split_path, %i[pointer size_t string int int], :int
651
+ attach_function :llama_split_prefix, %i[pointer size_t string int int], :int
652
+
653
+ # Logging
654
+ attach_function :llama_print_system_info, [], :string
655
+ attach_function :llama_log_set, %i[ggml_log_callback pointer], :void
656
+
657
+ # Performance utils
658
+ attach_function :llama_perf_context, [:llama_context_p], LlamaPerfContextData.by_value
659
+ attach_function :llama_perf_context_print, [:llama_context_p], :void
660
+ attach_function :llama_perf_context_reset, [:llama_context_p], :void
661
+ attach_function :llama_perf_sampler, [:llama_sampler_p], LlamaPerfSamplerData.by_value
662
+ attach_function :llama_perf_sampler_print, [:llama_sampler_p], :void
663
+ attach_function :llama_perf_sampler_reset, [:llama_sampler_p], :void
664
+
665
+ # Training
666
+ attach_function :llama_opt_param_filter_all, %i[pointer pointer], :bool
667
+ attach_function :llama_opt_init, [:llama_context_p, :llama_model_p, LlamaOptParams.by_value], :void
668
+ attach_function :llama_opt_epoch, %i[llama_context_p pointer pointer pointer int64 pointer pointer], :void
669
+
670
+ SILENCE_LOG_CALLBACK = FFI::Function.new(:void, %i[int string pointer], proc {})
671
+
672
+ module_function
673
+
674
+ def silence_log!
675
+ llama_log_set(SILENCE_LOG_CALLBACK, nil)
676
+ end
677
+
678
+ def set_log(io = $stdout)
679
+ @log_callback = FFI::Function.new(:void, %i[int string pointer]) { |_level, msg, _ud| io << msg }
680
+
681
+ llama_log_set(@log_callback, nil)
682
+ end
683
+
684
+ llama_backend_init
685
+
686
+ silence_log!
687
+
688
+ freeze
689
+ end
690
+ end