npm - @fugood/llama.node - Versions diffs - 0.3.2 → 0.3.4 - Mend

@fugood/llama.node 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (286) hide show

package/CMakeLists.txt +7 -0
package/bin/darwin/arm64/llama-node.node +0 -0
package/bin/darwin/x64/llama-node.node +0 -0
package/bin/linux/arm64/llama-node.node +0 -0
package/bin/linux/x64/llama-node.node +0 -0
package/bin/linux-vulkan/arm64/llama-node.node +0 -0
package/bin/linux-vulkan/x64/llama-node.node +0 -0
package/bin/win32/arm64/llama-node.node +0 -0
package/bin/win32/arm64/node.lib +0 -0
package/bin/win32/x64/llama-node.node +0 -0
package/bin/win32/x64/node.lib +0 -0
package/bin/win32-vulkan/arm64/llama-node.node +0 -0
package/bin/win32-vulkan/arm64/node.lib +0 -0
package/bin/win32-vulkan/x64/llama-node.node +0 -0
package/bin/win32-vulkan/x64/node.lib +0 -0
package/lib/binding.ts +18 -1
package/package.json +1 -1
package/src/DetokenizeWorker.cpp +1 -1
package/src/EmbeddingWorker.cpp +17 -7
package/src/EmbeddingWorker.h +2 -1
package/src/LlamaCompletionWorker.cpp +8 -8
package/src/LlamaCompletionWorker.h +2 -2
package/src/LlamaContext.cpp +89 -27
package/src/LlamaContext.h +2 -0
package/src/TokenizeWorker.cpp +1 -1
package/src/common.hpp +4 -4
package/src/llama.cpp/.github/workflows/build.yml +240 -168
package/src/llama.cpp/.github/workflows/docker.yml +8 -8
package/src/llama.cpp/.github/workflows/python-lint.yml +8 -1
package/src/llama.cpp/.github/workflows/server.yml +21 -14
package/src/llama.cpp/CMakeLists.txt +14 -6
package/src/llama.cpp/Sources/llama/llama.h +4 -0
package/src/llama.cpp/cmake/arm64-apple-clang.cmake +16 -0
package/src/llama.cpp/cmake/common.cmake +33 -0
package/src/llama.cpp/cmake/x64-windows-llvm.cmake +11 -0
package/src/llama.cpp/common/CMakeLists.txt +6 -4
package/src/llama.cpp/common/arg.cpp +986 -770
package/src/llama.cpp/common/arg.h +22 -22
package/src/llama.cpp/common/common.cpp +212 -351
package/src/llama.cpp/common/common.h +204 -117
package/src/llama.cpp/common/json-schema-to-grammar.cpp +1 -1
package/src/llama.cpp/common/log.cpp +50 -50
package/src/llama.cpp/common/log.h +18 -18
package/src/llama.cpp/common/ngram-cache.cpp +36 -36
package/src/llama.cpp/common/ngram-cache.h +19 -19
package/src/llama.cpp/common/sampling.cpp +163 -121
package/src/llama.cpp/common/sampling.h +41 -20
package/src/llama.cpp/common/speculative.cpp +274 -0
package/src/llama.cpp/common/speculative.h +28 -0
package/src/llama.cpp/docs/build.md +134 -161
package/src/llama.cpp/examples/CMakeLists.txt +33 -14
package/src/llama.cpp/examples/batched/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched/batched.cpp +19 -18
package/src/llama.cpp/examples/batched-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +10 -11
package/src/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +1 -1
package/src/llama.cpp/examples/cvector-generator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/cvector-generator/cvector-generator.cpp +9 -9
package/src/llama.cpp/examples/deprecation-warning/deprecation-warning.cpp +1 -1
package/src/llama.cpp/examples/embedding/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/embedding/embedding.cpp +12 -12
package/src/llama.cpp/examples/eval-callback/CMakeLists.txt +3 -2
package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +8 -8
package/src/llama.cpp/examples/export-lora/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/export-lora/export-lora.cpp +5 -5
package/src/llama.cpp/examples/gbnf-validator/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gbnf-validator/gbnf-validator.cpp +4 -7
package/src/llama.cpp/examples/gen-docs/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gen-docs/gen-docs.cpp +7 -7
package/src/llama.cpp/examples/gguf/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-hash/CMakeLists.txt +8 -1
package/src/llama.cpp/examples/gguf-split/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +2 -2
package/src/llama.cpp/examples/gritlm/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/gritlm/gritlm.cpp +18 -18
package/src/llama.cpp/examples/imatrix/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/imatrix/imatrix.cpp +31 -13
package/src/llama.cpp/examples/infill/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/infill/infill.cpp +41 -87
package/src/llama.cpp/examples/llama-bench/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +439 -459
package/src/llama.cpp/examples/llama.android/llama/build.gradle.kts +2 -0
package/src/llama.cpp/examples/llama.android/llama/src/main/cpp/llama-android.cpp +11 -14
package/src/llama.cpp/examples/llava/CMakeLists.txt +10 -3
package/src/llama.cpp/examples/llava/clip.cpp +263 -66
package/src/llama.cpp/examples/llava/clip.h +8 -2
package/src/llama.cpp/examples/llava/llava-cli.cpp +23 -23
package/src/llama.cpp/examples/llava/llava.cpp +83 -22
package/src/llama.cpp/examples/llava/minicpmv-cli.cpp +21 -21
package/src/llama.cpp/examples/llava/qwen2vl-cli.cpp +581 -0
package/src/llama.cpp/examples/lookahead/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/lookahead/lookahead.cpp +26 -26
package/src/llama.cpp/examples/lookup/CMakeLists.txt +4 -4
package/src/llama.cpp/examples/lookup/lookup-create.cpp +7 -7
package/src/llama.cpp/examples/lookup/lookup-merge.cpp +4 -4
package/src/llama.cpp/examples/lookup/lookup-stats.cpp +16 -15
package/src/llama.cpp/examples/lookup/lookup.cpp +30 -30
package/src/llama.cpp/examples/main/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/main/main.cpp +73 -114
package/src/llama.cpp/examples/main-cmake-pkg/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/parallel/parallel.cpp +18 -19
package/src/llama.cpp/examples/passkey/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/passkey/passkey.cpp +14 -14
package/src/llama.cpp/examples/perplexity/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/perplexity/perplexity.cpp +99 -120
package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize/quantize.cpp +0 -3
package/src/llama.cpp/examples/quantize-stats/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +10 -9
package/src/llama.cpp/examples/retrieval/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/retrieval/retrieval.cpp +16 -16
package/src/llama.cpp/examples/rpc/rpc-server.cpp +3 -1
package/src/llama.cpp/examples/run/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/run/run.cpp +911 -0
package/src/llama.cpp/examples/save-load-state/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/save-load-state/save-load-state.cpp +38 -21
package/src/llama.cpp/examples/server/CMakeLists.txt +3 -16
package/src/llama.cpp/examples/server/server.cpp +2073 -1339
package/src/llama.cpp/examples/server/tests/requirements.txt +2 -2
package/src/llama.cpp/examples/server/utils.hpp +354 -277
package/src/llama.cpp/examples/simple/CMakeLists.txt +2 -2
package/src/llama.cpp/examples/simple/simple.cpp +130 -94
package/src/llama.cpp/examples/simple-chat/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +200 -0
package/src/llama.cpp/examples/speculative/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/speculative/speculative.cpp +68 -64
package/src/llama.cpp/examples/speculative-simple/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/speculative-simple/speculative-simple.cpp +265 -0
package/src/llama.cpp/examples/tokenize/CMakeLists.txt +1 -1
package/src/llama.cpp/examples/tokenize/tokenize.cpp +3 -3
package/src/llama.cpp/examples/tts/CMakeLists.txt +5 -0
package/src/llama.cpp/examples/tts/tts.cpp +932 -0
package/src/llama.cpp/ggml/CMakeLists.txt +54 -36
package/src/llama.cpp/ggml/include/ggml-backend.h +63 -34
package/src/llama.cpp/ggml/include/ggml-blas.h +5 -3
package/src/llama.cpp/ggml/include/ggml-cann.h +9 -7
package/src/llama.cpp/ggml/include/ggml-cpp.h +38 -0
package/src/llama.cpp/ggml/include/ggml-cpu.h +135 -0
package/src/llama.cpp/ggml/include/ggml-cuda.h +12 -12
package/src/llama.cpp/ggml/include/ggml-kompute.h +7 -3
package/src/llama.cpp/ggml/include/ggml-metal.h +11 -7
package/src/llama.cpp/ggml/include/ggml-opencl.h +26 -0
package/src/llama.cpp/ggml/include/ggml-opt.h +216 -0
package/src/llama.cpp/ggml/include/ggml-rpc.h +9 -5
package/src/llama.cpp/ggml/include/ggml-sycl.h +18 -11
package/src/llama.cpp/ggml/include/ggml-vulkan.h +10 -8
package/src/llama.cpp/ggml/include/ggml.h +159 -417
package/src/llama.cpp/ggml/src/CMakeLists.txt +121 -1155
package/src/llama.cpp/ggml/src/ggml-alloc.c +23 -28
package/src/llama.cpp/ggml/src/ggml-backend-impl.h +57 -36
package/src/llama.cpp/ggml/src/ggml-backend-reg.cpp +552 -0
package/src/llama.cpp/ggml/src/ggml-backend.cpp +306 -867
package/src/llama.cpp/ggml/src/ggml-blas/CMakeLists.txt +87 -0
package/src/llama.cpp/ggml/src/{ggml-blas.cpp → ggml-blas/ggml-blas.cpp} +216 -65
package/src/llama.cpp/ggml/src/ggml-cann/CMakeLists.txt +76 -0
package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +456 -111
package/src/llama.cpp/ggml/src/ggml-cann/common.h +6 -3
package/src/llama.cpp/ggml/src/{ggml-cann.cpp → ggml-cann/ggml-cann.cpp} +343 -177
package/src/llama.cpp/ggml/src/ggml-cann/kernels/CMakeLists.txt +2 -5
package/src/llama.cpp/ggml/src/ggml-cann/kernels/dup.cpp +22 -9
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f16.cpp +24 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_f32.cpp +23 -13
package/src/llama.cpp/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +11 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +10 -0
package/src/llama.cpp/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +17 -0
package/src/llama.cpp/ggml/src/ggml-common.h +42 -42
package/src/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt +336 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.cpp +220 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/amx.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/common.h +91 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.cpp +2511 -0
package/src/llama.cpp/ggml/src/ggml-cpu/amx/mmq.h +10 -0
package/src/llama.cpp/ggml/src/ggml-cpu/cpu-feats-x86.cpp +323 -0
package/src/llama.cpp/ggml/src/{ggml-aarch64.c → ggml-cpu/ggml-cpu-aarch64.cpp} +1299 -246
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-aarch64.h +8 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp +55 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-hbm.h +8 -0
package/src/llama.cpp/ggml/src/{ggml-cpu-impl.h → ggml-cpu/ggml-cpu-impl.h} +14 -242
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.c +10835 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-quants.h +63 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.cpp +36 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu-traits.h +38 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +14123 -0
package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.cpp +628 -0
package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.cpp +666 -0
package/src/llama.cpp/ggml/src/ggml-cuda/CMakeLists.txt +152 -0
package/src/llama.cpp/ggml/src/ggml-cuda/vendors/hip.h +8 -0
package/src/llama.cpp/ggml/src/ggml-hip/CMakeLists.txt +104 -0
package/src/llama.cpp/ggml/src/ggml-impl.h +393 -22
package/src/llama.cpp/ggml/src/ggml-kompute/CMakeLists.txt +166 -0
package/src/llama.cpp/ggml/src/{ggml-kompute.cpp → ggml-kompute/ggml-kompute.cpp} +360 -127
package/src/llama.cpp/ggml/src/ggml-metal/CMakeLists.txt +105 -0
package/src/llama.cpp/ggml/src/ggml-metal/ggml-metal-impl.h +288 -0
package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +107 -0
package/src/llama.cpp/ggml/src/ggml-opencl/CMakeLists.txt +147 -0
package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +4004 -0
package/src/llama.cpp/ggml/src/ggml-opt.cpp +854 -0
package/src/llama.cpp/ggml/src/ggml-quants.c +188 -10702
package/src/llama.cpp/ggml/src/ggml-quants.h +78 -125
package/src/llama.cpp/ggml/src/ggml-rpc/CMakeLists.txt +9 -0
package/src/llama.cpp/ggml/src/{ggml-rpc.cpp → ggml-rpc/ggml-rpc.cpp} +478 -300
package/src/llama.cpp/ggml/src/ggml-sycl/CMakeLists.txt +84 -0
package/src/llama.cpp/ggml/src/ggml-sycl/backend.hpp +3 -0
package/src/llama.cpp/ggml/src/ggml-sycl/common.cpp +36 -5
package/src/llama.cpp/ggml/src/ggml-sycl/common.hpp +259 -0
package/src/llama.cpp/ggml/src/ggml-sycl/concat.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/convert.cpp +1 -1
package/src/llama.cpp/ggml/src/ggml-sycl/dmmv.cpp +5 -5
package/src/llama.cpp/ggml/src/ggml-sycl/dpct/helper.hpp +34 -35
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.cpp +1030 -0
package/src/llama.cpp/ggml/src/ggml-sycl/element_wise.hpp +76 -0
package/src/llama.cpp/ggml/src/ggml-sycl/gemm.hpp +4 -4
package/src/llama.cpp/ggml/src/{ggml-sycl.cpp → ggml-sycl/ggml-sycl.cpp} +3638 -4151
package/src/llama.cpp/ggml/src/ggml-sycl/im2col.cpp +3 -2
package/src/llama.cpp/ggml/src/ggml-sycl/mmq.cpp +6 -6
package/src/llama.cpp/ggml/src/ggml-sycl/mmvq.cpp +75 -87
package/src/llama.cpp/ggml/src/ggml-sycl/norm.cpp +7 -6
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.cpp +56 -0
package/src/llama.cpp/ggml/src/ggml-sycl/outprod.hpp +11 -0
package/src/llama.cpp/ggml/src/ggml-sycl/presets.hpp +6 -0
package/src/llama.cpp/ggml/src/ggml-sycl/rope.cpp +4 -3
package/src/llama.cpp/ggml/src/ggml-sycl/softmax.cpp +7 -7
package/src/llama.cpp/ggml/src/ggml-sycl/tsembd.cpp +1 -0
package/src/llama.cpp/ggml/src/ggml-sycl/vecdotq.hpp +4 -4
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.cpp +141 -0
package/src/llama.cpp/ggml/src/ggml-sycl/wkv6.hpp +10 -0
package/src/llama.cpp/ggml/src/ggml-threading.cpp +12 -0
package/src/llama.cpp/ggml/src/ggml-threading.h +14 -0
package/src/llama.cpp/ggml/src/ggml-vulkan/CMakeLists.txt +92 -0
package/src/llama.cpp/ggml/src/{ggml-vulkan.cpp → ggml-vulkan/ggml-vulkan.cpp} +2138 -887
package/src/llama.cpp/ggml/src/{vulkan-shaders → ggml-vulkan/vulkan-shaders}/CMakeLists.txt +3 -1
package/src/llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +593 -0
package/src/llama.cpp/ggml/src/ggml.c +4427 -20125
package/src/llama.cpp/include/llama-cpp.h +25 -0
package/src/llama.cpp/include/llama.h +93 -52
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.inp +112 -0
package/src/llama.cpp/models/ggml-vocab-roberta-bpe.gguf.out +46 -0
package/src/llama.cpp/pocs/CMakeLists.txt +3 -1
package/src/llama.cpp/pocs/vdot/CMakeLists.txt +2 -2
package/src/llama.cpp/pocs/vdot/q8dot.cpp +4 -3
package/src/llama.cpp/pocs/vdot/vdot.cpp +8 -7
package/src/llama.cpp/src/CMakeLists.txt +4 -8
package/src/llama.cpp/src/llama-grammar.cpp +15 -15
package/src/llama.cpp/src/llama-grammar.h +2 -5
package/src/llama.cpp/src/llama-sampling.cpp +779 -194
package/src/llama.cpp/src/llama-sampling.h +21 -2
package/src/llama.cpp/src/llama-vocab.cpp +55 -10
package/src/llama.cpp/src/llama-vocab.h +35 -11
package/src/llama.cpp/src/llama.cpp +4317 -2979
package/src/llama.cpp/src/unicode-data.cpp +2 -2
package/src/llama.cpp/src/unicode.cpp +62 -51
package/src/llama.cpp/src/unicode.h +9 -10
package/src/llama.cpp/tests/CMakeLists.txt +48 -38
package/src/llama.cpp/tests/test-arg-parser.cpp +15 -15
package/src/llama.cpp/tests/test-backend-ops.cpp +324 -80
package/src/llama.cpp/tests/test-barrier.cpp +1 -0
package/src/llama.cpp/tests/test-chat-template.cpp +59 -9
package/src/llama.cpp/tests/test-gguf.cpp +1303 -0
package/src/llama.cpp/tests/test-grammar-integration.cpp +3 -6
package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +17 -4
package/src/llama.cpp/tests/test-llama-grammar.cpp +2 -4
package/src/llama.cpp/tests/test-log.cpp +2 -2
package/src/llama.cpp/tests/test-opt.cpp +853 -142
package/src/llama.cpp/tests/test-quantize-fns.cpp +24 -21
package/src/llama.cpp/tests/test-quantize-perf.cpp +16 -14
package/src/llama.cpp/tests/test-rope.cpp +62 -20
package/src/llama.cpp/tests/test-sampling.cpp +163 -138
package/src/llama.cpp/tests/test-tokenizer-0.cpp +7 -7
package/src/llama.cpp/tests/test-tokenizer-1-bpe.cpp +5 -5
package/src/llama.cpp/tests/test-tokenizer-1-spm.cpp +5 -5
package/src/llama.cpp/.github/workflows/nix-ci-aarch64.yml +0 -72
package/src/llama.cpp/.github/workflows/nix-ci.yml +0 -79
package/src/llama.cpp/.github/workflows/nix-flake-update.yml +0 -22
package/src/llama.cpp/.github/workflows/nix-publish-flake.yml +0 -36
package/src/llama.cpp/common/train.cpp +0 -1515
package/src/llama.cpp/common/train.h +0 -233
package/src/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -5
package/src/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -1639
package/src/llama.cpp/ggml/src/ggml-aarch64.h +0 -39
package/src/llama.cpp/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +0 -600
package/src/llama.cpp/tests/test-grad0.cpp +0 -1683
/package/src/llama.cpp/ggml/{cmake → src/ggml-cpu/cmake}/FindSIMD.cmake +0 -0
/package/src/llama.cpp/ggml/src/{llamafile → ggml-cpu/llamafile}/sgemm.h +0 -0

package/src/llama.cpp/examples/run/run.cpp ADDED Viewed

@@ -0,0 +1,911 @@
+#if defined(_WIN32)
+#    include <windows.h>
+#else
+#    include <sys/file.h>
+#    include <sys/ioctl.h>
+#    include <unistd.h>
+#endif
+#if defined(LLAMA_USE_CURL)
+#    include <curl/curl.h>
+#endif
+#include <climits>
+#include <cstdarg>
+#include <cstdio>
+#include <cstring>
+#include <filesystem>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+#include "common.h"
+#include "json.hpp"
+#include "llama-cpp.h"
+GGML_ATTRIBUTE_FORMAT(1, 2)
+static std::string fmt(const char * fmt, ...) {
+    va_list ap;
+    va_list ap2;
+    va_start(ap, fmt);
+    va_copy(ap2, ap);
+    const int size = vsnprintf(NULL, 0, fmt, ap);
+    GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
+    std::string buf;
+    buf.resize(size);
+    const int size2 = vsnprintf(const_cast<char *>(buf.data()), buf.size() + 1, fmt, ap2);
+    GGML_ASSERT(size2 == size);
+    va_end(ap2);
+    va_end(ap);
+    return buf;
+}
+GGML_ATTRIBUTE_FORMAT(1, 2)
+static int printe(const char * fmt, ...) {
+    va_list args;
+    va_start(args, fmt);
+    const int ret = vfprintf(stderr, fmt, args);
+    va_end(args);
+    return ret;
+}
+class Opt {
+  public:
+    int init(int argc, const char ** argv) {
+        // Parse arguments
+        if (parse(argc, argv)) {
+            printe("Error: Failed to parse arguments.\n");
+            help();
+            return 1;
+        }
+        // If help is requested, show help and exit
+        if (help_) {
+            help();
+            return 2;
+        }
+        return 0;  // Success
+    }
+    std::string model_;
+    std::string user_;
+    int         context_size_ = -1, ngl_ = -1;
+    bool        verbose_ = false;
+  private:
+    bool        help_ = false;
+    bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) {
+        return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0;
+    }
+    int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) {
+        if (i + 1 >= argc) {
+            return 1;
+        }
+        option_value = std::atoi(argv[++i]);
+        return 0;
+    }
+    int parse(int argc, const char ** argv) {
+        bool options_parsing   = true;
+        for (int i = 1, positional_args_i = 0; i < argc; ++i) {
+            if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) {
+                if (handle_option_with_value(argc, argv, i, context_size_) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) {
+                if (handle_option_with_value(argc, argv, i, ngl_) == 1) {
+                    return 1;
+                }
+            } else if (options_parsing &&
+                       (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) {
+                verbose_ = true;
+            } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) {
+                help_ = true;
+                return 0;
+            } else if (options_parsing && strcmp(argv[i], "--") == 0) {
+                options_parsing = false;
+            } else if (positional_args_i == 0) {
+                if (!argv[i][0] || argv[i][0] == '-') {
+                    return 1;
+                }
+                ++positional_args_i;
+                model_ = argv[i];
+            } else if (positional_args_i == 1) {
+                ++positional_args_i;
+                user_ = argv[i];
+            } else {
+                user_ += " " + std::string(argv[i]);
+            }
+        }
+        return 0;
+    }
+    void help() const {
+        printf(
+            "Description:\n"
+            "  Runs a llm\n"
+            "\n"
+            "Usage:\n"
+            "  llama-run [options] model [prompt]\n"
+            "\n"
+            "Options:\n"
+            "  -c, --context-size <value>\n"
+            "      Context size (default: %d)\n"
+            "  -n, --ngl <value>\n"
+            "      Number of GPU layers (default: %d)\n"
+            "  -v, --verbose, --log-verbose\n"
+            "      Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n"
+            "  -h, --help\n"
+            "      Show help message\n"
+            "\n"
+            "Commands:\n"
+            "  model\n"
+            "      Model is a string with an optional prefix of \n"
+            "      huggingface:// (hf://), ollama://, https:// or file://.\n"
+            "      If no protocol is specified and a file exists in the specified\n"
+            "      path, file:// is assumed, otherwise if a file does not exist in\n"
+            "      the specified path, ollama:// is assumed. Models that are being\n"
+            "      pulled are downloaded with .partial extension while being\n"
+            "      downloaded and then renamed as the file without the .partial\n"
+            "      extension when complete.\n"
+            "\n"
+            "Examples:\n"
+            "  llama-run llama3\n"
+            "  llama-run ollama://granite-code\n"
+            "  llama-run ollama://smollm:135m\n"
+            "  llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n"
+            "  llama-run "
+            "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n"
+            "  llama-run https://example.com/some-file1.gguf\n"
+            "  llama-run some-file2.gguf\n"
+            "  llama-run file://some-file3.gguf\n"
+            "  llama-run --ngl 999 some-file4.gguf\n"
+            "  llama-run --ngl 999 some-file5.gguf Hello World\n",
+            llama_context_default_params().n_batch, llama_model_default_params().n_gpu_layers);
+    }
+};
+struct progress_data {
+    size_t                                file_size  = 0;
+    std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now();
+    bool                                  printed    = false;
+};
+static int get_terminal_width() {
+#if defined(_WIN32)
+    CONSOLE_SCREEN_BUFFER_INFO csbi;
+    GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi);
+    return csbi.srWindow.Right - csbi.srWindow.Left + 1;
+#else
+    struct winsize w;
+    ioctl(STDOUT_FILENO, TIOCGWINSZ, &w);
+    return w.ws_col;
+#endif
+}
+#ifdef LLAMA_USE_CURL
+class File {
+  public:
+    FILE * file = nullptr;
+    FILE * open(const std::string & filename, const char * mode) {
+        file = fopen(filename.c_str(), mode);
+        return file;
+    }
+    int lock() {
+        if (file) {
+#    ifdef _WIN32
+            fd    = _fileno(file);
+            hFile = (HANDLE) _get_osfhandle(fd);
+            if (hFile == INVALID_HANDLE_VALUE) {
+                fd = -1;
+                return 1;
+            }
+            OVERLAPPED overlapped = { 0 };
+            if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD,
+                            &overlapped)) {
+                fd = -1;
+                return 1;
+            }
+#    else
+            fd = fileno(file);
+            if (flock(fd, LOCK_EX | LOCK_NB) != 0) {
+                fd = -1;
+                return 1;
+            }
+#    endif
+        }
+        return 0;
+    }
+    ~File() {
+        if (fd >= 0) {
+#    ifdef _WIN32
+            if (hFile != INVALID_HANDLE_VALUE) {
+                OVERLAPPED overlapped = { 0 };
+                UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped);
+            }
+#    else
+            flock(fd, LOCK_UN);
+#    endif
+        }
+        if (file) {
+            fclose(file);
+        }
+    }
+  private:
+    int fd = -1;
+#    ifdef _WIN32
+    HANDLE hFile;
+#    endif
+};
+class HttpClient {
+  public:
+    int init(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+             const bool progress, std::string * response_str = nullptr) {
+        std::string output_file_partial;
+        curl = curl_easy_init();
+        if (!curl) {
+            return 1;
+        }
+        progress_data data;
+        File          out;
+        if (!output_file.empty()) {
+            output_file_partial = output_file + ".partial";
+            if (!out.open(output_file_partial, "ab")) {
+                printe("Failed to open file\n");
+                return 1;
+            }
+            if (out.lock()) {
+                printe("Failed to exclusively lock file\n");
+                return 1;
+            }
+        }
+        set_write_options(response_str, out);
+        data.file_size = set_resume_point(output_file_partial);
+        set_progress_options(progress, data);
+        set_headers(headers);
+        perform(url);
+        if (!output_file.empty()) {
+            std::filesystem::rename(output_file_partial, output_file);
+        }
+        return 0;
+    }
+    ~HttpClient() {
+        if (chunk) {
+            curl_slist_free_all(chunk);
+        }
+        if (curl) {
+            curl_easy_cleanup(curl);
+        }
+    }
+  private:
+    CURL *              curl  = nullptr;
+    struct curl_slist * chunk = nullptr;
+    void set_write_options(std::string * response_str, const File & out) {
+        if (response_str) {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str);
+        } else {
+            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
+            curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file);
+        }
+    }
+    size_t set_resume_point(const std::string & output_file) {
+        size_t file_size = 0;
+        if (std::filesystem::exists(output_file)) {
+            file_size = std::filesystem::file_size(output_file);
+            curl_easy_setopt(curl, CURLOPT_RESUME_FROM_LARGE, static_cast<curl_off_t>(file_size));
+        }
+        return file_size;
+    }
+    void set_progress_options(bool progress, progress_data & data) {
+        if (progress) {
+            curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+            curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data);
+            curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress);
+        }
+    }
+    void set_headers(const std::vector<std::string> & headers) {
+        if (!headers.empty()) {
+            if (chunk) {
+                curl_slist_free_all(chunk);
+                chunk = 0;
+            }
+            for (const auto & header : headers) {
+                chunk = curl_slist_append(chunk, header.c_str());
+            }
+            curl_easy_setopt(curl, CURLOPT_HTTPHEADER, chunk);
+        }
+    }
+    void perform(const std::string & url) {
+        CURLcode res;
+        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+        curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+        curl_easy_setopt(curl, CURLOPT_DEFAULT_PROTOCOL, "https");
+        curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1L);
+        res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            printe("curl_easy_perform() failed: %s\n", curl_easy_strerror(res));
+        }
+    }
+    static std::string human_readable_time(double seconds) {
+        int hrs  = static_cast<int>(seconds) / 3600;
+        int mins = (static_cast<int>(seconds) % 3600) / 60;
+        int secs = static_cast<int>(seconds) % 60;
+        if (hrs > 0) {
+            return fmt("%dh %02dm %02ds", hrs, mins, secs);
+        } else if (mins > 0) {
+            return fmt("%dm %02ds", mins, secs);
+        } else {
+            return fmt("%ds", secs);
+        }
+    }
+    static std::string human_readable_size(curl_off_t size) {
+        static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" };
+        char                length   = sizeof(suffix) / sizeof(suffix[0]);
+        int                 i        = 0;
+        double              dbl_size = size;
+        if (size > 1024) {
+            for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) {
+                dbl_size = size / 1024.0;
+            }
+        }
+        return fmt("%.2f %s", dbl_size, suffix[i]);
+    }
+    static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t,
+                               curl_off_t) {
+        progress_data * data = static_cast<progress_data *>(ptr);
+        if (total_to_download <= 0) {
+            return 0;
+        }
+        total_to_download += data->file_size;
+        const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size;
+        const curl_off_t percentage      = calculate_percentage(now_downloaded_plus_file_size, total_to_download);
+        std::string      progress_prefix = generate_progress_prefix(percentage);
+        const double speed = calculate_speed(now_downloaded, data->start_time);
+        const double tim   = (total_to_download - now_downloaded) / speed;
+        std::string  progress_suffix =
+            generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim);
+        int         progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix);
+        std::string progress_bar;
+        generate_progress_bar(progress_bar_width, percentage, progress_bar);
+        print_progress(progress_prefix, progress_bar, progress_suffix);
+        data->printed = true;
+        return 0;
+    }
+    static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) {
+        return (now_downloaded_plus_file_size * 100) / total_to_download;
+    }
+    static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); }
+    static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) {
+        const auto                          now             = std::chrono::steady_clock::now();
+        const std::chrono::duration<double> elapsed_seconds = now - start_time;
+        return now_downloaded / elapsed_seconds.count();
+    }
+    static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download,
+                                                double speed, double estimated_time) {
+        const int width = 10;
+        return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width,
+                   human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width,
+                   human_readable_time(estimated_time).c_str());
+    }
+    static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) {
+        int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3;
+        if (progress_bar_width < 1) {
+            progress_bar_width = 1;
+        }
+        return progress_bar_width;
+    }
+    static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage,
+                                             std::string & progress_bar) {
+        const curl_off_t pos = (percentage * progress_bar_width) / 100;
+        for (int i = 0; i < progress_bar_width; ++i) {
+            progress_bar.append((i < pos) ? "█" : " ");
+        }
+        return progress_bar;
+    }
+    static void print_progress(const std::string & progress_prefix, const std::string & progress_bar,
+                               const std::string & progress_suffix) {
+        printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(),
+               progress_suffix.c_str());
+    }
+    // Function to write data to a file
+    static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        FILE * out = static_cast<FILE *>(stream);
+        return fwrite(ptr, size, nmemb, out);
+    }
+    // Function to capture data into a string
+    static size_t capture_data(void * ptr, size_t size, size_t nmemb, void * stream) {
+        std::string * str = static_cast<std::string *>(stream);
+        str->append(static_cast<char *>(ptr), size * nmemb);
+        return size * nmemb;
+    }
+};
+#endif
+class LlamaData {
+  public:
+    llama_model_ptr                 model;
+    llama_sampler_ptr               sampler;
+    llama_context_ptr               context;
+    std::vector<llama_chat_message> messages;
+    std::vector<std::string>        msg_strs;
+    std::vector<char>               fmtted;
+    int init(Opt & opt) {
+        model = initialize_model(opt);
+        if (!model) {
+            return 1;
+        }
+        context = initialize_context(model, opt.context_size_);
+        if (!context) {
+            return 1;
+        }
+        sampler = initialize_sampler();
+        return 0;
+    }
+  private:
+#ifdef LLAMA_USE_CURL
+    int download(const std::string & url, const std::vector<std::string> & headers, const std::string & output_file,
+                 const bool progress, std::string * response_str = nullptr) {
+        HttpClient http;
+        if (http.init(url, headers, output_file, progress, response_str)) {
+            return 1;
+        }
+        return 0;
+    }
+#else
+    int download(const std::string &, const std::vector<std::string> &, const std::string &, const bool,
+                 std::string * = nullptr) {
+        printe("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
+        return 1;
+    }
+#endif
+    int huggingface_dl(const std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        // Find the second occurrence of '/' after protocol string
+        size_t pos = model.find('/');
+        pos        = model.find('/', pos + 1);
+        if (pos == std::string::npos) {
+            return 1;
+        }
+        const std::string hfr = model.substr(0, pos);
+        const std::string hff = model.substr(pos + 1);
+        const std::string url = "https://huggingface.co/" + hfr + "/resolve/main/" + hff;
+        return download(url, headers, bn, true);
+    }
+    int ollama_dl(std::string & model, const std::vector<std::string> headers, const std::string & bn) {
+        if (model.find('/') == std::string::npos) {
+            model = "library/" + model;
+        }
+        std::string model_tag = "latest";
+        size_t      colon_pos = model.find(':');
+        if (colon_pos != std::string::npos) {
+            model_tag = model.substr(colon_pos + 1);
+            model     = model.substr(0, colon_pos);
+        }
+        std::string manifest_url = "https://registry.ollama.ai/v2/" + model + "/manifests/" + model_tag;
+        std::string manifest_str;
+        const int   ret = download(manifest_url, headers, "", false, &manifest_str);
+        if (ret) {
+            return ret;
+        }
+        nlohmann::json manifest = nlohmann::json::parse(manifest_str);
+        std::string    layer;
+        for (const auto & l : manifest["layers"]) {
+            if (l["mediaType"] == "application/vnd.ollama.image.model") {
+                layer = l["digest"];
+                break;
+            }
+        }
+        std::string blob_url = "https://registry.ollama.ai/v2/" + model + "/blobs/" + layer;
+        return download(blob_url, headers, bn, true);
+    }
+    std::string basename(const std::string & path) {
+        const size_t pos = path.find_last_of("/\\");
+        if (pos == std::string::npos) {
+            return path;
+        }
+        return path.substr(pos + 1);
+    }
+    int remove_proto(std::string & model_) {
+        const std::string::size_type pos = model_.find("://");
+        if (pos == std::string::npos) {
+            return 1;
+        }
+        model_ = model_.substr(pos + 3);  // Skip past "://"
+        return 0;
+    }
+    int resolve_model(std::string & model_) {
+        int                            ret     = 0;
+        if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) {
+            remove_proto(model_);
+            return ret;
+        }
+        const std::string              bn      = basename(model_);
+        const std::vector<std::string> headers = { "--header",
+                                                   "Accept: application/vnd.docker.distribution.manifest.v2+json" };
+        if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) {
+            remove_proto(model_);
+            ret = huggingface_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "ollama://")) {
+            remove_proto(model_);
+            ret = ollama_dl(model_, headers, bn);
+        } else if (string_starts_with(model_, "https://")) {
+            download(model_, headers, bn, true);
+        } else {
+            ret = ollama_dl(model_, headers, bn);
+        }
+        model_ = bn;
+        return ret;
+    }
+    // Initializes the model and returns a unique pointer to it
+    llama_model_ptr initialize_model(Opt & opt) {
+        ggml_backend_load_all();
+        llama_model_params model_params = llama_model_default_params();
+        model_params.n_gpu_layers       = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers;
+        resolve_model(opt.model_);
+        printe(
+            "\r%*s"
+            "\rLoading model",
+            get_terminal_width(), " ");
+        llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params));
+        if (!model) {
+            printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str());
+        }
+        printe("\r%*s\r", static_cast<int>(sizeof("Loading model")), " ");
+        return model;
+    }
+    // Initializes the context with the specified parameters
+    llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) {
+        llama_context_params ctx_params = llama_context_default_params();
+        ctx_params.n_ctx = ctx_params.n_batch = n_ctx >= 0 ? n_ctx : ctx_params.n_batch;
+        llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params));
+        if (!context) {
+            printe("%s: error: failed to create the llama_context\n", __func__);
+        }
+        return context;
+    }
+    // Initializes and configures the sampler
+    llama_sampler_ptr initialize_sampler() {
+        llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params()));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f));
+        llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+        return sampler;
+    }
+};
+// Add a message to `messages` and store its content in `msg_strs`
+static void add_message(const char * role, const std::string & text, LlamaData & llama_data) {
+    llama_data.msg_strs.push_back(std::move(text));
+    llama_data.messages.push_back({ role, llama_data.msg_strs.back().c_str() });
+}
+// Function to apply the chat template and resize `formatted` if needed
+static int apply_chat_template(LlamaData & llama_data, const bool append) {
+    int result = llama_chat_apply_template(
+        llama_data.model.get(), nullptr, llama_data.messages.data(), llama_data.messages.size(), append,
+        append ? llama_data.fmtted.data() : nullptr, append ? llama_data.fmtted.size() : 0);
+    if (append && result > static_cast<int>(llama_data.fmtted.size())) {
+        llama_data.fmtted.resize(result);
+        result = llama_chat_apply_template(llama_data.model.get(), nullptr, llama_data.messages.data(),
+                                           llama_data.messages.size(), append, llama_data.fmtted.data(),
+                                           llama_data.fmtted.size());
+    }
+    return result;
+}
+// Function to tokenize the prompt
+static int tokenize_prompt(const llama_model_ptr & model, const std::string & prompt,
+                           std::vector<llama_token> & prompt_tokens) {
+    const int n_prompt_tokens = -llama_tokenize(model.get(), prompt.c_str(), prompt.size(), NULL, 0, true, true);
+    prompt_tokens.resize(n_prompt_tokens);
+    if (llama_tokenize(model.get(), prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true,
+                       true) < 0) {
+        printe("failed to tokenize the prompt\n");
+        return -1;
+    }
+    return n_prompt_tokens;
+}
+// Check if we have enough space in the context to evaluate this batch
+static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
+    const int n_ctx      = llama_n_ctx(ctx.get());
+    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    if (n_ctx_used + batch.n_tokens > n_ctx) {
+        printf("\033[0m\n");
+        printe("context size exceeded\n");
+        return 1;
+    }
+    return 0;
+}
+// convert the token to a string
+static int convert_token_to_string(const llama_model_ptr & model, const llama_token token_id, std::string & piece) {
+    char buf[256];
+    int  n = llama_token_to_piece(model.get(), token_id, buf, sizeof(buf), 0, true);
+    if (n < 0) {
+        printe("failed to convert token to piece\n");
+        return 1;
+    }
+    piece = std::string(buf, n);
+    return 0;
+}
+static void print_word_and_concatenate_to_response(const std::string & piece, std::string & response) {
+    printf("%s", piece.c_str());
+    fflush(stdout);
+    response += piece;
+}
+// helper function to evaluate a prompt and generate a response
+static int generate(LlamaData & llama_data, const std::string & prompt, std::string & response) {
+    std::vector<llama_token> tokens;
+    if (tokenize_prompt(llama_data.model, prompt, tokens) < 0) {
+        return 1;
+    }
+    // prepare a batch for the prompt
+    llama_batch batch = llama_batch_get_one(tokens.data(), tokens.size());
+    llama_token new_token_id;
+    while (true) {
+        check_context_size(llama_data.context, batch);
+        if (llama_decode(llama_data.context.get(), batch)) {
+            printe("failed to decode\n");
+            return 1;
+        }
+        // sample the next token, check is it an end of generation?
+        new_token_id = llama_sampler_sample(llama_data.sampler.get(), llama_data.context.get(), -1);
+        if (llama_token_is_eog(llama_data.model.get(), new_token_id)) {
+            break;
+        }
+        std::string piece;
+        if (convert_token_to_string(llama_data.model, new_token_id, piece)) {
+            return 1;
+        }
+        print_word_and_concatenate_to_response(piece, response);
+        // prepare the next batch with the sampled token
+        batch = llama_batch_get_one(&new_token_id, 1);
+    }
+    return 0;
+}
+static int read_user_input(std::string & user) {
+    std::getline(std::cin, user);
+    return user.empty();  // Should have data in happy path
+}
+// Function to generate a response based on the prompt
+static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response,
+                             const bool stdout_a_terminal) {
+    // Set response color
+    if (stdout_a_terminal) {
+        printf("\033[33m");
+    }
+    if (generate(llama_data, prompt, response)) {
+        printe("failed to generate response\n");
+        return 1;
+    }
+    // End response with color reset and newline
+    printf("\n%s", stdout_a_terminal ? "\033[0m" : "");
+    return 0;
+}
+// Helper function to apply the chat template and handle errors
+static int apply_chat_template_with_error_handling(LlamaData & llama_data, const bool append, int & output_length) {
+    const int new_len = apply_chat_template(llama_data, append);
+    if (new_len < 0) {
+        printe("failed to apply the chat template\n");
+        return -1;
+    }
+    output_length = new_len;
+    return 0;
+}
+// Helper function to handle user input
+static int handle_user_input(std::string & user_input, const std::string & user_) {
+    if (!user_.empty()) {
+        user_input = user_;
+        return 0;  // No need for interactive input
+    }
+    printf(
+        "\r%*s"
+        "\r\033[32m> \033[0m",
+        get_terminal_width(), " ");
+    return read_user_input(user_input);  // Returns true if input ends the loop
+}
+static bool is_stdin_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdin, &mode);
+#else
+    return isatty(STDIN_FILENO);
+#endif
+}
+static bool is_stdout_a_terminal() {
+#if defined(_WIN32)
+    HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE);
+    DWORD  mode;
+    return GetConsoleMode(hStdout, &mode);
+#else
+    return isatty(STDOUT_FILENO);
+#endif
+}
+// Function to tokenize the prompt
+static int chat_loop(LlamaData & llama_data, const std::string & user_) {
+    int prev_len = 0;
+    llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get()));
+    static const bool stdout_a_terminal = is_stdout_a_terminal();
+    while (true) {
+        // Get user input
+        std::string user_input;
+        while (handle_user_input(user_input, user_)) {
+        }
+        add_message("user", user_.empty() ? user_input : user_, llama_data);
+        int new_len;
+        if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) {
+            return 1;
+        }
+        std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len);
+        std::string response;
+        if (generate_response(llama_data, prompt, response, stdout_a_terminal)) {
+            return 1;
+        }
+        if (!user_.empty()) {
+            break;
+        }
+        add_message("assistant", response, llama_data);
+        if (apply_chat_template_with_error_handling(llama_data, false, prev_len) < 0) {
+            return 1;
+        }
+    }
+    return 0;
+}
+static void log_callback(const enum ggml_log_level level, const char * text, void * p) {
+    const Opt * opt = static_cast<Opt *>(p);
+    if (opt->verbose_ || level == GGML_LOG_LEVEL_ERROR) {
+        printe("%s", text);
+    }
+}
+static std::string read_pipe_data() {
+    std::ostringstream result;
+    result << std::cin.rdbuf();  // Read all data from std::cin
+    return result.str();
+}
+int main(int argc, const char ** argv) {
+    Opt       opt;
+    const int ret = opt.init(argc, argv);
+    if (ret == 2) {
+        return 0;
+    } else if (ret) {
+        return 1;
+    }
+    if (!is_stdin_a_terminal()) {
+        if (!opt.user_.empty()) {
+            opt.user_ += "\n\n";
+        }
+        opt.user_ += read_pipe_data();
+    }
+    llama_log_set(log_callback, &opt);
+    LlamaData llama_data;
+    if (llama_data.init(opt)) {
+        return 1;
+    }
+    if (chat_loop(llama_data, opt.user_)) {
+        return 1;
+    }
+    return 0;
+}