llama-cpp-python 0.1.77__tar.gz → 0.1.79__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/CHANGELOG.md +13 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/PKG-INFO +6 -3
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/README.md +5 -2
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/openblas_simple/Dockerfile +1 -1
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp/llama.py +99 -60
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp/llama_cpp.py +489 -280
- llama_cpp_python-0.1.79/llama_cpp/llama_grammar.py +1188 -0
- llama_cpp_python-0.1.79/llama_cpp/server/__init__.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp/server/app.py +4 -0
- llama_cpp_python-0.1.79/llama_cpp/utils.py +38 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp_python.egg-info/PKG-INFO +6 -3
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp_python.egg-info/SOURCES.txt +59 -13
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/poetry.lock +215 -91
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/pyproject.toml +12 -8
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/setup.py +2 -1
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/tests/test_llama.py +17 -5
- llama_cpp_python-0.1.79/vendor/llama.cpp/.devops/full-rocm.Dockerfile +44 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/.devops/lamma-cpp-clblast.srpm.spec +58 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/.devops/lamma-cpp-cublas.srpm.spec +59 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/.devops/llama-cpp.srpm.spec +58 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/.devops/main-rocm.Dockerfile +44 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/.dockerignore +17 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.github/workflows/build.yml +48 -17
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.gitignore +11 -17
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/CMakeLists.txt +77 -11
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/Makefile +95 -41
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/README.md +178 -100
- llama_cpp_python-0.1.79/vendor/llama.cpp/build.zig +121 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ci/run.sh +23 -22
- llama_cpp_python-0.1.79/vendor/llama.cpp/common/CMakeLists.txt +20 -0
- {llama_cpp_python-0.1.77/vendor/llama.cpp/examples → llama_cpp_python-0.1.79/vendor/llama.cpp/common}/common.cpp +109 -406
- {llama_cpp_python-0.1.77/vendor/llama.cpp/examples → llama_cpp_python-0.1.79/vendor/llama.cpp/common}/common.h +25 -52
- llama_cpp_python-0.1.79/vendor/llama.cpp/common/console.cpp +500 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/common/console.h +19 -0
- {llama_cpp_python-0.1.77/vendor/llama.cpp/examples → llama_cpp_python-0.1.79/vendor/llama.cpp/common}/grammar-parser.cpp +1 -1
- llama_cpp_python-0.1.79/vendor/llama.cpp/convert-falcon-hf-to-gguf.py +279 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/convert-gptneox-hf-to-gguf.py +267 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/convert-llama-7b-pth-to-gguf.py +308 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/convert-llama-ggmlv3-to-gguf.py +345 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/convert-llama-hf-to-gguf.py +328 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/convert-lora-to-ggml.py +18 -17
- llama_cpp_python-0.1.79/vendor/llama.cpp/convert.py +1111 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/docs/token_generation_performance_tips.md +3 -3
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/CMakeLists.txt +3 -19
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +5 -1
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/beam_search/CMakeLists.txt +8 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/beam_search/beam_search.cpp +188 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/convert-llama2c-to-ggml/CMakeLists.txt +5 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/convert-llama2c-to-ggml/README.md +30 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +863 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/embd-input-lib.cpp +4 -4
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/embd_input.py +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/llava.py +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/minigpt4.py +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/panda_gpt.py +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embedding/embedding.cpp +20 -13
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/gguf/gguf.cpp +246 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/gptneox-wip/cmpnct_gpt2bpe.hpp +1133 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/gptneox-wip/falcon-main.cpp +1111 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/gptneox-wip/gptneox-main.cpp +1082 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/jeopardy/graph.py +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/json-schema-to-grammar.py +133 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/llama-bench/CMakeLists.txt +8 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/llama-bench/llama-bench.cpp +1011 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/llama.vim +132 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/llm.vim +5 -1
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/main/README.md +18 -4
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/main/main.cpp +74 -61
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/make-ggml.py +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/metal/metal.cpp +1 -1
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/perplexity/perplexity.cpp +602 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/quantize/quantize.cpp +18 -18
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/reason-act.sh +0 -1
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +4 -5
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/README.md +8 -6
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/api_like_OAI.py +1 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/chat-llama2.sh +109 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/chat.mjs +30 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/chat.sh +0 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/completion.js.hpp +428 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/deps.sh +5 -3
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/index.html.hpp +2272 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/index.js.hpp +1876 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/json-schema-to-grammar.mjs.hpp +311 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/public/completion.js +37 -18
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/public/index.html +851 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/public/index.js +1 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server/public/json-schema-to-grammar.mjs +112 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/server.cpp +333 -93
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/server-llama2-13B.sh +26 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/examples/simple/simple.cpp +130 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/train-text-from-scratch/train-text-from-scratch.cpp +70 -72
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/flake.nix +0 -2
- llama_cpp_python-0.1.79/vendor/llama.cpp/ggml-alloc.c +593 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/ggml-alloc.h +26 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-cuda.cu +3158 -472
- llama_cpp_python-0.1.79/vendor/llama.cpp/ggml-cuda.h +46 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-metal.h +13 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-metal.m +277 -165
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-metal.metal +635 -552
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml.c +2638 -480
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml.h +389 -48
- llama_cpp_python-0.1.79/vendor/llama.cpp/gguf-py/LICENSE +21 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/gguf-py/README.md +55 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/gguf-py/gguf/__init__.py +1 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/gguf-py/gguf/gguf.py +727 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/gguf-py/pyproject.toml +28 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/gguf-py/tests/test_gguf.py +7 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/grammars/README.md +91 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/grammars/json.gbnf +6 -10
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/k_quants.c +458 -75
- llama_cpp_python-0.1.79/vendor/llama.cpp/llama.cpp +6254 -0
- {llama_cpp_python-0.1.77/vendor/llama.cpp/spm-headers → llama_cpp_python-0.1.79/vendor/llama.cpp}/llama.h +176 -103
- llama_cpp_python-0.1.79/vendor/llama.cpp/models/.editorconfig +1 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/models/ggml-vocab-llama.gguf +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/requirements.txt +1 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/scripts/build-info.sh +3 -2
- llama_cpp_python-0.1.79/vendor/llama.cpp/scripts/get-wikitext-2.sh +3 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/scripts/qnt-all.sh +27 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/scripts/run-all-perf.sh +31 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/scripts/run-all-ppl.sh +27 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/scripts/sync-ggml.sh +16 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/spm-headers/ggml.h +389 -48
- {llama_cpp_python-0.1.77/vendor/llama.cpp → llama_cpp_python-0.1.79/vendor/llama.cpp/spm-headers}/llama.h +176 -103
- llama_cpp_python-0.1.79/vendor/llama.cpp/tests/CMakeLists.txt +37 -0
- llama_cpp_python-0.1.77/vendor/llama.cpp/tests/test-double-float.c → llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-double-float.cpp +7 -5
- llama_cpp_python-0.1.77/vendor/llama.cpp/tests/test-grad0.c → llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-grad0.cpp +16 -16
- llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-grammar-parser.cpp +250 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-llama-grammar.cpp +403 -0
- llama_cpp_python-0.1.77/vendor/llama.cpp/tests/test-opt.c → llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-opt.cpp +8 -7
- llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-tokenizer-0.cpp +140 -0
- llama_cpp_python-0.1.79/vendor/llama.cpp/tests/test-tokenizer-1.cpp +116 -0
- llama_cpp_python-0.1.77/vendor/llama.cpp/.dockerignore +0 -24
- llama_cpp_python-0.1.77/vendor/llama.cpp/build.zig +0 -68
- llama_cpp_python-0.1.77/vendor/llama.cpp/convert-pth-to-ggml.py +0 -13
- llama_cpp_python-0.1.77/vendor/llama.cpp/convert.py +0 -1288
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/perplexity/perplexity.cpp +0 -256
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/server/completion.js.hpp +0 -375
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/server/index.html.hpp +0 -1145
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/server/index.js.hpp +0 -1851
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/server/public/index.html +0 -446
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/server/public/index.js +0 -1
- llama_cpp_python-0.1.77/vendor/llama.cpp/examples/simple/simple.cpp +0 -181
- llama_cpp_python-0.1.77/vendor/llama.cpp/ggml-cuda.h +0 -36
- llama_cpp_python-0.1.77/vendor/llama.cpp/llama-util.h +0 -504
- llama_cpp_python-0.1.77/vendor/llama.cpp/llama.cpp +0 -4128
- llama_cpp_python-0.1.77/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
- llama_cpp_python-0.1.77/vendor/llama.cpp/scripts/perf-run-all.sh +0 -93
- llama_cpp_python-0.1.77/vendor/llama.cpp/scripts/ppl-run-all.sh +0 -39
- llama_cpp_python-0.1.77/vendor/llama.cpp/scripts/sync-ggml.sh +0 -14
- llama_cpp_python-0.1.77/vendor/llama.cpp/tests/CMakeLists.txt +0 -15
- llama_cpp_python-0.1.77/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -105
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.dockerignore +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/dependabot.yml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/workflows/build-and-release.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/workflows/build-docker.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/workflows/publish-to-test.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/workflows/publish.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/workflows/test-pypi.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.github/workflows/test.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.gitignore +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.gitmodules +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/.readthedocs.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/LICENSE.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/Makefile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/cuda_simple/Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/open_llama/Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/open_llama/build.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/open_llama/hug_model.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/open_llama/start.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/open_llama/start_server.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/simple/Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docker/simple/run.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docs/api-reference.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docs/index.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docs/install/macos.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/docs/requirements.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/high_level_api/fastapi_server.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/high_level_api/high_level_api_embedding.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/high_level_api/high_level_api_inference.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/high_level_api/high_level_api_streaming.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/high_level_api/langchain_custom_llm.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/Chat.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/Miku.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/ReasonAct.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/common.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/low_level_api_chat_cpp.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/quantize.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/low_level_api/util.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/notebooks/Clients.ipynb +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/notebooks/Guidance.ipynb +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/examples/notebooks/PerformanceTuning.ipynb +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp/__init__.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp/llama_types.py +0 -0
- /llama_cpp_python-0.1.77/llama_cpp/server/__init__.py → /llama_cpp_python-0.1.79/llama_cpp/py.typed +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp/server/__main__.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp_python.egg-info/requires.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/llama_cpp_python.egg-info/top_level.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/mkdocs.yml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/poetry.toml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/setup.cfg +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.clang-tidy +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.devops/full-cuda.Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.devops/main-cuda.Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.devops/tools.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.ecrc +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.editorconfig +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.flake8 +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.github/workflows/tidy-post.yml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.github/workflows/tidy-review.yml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/.pre-commit-config.yaml +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/LICENSE +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/Package.swift +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/SHA256SUMS +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ci/README.md +0 -0
- {llama_cpp_python-0.1.77/vendor/llama.cpp/examples → llama_cpp_python-0.1.79/vendor/llama.cpp/common}/grammar-parser.h +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/docs/BLIS.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/Miku.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/alpaca.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/chat-persistent.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/chat-vicuna.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/chat.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/.gitignore +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/embd-input-test.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embd-input/embd-input.h +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/embedding/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/llama2-13b.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/llama2.sh +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/metal/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/quantize/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/httplib.h +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/server/json.hpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/simple/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/train-text-from-scratch/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/examples/train-text-from-scratch/README.md +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/flake.lock +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-mpi.c +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-mpi.h +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-opencl.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/ggml-opencl.h +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/grammars/arithmetic.gbnf +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/grammars/chess.gbnf +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/grammars/japanese.gbnf +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/grammars/list.gbnf +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/k_quants.h +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/media/llama0-banner.png +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/media/llama0-logo.png +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/media/llama1-banner.png +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/media/llama1-logo.png +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/chat.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/dan-modified.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/dan.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/tests/test-quantize-fns.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
- {llama_cpp_python-0.1.77 → llama_cpp_python-0.1.79}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
|
@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.1.79]
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- (llama.cpp) GGUF support
|
|
15
|
+
|
|
16
|
+
## [0.1.78]
|
|
17
|
+
|
|
18
|
+
### Added
|
|
19
|
+
|
|
20
|
+
- Grammar based sampling via LlamaGrammar which can be passed to completions
|
|
21
|
+
- Make n_gpu_layers == -1 offload all layers
|
|
22
|
+
|
|
10
23
|
## [0.1.77]
|
|
11
24
|
|
|
12
25
|
- (llama.cpp) Update llama.cpp add support for LLaMa 2 70B
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llama_cpp_python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.79
|
|
4
4
|
Summary: A Python wrapper for llama.cpp
|
|
5
5
|
Author: Andrei Betlen
|
|
6
6
|
Author-email: abetlen@gmail.com
|
|
@@ -35,6 +35,9 @@ This package provides:
|
|
|
35
35
|
|
|
36
36
|
Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
|
|
37
37
|
|
|
38
|
+
> [!WARNING]
|
|
39
|
+
> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`. Old model files can be converted using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
|
|
40
|
+
|
|
38
41
|
|
|
39
42
|
## Installation from PyPI (recommended)
|
|
40
43
|
|
|
@@ -158,7 +161,7 @@ llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
|
|
|
158
161
|
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
|
|
159
162
|
|
|
160
163
|
```python
|
|
161
|
-
llm = Llama(model_path="./models/
|
|
164
|
+
llm = Llama(model_path="./models/70B/ggml-model.bin", n_gqa=8)
|
|
162
165
|
```
|
|
163
166
|
|
|
164
167
|
## Web Server
|
|
@@ -187,7 +190,7 @@ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggm
|
|
|
187
190
|
## Low-level API
|
|
188
191
|
|
|
189
192
|
The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
|
|
190
|
-
The entire
|
|
193
|
+
The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
|
|
191
194
|
|
|
192
195
|
Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
|
|
193
196
|
|
|
@@ -17,6 +17,9 @@ This package provides:
|
|
|
17
17
|
|
|
18
18
|
Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
|
|
19
19
|
|
|
20
|
+
> [!WARNING]
|
|
21
|
+
> Starting with version 0.1.79 the model format has changed from `ggmlv3` to `gguf`. Old model files can be converted using the `convert-llama-ggmlv3-to-gguf.py` script in [`llama.cpp`](https://github.com/ggerganov/llama.cpp)
|
|
22
|
+
|
|
20
23
|
|
|
21
24
|
## Installation from PyPI (recommended)
|
|
22
25
|
|
|
@@ -140,7 +143,7 @@ llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
|
|
|
140
143
|
Llama2 70b must set the `n_gqa` parameter (grouped-query attention factor) to 8 when loading:
|
|
141
144
|
|
|
142
145
|
```python
|
|
143
|
-
llm = Llama(model_path="./models/
|
|
146
|
+
llm = Llama(model_path="./models/70B/ggml-model.bin", n_gqa=8)
|
|
144
147
|
```
|
|
145
148
|
|
|
146
149
|
## Web Server
|
|
@@ -169,7 +172,7 @@ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggm
|
|
|
169
172
|
## Low-level API
|
|
170
173
|
|
|
171
174
|
The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
|
|
172
|
-
The entire
|
|
175
|
+
The entire low-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
|
|
173
176
|
|
|
174
177
|
Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
|
|
175
178
|
|
|
@@ -9,7 +9,7 @@ COPY . .
|
|
|
9
9
|
RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
|
|
10
10
|
RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
|
|
11
11
|
|
|
12
|
-
RUN
|
|
12
|
+
RUN CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama_cpp_python --verbose
|
|
13
13
|
|
|
14
14
|
# Run the server
|
|
15
15
|
CMD python3 -m llama_cpp.server
|
|
@@ -23,10 +23,13 @@ import ctypes
|
|
|
23
23
|
|
|
24
24
|
from . import llama_cpp
|
|
25
25
|
from .llama_types import *
|
|
26
|
+
from .llama_grammar import LlamaGrammar
|
|
26
27
|
|
|
27
28
|
import numpy as np
|
|
28
29
|
import numpy.typing as npt
|
|
29
30
|
|
|
31
|
+
from .utils import suppress_stdout_stderr
|
|
32
|
+
|
|
30
33
|
class BaseLlamaCache(ABC):
|
|
31
34
|
"""Base cache class for a llama.cpp model."""
|
|
32
35
|
|
|
@@ -224,7 +227,8 @@ class Llama:
|
|
|
224
227
|
rope_freq_base: float = 10000.0,
|
|
225
228
|
rope_freq_scale: float = 1.0,
|
|
226
229
|
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
|
|
227
|
-
rms_norm_eps: Optional[float] = None,
|
|
230
|
+
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
|
|
231
|
+
mul_mat_q: Optional[bool] = None,
|
|
228
232
|
verbose: bool = True,
|
|
229
233
|
):
|
|
230
234
|
"""Load a llama.cpp model from `model_path`.
|
|
@@ -234,6 +238,7 @@ class Llama:
|
|
|
234
238
|
n_ctx: Maximum context size.
|
|
235
239
|
n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
|
|
236
240
|
seed: Random seed. -1 for random.
|
|
241
|
+
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
|
|
237
242
|
f16_kv: Use half-precision for key/value cache.
|
|
238
243
|
logits_all: Return logits for all tokens, not just the last token.
|
|
239
244
|
vocab_only: Only load the vocabulary no weights.
|
|
@@ -262,7 +267,7 @@ class Llama:
|
|
|
262
267
|
|
|
263
268
|
self.params = llama_cpp.llama_context_default_params()
|
|
264
269
|
self.params.n_ctx = n_ctx
|
|
265
|
-
self.params.n_gpu_layers = n_gpu_layers
|
|
270
|
+
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
|
|
266
271
|
self.params.seed = seed
|
|
267
272
|
self.params.f16_kv = f16_kv
|
|
268
273
|
self.params.logits_all = logits_all
|
|
@@ -273,22 +278,21 @@ class Llama:
|
|
|
273
278
|
self.params.low_vram = low_vram
|
|
274
279
|
|
|
275
280
|
self.tensor_split = tensor_split
|
|
276
|
-
self.
|
|
281
|
+
self._p_tensor_split = None
|
|
277
282
|
|
|
278
283
|
if self.tensor_split is not None:
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
284
|
+
FloatArray = (ctypes.c_float * len(self.tensor_split))(*self.tensor_split)
|
|
285
|
+
self._p_tensor_split = ctypes.POINTER(ctypes.c_float)(
|
|
286
|
+
FloatArray
|
|
287
|
+
) # keep a reference to the array so it is not gc'd
|
|
288
|
+
self.params.tensor_split = self._p_tensor_split
|
|
283
289
|
|
|
284
290
|
self.params.rope_freq_base = rope_freq_base
|
|
285
291
|
self.params.rope_freq_scale = rope_freq_scale
|
|
286
292
|
|
|
287
|
-
if n_gqa is not None:
|
|
288
|
-
self.params.n_gqa = n_gqa
|
|
289
293
|
|
|
290
|
-
if
|
|
291
|
-
self.params.
|
|
294
|
+
if mul_mat_q is not None:
|
|
295
|
+
self.params.mul_mat_q = mul_mat_q
|
|
292
296
|
|
|
293
297
|
self.last_n_tokens_size = last_n_tokens_size
|
|
294
298
|
self.n_batch = min(n_ctx, n_batch)
|
|
@@ -307,12 +311,25 @@ class Llama:
|
|
|
307
311
|
if not os.path.exists(model_path):
|
|
308
312
|
raise ValueError(f"Model path does not exist: {model_path}")
|
|
309
313
|
|
|
310
|
-
|
|
311
|
-
self.
|
|
312
|
-
|
|
314
|
+
if verbose:
|
|
315
|
+
self.model = llama_cpp.llama_load_model_from_file(
|
|
316
|
+
self.model_path.encode("utf-8"), self.params
|
|
317
|
+
)
|
|
318
|
+
else:
|
|
319
|
+
with suppress_stdout_stderr():
|
|
320
|
+
self.model = llama_cpp.llama_load_model_from_file(
|
|
321
|
+
self.model_path.encode("utf-8"), self.params
|
|
322
|
+
)
|
|
313
323
|
assert self.model is not None
|
|
314
324
|
|
|
315
|
-
|
|
325
|
+
if verbose:
|
|
326
|
+
self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
|
|
327
|
+
else:
|
|
328
|
+
with suppress_stdout_stderr():
|
|
329
|
+
print("here")
|
|
330
|
+
self.ctx = llama_cpp.llama_new_context_with_model(
|
|
331
|
+
self.model, self.params
|
|
332
|
+
)
|
|
316
333
|
|
|
317
334
|
assert self.ctx is not None
|
|
318
335
|
|
|
@@ -349,8 +366,8 @@ class Llama:
|
|
|
349
366
|
sorted=sorted,
|
|
350
367
|
)
|
|
351
368
|
self._candidates = candidates
|
|
352
|
-
self._token_nl =
|
|
353
|
-
self._token_eos =
|
|
369
|
+
self._token_nl = self.token_nl()
|
|
370
|
+
self._token_eos = self.token_eos()
|
|
354
371
|
self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore
|
|
355
372
|
self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single)
|
|
356
373
|
|
|
@@ -391,11 +408,11 @@ class Llama:
|
|
|
391
408
|
Returns:
|
|
392
409
|
A list of tokens.
|
|
393
410
|
"""
|
|
394
|
-
assert self.
|
|
411
|
+
assert self.model is not None
|
|
395
412
|
n_ctx = self._n_ctx
|
|
396
413
|
tokens = (llama_cpp.llama_token * n_ctx)()
|
|
397
|
-
n_tokens = llama_cpp.
|
|
398
|
-
self.
|
|
414
|
+
n_tokens = llama_cpp.llama_tokenize_with_model(
|
|
415
|
+
self.model,
|
|
399
416
|
text,
|
|
400
417
|
tokens,
|
|
401
418
|
llama_cpp.c_int(n_ctx),
|
|
@@ -404,8 +421,8 @@ class Llama:
|
|
|
404
421
|
if n_tokens < 0:
|
|
405
422
|
n_tokens = abs(n_tokens)
|
|
406
423
|
tokens = (llama_cpp.llama_token * n_tokens)()
|
|
407
|
-
n_tokens = llama_cpp.
|
|
408
|
-
self.
|
|
424
|
+
n_tokens = llama_cpp.llama_tokenize_with_model(
|
|
425
|
+
self.model,
|
|
409
426
|
text,
|
|
410
427
|
tokens,
|
|
411
428
|
llama_cpp.c_int(n_tokens),
|
|
@@ -426,13 +443,19 @@ class Llama:
|
|
|
426
443
|
Returns:
|
|
427
444
|
The detokenized string.
|
|
428
445
|
"""
|
|
429
|
-
assert self.
|
|
446
|
+
assert self.model is not None
|
|
430
447
|
output = b""
|
|
448
|
+
size = 8
|
|
449
|
+
buffer = (ctypes.c_char * size)()
|
|
431
450
|
for token in tokens:
|
|
432
|
-
|
|
433
|
-
self.
|
|
451
|
+
n = llama_cpp.llama_token_to_str_with_model(
|
|
452
|
+
self.model, llama_cpp.llama_token(token), buffer, size
|
|
434
453
|
)
|
|
435
|
-
|
|
454
|
+
assert n <= size
|
|
455
|
+
output += bytes(buffer[:n])
|
|
456
|
+
# NOTE: Llama1 models automatically added a space at the start of the prompt
|
|
457
|
+
# this line removes a leading space if the first token is a beginning of sentence token
|
|
458
|
+
return output[1:] if len(tokens) > 0 and tokens[0] == self.token_bos() else output
|
|
436
459
|
|
|
437
460
|
def set_cache(self, cache: Optional[BaseLlamaCache]):
|
|
438
461
|
"""Set the cache.
|
|
@@ -497,6 +520,7 @@ class Llama:
|
|
|
497
520
|
mirostat_eta: llama_cpp.c_float,
|
|
498
521
|
penalize_nl: bool = True,
|
|
499
522
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
523
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
500
524
|
):
|
|
501
525
|
assert self.ctx is not None
|
|
502
526
|
assert self.n_tokens > 0
|
|
@@ -543,8 +567,16 @@ class Llama:
|
|
|
543
567
|
)
|
|
544
568
|
if not penalize_nl:
|
|
545
569
|
candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
|
|
570
|
+
|
|
571
|
+
if grammar is not None:
|
|
572
|
+
llama_cpp.llama_sample_grammar(
|
|
573
|
+
ctx=self.ctx,
|
|
574
|
+
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
575
|
+
grammar=grammar.grammar,
|
|
576
|
+
)
|
|
577
|
+
|
|
546
578
|
if temp.value == 0.0:
|
|
547
|
-
|
|
579
|
+
id = llama_cpp.llama_sample_token_greedy(
|
|
548
580
|
ctx=self.ctx,
|
|
549
581
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
550
582
|
)
|
|
@@ -556,7 +588,7 @@ class Llama:
|
|
|
556
588
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
557
589
|
temp=temp,
|
|
558
590
|
)
|
|
559
|
-
|
|
591
|
+
id = llama_cpp.llama_sample_token_mirostat(
|
|
560
592
|
ctx=self.ctx,
|
|
561
593
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
562
594
|
tau=mirostat_tau,
|
|
@@ -571,7 +603,7 @@ class Llama:
|
|
|
571
603
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
572
604
|
temp=temp,
|
|
573
605
|
)
|
|
574
|
-
|
|
606
|
+
id = llama_cpp.llama_sample_token_mirostat_v2(
|
|
575
607
|
ctx=self.ctx,
|
|
576
608
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
577
609
|
tau=mirostat_tau,
|
|
@@ -608,10 +640,17 @@ class Llama:
|
|
|
608
640
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
609
641
|
temp=temp,
|
|
610
642
|
)
|
|
611
|
-
|
|
643
|
+
id = llama_cpp.llama_sample_token(
|
|
612
644
|
ctx=self.ctx,
|
|
613
645
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
614
646
|
)
|
|
647
|
+
if grammar is not None:
|
|
648
|
+
llama_cpp.llama_grammar_accept_token(
|
|
649
|
+
ctx=self.ctx,
|
|
650
|
+
grammar=grammar.grammar,
|
|
651
|
+
token=llama_cpp.ctypes.c_int(id),
|
|
652
|
+
)
|
|
653
|
+
return id
|
|
615
654
|
|
|
616
655
|
def sample(
|
|
617
656
|
self,
|
|
@@ -627,6 +666,7 @@ class Llama:
|
|
|
627
666
|
mirostat_tau: float = 5.0,
|
|
628
667
|
penalize_nl: bool = True,
|
|
629
668
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
669
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
630
670
|
):
|
|
631
671
|
"""Sample a token from the model.
|
|
632
672
|
|
|
@@ -660,6 +700,7 @@ class Llama:
|
|
|
660
700
|
mirostat_eta=llama_cpp.c_float(mirostat_eta),
|
|
661
701
|
penalize_nl=penalize_nl,
|
|
662
702
|
logits_processor=logits_processor,
|
|
703
|
+
grammar=grammar,
|
|
663
704
|
)
|
|
664
705
|
|
|
665
706
|
def generate(
|
|
@@ -678,6 +719,7 @@ class Llama:
|
|
|
678
719
|
mirostat_eta: float = 0.1,
|
|
679
720
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
680
721
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
|
722
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
681
723
|
) -> Generator[int, Optional[Sequence[int]], None]:
|
|
682
724
|
"""Create a generator of tokens from a prompt.
|
|
683
725
|
|
|
@@ -699,7 +741,6 @@ class Llama:
|
|
|
699
741
|
The generated tokens.
|
|
700
742
|
"""
|
|
701
743
|
assert self.ctx is not None
|
|
702
|
-
|
|
703
744
|
if reset and len(self._input_ids) > 0:
|
|
704
745
|
longest_prefix = 0
|
|
705
746
|
for a, b in zip(self._input_ids, tokens[:-1]):
|
|
@@ -717,6 +758,9 @@ class Llama:
|
|
|
717
758
|
if reset:
|
|
718
759
|
self.reset()
|
|
719
760
|
|
|
761
|
+
if grammar is not None:
|
|
762
|
+
grammar.reset()
|
|
763
|
+
|
|
720
764
|
while True:
|
|
721
765
|
self.eval(tokens)
|
|
722
766
|
token = self.sample(
|
|
@@ -731,6 +775,7 @@ class Llama:
|
|
|
731
775
|
mirostat_tau=mirostat_tau,
|
|
732
776
|
mirostat_eta=mirostat_eta,
|
|
733
777
|
logits_processor=logits_processor,
|
|
778
|
+
grammar=grammar,
|
|
734
779
|
)
|
|
735
780
|
if stopping_criteria is not None and stopping_criteria(
|
|
736
781
|
self._input_ids.tolist(), self._scores[-1, :].tolist()
|
|
@@ -833,6 +878,7 @@ class Llama:
|
|
|
833
878
|
model: Optional[str] = None,
|
|
834
879
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
|
835
880
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
881
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
836
882
|
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
|
|
837
883
|
assert self.ctx is not None
|
|
838
884
|
|
|
@@ -840,7 +886,7 @@ class Llama:
|
|
|
840
886
|
created: int = int(time.time())
|
|
841
887
|
completion_tokens: List[int] = []
|
|
842
888
|
# Add blank space to start of prompt to match OG llama tokenizer
|
|
843
|
-
prompt_tokens: List[int] = self.tokenize(
|
|
889
|
+
prompt_tokens: List[int] = self.tokenize(prompt.encode("utf-8")) if prompt != "" else [self.token_bos()]
|
|
844
890
|
text: bytes = b""
|
|
845
891
|
returned_tokens: int = 0
|
|
846
892
|
stop = (
|
|
@@ -910,6 +956,7 @@ class Llama:
|
|
|
910
956
|
repeat_penalty=repeat_penalty,
|
|
911
957
|
stopping_criteria=stopping_criteria,
|
|
912
958
|
logits_processor=logits_processor,
|
|
959
|
+
grammar=grammar,
|
|
913
960
|
):
|
|
914
961
|
if token == self._token_eos:
|
|
915
962
|
text = self.detokenize(completion_tokens)
|
|
@@ -960,9 +1007,7 @@ class Llama:
|
|
|
960
1007
|
for token in remaining_tokens:
|
|
961
1008
|
token_end_position += len(self.detokenize([token]))
|
|
962
1009
|
# Check if stop sequence is in the token
|
|
963
|
-
if token_end_position >= (
|
|
964
|
-
remaining_length - first_stop_position
|
|
965
|
-
):
|
|
1010
|
+
if token_end_position >= (remaining_length - first_stop_position):
|
|
966
1011
|
break
|
|
967
1012
|
logprobs_or_none: Optional[CompletionLogprobs] = None
|
|
968
1013
|
if logprobs is not None:
|
|
@@ -1256,6 +1301,7 @@ class Llama:
|
|
|
1256
1301
|
model: Optional[str] = None,
|
|
1257
1302
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
|
1258
1303
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
1304
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
1259
1305
|
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
1260
1306
|
"""Generate text from a prompt.
|
|
1261
1307
|
|
|
@@ -1300,6 +1346,7 @@ class Llama:
|
|
|
1300
1346
|
model=model,
|
|
1301
1347
|
stopping_criteria=stopping_criteria,
|
|
1302
1348
|
logits_processor=logits_processor,
|
|
1349
|
+
grammar=grammar
|
|
1303
1350
|
)
|
|
1304
1351
|
if stream:
|
|
1305
1352
|
chunks: Iterator[CompletionChunk] = completion_or_chunks
|
|
@@ -1329,6 +1376,7 @@ class Llama:
|
|
|
1329
1376
|
model: Optional[str] = None,
|
|
1330
1377
|
stopping_criteria: Optional[StoppingCriteriaList] = None,
|
|
1331
1378
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
1379
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
1332
1380
|
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
1333
1381
|
"""Generate text from a prompt.
|
|
1334
1382
|
|
|
@@ -1373,6 +1421,7 @@ class Llama:
|
|
|
1373
1421
|
model=model,
|
|
1374
1422
|
stopping_criteria=stopping_criteria,
|
|
1375
1423
|
logits_processor=logits_processor,
|
|
1424
|
+
grammar=grammar,
|
|
1376
1425
|
)
|
|
1377
1426
|
|
|
1378
1427
|
def _convert_text_completion_to_chat(
|
|
@@ -1453,6 +1502,7 @@ class Llama:
|
|
|
1453
1502
|
mirostat_eta: float = 0.1,
|
|
1454
1503
|
model: Optional[str] = None,
|
|
1455
1504
|
logits_processor: Optional[LogitsProcessorList] = None,
|
|
1505
|
+
grammar: Optional[LlamaGrammar] = None,
|
|
1456
1506
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
1457
1507
|
"""Generate a chat completion from a list of messages.
|
|
1458
1508
|
|
|
@@ -1495,6 +1545,7 @@ class Llama:
|
|
|
1495
1545
|
mirostat_eta=mirostat_eta,
|
|
1496
1546
|
model=model,
|
|
1497
1547
|
logits_processor=logits_processor,
|
|
1548
|
+
grammar=grammar,
|
|
1498
1549
|
)
|
|
1499
1550
|
if stream:
|
|
1500
1551
|
chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore
|
|
@@ -1504,10 +1555,10 @@ class Llama:
|
|
|
1504
1555
|
return self._convert_text_completion_to_chat(completion)
|
|
1505
1556
|
|
|
1506
1557
|
def __del__(self):
|
|
1507
|
-
if self.model is not None:
|
|
1558
|
+
if hasattr(self, "model") and self.model is not None:
|
|
1508
1559
|
llama_cpp.llama_free_model(self.model)
|
|
1509
1560
|
self.model = None
|
|
1510
|
-
if self.ctx is not None:
|
|
1561
|
+
if hasattr(self, "ctx") and self.ctx is not None:
|
|
1511
1562
|
llama_cpp.llama_free(self.ctx)
|
|
1512
1563
|
self.ctx = None
|
|
1513
1564
|
|
|
@@ -1531,13 +1582,7 @@ class Llama:
|
|
|
1531
1582
|
lora_base=self.lora_base,
|
|
1532
1583
|
lora_path=self.lora_path,
|
|
1533
1584
|
tensor_split=self.tensor_split,
|
|
1534
|
-
|
|
1535
|
-
n_gqa=self.params.n_gqa,
|
|
1536
|
-
rms_norm_eps=self.params.rms_norm_eps,
|
|
1537
|
-
### TEMPORARY ###
|
|
1538
|
-
### DEPRECATED ###
|
|
1539
|
-
n_parts=self.n_parts,
|
|
1540
|
-
### DEPRECATED ###
|
|
1585
|
+
mul_mat_q=self.params.mul_mat_q,
|
|
1541
1586
|
)
|
|
1542
1587
|
|
|
1543
1588
|
def __setstate__(self, state):
|
|
@@ -1559,14 +1604,8 @@ class Llama:
|
|
|
1559
1604
|
lora_base=state["lora_base"],
|
|
1560
1605
|
lora_path=state["lora_path"],
|
|
1561
1606
|
tensor_split=state["tensor_split"],
|
|
1607
|
+
mul_mat_q=state["mul_mat_q"],
|
|
1562
1608
|
verbose=state["verbose"],
|
|
1563
|
-
### TEMPORARY ###
|
|
1564
|
-
n_gqa=state["n_gqa"],
|
|
1565
|
-
rms_norm_eps=state["rms_norm_eps"],
|
|
1566
|
-
### TEMPORARY ###
|
|
1567
|
-
### DEPRECATED ###
|
|
1568
|
-
n_parts=state["n_parts"],
|
|
1569
|
-
### DEPRECATED ###
|
|
1570
1609
|
)
|
|
1571
1610
|
|
|
1572
1611
|
def save_state(self) -> LlamaState:
|
|
@@ -1631,20 +1670,20 @@ class Llama:
|
|
|
1631
1670
|
assert self.ctx is not None
|
|
1632
1671
|
return LlamaTokenizer(self)
|
|
1633
1672
|
|
|
1634
|
-
|
|
1635
|
-
def token_eos() -> int:
|
|
1673
|
+
def token_eos(self) -> int:
|
|
1636
1674
|
"""Return the end-of-sequence token."""
|
|
1637
|
-
|
|
1675
|
+
assert self.ctx is not None
|
|
1676
|
+
return llama_cpp.llama_token_eos(self.ctx)
|
|
1638
1677
|
|
|
1639
|
-
|
|
1640
|
-
def token_bos() -> int:
|
|
1678
|
+
def token_bos(self) -> int:
|
|
1641
1679
|
"""Return the beginning-of-sequence token."""
|
|
1642
|
-
|
|
1680
|
+
assert self.ctx is not None
|
|
1681
|
+
return llama_cpp.llama_token_bos(self.ctx)
|
|
1643
1682
|
|
|
1644
|
-
|
|
1645
|
-
def token_nl() -> int:
|
|
1683
|
+
def token_nl(self) -> int:
|
|
1646
1684
|
"""Return the newline token."""
|
|
1647
|
-
|
|
1685
|
+
assert self.ctx is not None
|
|
1686
|
+
return llama_cpp.llama_token_nl(self.ctx)
|
|
1648
1687
|
|
|
1649
1688
|
@staticmethod
|
|
1650
1689
|
def logits_to_logprobs(logits: List[float]) -> List[float]:
|