llama-cpp-python 0.1.48__tar.gz → 0.1.50__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/PKG-INFO +5 -5
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/README.md +4 -4
- llama_cpp_python-0.1.50/examples/high_level_api/fastapi_server.py +37 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama.py +114 -14
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama_cpp.py +6 -5
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/app.py +29 -5
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/PKG-INFO +5 -5
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/SOURCES.txt +10 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/poetry.lock +4 -4
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/pyproject.toml +2 -2
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/setup.py +1 -1
- llama_cpp_python-0.1.50/vendor/llama.cpp/.clang-tidy +18 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/workflows/build.yml +65 -8
- llama_cpp_python-0.1.50/vendor/llama.cpp/.github/workflows/tidy-post.yml +20 -0
- llama_cpp_python-0.1.50/vendor/llama.cpp/.github/workflows/tidy-review.yml +23 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.gitignore +3 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/Makefile +9 -8
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/README.md +71 -33
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/SHA256SUMS +16 -12
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/convert.py +7 -3
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/CMakeLists.txt +1 -0
- llama_cpp_python-0.1.50/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +4 -0
- llama_cpp_python-0.1.50/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +1687 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/common.cpp +392 -80
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/common.h +28 -13
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/embedding/embedding.cpp +0 -3
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/main/README.md +2 -2
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/main/main.cpp +34 -52
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/perplexity/perplexity.cpp +47 -23
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize/quantize.cpp +5 -6
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-cuda.cu +291 -109
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-cuda.h +2 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-opencl.c +85 -122
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml.c +3835 -2067
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml.h +201 -11
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/llama.cpp +156 -75
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/llama.h +7 -6
- llama_cpp_python-0.1.50/vendor/llama.cpp/prompts/dan-modified.txt +1 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/dan.txt +1 -1
- llama_cpp_python-0.1.50/vendor/llama.cpp/scripts/perf-run-all.sh +93 -0
- llama_cpp_python-0.1.50/vendor/llama.cpp/scripts/ppl-run-all.sh +39 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/spm-headers/llama.h +7 -6
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/CMakeLists.txt +2 -0
- llama_cpp_python-0.1.50/vendor/llama.cpp/tests/test-grad0.c +1131 -0
- llama_cpp_python-0.1.50/vendor/llama.cpp/tests/test-opt.c +205 -0
- llama_cpp_python-0.1.48/examples/high_level_api/fastapi_server.py +0 -262
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.dockerignore +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/dependabot.yml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/build-and-release.yaml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/build-docker.yaml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/publish-to-test.yaml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/publish.yaml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/test.yaml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.gitignore +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.gitmodules +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/Dockerfile +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/Dockerfile.cuda +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/LICENSE.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/docs/index.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/high_level_api_embedding.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/high_level_api_inference.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/high_level_api_streaming.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/langchain_custom_llm.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/Chat.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/Miku.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/ReasonAct.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/common.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/low_level_api_chat_cpp.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/quantize.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/util.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/notebooks/Clients.ipynb +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/notebooks/PerformanceTuning.ipynb +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/__init__.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama_types.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/__init__.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/__main__.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/requires.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/top_level.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/mkdocs.yml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/setup.cfg +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/tests/test_llama.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.devops/tools.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.dockerignore +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.ecrc +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.editorconfig +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/LICENSE +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/Package.swift +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/build.zig +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/convert-lora-to-ggml.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/Miku.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/alpaca.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/chat.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/embedding/README.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/graph.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize/README.md +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/reason-act.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/flake.lock +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/flake.nix +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-opencl.h +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/llama-util.h +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama0-banner.png +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama0-logo.png +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama1-banner.png +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama1-logo.png +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/requirements.txt +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/build-info.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/sync-ggml.sh +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-double-float.c +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-quantize-fns.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
- {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llama_cpp_python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.50
|
|
4
4
|
Summary: A Python wrapper for llama.cpp
|
|
5
5
|
Author: Andrei Betlen
|
|
6
6
|
Author-email: abetlen@gmail.com
|
|
@@ -53,19 +53,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
|
|
|
53
53
|
To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
|
|
54
54
|
|
|
55
55
|
```bash
|
|
56
|
-
|
|
56
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
57
57
|
```
|
|
58
58
|
|
|
59
59
|
To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
|
|
60
60
|
|
|
61
61
|
```bash
|
|
62
|
-
|
|
62
|
+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
63
63
|
```
|
|
64
64
|
|
|
65
65
|
To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
|
|
66
66
|
|
|
67
67
|
```bash
|
|
68
|
-
|
|
68
|
+
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
|
|
@@ -120,7 +120,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
|
|
|
120
120
|
A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
|
|
121
121
|
|
|
122
122
|
```bash
|
|
123
|
-
docker run --rm -it -
|
|
123
|
+
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
|
|
124
124
|
```
|
|
125
125
|
|
|
126
126
|
## Low-level API
|
|
@@ -35,19 +35,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
|
|
|
35
35
|
To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
|
|
36
36
|
|
|
37
37
|
```bash
|
|
38
|
-
|
|
38
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
39
39
|
```
|
|
40
40
|
|
|
41
41
|
To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
|
|
42
42
|
|
|
43
43
|
```bash
|
|
44
|
-
|
|
44
|
+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
45
45
|
```
|
|
46
46
|
|
|
47
47
|
To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
|
|
48
48
|
|
|
49
49
|
```bash
|
|
50
|
-
|
|
50
|
+
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
51
51
|
```
|
|
52
52
|
|
|
53
53
|
|
|
@@ -102,7 +102,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
|
|
|
102
102
|
A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
|
|
103
103
|
|
|
104
104
|
```bash
|
|
105
|
-
docker run --rm -it -
|
|
105
|
+
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
|
|
106
106
|
```
|
|
107
107
|
|
|
108
108
|
## Low-level API
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Example FastAPI server for llama.cpp.
|
|
2
|
+
|
|
3
|
+
To run this example:
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install fastapi uvicorn sse-starlette
|
|
7
|
+
export MODEL=../models/7B/...
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
Then run:
|
|
11
|
+
```
|
|
12
|
+
uvicorn llama_cpp.server.app:app --reload
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
or
|
|
16
|
+
|
|
17
|
+
```
|
|
18
|
+
python3 -m llama_cpp.server
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Then visit http://localhost:8000/docs to see the interactive API docs.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
To actually see the implementation of the server, see llama_cpp/server/app.py
|
|
25
|
+
|
|
26
|
+
"""
|
|
27
|
+
import os
|
|
28
|
+
import uvicorn
|
|
29
|
+
|
|
30
|
+
from llama_cpp.server.app import create_app
|
|
31
|
+
|
|
32
|
+
if __name__ == "__main__":
|
|
33
|
+
app = create_app()
|
|
34
|
+
|
|
35
|
+
uvicorn.run(
|
|
36
|
+
app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
|
|
37
|
+
)
|
|
@@ -83,6 +83,7 @@ class Llama:
|
|
|
83
83
|
# NOTE: These parameters are likely to change in the future.
|
|
84
84
|
n_ctx: int = 512,
|
|
85
85
|
n_parts: int = -1,
|
|
86
|
+
n_gpu_layers: int = 0,
|
|
86
87
|
seed: int = 1337,
|
|
87
88
|
f16_kv: bool = True,
|
|
88
89
|
logits_all: bool = False,
|
|
@@ -129,6 +130,7 @@ class Llama:
|
|
|
129
130
|
self.params = llama_cpp.llama_context_default_params()
|
|
130
131
|
self.params.n_ctx = n_ctx
|
|
131
132
|
self.params.n_parts = n_parts
|
|
133
|
+
self.params.n_gpu_layers = n_gpu_layers
|
|
132
134
|
self.params.seed = seed
|
|
133
135
|
self.params.f16_kv = f16_kv
|
|
134
136
|
self.params.logits_all = logits_all
|
|
@@ -174,7 +176,9 @@ class Llama:
|
|
|
174
176
|
if self.verbose:
|
|
175
177
|
print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
|
|
176
178
|
|
|
177
|
-
def tokenize(
|
|
179
|
+
def tokenize(
|
|
180
|
+
self, text: bytes, add_bos: bool = True
|
|
181
|
+
) -> List[llama_cpp.llama_token]:
|
|
178
182
|
"""Tokenize a string.
|
|
179
183
|
|
|
180
184
|
Args:
|
|
@@ -194,10 +198,22 @@ class Llama:
|
|
|
194
198
|
text,
|
|
195
199
|
tokens,
|
|
196
200
|
n_ctx,
|
|
197
|
-
llama_cpp.c_bool(
|
|
201
|
+
llama_cpp.c_bool(add_bos),
|
|
198
202
|
)
|
|
199
203
|
if int(n_tokens) < 0:
|
|
200
|
-
|
|
204
|
+
n_tokens = abs(n_tokens)
|
|
205
|
+
tokens = (llama_cpp.llama_token * int(n_tokens))()
|
|
206
|
+
n_tokens = llama_cpp.llama_tokenize(
|
|
207
|
+
self.ctx,
|
|
208
|
+
text,
|
|
209
|
+
tokens,
|
|
210
|
+
llama_cpp.c_int(n_tokens),
|
|
211
|
+
llama_cpp.c_bool(add_bos),
|
|
212
|
+
)
|
|
213
|
+
if n_tokens < 0:
|
|
214
|
+
raise RuntimeError(
|
|
215
|
+
f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
|
|
216
|
+
)
|
|
201
217
|
return list(tokens[:n_tokens])
|
|
202
218
|
|
|
203
219
|
def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes:
|
|
@@ -268,9 +284,13 @@ class Llama:
|
|
|
268
284
|
top_k: llama_cpp.c_int,
|
|
269
285
|
top_p: llama_cpp.c_float,
|
|
270
286
|
temp: llama_cpp.c_float,
|
|
287
|
+
tfs_z: llama_cpp.c_float,
|
|
271
288
|
repeat_penalty: llama_cpp.c_float,
|
|
272
289
|
frequency_penalty: llama_cpp.c_float,
|
|
273
290
|
presence_penalty: llama_cpp.c_float,
|
|
291
|
+
mirostat_mode: llama_cpp.c_int,
|
|
292
|
+
mirostat_tau: llama_cpp.c_float,
|
|
293
|
+
mirostat_eta: llama_cpp.c_float,
|
|
274
294
|
):
|
|
275
295
|
assert self.ctx is not None
|
|
276
296
|
assert len(self.eval_logits) > 0
|
|
@@ -308,11 +328,41 @@ class Llama:
|
|
|
308
328
|
alpha_frequency=frequency_penalty,
|
|
309
329
|
alpha_presence=presence_penalty,
|
|
310
330
|
)
|
|
311
|
-
if
|
|
331
|
+
if temp.value == 0.0:
|
|
312
332
|
return llama_cpp.llama_sample_token_greedy(
|
|
313
333
|
ctx=self.ctx,
|
|
314
334
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
315
335
|
)
|
|
336
|
+
elif mirostat_mode.value == 1:
|
|
337
|
+
mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
|
|
338
|
+
mirostat_m = llama_cpp.c_int(100)
|
|
339
|
+
llama_cpp.llama_sample_temperature(
|
|
340
|
+
ctx=self.ctx,
|
|
341
|
+
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
342
|
+
temp=temp,
|
|
343
|
+
)
|
|
344
|
+
return llama_cpp.llama_sample_token_mirostat(
|
|
345
|
+
ctx=self.ctx,
|
|
346
|
+
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
347
|
+
tau=mirostat_tau,
|
|
348
|
+
eta=mirostat_eta,
|
|
349
|
+
mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
|
|
350
|
+
m=mirostat_m,
|
|
351
|
+
)
|
|
352
|
+
elif mirostat_mode.value == 2:
|
|
353
|
+
mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
|
|
354
|
+
llama_cpp.llama_sample_temperature(
|
|
355
|
+
ctx=self.ctx,
|
|
356
|
+
candidates=llama_cpp.ctypes.pointer(candidates),
|
|
357
|
+
temp=temp,
|
|
358
|
+
)
|
|
359
|
+
return llama_cpp.llama_sample_token_mirostat_v2(
|
|
360
|
+
ctx=self.ctx,
|
|
361
|
+
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
362
|
+
tau=mirostat_tau,
|
|
363
|
+
eta=mirostat_eta,
|
|
364
|
+
mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
|
|
365
|
+
)
|
|
316
366
|
else:
|
|
317
367
|
llama_cpp.llama_sample_top_k(
|
|
318
368
|
ctx=self.ctx,
|
|
@@ -323,7 +373,7 @@ class Llama:
|
|
|
323
373
|
llama_cpp.llama_sample_tail_free(
|
|
324
374
|
ctx=self.ctx,
|
|
325
375
|
candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
|
|
326
|
-
z=
|
|
376
|
+
z=tfs_z,
|
|
327
377
|
min_keep=llama_cpp.c_size_t(1),
|
|
328
378
|
)
|
|
329
379
|
llama_cpp.llama_sample_typical(
|
|
@@ -350,12 +400,16 @@ class Llama:
|
|
|
350
400
|
|
|
351
401
|
def sample(
|
|
352
402
|
self,
|
|
353
|
-
top_k: int,
|
|
354
|
-
top_p: float,
|
|
355
|
-
temp: float,
|
|
356
|
-
repeat_penalty: float,
|
|
403
|
+
top_k: int = 40,
|
|
404
|
+
top_p: float = 0.95,
|
|
405
|
+
temp: float = 0.80,
|
|
406
|
+
repeat_penalty: float = 1.1,
|
|
357
407
|
frequency_penalty: float = 0.0,
|
|
358
408
|
presence_penalty: float = 0.0,
|
|
409
|
+
tfs_z: float = 1.0,
|
|
410
|
+
mirostat_mode: int = 0,
|
|
411
|
+
mirostat_eta: float = 0.1,
|
|
412
|
+
mirostat_tau: float = 5.0,
|
|
359
413
|
):
|
|
360
414
|
"""Sample a token from the model.
|
|
361
415
|
|
|
@@ -380,9 +434,13 @@ class Llama:
|
|
|
380
434
|
top_k=llama_cpp.c_int(top_k),
|
|
381
435
|
top_p=llama_cpp.c_float(top_p),
|
|
382
436
|
temp=llama_cpp.c_float(temp),
|
|
437
|
+
tfs_z=llama_cpp.c_float(tfs_z),
|
|
383
438
|
repeat_penalty=llama_cpp.c_float(repeat_penalty),
|
|
384
439
|
frequency_penalty=llama_cpp.c_float(frequency_penalty),
|
|
385
440
|
presence_penalty=llama_cpp.c_float(presence_penalty),
|
|
441
|
+
mirostat_mode=llama_cpp.c_int(mirostat_mode),
|
|
442
|
+
mirostat_tau=llama_cpp.c_float(mirostat_tau),
|
|
443
|
+
mirostat_eta=llama_cpp.c_float(mirostat_eta),
|
|
386
444
|
)
|
|
387
445
|
|
|
388
446
|
def generate(
|
|
@@ -392,9 +450,13 @@ class Llama:
|
|
|
392
450
|
top_p: float,
|
|
393
451
|
temp: float,
|
|
394
452
|
repeat_penalty: float,
|
|
453
|
+
reset: bool = True,
|
|
395
454
|
frequency_penalty: float = 0.0,
|
|
396
455
|
presence_penalty: float = 0.0,
|
|
397
|
-
|
|
456
|
+
tfs_z: float = 1.0,
|
|
457
|
+
mirostat_mode: int = 0,
|
|
458
|
+
mirostat_tau: float = 5.0,
|
|
459
|
+
mirostat_eta: float = 0.1,
|
|
398
460
|
) -> Generator[
|
|
399
461
|
llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
|
|
400
462
|
]:
|
|
@@ -447,9 +509,13 @@ class Llama:
|
|
|
447
509
|
top_k=top_k,
|
|
448
510
|
top_p=top_p,
|
|
449
511
|
temp=temp,
|
|
512
|
+
repeat_penalty=repeat_penalty,
|
|
450
513
|
frequency_penalty=frequency_penalty,
|
|
451
514
|
presence_penalty=presence_penalty,
|
|
452
|
-
|
|
515
|
+
tfs_z=tfs_z,
|
|
516
|
+
mirostat_mode=mirostat_mode,
|
|
517
|
+
mirostat_tau=mirostat_tau,
|
|
518
|
+
mirostat_eta=mirostat_eta,
|
|
453
519
|
)
|
|
454
520
|
tokens_or_none = yield token
|
|
455
521
|
tokens = [token]
|
|
@@ -528,6 +594,10 @@ class Llama:
|
|
|
528
594
|
repeat_penalty: float = 1.1,
|
|
529
595
|
top_k: int = 40,
|
|
530
596
|
stream: bool = False,
|
|
597
|
+
tfs_z: float = 1.0,
|
|
598
|
+
mirostat_mode: int = 0,
|
|
599
|
+
mirostat_tau: float = 5.0,
|
|
600
|
+
mirostat_eta: float = 0.1,
|
|
531
601
|
) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
|
|
532
602
|
assert self.ctx is not None
|
|
533
603
|
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
|
|
@@ -583,6 +653,10 @@ class Llama:
|
|
|
583
653
|
top_k=top_k,
|
|
584
654
|
top_p=top_p,
|
|
585
655
|
temp=temperature,
|
|
656
|
+
tfs_z=tfs_z,
|
|
657
|
+
mirostat_mode=mirostat_mode,
|
|
658
|
+
mirostat_tau=mirostat_tau,
|
|
659
|
+
mirostat_eta=mirostat_eta,
|
|
586
660
|
frequency_penalty=frequency_penalty,
|
|
587
661
|
presence_penalty=presence_penalty,
|
|
588
662
|
repeat_penalty=repeat_penalty,
|
|
@@ -655,6 +729,9 @@ class Llama:
|
|
|
655
729
|
print("Llama._create_completion: cache save", file=sys.stderr)
|
|
656
730
|
self.cache[prompt_tokens + completion_tokens] = self.save_state()
|
|
657
731
|
|
|
732
|
+
if self.verbose:
|
|
733
|
+
llama_cpp.llama_print_timings(self.ctx)
|
|
734
|
+
|
|
658
735
|
if stream:
|
|
659
736
|
yield {
|
|
660
737
|
"id": completion_id,
|
|
@@ -726,9 +803,6 @@ class Llama:
|
|
|
726
803
|
"top_logprobs": top_logprobs,
|
|
727
804
|
}
|
|
728
805
|
|
|
729
|
-
if self.verbose:
|
|
730
|
-
llama_cpp.llama_print_timings(self.ctx)
|
|
731
|
-
|
|
732
806
|
yield {
|
|
733
807
|
"id": completion_id,
|
|
734
808
|
"object": "text_completion",
|
|
@@ -764,6 +838,10 @@ class Llama:
|
|
|
764
838
|
repeat_penalty: float = 1.1,
|
|
765
839
|
top_k: int = 40,
|
|
766
840
|
stream: bool = False,
|
|
841
|
+
tfs_z: float = 1.0,
|
|
842
|
+
mirostat_mode: int = 0,
|
|
843
|
+
mirostat_tau: float = 5.0,
|
|
844
|
+
mirostat_eta: float = 0.1,
|
|
767
845
|
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
768
846
|
"""Generate text from a prompt.
|
|
769
847
|
|
|
@@ -801,6 +879,10 @@ class Llama:
|
|
|
801
879
|
repeat_penalty=repeat_penalty,
|
|
802
880
|
top_k=top_k,
|
|
803
881
|
stream=stream,
|
|
882
|
+
tfs_z=tfs_z,
|
|
883
|
+
mirostat_mode=mirostat_mode,
|
|
884
|
+
mirostat_tau=mirostat_tau,
|
|
885
|
+
mirostat_eta=mirostat_eta,
|
|
804
886
|
)
|
|
805
887
|
if stream:
|
|
806
888
|
chunks: Iterator[CompletionChunk] = completion_or_chunks
|
|
@@ -823,6 +905,10 @@ class Llama:
|
|
|
823
905
|
repeat_penalty: float = 1.1,
|
|
824
906
|
top_k: int = 40,
|
|
825
907
|
stream: bool = False,
|
|
908
|
+
tfs_z: float = 1.0,
|
|
909
|
+
mirostat_mode: int = 0,
|
|
910
|
+
mirostat_tau: float = 5.0,
|
|
911
|
+
mirostat_eta: float = 0.1,
|
|
826
912
|
) -> Union[Completion, Iterator[CompletionChunk]]:
|
|
827
913
|
"""Generate text from a prompt.
|
|
828
914
|
|
|
@@ -860,6 +946,10 @@ class Llama:
|
|
|
860
946
|
repeat_penalty=repeat_penalty,
|
|
861
947
|
top_k=top_k,
|
|
862
948
|
stream=stream,
|
|
949
|
+
tfs_z=tfs_z,
|
|
950
|
+
mirostat_mode=mirostat_mode,
|
|
951
|
+
mirostat_tau=mirostat_tau,
|
|
952
|
+
mirostat_eta=mirostat_eta,
|
|
863
953
|
)
|
|
864
954
|
|
|
865
955
|
def _convert_text_completion_to_chat(
|
|
@@ -932,6 +1022,10 @@ class Llama:
|
|
|
932
1022
|
presence_penalty: float = 0.0,
|
|
933
1023
|
frequency_penalty: float = 0.0,
|
|
934
1024
|
repeat_penalty: float = 1.1,
|
|
1025
|
+
tfs_z: float = 1.0,
|
|
1026
|
+
mirostat_mode: int = 0,
|
|
1027
|
+
mirostat_tau: float = 5.0,
|
|
1028
|
+
mirostat_eta: float = 0.1,
|
|
935
1029
|
) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
|
|
936
1030
|
"""Generate a chat completion from a list of messages.
|
|
937
1031
|
|
|
@@ -966,6 +1060,10 @@ class Llama:
|
|
|
966
1060
|
repeat_penalty=repeat_penalty,
|
|
967
1061
|
presence_penalty=presence_penalty,
|
|
968
1062
|
frequency_penalty=frequency_penalty,
|
|
1063
|
+
tfs_z=tfs_z,
|
|
1064
|
+
mirostat_mode=mirostat_mode,
|
|
1065
|
+
mirostat_tau=mirostat_tau,
|
|
1066
|
+
mirostat_eta=mirostat_eta,
|
|
969
1067
|
)
|
|
970
1068
|
if stream:
|
|
971
1069
|
chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore
|
|
@@ -985,6 +1083,7 @@ class Llama:
|
|
|
985
1083
|
model_path=self.model_path,
|
|
986
1084
|
n_ctx=self.params.n_ctx,
|
|
987
1085
|
n_parts=self.params.n_parts,
|
|
1086
|
+
n_gpu_layers=self.params.n_gpu_layers,
|
|
988
1087
|
seed=self.params.seed,
|
|
989
1088
|
f16_kv=self.params.f16_kv,
|
|
990
1089
|
logits_all=self.params.logits_all,
|
|
@@ -1004,6 +1103,7 @@ class Llama:
|
|
|
1004
1103
|
model_path=state["model_path"],
|
|
1005
1104
|
n_ctx=state["n_ctx"],
|
|
1006
1105
|
n_parts=state["n_parts"],
|
|
1106
|
+
n_gpu_layers=state["n_gpu_layers"],
|
|
1007
1107
|
seed=state["seed"],
|
|
1008
1108
|
f16_kv=state["f16_kv"],
|
|
1009
1109
|
logits_all=state["logits_all"],
|
|
@@ -68,7 +68,7 @@ _lib_base_name = "llama"
|
|
|
68
68
|
_lib = _load_shared_library(_lib_base_name)
|
|
69
69
|
|
|
70
70
|
# C types
|
|
71
|
-
LLAMA_FILE_VERSION = c_int(
|
|
71
|
+
LLAMA_FILE_VERSION = c_int(2)
|
|
72
72
|
LLAMA_FILE_MAGIC = b"ggjt"
|
|
73
73
|
LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
|
|
74
74
|
LLAMA_SESSION_MAGIC = b"ggsn"
|
|
@@ -109,6 +109,7 @@ class llama_context_params(Structure):
|
|
|
109
109
|
_fields_ = [
|
|
110
110
|
("n_ctx", c_int), # text context
|
|
111
111
|
("n_parts", c_int), # -1 for default
|
|
112
|
+
("n_gpu_layers", c_int), # number of layers to store in VRAM
|
|
112
113
|
("seed", c_int), # RNG seed, 0 for random
|
|
113
114
|
("f16_kv", c_bool), # use fp16 for KV cache
|
|
114
115
|
(
|
|
@@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors
|
|
|
135
136
|
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
|
|
136
137
|
4
|
|
137
138
|
) # tok_embeddings.weight and output.weight are F16
|
|
138
|
-
LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
|
|
139
|
+
# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
|
|
139
140
|
# LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
|
|
140
141
|
LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
|
|
141
142
|
LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
|
|
@@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t
|
|
|
259
260
|
# Destination needs to have allocated enough memory.
|
|
260
261
|
# Returns the number of bytes copied
|
|
261
262
|
def llama_copy_state_data(
|
|
262
|
-
ctx: llama_context_p,
|
|
263
|
+
ctx: llama_context_p, dst # type: Array[c_uint8]
|
|
263
264
|
) -> int:
|
|
264
|
-
return _lib.llama_copy_state_data(ctx,
|
|
265
|
+
return _lib.llama_copy_state_data(ctx, dst)
|
|
265
266
|
|
|
266
267
|
|
|
267
268
|
_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
|
|
@@ -350,7 +351,7 @@ def llama_tokenize(
|
|
|
350
351
|
tokens, # type: Array[llama_token]
|
|
351
352
|
n_max_tokens: c_int,
|
|
352
353
|
add_bos: c_bool,
|
|
353
|
-
) ->
|
|
354
|
+
) -> int:
|
|
354
355
|
return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
|
|
355
356
|
|
|
356
357
|
|
|
@@ -17,6 +17,11 @@ class Settings(BaseSettings):
|
|
|
17
17
|
description="The path to the model to use for generating completions."
|
|
18
18
|
)
|
|
19
19
|
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
|
|
20
|
+
n_gpu_layers: int = Field(
|
|
21
|
+
default=0,
|
|
22
|
+
ge=0,
|
|
23
|
+
description="The number of layers to put on the GPU. The rest will be on the CPU.",
|
|
24
|
+
)
|
|
20
25
|
n_batch: int = Field(
|
|
21
26
|
default=512, ge=1, description="The batch size to use per eval."
|
|
22
27
|
)
|
|
@@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
|
|
|
80
85
|
global llama
|
|
81
86
|
llama = llama_cpp.Llama(
|
|
82
87
|
model_path=settings.model,
|
|
88
|
+
n_gpu_layers=settings.n_gpu_layers,
|
|
83
89
|
f16_kv=settings.f16_kv,
|
|
84
90
|
use_mlock=settings.use_mlock,
|
|
85
91
|
use_mmap=settings.use_mmap,
|
|
@@ -152,9 +158,23 @@ repeat_penalty_field = Field(
|
|
|
152
158
|
+ "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
|
|
153
159
|
)
|
|
154
160
|
|
|
161
|
+
presence_penalty_field = Field(
|
|
162
|
+
default=0.0,
|
|
163
|
+
ge=-2.0,
|
|
164
|
+
le=2.0,
|
|
165
|
+
description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
frequency_penalty_field = Field(
|
|
169
|
+
default=0.0,
|
|
170
|
+
ge=-2.0,
|
|
171
|
+
le=2.0,
|
|
172
|
+
description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
|
|
173
|
+
)
|
|
174
|
+
|
|
155
175
|
|
|
156
176
|
class CreateCompletionRequest(BaseModel):
|
|
157
|
-
prompt:
|
|
177
|
+
prompt: Union[str, List[str]] = Field(
|
|
158
178
|
default="", description="The prompt to generate completions for."
|
|
159
179
|
)
|
|
160
180
|
suffix: Optional[str] = Field(
|
|
@@ -175,13 +195,13 @@ class CreateCompletionRequest(BaseModel):
|
|
|
175
195
|
ge=0,
|
|
176
196
|
description="The number of logprobs to generate. If None, no logprobs are generated.",
|
|
177
197
|
)
|
|
198
|
+
presence_penalty: Optional[float] = presence_penalty_field
|
|
199
|
+
frequency_penalty: Optional[float] = frequency_penalty_field
|
|
178
200
|
|
|
179
201
|
# ignored or currently unsupported
|
|
180
202
|
model: Optional[str] = model_field
|
|
181
203
|
n: Optional[int] = 1
|
|
182
204
|
logprobs: Optional[int] = Field(None)
|
|
183
|
-
presence_penalty: Optional[float] = 0
|
|
184
|
-
frequency_penalty: Optional[float] = 0
|
|
185
205
|
best_of: Optional[int] = 1
|
|
186
206
|
logit_bias: Optional[Dict[str, float]] = Field(None)
|
|
187
207
|
user: Optional[str] = Field(None)
|
|
@@ -209,6 +229,10 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
|
|
|
209
229
|
def create_completion(
|
|
210
230
|
request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
|
|
211
231
|
):
|
|
232
|
+
if isinstance(request.prompt, list):
|
|
233
|
+
assert len(request.prompt) <= 1
|
|
234
|
+
request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
|
|
235
|
+
|
|
212
236
|
completion_or_chunks = llama(
|
|
213
237
|
**request.dict(
|
|
214
238
|
exclude={
|
|
@@ -269,12 +293,12 @@ class CreateChatCompletionRequest(BaseModel):
|
|
|
269
293
|
top_p: float = top_p_field
|
|
270
294
|
stop: Optional[List[str]] = stop_field
|
|
271
295
|
stream: bool = stream_field
|
|
296
|
+
presence_penalty: Optional[float] = presence_penalty_field
|
|
297
|
+
frequency_penalty: Optional[float] = frequency_penalty_field
|
|
272
298
|
|
|
273
299
|
# ignored or currently unsupported
|
|
274
300
|
model: Optional[str] = model_field
|
|
275
301
|
n: Optional[int] = 1
|
|
276
|
-
presence_penalty: Optional[float] = 0
|
|
277
|
-
frequency_penalty: Optional[float] = 0
|
|
278
302
|
logit_bias: Optional[Dict[str, float]] = Field(None)
|
|
279
303
|
user: Optional[str] = Field(None)
|
|
280
304
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llama-cpp-python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.50
|
|
4
4
|
Summary: A Python wrapper for llama.cpp
|
|
5
5
|
Author: Andrei Betlen
|
|
6
6
|
Author-email: abetlen@gmail.com
|
|
@@ -53,19 +53,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
|
|
|
53
53
|
To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
|
|
54
54
|
|
|
55
55
|
```bash
|
|
56
|
-
|
|
56
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
57
57
|
```
|
|
58
58
|
|
|
59
59
|
To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
|
|
60
60
|
|
|
61
61
|
```bash
|
|
62
|
-
|
|
62
|
+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
63
63
|
```
|
|
64
64
|
|
|
65
65
|
To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
|
|
66
66
|
|
|
67
67
|
```bash
|
|
68
|
-
|
|
68
|
+
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
|
|
69
69
|
```
|
|
70
70
|
|
|
71
71
|
|
|
@@ -120,7 +120,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
|
|
|
120
120
|
A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
|
|
121
121
|
|
|
122
122
|
```bash
|
|
123
|
-
docker run --rm -it -
|
|
123
|
+
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
|
|
124
124
|
```
|
|
125
125
|
|
|
126
126
|
## Low-level API
|
|
@@ -47,6 +47,7 @@ llama_cpp_python.egg-info/dependency_links.txt
|
|
|
47
47
|
llama_cpp_python.egg-info/requires.txt
|
|
48
48
|
llama_cpp_python.egg-info/top_level.txt
|
|
49
49
|
tests/test_llama.py
|
|
50
|
+
vendor/llama.cpp/.clang-tidy
|
|
50
51
|
vendor/llama.cpp/.dockerignore
|
|
51
52
|
vendor/llama.cpp/.ecrc
|
|
52
53
|
vendor/llama.cpp/.editorconfig
|
|
@@ -80,6 +81,8 @@ vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md
|
|
|
80
81
|
vendor/llama.cpp/.github/workflows/build.yml
|
|
81
82
|
vendor/llama.cpp/.github/workflows/docker.yml
|
|
82
83
|
vendor/llama.cpp/.github/workflows/editorconfig.yml
|
|
84
|
+
vendor/llama.cpp/.github/workflows/tidy-post.yml
|
|
85
|
+
vendor/llama.cpp/.github/workflows/tidy-review.yml
|
|
83
86
|
vendor/llama.cpp/examples/CMakeLists.txt
|
|
84
87
|
vendor/llama.cpp/examples/Miku.sh
|
|
85
88
|
vendor/llama.cpp/examples/alpaca.sh
|
|
@@ -90,6 +93,8 @@ vendor/llama.cpp/examples/common.cpp
|
|
|
90
93
|
vendor/llama.cpp/examples/common.h
|
|
91
94
|
vendor/llama.cpp/examples/gpt4all.sh
|
|
92
95
|
vendor/llama.cpp/examples/reason-act.sh
|
|
96
|
+
vendor/llama.cpp/examples/baby-llama/CMakeLists.txt
|
|
97
|
+
vendor/llama.cpp/examples/baby-llama/baby-llama.cpp
|
|
93
98
|
vendor/llama.cpp/examples/benchmark/CMakeLists.txt
|
|
94
99
|
vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp
|
|
95
100
|
vendor/llama.cpp/examples/embedding/CMakeLists.txt
|
|
@@ -128,16 +133,21 @@ vendor/llama.cpp/prompts/chat-with-bob.txt
|
|
|
128
133
|
vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt
|
|
129
134
|
vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt
|
|
130
135
|
vendor/llama.cpp/prompts/chat.txt
|
|
136
|
+
vendor/llama.cpp/prompts/dan-modified.txt
|
|
131
137
|
vendor/llama.cpp/prompts/dan.txt
|
|
132
138
|
vendor/llama.cpp/prompts/reason-act.txt
|
|
133
139
|
vendor/llama.cpp/scripts/build-info.cmake
|
|
134
140
|
vendor/llama.cpp/scripts/build-info.h.in
|
|
135
141
|
vendor/llama.cpp/scripts/build-info.sh
|
|
142
|
+
vendor/llama.cpp/scripts/perf-run-all.sh
|
|
143
|
+
vendor/llama.cpp/scripts/ppl-run-all.sh
|
|
136
144
|
vendor/llama.cpp/scripts/sync-ggml.sh
|
|
137
145
|
vendor/llama.cpp/scripts/verify-checksum-models.py
|
|
138
146
|
vendor/llama.cpp/spm-headers/llama.h
|
|
139
147
|
vendor/llama.cpp/tests/CMakeLists.txt
|
|
140
148
|
vendor/llama.cpp/tests/test-double-float.c
|
|
149
|
+
vendor/llama.cpp/tests/test-grad0.c
|
|
150
|
+
vendor/llama.cpp/tests/test-opt.c
|
|
141
151
|
vendor/llama.cpp/tests/test-quantize-fns.cpp
|
|
142
152
|
vendor/llama.cpp/tests/test-quantize-perf.cpp
|
|
143
153
|
vendor/llama.cpp/tests/test-sampling.cpp
|
|
@@ -773,14 +773,14 @@ mkdocs = ">=1.1"
|
|
|
773
773
|
|
|
774
774
|
[[package]]
|
|
775
775
|
name = "mkdocs-material"
|
|
776
|
-
version = "9.1.
|
|
776
|
+
version = "9.1.11"
|
|
777
777
|
description = "Documentation that simply works"
|
|
778
778
|
category = "dev"
|
|
779
779
|
optional = false
|
|
780
780
|
python-versions = ">=3.7"
|
|
781
781
|
files = [
|
|
782
|
-
{file = "mkdocs_material-9.1.
|
|
783
|
-
{file = "mkdocs_material-9.1.
|
|
782
|
+
{file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
|
|
783
|
+
{file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
|
|
784
784
|
]
|
|
785
785
|
|
|
786
786
|
[package.dependencies]
|
|
@@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
|
|
|
1439
1439
|
[metadata]
|
|
1440
1440
|
lock-version = "2.0"
|
|
1441
1441
|
python-versions = "^3.8.1"
|
|
1442
|
-
content-hash = "
|
|
1442
|
+
content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "llama_cpp_python"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.50"
|
|
4
4
|
description = "Python bindings for the llama.cpp library"
|
|
5
5
|
authors = ["Andrei Betlen <abetlen@gmail.com>"]
|
|
6
6
|
license = "MIT"
|
|
@@ -22,7 +22,7 @@ black = "^23.3.0"
|
|
|
22
22
|
twine = "^4.0.2"
|
|
23
23
|
mkdocs = "^1.4.3"
|
|
24
24
|
mkdocstrings = {extras = ["python"], version = "^0.21.2"}
|
|
25
|
-
mkdocs-material = "^9.1.
|
|
25
|
+
mkdocs-material = "^9.1.11"
|
|
26
26
|
pytest = "^7.3.1"
|
|
27
27
|
httpx = "^0.24.0"
|
|
28
28
|
|
|
@@ -10,7 +10,7 @@ setup(
|
|
|
10
10
|
description="A Python wrapper for llama.cpp",
|
|
11
11
|
long_description=long_description,
|
|
12
12
|
long_description_content_type="text/markdown",
|
|
13
|
-
version="0.1.
|
|
13
|
+
version="0.1.50",
|
|
14
14
|
author="Andrei Betlen",
|
|
15
15
|
author_email="abetlen@gmail.com",
|
|
16
16
|
license="MIT",
|