llama-cpp-python 0.1.48__tar.gz → 0.1.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/PKG-INFO +5 -5
  2. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/README.md +4 -4
  3. llama_cpp_python-0.1.50/examples/high_level_api/fastapi_server.py +37 -0
  4. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama.py +114 -14
  5. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama_cpp.py +6 -5
  6. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/app.py +29 -5
  7. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/PKG-INFO +5 -5
  8. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/SOURCES.txt +10 -0
  9. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/poetry.lock +4 -4
  10. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/pyproject.toml +2 -2
  11. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/setup.py +1 -1
  12. llama_cpp_python-0.1.50/vendor/llama.cpp/.clang-tidy +18 -0
  13. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/workflows/build.yml +65 -8
  14. llama_cpp_python-0.1.50/vendor/llama.cpp/.github/workflows/tidy-post.yml +20 -0
  15. llama_cpp_python-0.1.50/vendor/llama.cpp/.github/workflows/tidy-review.yml +23 -0
  16. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.gitignore +3 -0
  17. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/Makefile +9 -8
  18. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/README.md +71 -33
  19. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/SHA256SUMS +16 -12
  20. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/convert.py +7 -3
  21. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/CMakeLists.txt +1 -0
  22. llama_cpp_python-0.1.50/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +4 -0
  23. llama_cpp_python-0.1.50/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +1687 -0
  24. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/common.cpp +392 -80
  25. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/common.h +28 -13
  26. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/embedding/embedding.cpp +0 -3
  27. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/main/README.md +2 -2
  28. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/main/main.cpp +34 -52
  29. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/perplexity/perplexity.cpp +47 -23
  30. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize/quantize.cpp +5 -6
  31. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-cuda.cu +291 -109
  32. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-cuda.h +2 -0
  33. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-opencl.c +85 -122
  34. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml.c +3835 -2067
  35. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml.h +201 -11
  36. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/llama.cpp +156 -75
  37. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/llama.h +7 -6
  38. llama_cpp_python-0.1.50/vendor/llama.cpp/prompts/dan-modified.txt +1 -0
  39. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/dan.txt +1 -1
  40. llama_cpp_python-0.1.50/vendor/llama.cpp/scripts/perf-run-all.sh +93 -0
  41. llama_cpp_python-0.1.50/vendor/llama.cpp/scripts/ppl-run-all.sh +39 -0
  42. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/spm-headers/llama.h +7 -6
  43. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/CMakeLists.txt +2 -0
  44. llama_cpp_python-0.1.50/vendor/llama.cpp/tests/test-grad0.c +1131 -0
  45. llama_cpp_python-0.1.50/vendor/llama.cpp/tests/test-opt.c +205 -0
  46. llama_cpp_python-0.1.48/examples/high_level_api/fastapi_server.py +0 -262
  47. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.dockerignore +0 -0
  48. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  49. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  50. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/dependabot.yml +0 -0
  51. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/build-and-release.yaml +0 -0
  52. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/build-docker.yaml +0 -0
  53. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/publish-to-test.yaml +0 -0
  54. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/publish.yaml +0 -0
  55. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.github/workflows/test.yaml +0 -0
  56. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.gitignore +0 -0
  57. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/.gitmodules +0 -0
  58. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/CMakeLists.txt +0 -0
  59. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/Dockerfile +0 -0
  60. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/Dockerfile.cuda +0 -0
  61. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/LICENSE.md +0 -0
  62. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/docs/index.md +0 -0
  63. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/high_level_api_embedding.py +0 -0
  64. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/high_level_api_inference.py +0 -0
  65. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/high_level_api_streaming.py +0 -0
  66. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/high_level_api/langchain_custom_llm.py +0 -0
  67. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/Chat.py +0 -0
  68. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/Miku.py +0 -0
  69. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/ReasonAct.py +0 -0
  70. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/common.py +0 -0
  71. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/low_level_api_chat_cpp.py +0 -0
  72. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
  73. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/quantize.py +0 -0
  74. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/low_level_api/util.py +0 -0
  75. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/notebooks/Clients.ipynb +0 -0
  76. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/examples/notebooks/PerformanceTuning.ipynb +0 -0
  77. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/__init__.py +0 -0
  78. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama_types.py +0 -0
  79. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/__init__.py +0 -0
  80. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/__main__.py +0 -0
  81. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
  82. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/requires.txt +0 -0
  83. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/top_level.txt +0 -0
  84. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/mkdocs.yml +0 -0
  85. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/setup.cfg +0 -0
  86. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/tests/test_llama.py +0 -0
  87. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
  88. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
  89. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.devops/tools.sh +0 -0
  90. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.dockerignore +0 -0
  91. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.ecrc +0 -0
  92. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.editorconfig +0 -0
  93. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
  94. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
  95. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
  96. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/CMakeLists.txt +0 -0
  97. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/LICENSE +0 -0
  98. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/Package.swift +0 -0
  99. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/build.zig +0 -0
  100. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/convert-lora-to-ggml.py +0 -0
  101. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
  102. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/Miku.sh +0 -0
  103. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/alpaca.sh +0 -0
  104. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
  105. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
  106. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
  107. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
  108. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/chat.sh +0 -0
  109. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
  110. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/embedding/README.md +0 -0
  111. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
  112. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
  113. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/graph.py +0 -0
  114. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
  115. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
  116. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
  117. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
  118. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
  119. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
  120. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
  121. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize/README.md +0 -0
  122. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
  123. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +0 -0
  124. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/reason-act.sh +0 -0
  125. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
  126. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -0
  127. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/flake.lock +0 -0
  128. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/flake.nix +0 -0
  129. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/ggml-opencl.h +0 -0
  130. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/llama-util.h +0 -0
  131. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
  132. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama0-banner.png +0 -0
  133. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama0-logo.png +0 -0
  134. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama1-banner.png +0 -0
  135. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/media/llama1-logo.png +0 -0
  136. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
  137. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
  138. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
  139. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
  140. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
  141. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
  142. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
  143. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
  144. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
  145. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/chat.txt +0 -0
  146. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
  147. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/requirements.txt +0 -0
  148. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
  149. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
  150. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/build-info.sh +0 -0
  151. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/sync-ggml.sh +0 -0
  152. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
  153. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-double-float.c +0 -0
  154. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-quantize-fns.cpp +0 -0
  155. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
  156. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
  157. {llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama_cpp_python
3
- Version: 0.1.48
3
+ Version: 0.1.50
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -53,19 +53,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
53
53
  To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
54
54
 
55
55
  ```bash
56
- LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
56
+ CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
57
57
  ```
58
58
 
59
59
  To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
60
60
 
61
61
  ```bash
62
- LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
62
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
63
63
  ```
64
64
 
65
65
  To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
66
66
 
67
67
  ```bash
68
- LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
68
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
69
69
  ```
70
70
 
71
71
 
@@ -120,7 +120,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
120
120
  A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
121
121
 
122
122
  ```bash
123
- docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
123
+ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
124
124
  ```
125
125
 
126
126
  ## Low-level API
@@ -35,19 +35,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
35
35
  To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
36
36
 
37
37
  ```bash
38
- LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
38
+ CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
39
39
  ```
40
40
 
41
41
  To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
42
42
 
43
43
  ```bash
44
- LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
44
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
45
45
  ```
46
46
 
47
47
  To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
48
48
 
49
49
  ```bash
50
- LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
50
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
51
51
  ```
52
52
 
53
53
 
@@ -102,7 +102,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
102
102
  A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
103
103
 
104
104
  ```bash
105
- docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
105
+ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
106
106
  ```
107
107
 
108
108
  ## Low-level API
@@ -0,0 +1,37 @@
1
+ """Example FastAPI server for llama.cpp.
2
+
3
+ To run this example:
4
+
5
+ ```bash
6
+ pip install fastapi uvicorn sse-starlette
7
+ export MODEL=../models/7B/...
8
+ ```
9
+
10
+ Then run:
11
+ ```
12
+ uvicorn llama_cpp.server.app:app --reload
13
+ ```
14
+
15
+ or
16
+
17
+ ```
18
+ python3 -m llama_cpp.server
19
+ ```
20
+
21
+ Then visit http://localhost:8000/docs to see the interactive API docs.
22
+
23
+
24
+ To actually see the implementation of the server, see llama_cpp/server/app.py
25
+
26
+ """
27
+ import os
28
+ import uvicorn
29
+
30
+ from llama_cpp.server.app import create_app
31
+
32
+ if __name__ == "__main__":
33
+ app = create_app()
34
+
35
+ uvicorn.run(
36
+ app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
37
+ )
@@ -83,6 +83,7 @@ class Llama:
83
83
  # NOTE: These parameters are likely to change in the future.
84
84
  n_ctx: int = 512,
85
85
  n_parts: int = -1,
86
+ n_gpu_layers: int = 0,
86
87
  seed: int = 1337,
87
88
  f16_kv: bool = True,
88
89
  logits_all: bool = False,
@@ -129,6 +130,7 @@ class Llama:
129
130
  self.params = llama_cpp.llama_context_default_params()
130
131
  self.params.n_ctx = n_ctx
131
132
  self.params.n_parts = n_parts
133
+ self.params.n_gpu_layers = n_gpu_layers
132
134
  self.params.seed = seed
133
135
  self.params.f16_kv = f16_kv
134
136
  self.params.logits_all = logits_all
@@ -174,7 +176,9 @@ class Llama:
174
176
  if self.verbose:
175
177
  print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
176
178
 
177
- def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
179
+ def tokenize(
180
+ self, text: bytes, add_bos: bool = True
181
+ ) -> List[llama_cpp.llama_token]:
178
182
  """Tokenize a string.
179
183
 
180
184
  Args:
@@ -194,10 +198,22 @@ class Llama:
194
198
  text,
195
199
  tokens,
196
200
  n_ctx,
197
- llama_cpp.c_bool(True),
201
+ llama_cpp.c_bool(add_bos),
198
202
  )
199
203
  if int(n_tokens) < 0:
200
- raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
204
+ n_tokens = abs(n_tokens)
205
+ tokens = (llama_cpp.llama_token * int(n_tokens))()
206
+ n_tokens = llama_cpp.llama_tokenize(
207
+ self.ctx,
208
+ text,
209
+ tokens,
210
+ llama_cpp.c_int(n_tokens),
211
+ llama_cpp.c_bool(add_bos),
212
+ )
213
+ if n_tokens < 0:
214
+ raise RuntimeError(
215
+ f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
216
+ )
201
217
  return list(tokens[:n_tokens])
202
218
 
203
219
  def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes:
@@ -268,9 +284,13 @@ class Llama:
268
284
  top_k: llama_cpp.c_int,
269
285
  top_p: llama_cpp.c_float,
270
286
  temp: llama_cpp.c_float,
287
+ tfs_z: llama_cpp.c_float,
271
288
  repeat_penalty: llama_cpp.c_float,
272
289
  frequency_penalty: llama_cpp.c_float,
273
290
  presence_penalty: llama_cpp.c_float,
291
+ mirostat_mode: llama_cpp.c_int,
292
+ mirostat_tau: llama_cpp.c_float,
293
+ mirostat_eta: llama_cpp.c_float,
274
294
  ):
275
295
  assert self.ctx is not None
276
296
  assert len(self.eval_logits) > 0
@@ -308,11 +328,41 @@ class Llama:
308
328
  alpha_frequency=frequency_penalty,
309
329
  alpha_presence=presence_penalty,
310
330
  )
311
- if float(temp.value) == 0.0:
331
+ if temp.value == 0.0:
312
332
  return llama_cpp.llama_sample_token_greedy(
313
333
  ctx=self.ctx,
314
334
  candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
315
335
  )
336
+ elif mirostat_mode.value == 1:
337
+ mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
338
+ mirostat_m = llama_cpp.c_int(100)
339
+ llama_cpp.llama_sample_temperature(
340
+ ctx=self.ctx,
341
+ candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
342
+ temp=temp,
343
+ )
344
+ return llama_cpp.llama_sample_token_mirostat(
345
+ ctx=self.ctx,
346
+ candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
347
+ tau=mirostat_tau,
348
+ eta=mirostat_eta,
349
+ mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
350
+ m=mirostat_m,
351
+ )
352
+ elif mirostat_mode.value == 2:
353
+ mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
354
+ llama_cpp.llama_sample_temperature(
355
+ ctx=self.ctx,
356
+ candidates=llama_cpp.ctypes.pointer(candidates),
357
+ temp=temp,
358
+ )
359
+ return llama_cpp.llama_sample_token_mirostat_v2(
360
+ ctx=self.ctx,
361
+ candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
362
+ tau=mirostat_tau,
363
+ eta=mirostat_eta,
364
+ mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore
365
+ )
316
366
  else:
317
367
  llama_cpp.llama_sample_top_k(
318
368
  ctx=self.ctx,
@@ -323,7 +373,7 @@ class Llama:
323
373
  llama_cpp.llama_sample_tail_free(
324
374
  ctx=self.ctx,
325
375
  candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
326
- z=llama_cpp.c_float(1.0),
376
+ z=tfs_z,
327
377
  min_keep=llama_cpp.c_size_t(1),
328
378
  )
329
379
  llama_cpp.llama_sample_typical(
@@ -350,12 +400,16 @@ class Llama:
350
400
 
351
401
  def sample(
352
402
  self,
353
- top_k: int,
354
- top_p: float,
355
- temp: float,
356
- repeat_penalty: float,
403
+ top_k: int = 40,
404
+ top_p: float = 0.95,
405
+ temp: float = 0.80,
406
+ repeat_penalty: float = 1.1,
357
407
  frequency_penalty: float = 0.0,
358
408
  presence_penalty: float = 0.0,
409
+ tfs_z: float = 1.0,
410
+ mirostat_mode: int = 0,
411
+ mirostat_eta: float = 0.1,
412
+ mirostat_tau: float = 5.0,
359
413
  ):
360
414
  """Sample a token from the model.
361
415
 
@@ -380,9 +434,13 @@ class Llama:
380
434
  top_k=llama_cpp.c_int(top_k),
381
435
  top_p=llama_cpp.c_float(top_p),
382
436
  temp=llama_cpp.c_float(temp),
437
+ tfs_z=llama_cpp.c_float(tfs_z),
383
438
  repeat_penalty=llama_cpp.c_float(repeat_penalty),
384
439
  frequency_penalty=llama_cpp.c_float(frequency_penalty),
385
440
  presence_penalty=llama_cpp.c_float(presence_penalty),
441
+ mirostat_mode=llama_cpp.c_int(mirostat_mode),
442
+ mirostat_tau=llama_cpp.c_float(mirostat_tau),
443
+ mirostat_eta=llama_cpp.c_float(mirostat_eta),
386
444
  )
387
445
 
388
446
  def generate(
@@ -392,9 +450,13 @@ class Llama:
392
450
  top_p: float,
393
451
  temp: float,
394
452
  repeat_penalty: float,
453
+ reset: bool = True,
395
454
  frequency_penalty: float = 0.0,
396
455
  presence_penalty: float = 0.0,
397
- reset: bool = True,
456
+ tfs_z: float = 1.0,
457
+ mirostat_mode: int = 0,
458
+ mirostat_tau: float = 5.0,
459
+ mirostat_eta: float = 0.1,
398
460
  ) -> Generator[
399
461
  llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
400
462
  ]:
@@ -447,9 +509,13 @@ class Llama:
447
509
  top_k=top_k,
448
510
  top_p=top_p,
449
511
  temp=temp,
512
+ repeat_penalty=repeat_penalty,
450
513
  frequency_penalty=frequency_penalty,
451
514
  presence_penalty=presence_penalty,
452
- repeat_penalty=repeat_penalty,
515
+ tfs_z=tfs_z,
516
+ mirostat_mode=mirostat_mode,
517
+ mirostat_tau=mirostat_tau,
518
+ mirostat_eta=mirostat_eta,
453
519
  )
454
520
  tokens_or_none = yield token
455
521
  tokens = [token]
@@ -528,6 +594,10 @@ class Llama:
528
594
  repeat_penalty: float = 1.1,
529
595
  top_k: int = 40,
530
596
  stream: bool = False,
597
+ tfs_z: float = 1.0,
598
+ mirostat_mode: int = 0,
599
+ mirostat_tau: float = 5.0,
600
+ mirostat_eta: float = 0.1,
531
601
  ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
532
602
  assert self.ctx is not None
533
603
  completion_id: str = f"cmpl-{str(uuid.uuid4())}"
@@ -583,6 +653,10 @@ class Llama:
583
653
  top_k=top_k,
584
654
  top_p=top_p,
585
655
  temp=temperature,
656
+ tfs_z=tfs_z,
657
+ mirostat_mode=mirostat_mode,
658
+ mirostat_tau=mirostat_tau,
659
+ mirostat_eta=mirostat_eta,
586
660
  frequency_penalty=frequency_penalty,
587
661
  presence_penalty=presence_penalty,
588
662
  repeat_penalty=repeat_penalty,
@@ -655,6 +729,9 @@ class Llama:
655
729
  print("Llama._create_completion: cache save", file=sys.stderr)
656
730
  self.cache[prompt_tokens + completion_tokens] = self.save_state()
657
731
 
732
+ if self.verbose:
733
+ llama_cpp.llama_print_timings(self.ctx)
734
+
658
735
  if stream:
659
736
  yield {
660
737
  "id": completion_id,
@@ -726,9 +803,6 @@ class Llama:
726
803
  "top_logprobs": top_logprobs,
727
804
  }
728
805
 
729
- if self.verbose:
730
- llama_cpp.llama_print_timings(self.ctx)
731
-
732
806
  yield {
733
807
  "id": completion_id,
734
808
  "object": "text_completion",
@@ -764,6 +838,10 @@ class Llama:
764
838
  repeat_penalty: float = 1.1,
765
839
  top_k: int = 40,
766
840
  stream: bool = False,
841
+ tfs_z: float = 1.0,
842
+ mirostat_mode: int = 0,
843
+ mirostat_tau: float = 5.0,
844
+ mirostat_eta: float = 0.1,
767
845
  ) -> Union[Completion, Iterator[CompletionChunk]]:
768
846
  """Generate text from a prompt.
769
847
 
@@ -801,6 +879,10 @@ class Llama:
801
879
  repeat_penalty=repeat_penalty,
802
880
  top_k=top_k,
803
881
  stream=stream,
882
+ tfs_z=tfs_z,
883
+ mirostat_mode=mirostat_mode,
884
+ mirostat_tau=mirostat_tau,
885
+ mirostat_eta=mirostat_eta,
804
886
  )
805
887
  if stream:
806
888
  chunks: Iterator[CompletionChunk] = completion_or_chunks
@@ -823,6 +905,10 @@ class Llama:
823
905
  repeat_penalty: float = 1.1,
824
906
  top_k: int = 40,
825
907
  stream: bool = False,
908
+ tfs_z: float = 1.0,
909
+ mirostat_mode: int = 0,
910
+ mirostat_tau: float = 5.0,
911
+ mirostat_eta: float = 0.1,
826
912
  ) -> Union[Completion, Iterator[CompletionChunk]]:
827
913
  """Generate text from a prompt.
828
914
 
@@ -860,6 +946,10 @@ class Llama:
860
946
  repeat_penalty=repeat_penalty,
861
947
  top_k=top_k,
862
948
  stream=stream,
949
+ tfs_z=tfs_z,
950
+ mirostat_mode=mirostat_mode,
951
+ mirostat_tau=mirostat_tau,
952
+ mirostat_eta=mirostat_eta,
863
953
  )
864
954
 
865
955
  def _convert_text_completion_to_chat(
@@ -932,6 +1022,10 @@ class Llama:
932
1022
  presence_penalty: float = 0.0,
933
1023
  frequency_penalty: float = 0.0,
934
1024
  repeat_penalty: float = 1.1,
1025
+ tfs_z: float = 1.0,
1026
+ mirostat_mode: int = 0,
1027
+ mirostat_tau: float = 5.0,
1028
+ mirostat_eta: float = 0.1,
935
1029
  ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
936
1030
  """Generate a chat completion from a list of messages.
937
1031
 
@@ -966,6 +1060,10 @@ class Llama:
966
1060
  repeat_penalty=repeat_penalty,
967
1061
  presence_penalty=presence_penalty,
968
1062
  frequency_penalty=frequency_penalty,
1063
+ tfs_z=tfs_z,
1064
+ mirostat_mode=mirostat_mode,
1065
+ mirostat_tau=mirostat_tau,
1066
+ mirostat_eta=mirostat_eta,
969
1067
  )
970
1068
  if stream:
971
1069
  chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore
@@ -985,6 +1083,7 @@ class Llama:
985
1083
  model_path=self.model_path,
986
1084
  n_ctx=self.params.n_ctx,
987
1085
  n_parts=self.params.n_parts,
1086
+ n_gpu_layers=self.params.n_gpu_layers,
988
1087
  seed=self.params.seed,
989
1088
  f16_kv=self.params.f16_kv,
990
1089
  logits_all=self.params.logits_all,
@@ -1004,6 +1103,7 @@ class Llama:
1004
1103
  model_path=state["model_path"],
1005
1104
  n_ctx=state["n_ctx"],
1006
1105
  n_parts=state["n_parts"],
1106
+ n_gpu_layers=state["n_gpu_layers"],
1007
1107
  seed=state["seed"],
1008
1108
  f16_kv=state["f16_kv"],
1009
1109
  logits_all=state["logits_all"],
@@ -68,7 +68,7 @@ _lib_base_name = "llama"
68
68
  _lib = _load_shared_library(_lib_base_name)
69
69
 
70
70
  # C types
71
- LLAMA_FILE_VERSION = c_int(1)
71
+ LLAMA_FILE_VERSION = c_int(2)
72
72
  LLAMA_FILE_MAGIC = b"ggjt"
73
73
  LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
74
74
  LLAMA_SESSION_MAGIC = b"ggsn"
@@ -109,6 +109,7 @@ class llama_context_params(Structure):
109
109
  _fields_ = [
110
110
  ("n_ctx", c_int), # text context
111
111
  ("n_parts", c_int), # -1 for default
112
+ ("n_gpu_layers", c_int), # number of layers to store in VRAM
112
113
  ("seed", c_int), # RNG seed, 0 for random
113
114
  ("f16_kv", c_bool), # use fp16 for KV cache
114
115
  (
@@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) # except 1d tensors
135
136
  LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
136
137
  4
137
138
  ) # tok_embeddings.weight and output.weight are F16
138
- LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
139
+ # LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5) # except 1d tensors
139
140
  # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6) # except 1d tensors
140
141
  LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) # except 1d tensors
141
142
  LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) # except 1d tensors
@@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t
259
260
  # Destination needs to have allocated enough memory.
260
261
  # Returns the number of bytes copied
261
262
  def llama_copy_state_data(
262
- ctx: llama_context_p, dest # type: Array[c_uint8]
263
+ ctx: llama_context_p, dst # type: Array[c_uint8]
263
264
  ) -> int:
264
- return _lib.llama_copy_state_data(ctx, dest)
265
+ return _lib.llama_copy_state_data(ctx, dst)
265
266
 
266
267
 
267
268
  _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
@@ -350,7 +351,7 @@ def llama_tokenize(
350
351
  tokens, # type: Array[llama_token]
351
352
  n_max_tokens: c_int,
352
353
  add_bos: c_bool,
353
- ) -> c_int:
354
+ ) -> int:
354
355
  return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
355
356
 
356
357
 
@@ -17,6 +17,11 @@ class Settings(BaseSettings):
17
17
  description="The path to the model to use for generating completions."
18
18
  )
19
19
  n_ctx: int = Field(default=2048, ge=1, description="The context size.")
20
+ n_gpu_layers: int = Field(
21
+ default=0,
22
+ ge=0,
23
+ description="The number of layers to put on the GPU. The rest will be on the CPU.",
24
+ )
20
25
  n_batch: int = Field(
21
26
  default=512, ge=1, description="The batch size to use per eval."
22
27
  )
@@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
80
85
  global llama
81
86
  llama = llama_cpp.Llama(
82
87
  model_path=settings.model,
88
+ n_gpu_layers=settings.n_gpu_layers,
83
89
  f16_kv=settings.f16_kv,
84
90
  use_mlock=settings.use_mlock,
85
91
  use_mmap=settings.use_mmap,
@@ -152,9 +158,23 @@ repeat_penalty_field = Field(
152
158
  + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
153
159
  )
154
160
 
161
+ presence_penalty_field = Field(
162
+ default=0.0,
163
+ ge=-2.0,
164
+ le=2.0,
165
+ description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
166
+ )
167
+
168
+ frequency_penalty_field = Field(
169
+ default=0.0,
170
+ ge=-2.0,
171
+ le=2.0,
172
+ description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
173
+ )
174
+
155
175
 
156
176
  class CreateCompletionRequest(BaseModel):
157
- prompt: Optional[str] = Field(
177
+ prompt: Union[str, List[str]] = Field(
158
178
  default="", description="The prompt to generate completions for."
159
179
  )
160
180
  suffix: Optional[str] = Field(
@@ -175,13 +195,13 @@ class CreateCompletionRequest(BaseModel):
175
195
  ge=0,
176
196
  description="The number of logprobs to generate. If None, no logprobs are generated.",
177
197
  )
198
+ presence_penalty: Optional[float] = presence_penalty_field
199
+ frequency_penalty: Optional[float] = frequency_penalty_field
178
200
 
179
201
  # ignored or currently unsupported
180
202
  model: Optional[str] = model_field
181
203
  n: Optional[int] = 1
182
204
  logprobs: Optional[int] = Field(None)
183
- presence_penalty: Optional[float] = 0
184
- frequency_penalty: Optional[float] = 0
185
205
  best_of: Optional[int] = 1
186
206
  logit_bias: Optional[Dict[str, float]] = Field(None)
187
207
  user: Optional[str] = Field(None)
@@ -209,6 +229,10 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
209
229
  def create_completion(
210
230
  request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
211
231
  ):
232
+ if isinstance(request.prompt, list):
233
+ assert len(request.prompt) <= 1
234
+ request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
235
+
212
236
  completion_or_chunks = llama(
213
237
  **request.dict(
214
238
  exclude={
@@ -269,12 +293,12 @@ class CreateChatCompletionRequest(BaseModel):
269
293
  top_p: float = top_p_field
270
294
  stop: Optional[List[str]] = stop_field
271
295
  stream: bool = stream_field
296
+ presence_penalty: Optional[float] = presence_penalty_field
297
+ frequency_penalty: Optional[float] = frequency_penalty_field
272
298
 
273
299
  # ignored or currently unsupported
274
300
  model: Optional[str] = model_field
275
301
  n: Optional[int] = 1
276
- presence_penalty: Optional[float] = 0
277
- frequency_penalty: Optional[float] = 0
278
302
  logit_bias: Optional[Dict[str, float]] = Field(None)
279
303
  user: Optional[str] = Field(None)
280
304
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-cpp-python
3
- Version: 0.1.48
3
+ Version: 0.1.50
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -53,19 +53,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
53
53
  To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
54
54
 
55
55
  ```bash
56
- LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
56
+ CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
57
57
  ```
58
58
 
59
59
  To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
60
60
 
61
61
  ```bash
62
- LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
62
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
63
63
  ```
64
64
 
65
65
  To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
66
66
 
67
67
  ```bash
68
- LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
68
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
69
69
  ```
70
70
 
71
71
 
@@ -120,7 +120,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
120
120
  A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
121
121
 
122
122
  ```bash
123
- docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
123
+ docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
124
124
  ```
125
125
 
126
126
  ## Low-level API
@@ -47,6 +47,7 @@ llama_cpp_python.egg-info/dependency_links.txt
47
47
  llama_cpp_python.egg-info/requires.txt
48
48
  llama_cpp_python.egg-info/top_level.txt
49
49
  tests/test_llama.py
50
+ vendor/llama.cpp/.clang-tidy
50
51
  vendor/llama.cpp/.dockerignore
51
52
  vendor/llama.cpp/.ecrc
52
53
  vendor/llama.cpp/.editorconfig
@@ -80,6 +81,8 @@ vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md
80
81
  vendor/llama.cpp/.github/workflows/build.yml
81
82
  vendor/llama.cpp/.github/workflows/docker.yml
82
83
  vendor/llama.cpp/.github/workflows/editorconfig.yml
84
+ vendor/llama.cpp/.github/workflows/tidy-post.yml
85
+ vendor/llama.cpp/.github/workflows/tidy-review.yml
83
86
  vendor/llama.cpp/examples/CMakeLists.txt
84
87
  vendor/llama.cpp/examples/Miku.sh
85
88
  vendor/llama.cpp/examples/alpaca.sh
@@ -90,6 +93,8 @@ vendor/llama.cpp/examples/common.cpp
90
93
  vendor/llama.cpp/examples/common.h
91
94
  vendor/llama.cpp/examples/gpt4all.sh
92
95
  vendor/llama.cpp/examples/reason-act.sh
96
+ vendor/llama.cpp/examples/baby-llama/CMakeLists.txt
97
+ vendor/llama.cpp/examples/baby-llama/baby-llama.cpp
93
98
  vendor/llama.cpp/examples/benchmark/CMakeLists.txt
94
99
  vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp
95
100
  vendor/llama.cpp/examples/embedding/CMakeLists.txt
@@ -128,16 +133,21 @@ vendor/llama.cpp/prompts/chat-with-bob.txt
128
133
  vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt
129
134
  vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt
130
135
  vendor/llama.cpp/prompts/chat.txt
136
+ vendor/llama.cpp/prompts/dan-modified.txt
131
137
  vendor/llama.cpp/prompts/dan.txt
132
138
  vendor/llama.cpp/prompts/reason-act.txt
133
139
  vendor/llama.cpp/scripts/build-info.cmake
134
140
  vendor/llama.cpp/scripts/build-info.h.in
135
141
  vendor/llama.cpp/scripts/build-info.sh
142
+ vendor/llama.cpp/scripts/perf-run-all.sh
143
+ vendor/llama.cpp/scripts/ppl-run-all.sh
136
144
  vendor/llama.cpp/scripts/sync-ggml.sh
137
145
  vendor/llama.cpp/scripts/verify-checksum-models.py
138
146
  vendor/llama.cpp/spm-headers/llama.h
139
147
  vendor/llama.cpp/tests/CMakeLists.txt
140
148
  vendor/llama.cpp/tests/test-double-float.c
149
+ vendor/llama.cpp/tests/test-grad0.c
150
+ vendor/llama.cpp/tests/test-opt.c
141
151
  vendor/llama.cpp/tests/test-quantize-fns.cpp
142
152
  vendor/llama.cpp/tests/test-quantize-perf.cpp
143
153
  vendor/llama.cpp/tests/test-sampling.cpp
@@ -773,14 +773,14 @@ mkdocs = ">=1.1"
773
773
 
774
774
  [[package]]
775
775
  name = "mkdocs-material"
776
- version = "9.1.9"
776
+ version = "9.1.11"
777
777
  description = "Documentation that simply works"
778
778
  category = "dev"
779
779
  optional = false
780
780
  python-versions = ">=3.7"
781
781
  files = [
782
- {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"},
783
- {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"},
782
+ {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
783
+ {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
784
784
  ]
785
785
 
786
786
  [package.dependencies]
@@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
1439
1439
  [metadata]
1440
1440
  lock-version = "2.0"
1441
1441
  python-versions = "^3.8.1"
1442
- content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc"
1442
+ content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2"
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "llama_cpp_python"
3
- version = "0.1.48"
3
+ version = "0.1.50"
4
4
  description = "Python bindings for the llama.cpp library"
5
5
  authors = ["Andrei Betlen <abetlen@gmail.com>"]
6
6
  license = "MIT"
@@ -22,7 +22,7 @@ black = "^23.3.0"
22
22
  twine = "^4.0.2"
23
23
  mkdocs = "^1.4.3"
24
24
  mkdocstrings = {extras = ["python"], version = "^0.21.2"}
25
- mkdocs-material = "^9.1.9"
25
+ mkdocs-material = "^9.1.11"
26
26
  pytest = "^7.3.1"
27
27
  httpx = "^0.24.0"
28
28
 
@@ -10,7 +10,7 @@ setup(
10
10
  description="A Python wrapper for llama.cpp",
11
11
  long_description=long_description,
12
12
  long_description_content_type="text/markdown",
13
- version="0.1.48",
13
+ version="0.1.50",
14
14
  author="Andrei Betlen",
15
15
  author_email="abetlen@gmail.com",
16
16
  license="MIT",