llama-cpp-python 0.1.55__tar.gz → 0.1.56__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. llama_cpp_python-0.1.56/CHANGELOG.md +20 -0
  2. llama_cpp_python-0.1.56/Makefile +49 -0
  3. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/PKG-INFO +12 -1
  4. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/README.md +11 -0
  5. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama.py +64 -36
  6. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/app.py +88 -58
  7. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/PKG-INFO +12 -1
  8. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/SOURCES.txt +3 -0
  9. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/requires.txt +1 -0
  10. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/poetry.lock +379 -156
  11. llama_cpp_python-0.1.56/poetry.toml +3 -0
  12. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/pyproject.toml +12 -5
  13. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/setup.py +2 -4
  14. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.dockerignore +0 -0
  15. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  16. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  17. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/dependabot.yml +0 -0
  18. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/build-and-release.yaml +0 -0
  19. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/build-docker.yaml +0 -0
  20. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/publish-to-test.yaml +0 -0
  21. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/publish.yaml +0 -0
  22. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/test.yaml +0 -0
  23. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.gitignore +0 -0
  24. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.gitmodules +0 -0
  25. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/CMakeLists.txt +0 -0
  26. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/LICENSE.md +0 -0
  27. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/Dockerfile +0 -0
  28. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/Dockerfile.cuda_simple +0 -0
  29. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/Dockerfile.openblas_simple +0 -0
  30. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/README.md +0 -0
  31. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/hug_model.py +0 -0
  32. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/start_server.sh +0 -0
  33. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docs/index.md +0 -0
  34. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/fastapi_server.py +0 -0
  35. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_embedding.py +0 -0
  36. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_inference.py +0 -0
  37. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_streaming.py +0 -0
  38. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/langchain_custom_llm.py +0 -0
  39. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/Chat.py +0 -0
  40. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/Miku.py +0 -0
  41. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/ReasonAct.py +0 -0
  42. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/common.py +0 -0
  43. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_chat_cpp.py +0 -0
  44. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
  45. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/quantize.py +0 -0
  46. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/util.py +0 -0
  47. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/notebooks/Clients.ipynb +0 -0
  48. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/notebooks/Guidance.ipynb +0 -0
  49. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/notebooks/PerformanceTuning.ipynb +0 -0
  50. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/__init__.py +0 -0
  51. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama_cpp.py +0 -0
  52. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama_types.py +0 -0
  53. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/__init__.py +0 -0
  54. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/__main__.py +0 -0
  55. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
  56. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/top_level.txt +0 -0
  57. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/mkdocs.yml +0 -0
  58. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/setup.cfg +0 -0
  59. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/tests/test_llama.py +0 -0
  60. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.clang-tidy +0 -0
  61. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
  62. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
  63. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/tools.sh +0 -0
  64. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.dockerignore +0 -0
  65. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.ecrc +0 -0
  66. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.editorconfig +0 -0
  67. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
  68. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/build.yml +0 -0
  69. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
  70. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
  71. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/tidy-post.yml +0 -0
  72. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/tidy-review.yml +0 -0
  73. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.gitignore +0 -0
  74. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/BLIS.md +0 -0
  75. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/CMakeLists.txt +0 -0
  76. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/LICENSE +0 -0
  77. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/Makefile +0 -0
  78. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/Package.swift +0 -0
  79. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/README.md +0 -0
  80. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/SHA256SUMS +0 -0
  81. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/build.zig +0 -0
  82. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert-lora-to-ggml.py +0 -0
  83. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
  84. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert.py +0 -0
  85. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/CMakeLists.txt +0 -0
  86. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/Miku.sh +0 -0
  87. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/alpaca.sh +0 -0
  88. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -0
  89. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -0
  90. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
  91. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
  92. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
  93. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
  94. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-persistent.sh +0 -0
  95. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat.sh +0 -0
  96. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/common.cpp +0 -0
  97. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/common.h +0 -0
  98. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
  99. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/README.md +0 -0
  100. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/embedding.cpp +0 -0
  101. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
  102. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
  103. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/graph.py +0 -0
  104. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
  105. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
  106. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
  107. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
  108. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/README.md +0 -0
  109. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/main.cpp +0 -0
  110. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
  111. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
  112. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/perplexity.cpp +0 -0
  113. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
  114. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/README.md +0 -0
  115. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/quantize.cpp +0 -0
  116. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
  117. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +0 -0
  118. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/reason-act.sh +0 -0
  119. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
  120. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -0
  121. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/CMakeLists.txt +0 -0
  122. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/README.md +0 -0
  123. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/httplib.h +0 -0
  124. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/json.hpp +0 -0
  125. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/server.cpp +0 -0
  126. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/flake.lock +0 -0
  127. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/flake.nix +0 -0
  128. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-cuda.cu +0 -0
  129. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-cuda.h +0 -0
  130. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-opencl.cpp +0 -0
  131. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-opencl.h +0 -0
  132. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml.c +0 -0
  133. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml.h +0 -0
  134. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama-util.h +0 -0
  135. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama.cpp +0 -0
  136. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama.h +0 -0
  137. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
  138. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama0-banner.png +0 -0
  139. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama0-logo.png +0 -0
  140. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama1-banner.png +0 -0
  141. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama1-logo.png +0 -0
  142. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
  143. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
  144. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
  145. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
  146. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
  147. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
  148. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
  149. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
  150. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
  151. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat.txt +0 -0
  152. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/dan-modified.txt +0 -0
  153. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/dan.txt +0 -0
  154. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
  155. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/requirements.txt +0 -0
  156. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
  157. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
  158. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.sh +0 -0
  159. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/perf-run-all.sh +0 -0
  160. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/ppl-run-all.sh +0 -0
  161. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/sync-ggml.sh +0 -0
  162. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
  163. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/spm-headers/llama.h +0 -0
  164. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/CMakeLists.txt +0 -0
  165. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-double-float.c +0 -0
  166. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-grad0.c +0 -0
  167. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-opt.c +0 -0
  168. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-quantize-fns.cpp +0 -0
  169. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
  170. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
  171. {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
@@ -0,0 +1,20 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [v0.1.56]
11
+
12
+ ### Added
13
+
14
+ - Added first version of the changelog
15
+ - Server: Use async routes
16
+ - Use numpy for internal buffers to reduce memory usage and improve performance.
17
+
18
+ ### Fixed
19
+
20
+ - Performance bug in stop sequence check slowing down streaming.
@@ -0,0 +1,49 @@
1
+ update:
2
+ poetry install
3
+ git submodule update --init --recursive
4
+
5
+ update.vendor:
6
+ cd vendor/llama.cpp && git pull origin master
7
+
8
+ build:
9
+ python3 setup.py develop
10
+
11
+ build.cuda:
12
+ CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
13
+
14
+ build.opencl:
15
+ CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
16
+
17
+ build.openblas:
18
+ CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
19
+
20
+ build.blis:
21
+ CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
22
+
23
+ build.sdist:
24
+ python3 setup.py sdist
25
+
26
+ deploy.pypi:
27
+ python3 -m twine upload dist/*
28
+
29
+ deploy.gh-docs:
30
+ mkdocs build
31
+ mkdocs gh-deploy
32
+
33
+ clean:
34
+ - cd vendor/llama.cpp && make clean
35
+ - cd vendor/llama.cpp && rm libllama.so
36
+ - rm -rf _skbuild
37
+ - rm llama_cpp/libllama.so
38
+
39
+ .PHONY: \
40
+ update \
41
+ update.vendor \
42
+ build \
43
+ build.cuda \
44
+ build.opencl \
45
+ build.openblas \
46
+ build.sdist \
47
+ deploy.pypi \
48
+ deploy.gh-docs \
49
+ clean
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama_cpp_python
3
- Version: 0.1.55
3
+ Version: 0.1.56
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
173
173
 
174
174
  ```bash
175
175
  git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
176
+
177
+ # Install with pip
178
+ pip install -e .
179
+
180
+ # if you want to use the fastapi / openapi server
181
+ pip install -e .[server]
182
+
183
+ # If you're a poetry user, installing will also include a virtual environment
184
+ poetry install --all-extras
185
+ . .venv/bin/activate
186
+
176
187
  # Will need to be re-run any time vendor/llama.cpp is updated
177
188
  python3 setup.py develop
178
189
  ```
@@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode
155
155
 
156
156
  ```bash
157
157
  git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
158
+
159
+ # Install with pip
160
+ pip install -e .
161
+
162
+ # if you want to use the fastapi / openapi server
163
+ pip install -e .[server]
164
+
165
+ # If you're a poetry user, installing will also include a virtual environment
166
+ poetry install --all-extras
167
+ . .venv/bin/activate
168
+
158
169
  # Will need to be re-run any time vendor/llama.cpp is updated
159
170
  python3 setup.py develop
160
171
  ```
@@ -20,6 +20,9 @@ from collections import deque, OrderedDict
20
20
  from . import llama_cpp
21
21
  from .llama_types import *
22
22
 
23
+ import numpy as np
24
+ import numpy.typing as npt
25
+
23
26
 
24
27
  class LlamaCache:
25
28
  """Cache for a llama.cpp model."""
@@ -73,11 +76,15 @@ class LlamaState:
73
76
  self,
74
77
  eval_tokens: Deque[int],
75
78
  eval_logits: Deque[List[float]],
79
+ input_ids: npt.NDArray[np.intc],
80
+ scores: npt.NDArray[np.single],
76
81
  llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8]
77
82
  llama_state_size: int,
78
83
  ):
79
84
  self.eval_tokens = eval_tokens
80
85
  self.eval_logits = eval_logits
86
+ self.input_ids = input_ids
87
+ self.scores = scores
81
88
  self.llama_state = llama_state
82
89
  self.llama_state_size = llama_state_size
83
90
 
@@ -207,20 +214,17 @@ class Llama:
207
214
 
208
215
  self._n_vocab = self.n_vocab()
209
216
  self._n_ctx = self.n_ctx()
210
- data = (llama_cpp.llama_token_data * self._n_vocab)(
211
- *[
212
- llama_cpp.llama_token_data(
213
- id=llama_cpp.llama_token(i),
214
- logit=llama_cpp.c_float(0.0),
215
- p=llama_cpp.c_float(0.0),
216
- )
217
- for i in range(self._n_vocab)
218
- ]
219
- )
220
217
  size = llama_cpp.c_size_t(self._n_vocab)
221
- sorted = False
218
+ sorted = llama_cpp.c_bool(False)
219
+ self._candidates_data = np.array(
220
+ [],
221
+ dtype=np.dtype(
222
+ [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
223
+ ),
224
+ )
225
+ self._candidates_data.resize(3, self._n_vocab)
222
226
  candidates = llama_cpp.llama_token_data_array(
223
- data=data,
227
+ data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
224
228
  size=size,
225
229
  sorted=sorted,
226
230
  )
@@ -228,6 +232,9 @@ class Llama:
228
232
  self._token_nl = Llama.token_nl()
229
233
  self._token_eos = Llama.token_eos()
230
234
 
235
+ self._input_ids = np.array([], dtype=np.intc)
236
+ self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
237
+
231
238
  def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
232
239
  """Tokenize a string.
233
240
 
@@ -295,6 +302,8 @@ class Llama:
295
302
  """Reset the model state."""
296
303
  self.eval_tokens.clear()
297
304
  self.eval_logits.clear()
305
+ self._input_ids = np.array([], dtype=np.intc)
306
+ self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
298
307
 
299
308
  def eval(self, tokens: Sequence[int]):
300
309
  """Evaluate a list of tokens.
@@ -306,7 +315,7 @@ class Llama:
306
315
  n_ctx = self._n_ctx
307
316
  for i in range(0, len(tokens), self.n_batch):
308
317
  batch = tokens[i : min(len(tokens), i + self.n_batch)]
309
- n_past = min(n_ctx - len(batch), len(self.eval_tokens))
318
+ n_past = min(n_ctx - len(batch), len(self._input_ids))
310
319
  n_tokens = len(batch)
311
320
  return_code = llama_cpp.llama_eval(
312
321
  ctx=self.ctx,
@@ -319,6 +328,9 @@ class Llama:
319
328
  raise RuntimeError(f"llama_eval returned {return_code}")
320
329
  # Save tokens
321
330
  self.eval_tokens.extend(batch)
331
+ self._input_ids: npt.NDArray[np.intc] = np.concatenate(
332
+ (self._input_ids, np.array(batch, dtype=np.intc)), axis=0
333
+ )
322
334
  # Save logits
323
335
  rows = n_tokens if self.params.logits_all else 1
324
336
  n_vocab = self._n_vocab
@@ -326,6 +338,9 @@ class Llama:
326
338
  logits_view = llama_cpp.llama_get_logits(self.ctx)
327
339
  logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
328
340
  self.eval_logits.extend(logits)
341
+ self._scores: npt.NDArray[np.single] = np.concatenate(
342
+ (self._scores, np.array(logits, dtype=np.single)), axis=0
343
+ )
329
344
 
330
345
  def _sample(
331
346
  self,
@@ -346,6 +361,7 @@ class Llama:
346
361
  ):
347
362
  assert self.ctx is not None
348
363
  assert len(self.eval_logits) > 0
364
+ assert self._scores.shape[0] > 0
349
365
  n_vocab = self._n_vocab
350
366
  n_ctx = self._n_ctx
351
367
  top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
@@ -354,18 +370,23 @@ class Llama:
354
370
  if last_n_tokens_size.value < 0
355
371
  else last_n_tokens_size
356
372
  )
357
- logits = self.eval_logits[-1]
373
+ logits: npt.NDArray[np.single] = self._scores[-1, :]
358
374
 
359
375
  if logits_processor is not None:
360
- logits = logits_processor(list(self.eval_tokens), logits)
361
- self.eval_logits[-1] = logits
376
+ logits = np.array(
377
+ logits_processor(self._input_ids.tolist(), logits.tolist()),
378
+ dtype=np.single,
379
+ )
380
+ self._scores[-1, :] = logits
381
+ self.eval_logits[-1] = logits.tolist()
362
382
 
363
383
  nl_logit = logits[self._token_nl]
364
384
  candidates = self._candidates
365
- for i, logit in enumerate(logits):
366
- candidates.data[i].id = llama_cpp.llama_token(i)
367
- candidates.data[i].logit = llama_cpp.c_float(logit)
368
- candidates.data[i].p = llama_cpp.c_float(0.0)
385
+ candidates_data = self._candidates_data
386
+ candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore
387
+ candidates_data["logit"] = logits
388
+ candidates_data["p"] = np.zeros(n_vocab, dtype=np.single)
389
+ candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
369
390
  candidates.sorted = llama_cpp.c_bool(False)
370
391
  candidates.size = llama_cpp.c_size_t(n_vocab)
371
392
  llama_cpp.llama_sample_repetition_penalty(
@@ -483,8 +504,8 @@ class Llama:
483
504
  """
484
505
  assert self.ctx is not None
485
506
  last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
486
- 0, self.last_n_tokens_size - len(self.eval_tokens)
487
- ) + list(self.eval_tokens)[-self.last_n_tokens_size :]
507
+ 0, self.last_n_tokens_size - len(self._input_ids)
508
+ ) + self._input_ids[-self.last_n_tokens_size :].tolist()
488
509
  return self._sample(
489
510
  last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
490
511
  *last_n_tokens_data
@@ -542,9 +563,9 @@ class Llama:
542
563
  """
543
564
  assert self.ctx is not None
544
565
 
545
- if reset and len(self.eval_tokens) > 0:
566
+ if reset and len(self._input_ids) > 0:
546
567
  longest_prefix = 0
547
- for a, b in zip(self.eval_tokens, tokens[:-1]):
568
+ for a, b in zip(self._input_ids, tokens[:-1]):
548
569
  if a == b:
549
570
  longest_prefix += 1
550
571
  else:
@@ -554,6 +575,8 @@ class Llama:
554
575
  print("Llama.generate: prefix-match hit", file=sys.stderr)
555
576
  reset = False
556
577
  tokens = tokens[longest_prefix:]
578
+ self._input_ids = self._input_ids[:longest_prefix]
579
+ self._scores = self._scores[:longest_prefix, :]
557
580
  for _ in range(len(self.eval_tokens) - longest_prefix):
558
581
  self.eval_tokens.pop()
559
582
  try:
@@ -580,7 +603,7 @@ class Llama:
580
603
  logits_processor=logits_processor,
581
604
  )
582
605
  if stopping_criteria is not None and stopping_criteria(
583
- list(self.eval_tokens), self.eval_logits[-1]
606
+ self._input_ids.tolist(), self._scores[-1, :].tolist()
584
607
  ):
585
608
  return
586
609
  tokens_or_none = yield token
@@ -715,10 +738,10 @@ class Llama:
715
738
  try:
716
739
  cache_item = self.cache[prompt_tokens]
717
740
  cache_prefix_len = Llama.longest_token_prefix(
718
- cache_item.eval_tokens, prompt_tokens
741
+ cache_item.input_ids.tolist(), prompt_tokens
719
742
  )
720
743
  eval_prefix_len = Llama.longest_token_prefix(
721
- self.eval_tokens, prompt_tokens
744
+ self._input_ids.tolist(), prompt_tokens
722
745
  )
723
746
  if cache_prefix_len > eval_prefix_len:
724
747
  self.load_state(cache_item)
@@ -775,20 +798,22 @@ class Llama:
775
798
  break
776
799
 
777
800
  if stream:
801
+ remaining_tokens = completion_tokens[returned_tokens:]
802
+ remaining_text = self.detokenize(remaining_tokens)
803
+ remaining_length = len(remaining_text)
804
+
778
805
  # We want to avoid yielding any characters from
779
806
  # the generated text if they are part of a stop
780
807
  # sequence.
781
808
  first_stop_position = 0
782
809
  for s in stop_sequences:
783
- for i in range(len(s), 0, -1):
784
- if all_text.endswith(s[:i]):
810
+ for i in range(min(len(s), remaining_length), 0, -1):
811
+ if remaining_text.endswith(s[:i]):
785
812
  if i > first_stop_position:
786
813
  first_stop_position = i
787
814
  break
788
815
 
789
816
  token_end_position = 0
790
- remaining_tokens = completion_tokens[returned_tokens:]
791
- remaining_length = len(self.detokenize(remaining_tokens))
792
817
  for token in remaining_tokens:
793
818
  token_end_position += len(self.detokenize([token]))
794
819
  # Check if stop sequence is in the token
@@ -805,7 +830,7 @@ class Llama:
805
830
  self.detokenize(completion_tokens[:returned_tokens])
806
831
  )
807
832
  token_offset = len(prompt_tokens) + returned_tokens
808
- logits = self.eval_logits[token_offset - 1]
833
+ logits = self._scores[token_offset - 1, :].tolist()
809
834
  current_logprobs = Llama.logits_to_logprobs(logits)
810
835
  sorted_logprobs = list(
811
836
  sorted(
@@ -854,7 +879,7 @@ class Llama:
854
879
  break
855
880
 
856
881
  if stopping_criteria is not None and stopping_criteria(
857
- list(self.eval_tokens), self.eval_logits[-1]
882
+ self._input_ids.tolist(), self._scores[-1, :].tolist()
858
883
  ):
859
884
  text = self.detokenize(completion_tokens)
860
885
  finish_reason = "stop"
@@ -884,7 +909,7 @@ class Llama:
884
909
  self.detokenize(completion_tokens[:returned_tokens])
885
910
  )
886
911
  token_offset = len(prompt_tokens) + returned_tokens - 1
887
- logits = self.eval_logits[token_offset]
912
+ logits = self._scores[token_offset, :].tolist()
888
913
  current_logprobs = Llama.logits_to_logprobs(logits)
889
914
  sorted_logprobs = list(
890
915
  sorted(
@@ -986,8 +1011,7 @@ class Llama:
986
1011
  for token in all_tokens
987
1012
  ]
988
1013
  all_logprobs = [
989
- Llama.logits_to_logprobs(list(map(float, row)))
990
- for row in self.eval_logits
1014
+ Llama.logits_to_logprobs(row.tolist()) for row in self._scores
991
1015
  ][token_offset:]
992
1016
  for token, token_str, logprobs_token in zip(
993
1017
  all_tokens, all_token_strs, all_logprobs
@@ -1371,6 +1395,8 @@ class Llama:
1371
1395
  return LlamaState(
1372
1396
  eval_tokens=self.eval_tokens.copy(),
1373
1397
  eval_logits=self.eval_logits.copy(),
1398
+ scores=self._scores.copy(),
1399
+ input_ids=self._input_ids.copy(),
1374
1400
  llama_state=llama_state_compact,
1375
1401
  llama_state_size=n_bytes,
1376
1402
  )
@@ -1379,6 +1405,8 @@ class Llama:
1379
1405
  assert self.ctx is not None
1380
1406
  self.eval_tokens = state.eval_tokens.copy()
1381
1407
  self.eval_logits = state.eval_logits.copy()
1408
+ self._scores = state.scores.copy()
1409
+ self._input_ids = state.input_ids.copy()
1382
1410
  state_size = state.llama_state_size
1383
1411
  if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size:
1384
1412
  raise RuntimeError("Failed to set llama state data")
@@ -1,13 +1,16 @@
1
1
  import json
2
- import logging
3
2
  import multiprocessing
4
3
  from threading import Lock
5
- from typing import List, Optional, Union, Iterator, Dict
4
+ from functools import partial
5
+ from typing import Iterator, List, Optional, Union, Dict
6
6
  from typing_extensions import TypedDict, Literal
7
7
 
8
8
  import llama_cpp
9
9
 
10
- from fastapi import Depends, FastAPI, APIRouter
10
+ import anyio
11
+ from anyio.streams.memory import MemoryObjectSendStream
12
+ from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
13
+ from fastapi import Depends, FastAPI, APIRouter, Request
11
14
  from fastapi.middleware.cors import CORSMiddleware
12
15
  from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
13
16
  from sse_starlette.sse import EventSourceResponse
@@ -242,35 +245,49 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
242
245
  "/v1/completions",
243
246
  response_model=CreateCompletionResponse,
244
247
  )
245
- def create_completion(
246
- request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
248
+ async def create_completion(
249
+ request: Request,
250
+ body: CreateCompletionRequest,
251
+ llama: llama_cpp.Llama = Depends(get_llama),
247
252
  ):
248
- if isinstance(request.prompt, list):
249
- assert len(request.prompt) <= 1
250
- request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
251
-
252
- completion_or_chunks = llama(
253
- **request.dict(
254
- exclude={
255
- "n",
256
- "best_of",
257
- "logit_bias",
258
- "user",
259
- }
260
- )
261
- )
262
- if request.stream:
263
-
264
- async def server_sent_events(
265
- chunks: Iterator[llama_cpp.CompletionChunk],
266
- ):
267
- for chunk in chunks:
268
- yield dict(data=json.dumps(chunk))
253
+ if isinstance(body.prompt, list):
254
+ assert len(body.prompt) <= 1
255
+ body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
256
+
257
+ exclude = {
258
+ "n",
259
+ "best_of",
260
+ "logit_bias",
261
+ "user",
262
+ }
263
+ kwargs = body.dict(exclude=exclude)
264
+ if body.stream:
265
+ send_chan, recv_chan = anyio.create_memory_object_stream(10)
266
+
267
+ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
268
+ async with inner_send_chan:
269
+ try:
270
+ iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore
271
+ async for chunk in iterate_in_threadpool(iterator):
272
+ await inner_send_chan.send(dict(data=json.dumps(chunk)))
273
+ if await request.is_disconnected():
274
+ raise anyio.get_cancelled_exc_class()()
275
+ await inner_send_chan.send(dict(data="[DONE]"))
276
+ except anyio.get_cancelled_exc_class() as e:
277
+ print("disconnected")
278
+ with anyio.move_on_after(1, shield=True):
279
+ print(
280
+ f"Disconnected from client (via refresh/close) {request.client}"
281
+ )
282
+ await inner_send_chan.send(dict(closing=True))
283
+ raise e
269
284
 
270
- chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks # type: ignore
271
- return EventSourceResponse(server_sent_events(chunks))
272
- completion: llama_cpp.Completion = completion_or_chunks # type: ignore
273
- return completion
285
+ return EventSourceResponse(
286
+ recv_chan, data_sender_callable=partial(event_publisher, send_chan)
287
+ )
288
+ else:
289
+ completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore
290
+ return completion
274
291
 
275
292
 
276
293
  class CreateEmbeddingRequest(BaseModel):
@@ -293,10 +310,12 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
293
310
  "/v1/embeddings",
294
311
  response_model=CreateEmbeddingResponse,
295
312
  )
296
- def create_embedding(
313
+ async def create_embedding(
297
314
  request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
298
315
  ):
299
- return llama.create_embedding(**request.dict(exclude={"user"}))
316
+ return await run_in_threadpool(
317
+ llama.create_embedding, **request.dict(exclude={"user"})
318
+ )
300
319
 
301
320
 
302
321
  class ChatCompletionRequestMessage(BaseModel):
@@ -350,36 +369,47 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet
350
369
  "/v1/chat/completions",
351
370
  response_model=CreateChatCompletionResponse,
352
371
  )
353
- def create_chat_completion(
354
- request: CreateChatCompletionRequest,
372
+ async def create_chat_completion(
373
+ request: Request,
374
+ body: CreateChatCompletionRequest,
355
375
  llama: llama_cpp.Llama = Depends(get_llama),
356
376
  ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
357
- completion_or_chunks = llama.create_chat_completion(
358
- **request.dict(
359
- exclude={
360
- "n",
361
- "logit_bias",
362
- "user",
363
- }
364
- ),
365
- )
366
-
367
- if request.stream:
368
-
369
- async def server_sent_events(
370
- chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
371
- ):
372
- for chat_chunk in chat_chunks:
373
- yield dict(data=json.dumps(chat_chunk))
374
- yield dict(data="[DONE]")
375
-
376
- chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks # type: ignore
377
+ exclude = {
378
+ "n",
379
+ "logit_bias",
380
+ "user",
381
+ }
382
+ kwargs = body.dict(exclude=exclude)
383
+ if body.stream:
384
+ send_chan, recv_chan = anyio.create_memory_object_stream(10)
385
+
386
+ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
387
+ async with inner_send_chan:
388
+ try:
389
+ iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore
390
+ async for chat_chunk in iterate_in_threadpool(iterator):
391
+ await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
392
+ if await request.is_disconnected():
393
+ raise anyio.get_cancelled_exc_class()()
394
+ await inner_send_chan.send(dict(data="[DONE]"))
395
+ except anyio.get_cancelled_exc_class() as e:
396
+ print("disconnected")
397
+ with anyio.move_on_after(1, shield=True):
398
+ print(
399
+ f"Disconnected from client (via refresh/close) {request.client}"
400
+ )
401
+ await inner_send_chan.send(dict(closing=True))
402
+ raise e
377
403
 
378
404
  return EventSourceResponse(
379
- server_sent_events(chunks),
405
+ recv_chan,
406
+ data_sender_callable=partial(event_publisher, send_chan),
407
+ )
408
+ else:
409
+ completion: llama_cpp.ChatCompletion = await run_in_threadpool(
410
+ llama.create_chat_completion, **kwargs # type: ignore
380
411
  )
381
- completion: llama_cpp.ChatCompletion = completion_or_chunks # type: ignore
382
- return completion
412
+ return completion
383
413
 
384
414
 
385
415
  class ModelData(TypedDict):
@@ -398,7 +428,7 @@ GetModelResponse = create_model_from_typeddict(ModelList)
398
428
 
399
429
 
400
430
  @router.get("/v1/models", response_model=GetModelResponse)
401
- def get_models(
431
+ async def get_models(
402
432
  settings: Settings = Depends(get_settings),
403
433
  llama: llama_cpp.Llama = Depends(get_llama),
404
434
  ) -> ModelList:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: llama-cpp-python
3
- Version: 0.1.55
3
+ Version: 0.1.56
4
4
  Summary: A Python wrapper for llama.cpp
5
5
  Author: Andrei Betlen
6
6
  Author-email: abetlen@gmail.com
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
173
173
 
174
174
  ```bash
175
175
  git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
176
+
177
+ # Install with pip
178
+ pip install -e .
179
+
180
+ # if you want to use the fastapi / openapi server
181
+ pip install -e .[server]
182
+
183
+ # If you're a poetry user, installing will also include a virtual environment
184
+ poetry install --all-extras
185
+ . .venv/bin/activate
186
+
176
187
  # Will need to be re-run any time vendor/llama.cpp is updated
177
188
  python3 setup.py develop
178
189
  ```
@@ -1,11 +1,14 @@
1
1
  .dockerignore
2
2
  .gitignore
3
3
  .gitmodules
4
+ CHANGELOG.md
4
5
  CMakeLists.txt
5
6
  LICENSE.md
7
+ Makefile
6
8
  README.md
7
9
  mkdocs.yml
8
10
  poetry.lock
11
+ poetry.toml
9
12
  pyproject.toml
10
13
  setup.py
11
14
  .github/dependabot.yml
@@ -1,4 +1,5 @@
1
1
  typing-extensions>=4.5.0
2
+ numpy>=1.20.0
2
3
 
3
4
  [server]
4
5
  uvicorn>=0.21.1