llama-cpp-python 0.1.55__tar.gz → 0.1.56__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp_python-0.1.56/CHANGELOG.md +20 -0
- llama_cpp_python-0.1.56/Makefile +49 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/PKG-INFO +12 -1
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/README.md +11 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama.py +64 -36
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/app.py +88 -58
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/PKG-INFO +12 -1
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/SOURCES.txt +3 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/requires.txt +1 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/poetry.lock +379 -156
- llama_cpp_python-0.1.56/poetry.toml +3 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/pyproject.toml +12 -5
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/setup.py +2 -4
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.dockerignore +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/dependabot.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/build-and-release.yaml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/build-docker.yaml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/publish-to-test.yaml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/publish.yaml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.github/workflows/test.yaml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.gitignore +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/.gitmodules +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/LICENSE.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/Dockerfile +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/Dockerfile.cuda_simple +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/Dockerfile.openblas_simple +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/hug_model.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docker/start_server.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/docs/index.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/fastapi_server.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_embedding.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_inference.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_streaming.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/high_level_api/langchain_custom_llm.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/Chat.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/Miku.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/ReasonAct.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/common.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_chat_cpp.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/quantize.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/low_level_api/util.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/notebooks/Clients.ipynb +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/notebooks/Guidance.ipynb +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/examples/notebooks/PerformanceTuning.ipynb +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/__init__.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama_cpp.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama_types.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/__init__.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/__main__.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/top_level.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/mkdocs.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/setup.cfg +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/tests/test_llama.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.clang-tidy +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/tools.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.dockerignore +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.ecrc +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.editorconfig +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/build.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/tidy-post.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/tidy-review.yml +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.gitignore +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/BLIS.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/LICENSE +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/Makefile +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/Package.swift +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/SHA256SUMS +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/build.zig +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert-lora-to-ggml.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/Miku.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/alpaca.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-persistent.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/common.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/common.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/embedding.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/graph.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/main.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/perplexity.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/quantize.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/reason-act.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/README.md +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/httplib.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/json.hpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/server.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/flake.lock +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/flake.nix +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-cuda.cu +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-cuda.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-opencl.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-opencl.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml.c +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama-util.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama0-banner.png +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama0-logo.png +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama1-banner.png +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama1-logo.png +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/dan-modified.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/dan.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/requirements.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/perf-run-all.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/ppl-run-all.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/sync-ggml.sh +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/spm-headers/llama.h +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-double-float.c +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-grad0.c +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-opt.c +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-quantize-fns.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
- {llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [v0.1.56]
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Added first version of the changelog
|
|
15
|
+
- Server: Use async routes
|
|
16
|
+
- Use numpy for internal buffers to reduce memory usage and improve performance.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- Performance bug in stop sequence check slowing down streaming.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
update:
|
|
2
|
+
poetry install
|
|
3
|
+
git submodule update --init --recursive
|
|
4
|
+
|
|
5
|
+
update.vendor:
|
|
6
|
+
cd vendor/llama.cpp && git pull origin master
|
|
7
|
+
|
|
8
|
+
build:
|
|
9
|
+
python3 setup.py develop
|
|
10
|
+
|
|
11
|
+
build.cuda:
|
|
12
|
+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
13
|
+
|
|
14
|
+
build.opencl:
|
|
15
|
+
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
16
|
+
|
|
17
|
+
build.openblas:
|
|
18
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
19
|
+
|
|
20
|
+
build.blis:
|
|
21
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
|
|
22
|
+
|
|
23
|
+
build.sdist:
|
|
24
|
+
python3 setup.py sdist
|
|
25
|
+
|
|
26
|
+
deploy.pypi:
|
|
27
|
+
python3 -m twine upload dist/*
|
|
28
|
+
|
|
29
|
+
deploy.gh-docs:
|
|
30
|
+
mkdocs build
|
|
31
|
+
mkdocs gh-deploy
|
|
32
|
+
|
|
33
|
+
clean:
|
|
34
|
+
- cd vendor/llama.cpp && make clean
|
|
35
|
+
- cd vendor/llama.cpp && rm libllama.so
|
|
36
|
+
- rm -rf _skbuild
|
|
37
|
+
- rm llama_cpp/libllama.so
|
|
38
|
+
|
|
39
|
+
.PHONY: \
|
|
40
|
+
update \
|
|
41
|
+
update.vendor \
|
|
42
|
+
build \
|
|
43
|
+
build.cuda \
|
|
44
|
+
build.opencl \
|
|
45
|
+
build.openblas \
|
|
46
|
+
build.sdist \
|
|
47
|
+
deploy.pypi \
|
|
48
|
+
deploy.gh-docs \
|
|
49
|
+
clean
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llama_cpp_python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.56
|
|
4
4
|
Summary: A Python wrapper for llama.cpp
|
|
5
5
|
Author: Andrei Betlen
|
|
6
6
|
Author-email: abetlen@gmail.com
|
|
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
|
|
|
173
173
|
|
|
174
174
|
```bash
|
|
175
175
|
git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
|
|
176
|
+
|
|
177
|
+
# Install with pip
|
|
178
|
+
pip install -e .
|
|
179
|
+
|
|
180
|
+
# if you want to use the fastapi / openapi server
|
|
181
|
+
pip install -e .[server]
|
|
182
|
+
|
|
183
|
+
# If you're a poetry user, installing will also include a virtual environment
|
|
184
|
+
poetry install --all-extras
|
|
185
|
+
. .venv/bin/activate
|
|
186
|
+
|
|
176
187
|
# Will need to be re-run any time vendor/llama.cpp is updated
|
|
177
188
|
python3 setup.py develop
|
|
178
189
|
```
|
|
@@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode
|
|
|
155
155
|
|
|
156
156
|
```bash
|
|
157
157
|
git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
|
|
158
|
+
|
|
159
|
+
# Install with pip
|
|
160
|
+
pip install -e .
|
|
161
|
+
|
|
162
|
+
# if you want to use the fastapi / openapi server
|
|
163
|
+
pip install -e .[server]
|
|
164
|
+
|
|
165
|
+
# If you're a poetry user, installing will also include a virtual environment
|
|
166
|
+
poetry install --all-extras
|
|
167
|
+
. .venv/bin/activate
|
|
168
|
+
|
|
158
169
|
# Will need to be re-run any time vendor/llama.cpp is updated
|
|
159
170
|
python3 setup.py develop
|
|
160
171
|
```
|
|
@@ -20,6 +20,9 @@ from collections import deque, OrderedDict
|
|
|
20
20
|
from . import llama_cpp
|
|
21
21
|
from .llama_types import *
|
|
22
22
|
|
|
23
|
+
import numpy as np
|
|
24
|
+
import numpy.typing as npt
|
|
25
|
+
|
|
23
26
|
|
|
24
27
|
class LlamaCache:
|
|
25
28
|
"""Cache for a llama.cpp model."""
|
|
@@ -73,11 +76,15 @@ class LlamaState:
|
|
|
73
76
|
self,
|
|
74
77
|
eval_tokens: Deque[int],
|
|
75
78
|
eval_logits: Deque[List[float]],
|
|
79
|
+
input_ids: npt.NDArray[np.intc],
|
|
80
|
+
scores: npt.NDArray[np.single],
|
|
76
81
|
llama_state, # type: llama_cpp.Array[llama_cpp.c_uint8]
|
|
77
82
|
llama_state_size: int,
|
|
78
83
|
):
|
|
79
84
|
self.eval_tokens = eval_tokens
|
|
80
85
|
self.eval_logits = eval_logits
|
|
86
|
+
self.input_ids = input_ids
|
|
87
|
+
self.scores = scores
|
|
81
88
|
self.llama_state = llama_state
|
|
82
89
|
self.llama_state_size = llama_state_size
|
|
83
90
|
|
|
@@ -207,20 +214,17 @@ class Llama:
|
|
|
207
214
|
|
|
208
215
|
self._n_vocab = self.n_vocab()
|
|
209
216
|
self._n_ctx = self.n_ctx()
|
|
210
|
-
data = (llama_cpp.llama_token_data * self._n_vocab)(
|
|
211
|
-
*[
|
|
212
|
-
llama_cpp.llama_token_data(
|
|
213
|
-
id=llama_cpp.llama_token(i),
|
|
214
|
-
logit=llama_cpp.c_float(0.0),
|
|
215
|
-
p=llama_cpp.c_float(0.0),
|
|
216
|
-
)
|
|
217
|
-
for i in range(self._n_vocab)
|
|
218
|
-
]
|
|
219
|
-
)
|
|
220
217
|
size = llama_cpp.c_size_t(self._n_vocab)
|
|
221
|
-
sorted = False
|
|
218
|
+
sorted = llama_cpp.c_bool(False)
|
|
219
|
+
self._candidates_data = np.array(
|
|
220
|
+
[],
|
|
221
|
+
dtype=np.dtype(
|
|
222
|
+
[("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
|
|
223
|
+
),
|
|
224
|
+
)
|
|
225
|
+
self._candidates_data.resize(3, self._n_vocab)
|
|
222
226
|
candidates = llama_cpp.llama_token_data_array(
|
|
223
|
-
data=
|
|
227
|
+
data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
|
|
224
228
|
size=size,
|
|
225
229
|
sorted=sorted,
|
|
226
230
|
)
|
|
@@ -228,6 +232,9 @@ class Llama:
|
|
|
228
232
|
self._token_nl = Llama.token_nl()
|
|
229
233
|
self._token_eos = Llama.token_eos()
|
|
230
234
|
|
|
235
|
+
self._input_ids = np.array([], dtype=np.intc)
|
|
236
|
+
self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
|
|
237
|
+
|
|
231
238
|
def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
|
|
232
239
|
"""Tokenize a string.
|
|
233
240
|
|
|
@@ -295,6 +302,8 @@ class Llama:
|
|
|
295
302
|
"""Reset the model state."""
|
|
296
303
|
self.eval_tokens.clear()
|
|
297
304
|
self.eval_logits.clear()
|
|
305
|
+
self._input_ids = np.array([], dtype=np.intc)
|
|
306
|
+
self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
|
|
298
307
|
|
|
299
308
|
def eval(self, tokens: Sequence[int]):
|
|
300
309
|
"""Evaluate a list of tokens.
|
|
@@ -306,7 +315,7 @@ class Llama:
|
|
|
306
315
|
n_ctx = self._n_ctx
|
|
307
316
|
for i in range(0, len(tokens), self.n_batch):
|
|
308
317
|
batch = tokens[i : min(len(tokens), i + self.n_batch)]
|
|
309
|
-
n_past = min(n_ctx - len(batch), len(self.
|
|
318
|
+
n_past = min(n_ctx - len(batch), len(self._input_ids))
|
|
310
319
|
n_tokens = len(batch)
|
|
311
320
|
return_code = llama_cpp.llama_eval(
|
|
312
321
|
ctx=self.ctx,
|
|
@@ -319,6 +328,9 @@ class Llama:
|
|
|
319
328
|
raise RuntimeError(f"llama_eval returned {return_code}")
|
|
320
329
|
# Save tokens
|
|
321
330
|
self.eval_tokens.extend(batch)
|
|
331
|
+
self._input_ids: npt.NDArray[np.intc] = np.concatenate(
|
|
332
|
+
(self._input_ids, np.array(batch, dtype=np.intc)), axis=0
|
|
333
|
+
)
|
|
322
334
|
# Save logits
|
|
323
335
|
rows = n_tokens if self.params.logits_all else 1
|
|
324
336
|
n_vocab = self._n_vocab
|
|
@@ -326,6 +338,9 @@ class Llama:
|
|
|
326
338
|
logits_view = llama_cpp.llama_get_logits(self.ctx)
|
|
327
339
|
logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
|
|
328
340
|
self.eval_logits.extend(logits)
|
|
341
|
+
self._scores: npt.NDArray[np.single] = np.concatenate(
|
|
342
|
+
(self._scores, np.array(logits, dtype=np.single)), axis=0
|
|
343
|
+
)
|
|
329
344
|
|
|
330
345
|
def _sample(
|
|
331
346
|
self,
|
|
@@ -346,6 +361,7 @@ class Llama:
|
|
|
346
361
|
):
|
|
347
362
|
assert self.ctx is not None
|
|
348
363
|
assert len(self.eval_logits) > 0
|
|
364
|
+
assert self._scores.shape[0] > 0
|
|
349
365
|
n_vocab = self._n_vocab
|
|
350
366
|
n_ctx = self._n_ctx
|
|
351
367
|
top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
|
|
@@ -354,18 +370,23 @@ class Llama:
|
|
|
354
370
|
if last_n_tokens_size.value < 0
|
|
355
371
|
else last_n_tokens_size
|
|
356
372
|
)
|
|
357
|
-
logits = self.
|
|
373
|
+
logits: npt.NDArray[np.single] = self._scores[-1, :]
|
|
358
374
|
|
|
359
375
|
if logits_processor is not None:
|
|
360
|
-
logits =
|
|
361
|
-
|
|
376
|
+
logits = np.array(
|
|
377
|
+
logits_processor(self._input_ids.tolist(), logits.tolist()),
|
|
378
|
+
dtype=np.single,
|
|
379
|
+
)
|
|
380
|
+
self._scores[-1, :] = logits
|
|
381
|
+
self.eval_logits[-1] = logits.tolist()
|
|
362
382
|
|
|
363
383
|
nl_logit = logits[self._token_nl]
|
|
364
384
|
candidates = self._candidates
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
385
|
+
candidates_data = self._candidates_data
|
|
386
|
+
candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore
|
|
387
|
+
candidates_data["logit"] = logits
|
|
388
|
+
candidates_data["p"] = np.zeros(n_vocab, dtype=np.single)
|
|
389
|
+
candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
|
|
369
390
|
candidates.sorted = llama_cpp.c_bool(False)
|
|
370
391
|
candidates.size = llama_cpp.c_size_t(n_vocab)
|
|
371
392
|
llama_cpp.llama_sample_repetition_penalty(
|
|
@@ -483,8 +504,8 @@ class Llama:
|
|
|
483
504
|
"""
|
|
484
505
|
assert self.ctx is not None
|
|
485
506
|
last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
|
|
486
|
-
0, self.last_n_tokens_size - len(self.
|
|
487
|
-
) +
|
|
507
|
+
0, self.last_n_tokens_size - len(self._input_ids)
|
|
508
|
+
) + self._input_ids[-self.last_n_tokens_size :].tolist()
|
|
488
509
|
return self._sample(
|
|
489
510
|
last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
|
|
490
511
|
*last_n_tokens_data
|
|
@@ -542,9 +563,9 @@ class Llama:
|
|
|
542
563
|
"""
|
|
543
564
|
assert self.ctx is not None
|
|
544
565
|
|
|
545
|
-
if reset and len(self.
|
|
566
|
+
if reset and len(self._input_ids) > 0:
|
|
546
567
|
longest_prefix = 0
|
|
547
|
-
for a, b in zip(self.
|
|
568
|
+
for a, b in zip(self._input_ids, tokens[:-1]):
|
|
548
569
|
if a == b:
|
|
549
570
|
longest_prefix += 1
|
|
550
571
|
else:
|
|
@@ -554,6 +575,8 @@ class Llama:
|
|
|
554
575
|
print("Llama.generate: prefix-match hit", file=sys.stderr)
|
|
555
576
|
reset = False
|
|
556
577
|
tokens = tokens[longest_prefix:]
|
|
578
|
+
self._input_ids = self._input_ids[:longest_prefix]
|
|
579
|
+
self._scores = self._scores[:longest_prefix, :]
|
|
557
580
|
for _ in range(len(self.eval_tokens) - longest_prefix):
|
|
558
581
|
self.eval_tokens.pop()
|
|
559
582
|
try:
|
|
@@ -580,7 +603,7 @@ class Llama:
|
|
|
580
603
|
logits_processor=logits_processor,
|
|
581
604
|
)
|
|
582
605
|
if stopping_criteria is not None and stopping_criteria(
|
|
583
|
-
|
|
606
|
+
self._input_ids.tolist(), self._scores[-1, :].tolist()
|
|
584
607
|
):
|
|
585
608
|
return
|
|
586
609
|
tokens_or_none = yield token
|
|
@@ -715,10 +738,10 @@ class Llama:
|
|
|
715
738
|
try:
|
|
716
739
|
cache_item = self.cache[prompt_tokens]
|
|
717
740
|
cache_prefix_len = Llama.longest_token_prefix(
|
|
718
|
-
cache_item.
|
|
741
|
+
cache_item.input_ids.tolist(), prompt_tokens
|
|
719
742
|
)
|
|
720
743
|
eval_prefix_len = Llama.longest_token_prefix(
|
|
721
|
-
self.
|
|
744
|
+
self._input_ids.tolist(), prompt_tokens
|
|
722
745
|
)
|
|
723
746
|
if cache_prefix_len > eval_prefix_len:
|
|
724
747
|
self.load_state(cache_item)
|
|
@@ -775,20 +798,22 @@ class Llama:
|
|
|
775
798
|
break
|
|
776
799
|
|
|
777
800
|
if stream:
|
|
801
|
+
remaining_tokens = completion_tokens[returned_tokens:]
|
|
802
|
+
remaining_text = self.detokenize(remaining_tokens)
|
|
803
|
+
remaining_length = len(remaining_text)
|
|
804
|
+
|
|
778
805
|
# We want to avoid yielding any characters from
|
|
779
806
|
# the generated text if they are part of a stop
|
|
780
807
|
# sequence.
|
|
781
808
|
first_stop_position = 0
|
|
782
809
|
for s in stop_sequences:
|
|
783
|
-
for i in range(len(s), 0, -1):
|
|
784
|
-
if
|
|
810
|
+
for i in range(min(len(s), remaining_length), 0, -1):
|
|
811
|
+
if remaining_text.endswith(s[:i]):
|
|
785
812
|
if i > first_stop_position:
|
|
786
813
|
first_stop_position = i
|
|
787
814
|
break
|
|
788
815
|
|
|
789
816
|
token_end_position = 0
|
|
790
|
-
remaining_tokens = completion_tokens[returned_tokens:]
|
|
791
|
-
remaining_length = len(self.detokenize(remaining_tokens))
|
|
792
817
|
for token in remaining_tokens:
|
|
793
818
|
token_end_position += len(self.detokenize([token]))
|
|
794
819
|
# Check if stop sequence is in the token
|
|
@@ -805,7 +830,7 @@ class Llama:
|
|
|
805
830
|
self.detokenize(completion_tokens[:returned_tokens])
|
|
806
831
|
)
|
|
807
832
|
token_offset = len(prompt_tokens) + returned_tokens
|
|
808
|
-
logits = self.
|
|
833
|
+
logits = self._scores[token_offset - 1, :].tolist()
|
|
809
834
|
current_logprobs = Llama.logits_to_logprobs(logits)
|
|
810
835
|
sorted_logprobs = list(
|
|
811
836
|
sorted(
|
|
@@ -854,7 +879,7 @@ class Llama:
|
|
|
854
879
|
break
|
|
855
880
|
|
|
856
881
|
if stopping_criteria is not None and stopping_criteria(
|
|
857
|
-
|
|
882
|
+
self._input_ids.tolist(), self._scores[-1, :].tolist()
|
|
858
883
|
):
|
|
859
884
|
text = self.detokenize(completion_tokens)
|
|
860
885
|
finish_reason = "stop"
|
|
@@ -884,7 +909,7 @@ class Llama:
|
|
|
884
909
|
self.detokenize(completion_tokens[:returned_tokens])
|
|
885
910
|
)
|
|
886
911
|
token_offset = len(prompt_tokens) + returned_tokens - 1
|
|
887
|
-
logits = self.
|
|
912
|
+
logits = self._scores[token_offset, :].tolist()
|
|
888
913
|
current_logprobs = Llama.logits_to_logprobs(logits)
|
|
889
914
|
sorted_logprobs = list(
|
|
890
915
|
sorted(
|
|
@@ -986,8 +1011,7 @@ class Llama:
|
|
|
986
1011
|
for token in all_tokens
|
|
987
1012
|
]
|
|
988
1013
|
all_logprobs = [
|
|
989
|
-
Llama.logits_to_logprobs(
|
|
990
|
-
for row in self.eval_logits
|
|
1014
|
+
Llama.logits_to_logprobs(row.tolist()) for row in self._scores
|
|
991
1015
|
][token_offset:]
|
|
992
1016
|
for token, token_str, logprobs_token in zip(
|
|
993
1017
|
all_tokens, all_token_strs, all_logprobs
|
|
@@ -1371,6 +1395,8 @@ class Llama:
|
|
|
1371
1395
|
return LlamaState(
|
|
1372
1396
|
eval_tokens=self.eval_tokens.copy(),
|
|
1373
1397
|
eval_logits=self.eval_logits.copy(),
|
|
1398
|
+
scores=self._scores.copy(),
|
|
1399
|
+
input_ids=self._input_ids.copy(),
|
|
1374
1400
|
llama_state=llama_state_compact,
|
|
1375
1401
|
llama_state_size=n_bytes,
|
|
1376
1402
|
)
|
|
@@ -1379,6 +1405,8 @@ class Llama:
|
|
|
1379
1405
|
assert self.ctx is not None
|
|
1380
1406
|
self.eval_tokens = state.eval_tokens.copy()
|
|
1381
1407
|
self.eval_logits = state.eval_logits.copy()
|
|
1408
|
+
self._scores = state.scores.copy()
|
|
1409
|
+
self._input_ids = state.input_ids.copy()
|
|
1382
1410
|
state_size = state.llama_state_size
|
|
1383
1411
|
if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size:
|
|
1384
1412
|
raise RuntimeError("Failed to set llama state data")
|
|
@@ -1,13 +1,16 @@
|
|
|
1
1
|
import json
|
|
2
|
-
import logging
|
|
3
2
|
import multiprocessing
|
|
4
3
|
from threading import Lock
|
|
5
|
-
from
|
|
4
|
+
from functools import partial
|
|
5
|
+
from typing import Iterator, List, Optional, Union, Dict
|
|
6
6
|
from typing_extensions import TypedDict, Literal
|
|
7
7
|
|
|
8
8
|
import llama_cpp
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
import anyio
|
|
11
|
+
from anyio.streams.memory import MemoryObjectSendStream
|
|
12
|
+
from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
|
|
13
|
+
from fastapi import Depends, FastAPI, APIRouter, Request
|
|
11
14
|
from fastapi.middleware.cors import CORSMiddleware
|
|
12
15
|
from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
|
|
13
16
|
from sse_starlette.sse import EventSourceResponse
|
|
@@ -242,35 +245,49 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
|
|
|
242
245
|
"/v1/completions",
|
|
243
246
|
response_model=CreateCompletionResponse,
|
|
244
247
|
)
|
|
245
|
-
def create_completion(
|
|
246
|
-
request:
|
|
248
|
+
async def create_completion(
|
|
249
|
+
request: Request,
|
|
250
|
+
body: CreateCompletionRequest,
|
|
251
|
+
llama: llama_cpp.Llama = Depends(get_llama),
|
|
247
252
|
):
|
|
248
|
-
if isinstance(
|
|
249
|
-
assert len(
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
253
|
+
if isinstance(body.prompt, list):
|
|
254
|
+
assert len(body.prompt) <= 1
|
|
255
|
+
body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
|
|
256
|
+
|
|
257
|
+
exclude = {
|
|
258
|
+
"n",
|
|
259
|
+
"best_of",
|
|
260
|
+
"logit_bias",
|
|
261
|
+
"user",
|
|
262
|
+
}
|
|
263
|
+
kwargs = body.dict(exclude=exclude)
|
|
264
|
+
if body.stream:
|
|
265
|
+
send_chan, recv_chan = anyio.create_memory_object_stream(10)
|
|
266
|
+
|
|
267
|
+
async def event_publisher(inner_send_chan: MemoryObjectSendStream):
|
|
268
|
+
async with inner_send_chan:
|
|
269
|
+
try:
|
|
270
|
+
iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore
|
|
271
|
+
async for chunk in iterate_in_threadpool(iterator):
|
|
272
|
+
await inner_send_chan.send(dict(data=json.dumps(chunk)))
|
|
273
|
+
if await request.is_disconnected():
|
|
274
|
+
raise anyio.get_cancelled_exc_class()()
|
|
275
|
+
await inner_send_chan.send(dict(data="[DONE]"))
|
|
276
|
+
except anyio.get_cancelled_exc_class() as e:
|
|
277
|
+
print("disconnected")
|
|
278
|
+
with anyio.move_on_after(1, shield=True):
|
|
279
|
+
print(
|
|
280
|
+
f"Disconnected from client (via refresh/close) {request.client}"
|
|
281
|
+
)
|
|
282
|
+
await inner_send_chan.send(dict(closing=True))
|
|
283
|
+
raise e
|
|
269
284
|
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
285
|
+
return EventSourceResponse(
|
|
286
|
+
recv_chan, data_sender_callable=partial(event_publisher, send_chan)
|
|
287
|
+
)
|
|
288
|
+
else:
|
|
289
|
+
completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore
|
|
290
|
+
return completion
|
|
274
291
|
|
|
275
292
|
|
|
276
293
|
class CreateEmbeddingRequest(BaseModel):
|
|
@@ -293,10 +310,12 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
|
|
|
293
310
|
"/v1/embeddings",
|
|
294
311
|
response_model=CreateEmbeddingResponse,
|
|
295
312
|
)
|
|
296
|
-
def create_embedding(
|
|
313
|
+
async def create_embedding(
|
|
297
314
|
request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
|
|
298
315
|
):
|
|
299
|
-
return
|
|
316
|
+
return await run_in_threadpool(
|
|
317
|
+
llama.create_embedding, **request.dict(exclude={"user"})
|
|
318
|
+
)
|
|
300
319
|
|
|
301
320
|
|
|
302
321
|
class ChatCompletionRequestMessage(BaseModel):
|
|
@@ -350,36 +369,47 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet
|
|
|
350
369
|
"/v1/chat/completions",
|
|
351
370
|
response_model=CreateChatCompletionResponse,
|
|
352
371
|
)
|
|
353
|
-
def create_chat_completion(
|
|
354
|
-
request:
|
|
372
|
+
async def create_chat_completion(
|
|
373
|
+
request: Request,
|
|
374
|
+
body: CreateChatCompletionRequest,
|
|
355
375
|
llama: llama_cpp.Llama = Depends(get_llama),
|
|
356
376
|
) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
+
exclude = {
|
|
378
|
+
"n",
|
|
379
|
+
"logit_bias",
|
|
380
|
+
"user",
|
|
381
|
+
}
|
|
382
|
+
kwargs = body.dict(exclude=exclude)
|
|
383
|
+
if body.stream:
|
|
384
|
+
send_chan, recv_chan = anyio.create_memory_object_stream(10)
|
|
385
|
+
|
|
386
|
+
async def event_publisher(inner_send_chan: MemoryObjectSendStream):
|
|
387
|
+
async with inner_send_chan:
|
|
388
|
+
try:
|
|
389
|
+
iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore
|
|
390
|
+
async for chat_chunk in iterate_in_threadpool(iterator):
|
|
391
|
+
await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
|
|
392
|
+
if await request.is_disconnected():
|
|
393
|
+
raise anyio.get_cancelled_exc_class()()
|
|
394
|
+
await inner_send_chan.send(dict(data="[DONE]"))
|
|
395
|
+
except anyio.get_cancelled_exc_class() as e:
|
|
396
|
+
print("disconnected")
|
|
397
|
+
with anyio.move_on_after(1, shield=True):
|
|
398
|
+
print(
|
|
399
|
+
f"Disconnected from client (via refresh/close) {request.client}"
|
|
400
|
+
)
|
|
401
|
+
await inner_send_chan.send(dict(closing=True))
|
|
402
|
+
raise e
|
|
377
403
|
|
|
378
404
|
return EventSourceResponse(
|
|
379
|
-
|
|
405
|
+
recv_chan,
|
|
406
|
+
data_sender_callable=partial(event_publisher, send_chan),
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
completion: llama_cpp.ChatCompletion = await run_in_threadpool(
|
|
410
|
+
llama.create_chat_completion, **kwargs # type: ignore
|
|
380
411
|
)
|
|
381
|
-
|
|
382
|
-
return completion
|
|
412
|
+
return completion
|
|
383
413
|
|
|
384
414
|
|
|
385
415
|
class ModelData(TypedDict):
|
|
@@ -398,7 +428,7 @@ GetModelResponse = create_model_from_typeddict(ModelList)
|
|
|
398
428
|
|
|
399
429
|
|
|
400
430
|
@router.get("/v1/models", response_model=GetModelResponse)
|
|
401
|
-
def get_models(
|
|
431
|
+
async def get_models(
|
|
402
432
|
settings: Settings = Depends(get_settings),
|
|
403
433
|
llama: llama_cpp.Llama = Depends(get_llama),
|
|
404
434
|
) -> ModelList:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llama-cpp-python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.56
|
|
4
4
|
Summary: A Python wrapper for llama.cpp
|
|
5
5
|
Author: Andrei Betlen
|
|
6
6
|
Author-email: abetlen@gmail.com
|
|
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
|
|
|
173
173
|
|
|
174
174
|
```bash
|
|
175
175
|
git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
|
|
176
|
+
|
|
177
|
+
# Install with pip
|
|
178
|
+
pip install -e .
|
|
179
|
+
|
|
180
|
+
# if you want to use the fastapi / openapi server
|
|
181
|
+
pip install -e .[server]
|
|
182
|
+
|
|
183
|
+
# If you're a poetry user, installing will also include a virtual environment
|
|
184
|
+
poetry install --all-extras
|
|
185
|
+
. .venv/bin/activate
|
|
186
|
+
|
|
176
187
|
# Will need to be re-run any time vendor/llama.cpp is updated
|
|
177
188
|
python3 setup.py develop
|
|
178
189
|
```
|