llama-cpp-python 0.1.54__tar.gz → 0.1.56__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llama_cpp_python-0.1.56/CHANGELOG.md +20 -0
- llama_cpp_python-0.1.56/Makefile +49 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/PKG-INFO +12 -1
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/README.md +11 -0
- llama_cpp_python-0.1.56/docker/Dockerfile +51 -0
- llama_cpp_python-0.1.56/docker/README.md +46 -0
- llama_cpp_python-0.1.56/docker/hug_model.py +116 -0
- llama_cpp_python-0.1.56/docker/start_server.sh +11 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_chat_cpp.py +11 -8
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/llama.py +168 -56
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/server/app.py +88 -58
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/PKG-INFO +12 -1
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/SOURCES.txt +9 -2
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/requires.txt +1 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/poetry.lock +379 -156
- llama_cpp_python-0.1.56/poetry.toml +3 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/pyproject.toml +12 -5
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/setup.py +2 -4
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/build.yml +5 -5
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/CMakeLists.txt +29 -25
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/Makefile +11 -1
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/README.md +19 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-persistent.sh +2 -2
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/README.md +1 -1
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/main.cpp +14 -4
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-cuda.cu +71 -39
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-opencl.cpp +1 -1
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.dockerignore +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/dependabot.yml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/workflows/build-and-release.yaml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/workflows/build-docker.yaml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/workflows/publish-to-test.yaml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/workflows/publish.yaml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.github/workflows/test.yaml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.gitignore +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/.gitmodules +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/LICENSE.md +0 -0
- /llama_cpp_python-0.1.54/Dockerfile.cuda → /llama_cpp_python-0.1.56/docker/Dockerfile.cuda_simple +0 -0
- /llama_cpp_python-0.1.54/Dockerfile → /llama_cpp_python-0.1.56/docker/Dockerfile.openblas_simple +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/docs/index.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/high_level_api/fastapi_server.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_embedding.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_inference.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/high_level_api/high_level_api_streaming.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/high_level_api/langchain_custom_llm.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/Chat.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/Miku.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/ReasonAct.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/common.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/quantize.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/util.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/notebooks/Clients.ipynb +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/notebooks/Guidance.ipynb +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/notebooks/PerformanceTuning.ipynb +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/__init__.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/llama_cpp.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/llama_types.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/server/__init__.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp/server/__main__.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/top_level.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/mkdocs.yml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/setup.cfg +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/tests/test_llama.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.clang-tidy +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.devops/tools.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.dockerignore +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.ecrc +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.editorconfig +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/tidy-post.yml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.github/workflows/tidy-review.yml +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/.gitignore +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/BLIS.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/LICENSE +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/Package.swift +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/SHA256SUMS +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/build.zig +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert-lora-to-ggml.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert-pth-to-ggml.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/convert.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/Miku.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/alpaca.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/chat.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/common.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/common.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/README.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/embedding/embedding.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/graph.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/perplexity/perplexity.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/README.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize/quantize.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/reason-act.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/README.md +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/httplib.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/json.hpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/examples/server/server.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/flake.lock +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/flake.nix +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-cuda.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml-opencl.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml.c +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/ggml.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama-util.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/llama.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama0-banner.png +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama0-logo.png +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama1-banner.png +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/media/llama1-logo.png +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/chat.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/dan-modified.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/dan.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/requirements.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/build-info.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/perf-run-all.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/ppl-run-all.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/sync-ggml.sh +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/spm-headers/llama.h +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-double-float.c +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-grad0.c +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-opt.c +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-quantize-fns.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
- {llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [v0.1.56]
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Added first version of the changelog
|
|
15
|
+
- Server: Use async routes
|
|
16
|
+
- Use numpy for internal buffers to reduce memory usage and improve performance.
|
|
17
|
+
|
|
18
|
+
### Fixed
|
|
19
|
+
|
|
20
|
+
- Performance bug in stop sequence check slowing down streaming.
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
update:
|
|
2
|
+
poetry install
|
|
3
|
+
git submodule update --init --recursive
|
|
4
|
+
|
|
5
|
+
update.vendor:
|
|
6
|
+
cd vendor/llama.cpp && git pull origin master
|
|
7
|
+
|
|
8
|
+
build:
|
|
9
|
+
python3 setup.py develop
|
|
10
|
+
|
|
11
|
+
build.cuda:
|
|
12
|
+
CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
13
|
+
|
|
14
|
+
build.opencl:
|
|
15
|
+
CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
16
|
+
|
|
17
|
+
build.openblas:
|
|
18
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
19
|
+
|
|
20
|
+
build.blis:
|
|
21
|
+
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
|
|
22
|
+
|
|
23
|
+
build.sdist:
|
|
24
|
+
python3 setup.py sdist
|
|
25
|
+
|
|
26
|
+
deploy.pypi:
|
|
27
|
+
python3 -m twine upload dist/*
|
|
28
|
+
|
|
29
|
+
deploy.gh-docs:
|
|
30
|
+
mkdocs build
|
|
31
|
+
mkdocs gh-deploy
|
|
32
|
+
|
|
33
|
+
clean:
|
|
34
|
+
- cd vendor/llama.cpp && make clean
|
|
35
|
+
- cd vendor/llama.cpp && rm libllama.so
|
|
36
|
+
- rm -rf _skbuild
|
|
37
|
+
- rm llama_cpp/libllama.so
|
|
38
|
+
|
|
39
|
+
.PHONY: \
|
|
40
|
+
update \
|
|
41
|
+
update.vendor \
|
|
42
|
+
build \
|
|
43
|
+
build.cuda \
|
|
44
|
+
build.opencl \
|
|
45
|
+
build.openblas \
|
|
46
|
+
build.sdist \
|
|
47
|
+
deploy.pypi \
|
|
48
|
+
deploy.gh-docs \
|
|
49
|
+
clean
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: llama_cpp_python
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.56
|
|
4
4
|
Summary: A Python wrapper for llama.cpp
|
|
5
5
|
Author: Andrei Betlen
|
|
6
6
|
Author-email: abetlen@gmail.com
|
|
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
|
|
|
173
173
|
|
|
174
174
|
```bash
|
|
175
175
|
git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
|
|
176
|
+
|
|
177
|
+
# Install with pip
|
|
178
|
+
pip install -e .
|
|
179
|
+
|
|
180
|
+
# if you want to use the fastapi / openapi server
|
|
181
|
+
pip install -e .[server]
|
|
182
|
+
|
|
183
|
+
# If you're a poetry user, installing will also include a virtual environment
|
|
184
|
+
poetry install --all-extras
|
|
185
|
+
. .venv/bin/activate
|
|
186
|
+
|
|
176
187
|
# Will need to be re-run any time vendor/llama.cpp is updated
|
|
177
188
|
python3 setup.py develop
|
|
178
189
|
```
|
|
@@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode
|
|
|
155
155
|
|
|
156
156
|
```bash
|
|
157
157
|
git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
|
|
158
|
+
|
|
159
|
+
# Install with pip
|
|
160
|
+
pip install -e .
|
|
161
|
+
|
|
162
|
+
# if you want to use the fastapi / openapi server
|
|
163
|
+
pip install -e .[server]
|
|
164
|
+
|
|
165
|
+
# If you're a poetry user, installing will also include a virtual environment
|
|
166
|
+
poetry install --all-extras
|
|
167
|
+
. .venv/bin/activate
|
|
168
|
+
|
|
158
169
|
# Will need to be re-run any time vendor/llama.cpp is updated
|
|
159
170
|
python3 setup.py develop
|
|
160
171
|
```
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
# Define the image argument and provide a default value
|
|
2
|
+
ARG IMAGE=python:3-slim-bullseye
|
|
3
|
+
|
|
4
|
+
# Use the image as specified
|
|
5
|
+
FROM ${IMAGE}
|
|
6
|
+
|
|
7
|
+
# Re-declare the ARG after FROM
|
|
8
|
+
ARG IMAGE
|
|
9
|
+
|
|
10
|
+
# Update and upgrade the existing packages
|
|
11
|
+
RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
|
|
12
|
+
python3 \
|
|
13
|
+
python3-pip \
|
|
14
|
+
ninja-build \
|
|
15
|
+
build-essential
|
|
16
|
+
|
|
17
|
+
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
|
|
18
|
+
|
|
19
|
+
# Perform the conditional installations based on the image
|
|
20
|
+
RUN echo "Image: ${IMAGE}" && \
|
|
21
|
+
if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
|
|
22
|
+
echo "OpenBLAS install:" && \
|
|
23
|
+
apt-get install -y --no-install-recommends libopenblas-dev && \
|
|
24
|
+
LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
|
|
25
|
+
else \
|
|
26
|
+
echo "CuBLAS install:" && \
|
|
27
|
+
LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
|
|
28
|
+
fi
|
|
29
|
+
|
|
30
|
+
# Clean up apt cache
|
|
31
|
+
RUN rm -rf /var/lib/apt/lists/*
|
|
32
|
+
|
|
33
|
+
# Set a working directory for better clarity
|
|
34
|
+
WORKDIR /app
|
|
35
|
+
|
|
36
|
+
# Copy files to the app directory
|
|
37
|
+
RUN echo "Installing model...this can take some time..."
|
|
38
|
+
COPY ./model.bin /app/model.bin
|
|
39
|
+
COPY ./start_server.sh /app/start_server.sh
|
|
40
|
+
|
|
41
|
+
# Make the server start script executable
|
|
42
|
+
RUN chmod +x /app/start_server.sh
|
|
43
|
+
|
|
44
|
+
# Set environment variable for the host
|
|
45
|
+
ENV HOST=0.0.0.0
|
|
46
|
+
|
|
47
|
+
# Expose a port for the server
|
|
48
|
+
EXPOSE 8000
|
|
49
|
+
|
|
50
|
+
# Run the server start script
|
|
51
|
+
CMD ["/bin/sh", "/app/start_server.sh"]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# Dockerfiles for building the llama-cpp-python server
|
|
2
|
+
- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
|
|
3
|
+
- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
|
|
4
|
+
- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
|
|
5
|
+
- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
|
|
6
|
+
|
|
7
|
+
# Get model from Hugging Face
|
|
8
|
+
`python3 ./hug_model.py`
|
|
9
|
+
|
|
10
|
+
You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
|
|
11
|
+
```
|
|
12
|
+
docker $ ls -lh *.bin
|
|
13
|
+
-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
|
|
14
|
+
lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
|
|
15
|
+
```
|
|
16
|
+
**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
|
|
17
|
+
**TWICE** as much disk space as the size of the model:
|
|
18
|
+
|
|
19
|
+
| Model | Quantized size |
|
|
20
|
+
|------:|----------------:|
|
|
21
|
+
| 7B | 5 GB |
|
|
22
|
+
| 13B | 10 GB |
|
|
23
|
+
| 30B | 25 GB |
|
|
24
|
+
| 65B | 50 GB |
|
|
25
|
+
|
|
26
|
+
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
|
|
27
|
+
|
|
28
|
+
# Install Docker Server
|
|
29
|
+
|
|
30
|
+
**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
|
|
31
|
+
|
|
32
|
+
[Install Docker Engine](https://docs.docker.com/engine/install)
|
|
33
|
+
|
|
34
|
+
# Use OpenBLAS
|
|
35
|
+
Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
|
|
36
|
+
## Build:
|
|
37
|
+
`docker build --build-arg -t openblas .`
|
|
38
|
+
## Run:
|
|
39
|
+
`docker run --cap-add SYS_RESOURCE -t openblas`
|
|
40
|
+
|
|
41
|
+
# Use CuBLAS
|
|
42
|
+
Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
|
|
43
|
+
## Build:
|
|
44
|
+
`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
|
|
45
|
+
## Run:
|
|
46
|
+
`docker run --cap-add SYS_RESOURCE -t cublas`
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
import json
|
|
3
|
+
import os
|
|
4
|
+
import struct
|
|
5
|
+
|
|
6
|
+
def make_request(url, params=None):
|
|
7
|
+
print(f"Making request to {url}...")
|
|
8
|
+
response = requests.get(url, params=params)
|
|
9
|
+
if response.status_code == 200:
|
|
10
|
+
return json.loads(response.text)
|
|
11
|
+
else:
|
|
12
|
+
print(f"Request failed with status code {response.status_code}")
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
def check_magic_and_version(filename):
|
|
16
|
+
with open(filename, 'rb') as f:
|
|
17
|
+
# Read the first 6 bytes from the file
|
|
18
|
+
data = f.read(6)
|
|
19
|
+
|
|
20
|
+
# Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
|
|
21
|
+
# and the next 2 bytes as a little-endian unsigned short
|
|
22
|
+
magic, version = struct.unpack('<I H', data)
|
|
23
|
+
|
|
24
|
+
print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
|
|
25
|
+
|
|
26
|
+
return magic, version
|
|
27
|
+
|
|
28
|
+
def download_file(url, destination):
|
|
29
|
+
print(f"Downloading {url} to {destination}...")
|
|
30
|
+
response = requests.get(url, stream=True)
|
|
31
|
+
if response.status_code == 200:
|
|
32
|
+
with open(destination, 'wb') as f:
|
|
33
|
+
total_downloaded = 0
|
|
34
|
+
for chunk in response.iter_content(chunk_size=1024):
|
|
35
|
+
if chunk: # filter out keep-alive new chunks
|
|
36
|
+
f.write(chunk)
|
|
37
|
+
total_downloaded += len(chunk)
|
|
38
|
+
if total_downloaded >= 10485760: # 10 MB
|
|
39
|
+
print('.', end='', flush=True)
|
|
40
|
+
total_downloaded = 0
|
|
41
|
+
print("\nDownload complete.")
|
|
42
|
+
|
|
43
|
+
# Creating a symbolic link from destination to "model.bin"
|
|
44
|
+
if os.path.isfile("model.bin"):
|
|
45
|
+
os.remove("model.bin") # remove the existing link if any
|
|
46
|
+
os.symlink(destination, "model.bin")
|
|
47
|
+
else:
|
|
48
|
+
print(f"Download failed with status code {response.status_code}")
|
|
49
|
+
|
|
50
|
+
def get_user_choice(model_list):
|
|
51
|
+
# Print the enumerated list
|
|
52
|
+
print("\n")
|
|
53
|
+
for i, (model_id, rfilename) in enumerate(model_list):
|
|
54
|
+
print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
|
|
55
|
+
|
|
56
|
+
# Get user's choice
|
|
57
|
+
choice = input("Choose a model to download by entering the corresponding number: ")
|
|
58
|
+
try:
|
|
59
|
+
index = int(choice) - 1
|
|
60
|
+
if 0 <= index < len(model_list):
|
|
61
|
+
# Return the chosen model
|
|
62
|
+
return model_list[index]
|
|
63
|
+
else:
|
|
64
|
+
print("Invalid choice.")
|
|
65
|
+
except ValueError:
|
|
66
|
+
print("Invalid input. Please enter a number corresponding to a model.")
|
|
67
|
+
except IndexError:
|
|
68
|
+
print("Invalid choice. Index out of range.")
|
|
69
|
+
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
import argparse
|
|
73
|
+
|
|
74
|
+
def main():
|
|
75
|
+
# Create an argument parser
|
|
76
|
+
parser = argparse.ArgumentParser(description='Process the model version.')
|
|
77
|
+
parser.add_argument('-v', '--version', type=int, default=0x0003,
|
|
78
|
+
help='an integer for the version to be used')
|
|
79
|
+
|
|
80
|
+
# Parse the arguments
|
|
81
|
+
args = parser.parse_args()
|
|
82
|
+
|
|
83
|
+
# Define the parameters
|
|
84
|
+
params = {
|
|
85
|
+
"author": "TheBloke", # Filter by author
|
|
86
|
+
"tags": "llama"
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
models = make_request('https://huggingface.co/api/models', params=params)
|
|
90
|
+
if models is None:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
model_list = []
|
|
94
|
+
# Iterate over the models
|
|
95
|
+
for model in models:
|
|
96
|
+
model_id = model['id']
|
|
97
|
+
model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
|
|
98
|
+
if model_info is None:
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
for sibling in model_info.get('siblings', []):
|
|
102
|
+
rfilename = sibling.get('rfilename')
|
|
103
|
+
if rfilename and 'q5_1' in rfilename:
|
|
104
|
+
model_list.append((model_id, rfilename))
|
|
105
|
+
|
|
106
|
+
model_choice = get_user_choice(model_list)
|
|
107
|
+
if model_choice is not None:
|
|
108
|
+
model_id, rfilename = model_choice
|
|
109
|
+
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
|
|
110
|
+
download_file(url, rfilename)
|
|
111
|
+
_, version = check_magic_and_version(rfilename)
|
|
112
|
+
if version != args.version:
|
|
113
|
+
print(f"Warning: Expected version {args.version}, but found different version in the file.")
|
|
114
|
+
|
|
115
|
+
if __name__ == '__main__':
|
|
116
|
+
main()
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
|
|
3
|
+
# For mmap support
|
|
4
|
+
ulimit -l unlimited
|
|
5
|
+
|
|
6
|
+
if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
|
|
7
|
+
python3 -B -m llama_cpp.server --model /app/model.bin
|
|
8
|
+
else
|
|
9
|
+
# You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
|
|
10
|
+
python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
|
|
11
|
+
fi
|
{llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_chat_cpp.py
RENAMED
|
@@ -368,10 +368,10 @@ n_keep = {self.params.n_keep}
|
|
|
368
368
|
id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
|
|
369
369
|
else:
|
|
370
370
|
# Temperature sampling
|
|
371
|
-
llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
|
|
372
|
-
llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
|
|
373
|
-
llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
|
|
374
|
-
llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
|
|
371
|
+
llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
|
|
372
|
+
llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1))
|
|
373
|
+
llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1))
|
|
374
|
+
llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1))
|
|
375
375
|
llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
|
|
376
376
|
id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
|
|
377
377
|
# print("`{}`".format(candidates_p.size))
|
|
@@ -382,12 +382,15 @@ n_keep = {self.params.n_keep}
|
|
|
382
382
|
# replace end of text token with newline token when in interactive mode
|
|
383
383
|
if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
|
|
384
384
|
id = self.llama_token_newline[0]
|
|
385
|
+
self.embd.append(id)
|
|
385
386
|
if (self.use_antiprompt()):
|
|
386
387
|
# tokenize and inject first reverse prompt
|
|
387
388
|
self.embd_inp += self.first_antiprompt[0]
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
389
|
+
for id in self.first_antiprompt[0]:
|
|
390
|
+
self.embd.append(id)
|
|
391
|
+
else:
|
|
392
|
+
# add it to the context
|
|
393
|
+
self.embd.append(id)
|
|
391
394
|
|
|
392
395
|
# echo this to console
|
|
393
396
|
self.output_echo = True
|
|
@@ -493,7 +496,7 @@ n_keep = {self.params.n_keep}
|
|
|
493
496
|
# Contains multi-byte UTF8
|
|
494
497
|
for num, pattern in [(2, 192), (3, 224), (4, 240)]:
|
|
495
498
|
# Bitwise AND check
|
|
496
|
-
if pattern & int.from_bytes(cur_char) == pattern:
|
|
499
|
+
if pattern & int.from_bytes(cur_char, 'little') == pattern:
|
|
497
500
|
self.multibyte_fix = [cur_char] + ([None] * (num-1))
|
|
498
501
|
|
|
499
502
|
# Stop incomplete bytes from passing
|