llama-cpp-python 0.1.57__tar.gz → 0.1.59__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.gitignore +3 -0
- llama_cpp_python-0.1.59/CHANGELOG.md +43 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/CMakeLists.txt +2 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/Makefile +6 -1
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/PKG-INFO +1 -1
- llama_cpp_python-0.1.59/docker/README.md +66 -0
- llama_cpp_python-0.1.57/docker/Dockerfile.cuda_simple → llama_cpp_python-0.1.59/docker/cuda_simple/Dockerfile +2 -2
- llama_cpp_python-0.1.59/docker/open_llama/build.sh +14 -0
- {llama_cpp_python-0.1.57/docker → llama_cpp_python-0.1.59/docker/open_llama}/hug_model.py +34 -11
- llama_cpp_python-0.1.59/docker/open_llama/start.sh +28 -0
- {llama_cpp_python-0.1.57/docker → llama_cpp_python-0.1.59/docker/open_llama}/start_server.sh +1 -1
- llama_cpp_python-0.1.57/docker/Dockerfile.openblas_simple → llama_cpp_python-0.1.59/docker/openblas_simple/Dockerfile +1 -1
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/llama.py +264 -191
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/llama_cpp.py +47 -15
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/server/app.py +27 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp_python.egg-info/PKG-INFO +1 -1
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp_python.egg-info/SOURCES.txt +16 -6
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp_python.egg-info/requires.txt +1 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/poetry.lock +56 -18
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/pyproject.toml +6 -5
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/setup.py +2 -2
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.devops/tools.sh +2 -2
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.github/workflows/tidy-post.yml +1 -1
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.gitignore +3 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/CMakeLists.txt +53 -15
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/Makefile +48 -14
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/README.md +45 -14
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/convert-pth-to-ggml.py +3 -1
- llama_cpp_python-0.1.59/vendor/llama.cpp/docs/token_generation_performance_tips.md +40 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/CMakeLists.txt +4 -1
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/common.cpp +47 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/common.h +11 -7
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/main/README.md +2 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/main/main.cpp +16 -8
- llama_cpp_python-0.1.59/vendor/llama.cpp/examples/metal/CMakeLists.txt +3 -0
- llama_cpp_python-0.1.59/vendor/llama.cpp/examples/metal/metal.cpp +102 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/quantize/quantize.cpp +17 -5
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/quantize-stats/quantize-stats.cpp +3 -2
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/server/README.md +2 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/server/server.cpp +48 -0
- llama_cpp_python-0.1.59/vendor/llama.cpp/flake.lock +61 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/flake.nix +18 -8
- llama_cpp_python-0.1.59/vendor/llama.cpp/ggml-cuda.cu +1907 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/ggml-cuda.h +15 -2
- llama_cpp_python-0.1.59/vendor/llama.cpp/ggml-metal.h +63 -0
- llama_cpp_python-0.1.59/vendor/llama.cpp/ggml-metal.m +691 -0
- llama_cpp_python-0.1.59/vendor/llama.cpp/ggml-metal.metal +505 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/ggml-opencl.cpp +226 -39
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/ggml-opencl.h +2 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/ggml.c +326 -106
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/ggml.h +44 -6
- llama_cpp_python-0.1.59/vendor/llama.cpp/k_quants.c +2246 -0
- llama_cpp_python-0.1.59/vendor/llama.cpp/k_quants.h +122 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/llama-util.h +16 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/llama.cpp +395 -120
- {llama_cpp_python-0.1.57/vendor/llama.cpp/spm-headers → llama_cpp_python-0.1.59/vendor/llama.cpp}/llama.h +29 -4
- {llama_cpp_python-0.1.57/vendor/llama.cpp → llama_cpp_python-0.1.59/vendor/llama.cpp/spm-headers}/llama.h +29 -4
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-quantize-fns.cpp +6 -1
- llama_cpp_python-0.1.57/CHANGELOG.md +0 -20
- llama_cpp_python-0.1.57/docker/README.md +0 -46
- llama_cpp_python-0.1.57/vendor/llama.cpp/flake.lock +0 -43
- llama_cpp_python-0.1.57/vendor/llama.cpp/ggml-cuda.cu +0 -957
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.dockerignore +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/dependabot.yml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/workflows/build-and-release.yaml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/workflows/build-docker.yaml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/workflows/publish-to-test.yaml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/workflows/publish.yaml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.github/workflows/test.yaml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.gitmodules +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/LICENSE.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/README.md +0 -0
- {llama_cpp_python-0.1.57/docker → llama_cpp_python-0.1.59/docker/open_llama}/Dockerfile +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/docs/index.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/high_level_api/fastapi_server.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/high_level_api/high_level_api_embedding.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/high_level_api/high_level_api_inference.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/high_level_api/high_level_api_streaming.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/high_level_api/langchain_custom_llm.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/Chat.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/Miku.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/ReasonAct.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/common.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/low_level_api_chat_cpp.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/low_level_api_llama_cpp.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/quantize.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/low_level_api/util.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/notebooks/Clients.ipynb +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/notebooks/Guidance.ipynb +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/examples/notebooks/PerformanceTuning.ipynb +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/__init__.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/llama_types.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/server/__init__.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp/server/__main__.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp_python.egg-info/dependency_links.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/llama_cpp_python.egg-info/top_level.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/mkdocs.yml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/poetry.toml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/setup.cfg +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/tests/test_llama.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.clang-tidy +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.devops/full.Dockerfile +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.devops/main.Dockerfile +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.dockerignore +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.ecrc +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.editorconfig +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.github/workflows/build.yml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.github/workflows/docker.yml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.github/workflows/editorconfig.yml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/.github/workflows/tidy-review.yml +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/LICENSE +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/Package.swift +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/SHA256SUMS +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/build.zig +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/convert-lora-to-ggml.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/convert.py +0 -0
- {llama_cpp_python-0.1.57/vendor/llama.cpp → llama_cpp_python-0.1.59/vendor/llama.cpp/docs}/BLIS.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/Miku.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/alpaca.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/baby-llama/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/baby-llama/baby-llama.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/benchmark/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/chat-13B.bat +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/chat-13B.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/chat-persistent.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/chat.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/embedding/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/embedding/README.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/embedding/embedding.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/gpt4all.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/jeopardy/README.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/jeopardy/graph.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/jeopardy/jeopardy.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/jeopardy/qasheet.csv +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/jeopardy/questions.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/main/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/perplexity/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/perplexity/README.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/perplexity/perplexity.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/quantize/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/quantize/README.md +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/quantize-stats/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/reason-act.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/save-load-state/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/save-load-state/save-load-state.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/server/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/server/httplib.h +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/examples/server/json.hpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/media/llama-leader.jpeg +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/media/llama0-banner.png +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/media/llama0-logo.png +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/media/llama1-banner.png +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/media/llama1-logo.png +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/models/ggml-vocab.bin +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/pocs/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/pocs/vdot/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/pocs/vdot/q8dot.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/pocs/vdot/vdot.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/alpaca.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/chat-with-bob.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/chat.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/dan-modified.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/dan.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/prompts/reason-act.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/requirements.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/build-info.cmake +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/build-info.h.in +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/build-info.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/perf-run-all.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/ppl-run-all.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/sync-ggml.sh +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/scripts/verify-checksum-models.py +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/CMakeLists.txt +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-double-float.c +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-grad0.c +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-opt.c +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-quantize-perf.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-sampling.cpp +0 -0
- {llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/vendor/llama.cpp/tests/test-tokenizer-0.cpp +0 -0
|
@@ -164,3 +164,6 @@ cython_debug/
|
|
|
164
164
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
|
165
165
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
|
166
166
|
.idea/
|
|
167
|
+
|
|
168
|
+
# downloaded model .bin files
|
|
169
|
+
docker/open_llama/*.bin
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [v0.1.59]
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- (llama.cpp) k-quants support
|
|
15
|
+
- (server) mirostat sampling parameters to server
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
|
|
19
|
+
- Support both `.so` and `.dylib` for `libllama` on MacOS
|
|
20
|
+
|
|
21
|
+
## [v0.1.58]
|
|
22
|
+
|
|
23
|
+
### Added
|
|
24
|
+
|
|
25
|
+
- (llama.cpp) Metal Silicon support
|
|
26
|
+
|
|
27
|
+
## [v0.1.57]
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
|
|
31
|
+
- (llama.cpp) OpenLlama 3B support
|
|
32
|
+
|
|
33
|
+
## [v0.1.56]
|
|
34
|
+
|
|
35
|
+
### Added
|
|
36
|
+
|
|
37
|
+
- (misc) Added first version of the changelog
|
|
38
|
+
- (server) Use async routes
|
|
39
|
+
- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance.
|
|
40
|
+
|
|
41
|
+
### Fixed
|
|
42
|
+
|
|
43
|
+
- (python-api) Performance bug in stop sequence check slowing down streaming.
|
|
@@ -20,6 +20,9 @@ build.openblas:
|
|
|
20
20
|
build.blis:
|
|
21
21
|
CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
|
|
22
22
|
|
|
23
|
+
build.metal:
|
|
24
|
+
CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop
|
|
25
|
+
|
|
23
26
|
build.sdist:
|
|
24
27
|
python3 setup.py sdist
|
|
25
28
|
|
|
@@ -34,7 +37,9 @@ clean:
|
|
|
34
37
|
- cd vendor/llama.cpp && make clean
|
|
35
38
|
- cd vendor/llama.cpp && rm libllama.so
|
|
36
39
|
- rm -rf _skbuild
|
|
37
|
-
- rm llama_cpp
|
|
40
|
+
- rm llama_cpp/*.so
|
|
41
|
+
- rm llama_cpp/*.dylib
|
|
42
|
+
- rm llama_cpp/*.dll
|
|
38
43
|
|
|
39
44
|
.PHONY: \
|
|
40
45
|
update \
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# Install Docker Server
|
|
2
|
+
|
|
3
|
+
**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
|
|
4
|
+
|
|
5
|
+
[Install Docker Engine](https://docs.docker.com/engine/install)
|
|
6
|
+
|
|
7
|
+
**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
|
|
8
|
+
|
|
9
|
+
# Simple Dockerfiles for building the llama-cpp-python server with external model bin files
|
|
10
|
+
## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
|
|
11
|
+
```
|
|
12
|
+
cd ./openblas_simple
|
|
13
|
+
docker build -t openblas_simple .
|
|
14
|
+
docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
|
|
15
|
+
```
|
|
16
|
+
where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
|
|
17
|
+
|
|
18
|
+
## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
|
|
19
|
+
```
|
|
20
|
+
cd ./cuda_simple
|
|
21
|
+
docker build -t cuda_simple .
|
|
22
|
+
docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
|
|
23
|
+
```
|
|
24
|
+
where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
|
|
25
|
+
|
|
26
|
+
# "Open-Llama-in-a-box"
|
|
27
|
+
## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
|
|
28
|
+
```
|
|
29
|
+
$ cd ./open_llama
|
|
30
|
+
./build.sh
|
|
31
|
+
./start.sh
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
# Manually choose your own Llama model from Hugging Face
|
|
35
|
+
`python3 ./hug_model.py -a TheBloke -t llama`
|
|
36
|
+
You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
|
|
37
|
+
```
|
|
38
|
+
docker $ ls -lh *.bin
|
|
39
|
+
-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
|
|
40
|
+
lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
|
|
41
|
+
```
|
|
42
|
+
**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
|
|
43
|
+
**TWICE** as much disk space as the size of the model:
|
|
44
|
+
|
|
45
|
+
| Model | Quantized size |
|
|
46
|
+
|------:|----------------:|
|
|
47
|
+
| 3B | 3 GB |
|
|
48
|
+
| 7B | 5 GB |
|
|
49
|
+
| 13B | 10 GB |
|
|
50
|
+
| 33B | 25 GB |
|
|
51
|
+
| 65B | 50 GB |
|
|
52
|
+
|
|
53
|
+
**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
|
|
54
|
+
|
|
55
|
+
## Use OpenBLAS
|
|
56
|
+
Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
|
|
57
|
+
### Build:
|
|
58
|
+
`docker build -t openblas .`
|
|
59
|
+
### Run:
|
|
60
|
+
`docker run --cap-add SYS_RESOURCE -t openblas`
|
|
61
|
+
|
|
62
|
+
## Use CuBLAS
|
|
63
|
+
### Build:
|
|
64
|
+
`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
|
|
65
|
+
### Run:
|
|
66
|
+
`docker run --cap-add SYS_RESOURCE -t cublas`
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
|
|
2
|
-
FROM
|
|
2
|
+
FROM nvidia/cuda:${CUDA_IMAGE}
|
|
3
3
|
|
|
4
4
|
# We need to set the host to 0.0.0.0 to allow outside access
|
|
5
5
|
ENV HOST 0.0.0.0
|
|
@@ -10,7 +10,7 @@ COPY . .
|
|
|
10
10
|
RUN apt update && apt install -y python3 python3-pip
|
|
11
11
|
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
|
|
12
12
|
|
|
13
|
-
RUN LLAMA_CUBLAS=1
|
|
13
|
+
RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
|
|
14
14
|
|
|
15
15
|
# Run the server
|
|
16
16
|
CMD python3 -m llama_cpp.server
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
|
|
3
|
+
MODEL="open_llama_3b"
|
|
4
|
+
# Get open_llama_3b_ggml q5_1 quantization
|
|
5
|
+
python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
|
|
6
|
+
ls -lh *.bin
|
|
7
|
+
|
|
8
|
+
# Build the default OpenBLAS image
|
|
9
|
+
docker build -t $MODEL .
|
|
10
|
+
docker images | egrep "^(REPOSITORY|$MODEL)"
|
|
11
|
+
|
|
12
|
+
echo
|
|
13
|
+
echo "To start the docker container run:"
|
|
14
|
+
echo "docker run -t -p 8000:8000 $MODEL"
|
|
@@ -2,6 +2,7 @@ import requests
|
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
4
|
import struct
|
|
5
|
+
import argparse
|
|
5
6
|
|
|
6
7
|
def make_request(url, params=None):
|
|
7
8
|
print(f"Making request to {url}...")
|
|
@@ -69,21 +70,30 @@ def get_user_choice(model_list):
|
|
|
69
70
|
|
|
70
71
|
return None
|
|
71
72
|
|
|
72
|
-
import argparse
|
|
73
|
-
|
|
74
73
|
def main():
|
|
75
74
|
# Create an argument parser
|
|
76
|
-
parser = argparse.ArgumentParser(description='Process
|
|
75
|
+
parser = argparse.ArgumentParser(description='Process some parameters.')
|
|
76
|
+
|
|
77
|
+
# Arguments
|
|
77
78
|
parser.add_argument('-v', '--version', type=int, default=0x0003,
|
|
78
|
-
help='
|
|
79
|
+
help='hexadecimal version number of ggml file')
|
|
80
|
+
parser.add_argument('-a', '--author', type=str, default='TheBloke',
|
|
81
|
+
help='HuggingFace author filter')
|
|
82
|
+
parser.add_argument('-t', '--tag', type=str, default='llama',
|
|
83
|
+
help='HuggingFace tag filter')
|
|
84
|
+
parser.add_argument('-s', '--search', type=str, default='',
|
|
85
|
+
help='HuggingFace search filter')
|
|
86
|
+
parser.add_argument('-f', '--filename', type=str, default='q5_1',
|
|
87
|
+
help='HuggingFace model repository filename substring match')
|
|
79
88
|
|
|
80
89
|
# Parse the arguments
|
|
81
90
|
args = parser.parse_args()
|
|
82
91
|
|
|
83
92
|
# Define the parameters
|
|
84
93
|
params = {
|
|
85
|
-
"author":
|
|
86
|
-
"tags":
|
|
94
|
+
"author": args.author,
|
|
95
|
+
"tags": args.tag,
|
|
96
|
+
"search": args.search
|
|
87
97
|
}
|
|
88
98
|
|
|
89
99
|
models = make_request('https://huggingface.co/api/models', params=params)
|
|
@@ -100,17 +110,30 @@ def main():
|
|
|
100
110
|
|
|
101
111
|
for sibling in model_info.get('siblings', []):
|
|
102
112
|
rfilename = sibling.get('rfilename')
|
|
103
|
-
if rfilename and
|
|
113
|
+
if rfilename and args.filename in rfilename:
|
|
104
114
|
model_list.append((model_id, rfilename))
|
|
105
115
|
|
|
106
|
-
|
|
116
|
+
# Choose the model
|
|
117
|
+
model_list.sort(key=lambda x: x[0])
|
|
118
|
+
if len(model_list) == 0:
|
|
119
|
+
print("No models found")
|
|
120
|
+
exit(1)
|
|
121
|
+
elif len(model_list) == 1:
|
|
122
|
+
model_choice = model_list[0]
|
|
123
|
+
else:
|
|
124
|
+
model_choice = get_user_choice(model_list)
|
|
125
|
+
|
|
107
126
|
if model_choice is not None:
|
|
108
127
|
model_id, rfilename = model_choice
|
|
109
128
|
url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
|
|
110
|
-
|
|
111
|
-
|
|
129
|
+
dest = f"{model_id.replace('/', '_')}_{rfilename}"
|
|
130
|
+
download_file(url, dest)
|
|
131
|
+
_, version = check_magic_and_version(dest)
|
|
112
132
|
if version != args.version:
|
|
113
|
-
|
|
133
|
+
print(f"Warning: Expected version {args.version}, but found different version in the file.")
|
|
134
|
+
else:
|
|
135
|
+
print("Error - model choice was None")
|
|
136
|
+
exit(2)
|
|
114
137
|
|
|
115
138
|
if __name__ == '__main__':
|
|
116
139
|
main()
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/bin/sh
|
|
2
|
+
|
|
3
|
+
MODEL="open_llama_3b"
|
|
4
|
+
|
|
5
|
+
# Start Docker container
|
|
6
|
+
docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
|
|
7
|
+
sleep 10
|
|
8
|
+
echo
|
|
9
|
+
docker ps | egrep "(^CONTAINER|$MODEL)"
|
|
10
|
+
|
|
11
|
+
# Test the model works
|
|
12
|
+
echo
|
|
13
|
+
curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
|
14
|
+
"prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
|
|
15
|
+
"stop": [
|
|
16
|
+
"\n",
|
|
17
|
+
"###"
|
|
18
|
+
]
|
|
19
|
+
}' | grep Paris
|
|
20
|
+
if [ $? -eq 0 ]
|
|
21
|
+
then
|
|
22
|
+
echo
|
|
23
|
+
echo "$MODEL is working!!"
|
|
24
|
+
else
|
|
25
|
+
echo
|
|
26
|
+
echo "ERROR: $MODEL not replying."
|
|
27
|
+
exit 1
|
|
28
|
+
fi
|
|
@@ -9,7 +9,7 @@ COPY . .
|
|
|
9
9
|
RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
|
|
10
10
|
RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
|
|
11
11
|
|
|
12
|
-
RUN LLAMA_OPENBLAS=1
|
|
12
|
+
RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
|
|
13
13
|
|
|
14
14
|
# Run the server
|
|
15
15
|
CMD python3 -m llama_cpp.server
|