yta-fastapi-docker-llamacpp 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yta_fastapi_docker_llamacpp-0.0.1/LICENSE +19 -0
- yta_fastapi_docker_llamacpp-0.0.1/PKG-INFO +46 -0
- yta_fastapi_docker_llamacpp-0.0.1/README.md +27 -0
- yta_fastapi_docker_llamacpp-0.0.1/pyproject.toml +37 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/__init__.py +0 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/app/__init__.py +0 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/app/llamacpp/__init__.py +0 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/app/llamacpp/gemma4.py +66 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/app/main.py +10 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/app/routers/__init__.py +24 -0
- yta_fastapi_docker_llamacpp-0.0.1/src/yta_fastapi_docker_llamacpp/app/routers/llamacpp.py +36 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2018 The Python Packaging Authority
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
4
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
5
|
+
in the Software without restriction, including without limitation the rights
|
|
6
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
7
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
8
|
+
furnished to do so, subject to the following conditions:
|
|
9
|
+
|
|
10
|
+
The above copyright notice and this permission notice shall be included in all
|
|
11
|
+
copies or substantial portions of the Software.
|
|
12
|
+
|
|
13
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
14
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
15
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
17
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
18
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
19
|
+
SOFTWARE.
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: yta-fastapi-docker-llamacpp
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: Youtube Autonomous FastAPI Docker Llama.cpp Module
|
|
5
|
+
License-File: LICENSE
|
|
6
|
+
Author: danialcala94
|
|
7
|
+
Author-email: danielalcalavalera@gmail.com
|
|
8
|
+
Requires-Python: >=3.10,<3.14
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
|
+
Requires-Dist: fastapi (>=0.0.1,<9999.0.0)
|
|
15
|
+
Requires-Dist: uvicorn (>=0.0.1,<9999.0.0)
|
|
16
|
+
Requires-Dist: yta_fastapi_docker_pydantic_models (>=0.0.4,<1.0.0)
|
|
17
|
+
Requires-Dist: yta_httpx (>=0.0.27,<1.0.0)
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# Youtube Autonomous FastAPI Docker Llama.cpp Module
|
|
21
|
+
|
|
22
|
+
The module that is providing the functionality related to the Llama.cpp models hub (having the models and using them) through a FastAPI that is included and isolated in a Docker container.
|
|
23
|
+
|
|
24
|
+
This module is meant to be exposed as a container inside the internal network, to be connected with its own FastAPI that is exposing the functionality outside.
|
|
25
|
+
|
|
26
|
+
### Endpoints
|
|
27
|
+
|
|
28
|
+
#### GET
|
|
29
|
+
No endpoints by now.
|
|
30
|
+
|
|
31
|
+
#### POST
|
|
32
|
+
No endpoints by now.
|
|
33
|
+
|
|
34
|
+
## Instructions
|
|
35
|
+
I've followed these steps to make `llama.cpp` available in my laptop as a container running with cuda, and I've adapted this workflow to this project so its done automatically:
|
|
36
|
+
|
|
37
|
+
1. Nos aseguramos de tener la imagen de Nvidia en docker:
|
|
38
|
+
`$docker run --rm --gpus all nvidia/cuda:12.9.1-runtime-ubuntu24.04 nvidia-smi`
|
|
39
|
+
|
|
40
|
+
2. Creamos una carpeta `models` para tener los modelos ahí guardados (en mi caso en un SSD externo para ahorrar espacio) en `D:/llama/models`.
|
|
41
|
+
|
|
42
|
+
3. Descargamos el modelo GGUF que necesitemos (para ello, ver que tipo y qué características en función de nuestro PC), en cmd desde la carpeta `models` del paso 2:
|
|
43
|
+
`$huggingface-cli download unsloth/gemma-4-E2B-it-GGUF gemma-4-E2B-it-UD-Q4_K_XL.gguf --local-dir ./`
|
|
44
|
+
|
|
45
|
+
1. Descargamos el contenedor 'llama.cpp' adaptado a CUDA, estando en el cmd de la carpeta `models`:
|
|
46
|
+
`$docker run --rm --gpus all -p 8080:8080 -v "${PWD}:/models" ghcr.io/ggml-org/llama.cpp:server-cuda -m --host 0.0.0.0 -ngl 999`
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Youtube Autonomous FastAPI Docker Llama.cpp Module
|
|
2
|
+
|
|
3
|
+
The module that is providing the functionality related to the Llama.cpp models hub (having the models and using them) through a FastAPI that is included and isolated in a Docker container.
|
|
4
|
+
|
|
5
|
+
This module is meant to be exposed as a container inside the internal network, to be connected with its own FastAPI that is exposing the functionality outside.
|
|
6
|
+
|
|
7
|
+
### Endpoints
|
|
8
|
+
|
|
9
|
+
#### GET
|
|
10
|
+
No endpoints by now.
|
|
11
|
+
|
|
12
|
+
#### POST
|
|
13
|
+
No endpoints by now.
|
|
14
|
+
|
|
15
|
+
## Instructions
|
|
16
|
+
I've followed these steps to make `llama.cpp` available in my laptop as a container running with cuda, and I've adapted this workflow to this project so its done automatically:
|
|
17
|
+
|
|
18
|
+
1. Nos aseguramos de tener la imagen de Nvidia en docker:
|
|
19
|
+
`$docker run --rm --gpus all nvidia/cuda:12.9.1-runtime-ubuntu24.04 nvidia-smi`
|
|
20
|
+
|
|
21
|
+
2. Creamos una carpeta `models` para tener los modelos ahí guardados (en mi caso en un SSD externo para ahorrar espacio) en `D:/llama/models`.
|
|
22
|
+
|
|
23
|
+
3. Descargamos el modelo GGUF que necesitemos (para ello, ver que tipo y qué características en función de nuestro PC), en cmd desde la carpeta `models` del paso 2:
|
|
24
|
+
`$huggingface-cli download unsloth/gemma-4-E2B-it-GGUF gemma-4-E2B-it-UD-Q4_K_XL.gguf --local-dir ./`
|
|
25
|
+
|
|
26
|
+
1. Descargamos el contenedor 'llama.cpp' adaptado a CUDA, estando en el cmd de la carpeta `models`:
|
|
27
|
+
`$docker run --rm --gpus all -p 8080:8080 -v "${PWD}:/models" ghcr.io/ggml-org/llama.cpp:server-cuda -m --host 0.0.0.0 -ngl 999`
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "yta-fastapi-docker-llamacpp"
|
|
3
|
+
version = "0.0.1"
|
|
4
|
+
description = "Youtube Autonomous FastAPI Docker Llama.cpp Module"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "danialcala94", email = "danielalcalavalera@gmail.com"}
|
|
7
|
+
]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
# 3.12.11 is working
|
|
10
|
+
requires-python = ">=3.10,<3.14"
|
|
11
|
+
|
|
12
|
+
[tool.poetry.dependencies]
|
|
13
|
+
# Mandatory
|
|
14
|
+
fastapi = { version = ">=0.0.1,<9999.0.0", optional = false }
|
|
15
|
+
uvicorn = { version = ">=0.0.1,<9999.0.0", optional = false }
|
|
16
|
+
yta_httpx = { version = ">=0.0.27,<1.0.0", optional = false }
|
|
17
|
+
yta_fastapi_docker_pydantic_models = { version = ">=0.0.4,<1.0.0", optional = false }
|
|
18
|
+
# Optional
|
|
19
|
+
|
|
20
|
+
[tool.poetry]
|
|
21
|
+
packages = [{include = "yta_fastapi_docker_llamacpp", from = "src"}]
|
|
22
|
+
|
|
23
|
+
[tool.poetry.group.dev.dependencies]
|
|
24
|
+
pytest = "^8.3.5"
|
|
25
|
+
pytest-asyncio = ">=1.3.0"
|
|
26
|
+
httpx = ">=0.0.1"
|
|
27
|
+
yta_testing = ">=0.0.1"
|
|
28
|
+
|
|
29
|
+
[tool.pytest.ini_options]
|
|
30
|
+
markers = [
|
|
31
|
+
"mandatory: mandatory tests for release",
|
|
32
|
+
"additional: exhaustive and demanding tests"
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[build-system]
|
|
36
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
37
|
+
build-backend = "poetry.core.masonry.api"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module is using the Gemma4 model to handle
|
|
3
|
+
the requests:
|
|
4
|
+
- `gemma-4-E2B-it-UD-Q4_K_XL.gguf`
|
|
5
|
+
"""
|
|
6
|
+
from yta_httpx.client import HttpClient
|
|
7
|
+
from typing import Union
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
LLAMACPP_DOCKER_API_ENDPOINT = 'http://localhost:8080/'
|
|
11
|
+
|
|
12
|
+
class Gemma4Llamacpp:
|
|
13
|
+
"""
|
|
14
|
+
Class to wrap the functionality of Gemma4 model.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
async def ask(
|
|
18
|
+
self,
|
|
19
|
+
prompt: str,
|
|
20
|
+
schema: Union[dict, None] = None
|
|
21
|
+
) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Create an ephemeral chat to ask the `prompt` given
|
|
24
|
+
and waits for a response (that will match the
|
|
25
|
+
`schema`, if given), that will be returned as a
|
|
26
|
+
string.
|
|
27
|
+
|
|
28
|
+
If you are providing a `schema` that expects a json
|
|
29
|
+
response, parse the `str` this method is returning.
|
|
30
|
+
"""
|
|
31
|
+
payload = {
|
|
32
|
+
"temperature": 0,
|
|
33
|
+
"top_p": 1,
|
|
34
|
+
# TODO: This is rejecting some answers
|
|
35
|
+
# "max_tokens": 256,
|
|
36
|
+
"messages": [
|
|
37
|
+
{
|
|
38
|
+
"role": "user",
|
|
39
|
+
"content": prompt
|
|
40
|
+
}
|
|
41
|
+
],
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if schema is not None:
|
|
45
|
+
payload['response_format'] = {
|
|
46
|
+
"type": "json_schema",
|
|
47
|
+
"json_schema": {
|
|
48
|
+
# TODO: What about this schema name (?)
|
|
49
|
+
"name": "scene_analysis",
|
|
50
|
+
"schema": schema
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# url = 'http://localhost:8080/v1/chat/completions'
|
|
55
|
+
url = f'{LLAMACPP_DOCKER_API_ENDPOINT}v1/chat/completions'
|
|
56
|
+
|
|
57
|
+
async with HttpClient() as http_client:
|
|
58
|
+
response = await http_client.post.complete(
|
|
59
|
+
url = url,
|
|
60
|
+
json = payload,
|
|
61
|
+
timeout = 300
|
|
62
|
+
)
|
|
63
|
+
result = response.json()
|
|
64
|
+
message_content = result['choices'][0]['message']['content']
|
|
65
|
+
|
|
66
|
+
return message_content
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
from yta_fastapi_docker_llamacpp.app.routers import router as general_router
|
|
2
|
+
from yta_fastapi_docker_llamacpp.app.routers.llamacpp import router as llamacpp_router
|
|
3
|
+
from fastapi import FastAPI
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
app = FastAPI()
|
|
7
|
+
|
|
8
|
+
# Include all the routers we have
|
|
9
|
+
app.include_router(general_router)
|
|
10
|
+
app.include_router(llamacpp_router)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Check this project to get inspiration about FastAPI:
|
|
3
|
+
- https://github.com/Implosiv3/render-fastapi/blob/master/render-fastapi/routers/download/__init__.py
|
|
4
|
+
"""
|
|
5
|
+
from fastapi import APIRouter
|
|
6
|
+
from fastapi.responses import JSONResponse
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
PREFIX = f''
|
|
10
|
+
|
|
11
|
+
router = APIRouter(
|
|
12
|
+
prefix = PREFIX
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
@router.get('/check-status')
|
|
16
|
+
def route_check_status(
|
|
17
|
+
) -> JSONResponse:
|
|
18
|
+
return JSONResponse(
|
|
19
|
+
{
|
|
20
|
+
'error': False,
|
|
21
|
+
'message': 'Hello World!'
|
|
22
|
+
},
|
|
23
|
+
status_code = 200
|
|
24
|
+
)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from yta_fastapi_docker_llamacpp.app.llamacpp.gemma4 import Gemma4Llamacpp
|
|
2
|
+
from yta_fastapi_docker_pydantic_models.llamacpp import GemmaRequest
|
|
3
|
+
from fastapi.responses import JSONResponse
|
|
4
|
+
from fastapi import APIRouter
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
PREFIX = f'/llamacpp'
|
|
8
|
+
|
|
9
|
+
router = APIRouter(
|
|
10
|
+
prefix = PREFIX
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
# TODO: Maybe this should be in another router
|
|
14
|
+
@router.post('/gemma4')
|
|
15
|
+
def post_gemma4(
|
|
16
|
+
request: GemmaRequest
|
|
17
|
+
) -> JSONResponse:
|
|
18
|
+
"""
|
|
19
|
+
The request must include a `prompt`, that is
|
|
20
|
+
mandatory, and could include a `schema` that
|
|
21
|
+
is optional.
|
|
22
|
+
"""
|
|
23
|
+
gemma4_agent = Gemma4Llamacpp()
|
|
24
|
+
|
|
25
|
+
response = gemma4_agent.ask(
|
|
26
|
+
prompt = request.prompt,
|
|
27
|
+
schema = request.schema
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
return JSONResponse(
|
|
31
|
+
{
|
|
32
|
+
'error': False,
|
|
33
|
+
'data': response
|
|
34
|
+
},
|
|
35
|
+
status_code = 200
|
|
36
|
+
)
|