yta-fastapi-docker-llamacpp 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2018 The Python Packaging Authority
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.4
2
+ Name: yta-fastapi-docker-llamacpp
3
+ Version: 0.0.1
4
+ Summary: Youtube Autonomous FastAPI Docker Llama.cpp Module
5
+ License-File: LICENSE
6
+ Author: danialcala94
7
+ Author-email: danielalcalavalera@gmail.com
8
+ Requires-Python: >=3.10,<3.14
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.10
11
+ Classifier: Programming Language :: Python :: 3.11
12
+ Classifier: Programming Language :: Python :: 3.12
13
+ Classifier: Programming Language :: Python :: 3.13
14
+ Requires-Dist: fastapi (>=0.0.1,<9999.0.0)
15
+ Requires-Dist: uvicorn (>=0.0.1,<9999.0.0)
16
+ Requires-Dist: yta_fastapi_docker_pydantic_models (>=0.0.4,<1.0.0)
17
+ Requires-Dist: yta_httpx (>=0.0.27,<1.0.0)
18
+ Description-Content-Type: text/markdown
19
+
20
+ # Youtube Autonomous FastAPI Docker Llama.cpp Module
21
+
22
+ The module that is providing the functionality related to the Llama.cpp models hub (having the models and using them) through a FastAPI that is included and isolated in a Docker container.
23
+
24
+ This module is meant to be exposed as a container inside the internal network, to be connected with its own FastAPI that is exposing the functionality outside.
25
+
26
+ ### Endpoints
27
+
28
+ #### GET
29
+ No endpoints by now.
30
+
31
+ #### POST
32
+ No endpoints by now.
33
+
34
+ ## Instructions
35
+ I've followed these steps to make `llama.cpp` available in my laptop as a container running with cuda, and I've adapted this workflow to this project so its done automatically:
36
+
37
+ 1. Nos aseguramos de tener la imagen de Nvidia en docker:
38
+ `$docker run --rm --gpus all nvidia/cuda:12.9.1-runtime-ubuntu24.04 nvidia-smi`
39
+
40
+ 2. Creamos una carpeta `models` para tener los modelos ahí guardados (en mi caso en un SSD externo para ahorrar espacio) en `D:/llama/models`.
41
+
42
+ 3. Descargamos el modelo GGUF que necesitemos (para ello, ver que tipo y qué características en función de nuestro PC), en cmd desde la carpeta `models` del paso 2:
43
+ `$huggingface-cli download unsloth/gemma-4-E2B-it-GGUF gemma-4-E2B-it-UD-Q4_K_XL.gguf --local-dir ./`
44
+
45
+ 1. Descargamos el contenedor 'llama.cpp' adaptado a CUDA, estando en el cmd de la carpeta `models`:
46
+ `$docker run --rm --gpus all -p 8080:8080 -v "${PWD}:/models" ghcr.io/ggml-org/llama.cpp:server-cuda -m --host 0.0.0.0 -ngl 999`
@@ -0,0 +1,27 @@
1
+ # Youtube Autonomous FastAPI Docker Llama.cpp Module
2
+
3
+ The module that is providing the functionality related to the Llama.cpp models hub (having the models and using them) through a FastAPI that is included and isolated in a Docker container.
4
+
5
+ This module is meant to be exposed as a container inside the internal network, to be connected with its own FastAPI that is exposing the functionality outside.
6
+
7
+ ### Endpoints
8
+
9
+ #### GET
10
+ No endpoints by now.
11
+
12
+ #### POST
13
+ No endpoints by now.
14
+
15
+ ## Instructions
16
+ I've followed these steps to make `llama.cpp` available in my laptop as a container running with cuda, and I've adapted this workflow to this project so its done automatically:
17
+
18
+ 1. Nos aseguramos de tener la imagen de Nvidia en docker:
19
+ `$docker run --rm --gpus all nvidia/cuda:12.9.1-runtime-ubuntu24.04 nvidia-smi`
20
+
21
+ 2. Creamos una carpeta `models` para tener los modelos ahí guardados (en mi caso en un SSD externo para ahorrar espacio) en `D:/llama/models`.
22
+
23
+ 3. Descargamos el modelo GGUF que necesitemos (para ello, ver que tipo y qué características en función de nuestro PC), en cmd desde la carpeta `models` del paso 2:
24
+ `$huggingface-cli download unsloth/gemma-4-E2B-it-GGUF gemma-4-E2B-it-UD-Q4_K_XL.gguf --local-dir ./`
25
+
26
+ 1. Descargamos el contenedor 'llama.cpp' adaptado a CUDA, estando en el cmd de la carpeta `models`:
27
+ `$docker run --rm --gpus all -p 8080:8080 -v "${PWD}:/models" ghcr.io/ggml-org/llama.cpp:server-cuda -m --host 0.0.0.0 -ngl 999`
@@ -0,0 +1,37 @@
1
+ [project]
2
+ name = "yta-fastapi-docker-llamacpp"
3
+ version = "0.0.1"
4
+ description = "Youtube Autonomous FastAPI Docker Llama.cpp Module"
5
+ authors = [
6
+ {name = "danialcala94", email = "danielalcalavalera@gmail.com"}
7
+ ]
8
+ readme = "README.md"
9
+ # 3.12.11 is working
10
+ requires-python = ">=3.10,<3.14"
11
+
12
+ [tool.poetry.dependencies]
13
+ # Mandatory
14
+ fastapi = { version = ">=0.0.1,<9999.0.0", optional = false }
15
+ uvicorn = { version = ">=0.0.1,<9999.0.0", optional = false }
16
+ yta_httpx = { version = ">=0.0.27,<1.0.0", optional = false }
17
+ yta_fastapi_docker_pydantic_models = { version = ">=0.0.4,<1.0.0", optional = false }
18
+ # Optional
19
+
20
+ [tool.poetry]
21
+ packages = [{include = "yta_fastapi_docker_llamacpp", from = "src"}]
22
+
23
+ [tool.poetry.group.dev.dependencies]
24
+ pytest = "^8.3.5"
25
+ pytest-asyncio = ">=1.3.0"
26
+ httpx = ">=0.0.1"
27
+ yta_testing = ">=0.0.1"
28
+
29
+ [tool.pytest.ini_options]
30
+ markers = [
31
+ "mandatory: mandatory tests for release",
32
+ "additional: exhaustive and demanding tests"
33
+ ]
34
+
35
+ [build-system]
36
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
37
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,66 @@
1
+ """
2
+ This module is using the Gemma4 model to handle
3
+ the requests:
4
+ - `gemma-4-E2B-it-UD-Q4_K_XL.gguf`
5
+ """
6
+ from yta_httpx.client import HttpClient
7
+ from typing import Union
8
+
9
+
10
+ LLAMACPP_DOCKER_API_ENDPOINT = 'http://localhost:8080/'
11
+
12
+ class Gemma4Llamacpp:
13
+ """
14
+ Class to wrap the functionality of Gemma4 model.
15
+ """
16
+
17
+ async def ask(
18
+ self,
19
+ prompt: str,
20
+ schema: Union[dict, None] = None
21
+ ) -> str:
22
+ """
23
+ Create an ephemeral chat to ask the `prompt` given
24
+ and waits for a response (that will match the
25
+ `schema`, if given), that will be returned as a
26
+ string.
27
+
28
+ If you are providing a `schema` that expects a json
29
+ response, parse the `str` this method is returning.
30
+ """
31
+ payload = {
32
+ "temperature": 0,
33
+ "top_p": 1,
34
+ # TODO: This is rejecting some answers
35
+ # "max_tokens": 256,
36
+ "messages": [
37
+ {
38
+ "role": "user",
39
+ "content": prompt
40
+ }
41
+ ],
42
+ }
43
+
44
+ if schema is not None:
45
+ payload['response_format'] = {
46
+ "type": "json_schema",
47
+ "json_schema": {
48
+ # TODO: What about this schema name (?)
49
+ "name": "scene_analysis",
50
+ "schema": schema
51
+ }
52
+ }
53
+
54
+ # url = 'http://localhost:8080/v1/chat/completions'
55
+ url = f'{LLAMACPP_DOCKER_API_ENDPOINT}v1/chat/completions'
56
+
57
+ async with HttpClient() as http_client:
58
+ response = await http_client.post.complete(
59
+ url = url,
60
+ json = payload,
61
+ timeout = 300
62
+ )
63
+ result = response.json()
64
+ message_content = result['choices'][0]['message']['content']
65
+
66
+ return message_content
@@ -0,0 +1,10 @@
1
+ from yta_fastapi_docker_llamacpp.app.routers import router as general_router
2
+ from yta_fastapi_docker_llamacpp.app.routers.llamacpp import router as llamacpp_router
3
+ from fastapi import FastAPI
4
+
5
+
6
+ app = FastAPI()
7
+
8
+ # Include all the routers we have
9
+ app.include_router(general_router)
10
+ app.include_router(llamacpp_router)
@@ -0,0 +1,24 @@
1
+ """
2
+ Check this project to get inspiration about FastAPI:
3
+ - https://github.com/Implosiv3/render-fastapi/blob/master/render-fastapi/routers/download/__init__.py
4
+ """
5
+ from fastapi import APIRouter
6
+ from fastapi.responses import JSONResponse
7
+
8
+
9
+ PREFIX = f''
10
+
11
+ router = APIRouter(
12
+ prefix = PREFIX
13
+ )
14
+
15
+ @router.get('/check-status')
16
+ def route_check_status(
17
+ ) -> JSONResponse:
18
+ return JSONResponse(
19
+ {
20
+ 'error': False,
21
+ 'message': 'Hello World!'
22
+ },
23
+ status_code = 200
24
+ )
@@ -0,0 +1,36 @@
1
+ from yta_fastapi_docker_llamacpp.app.llamacpp.gemma4 import Gemma4Llamacpp
2
+ from yta_fastapi_docker_pydantic_models.llamacpp import GemmaRequest
3
+ from fastapi.responses import JSONResponse
4
+ from fastapi import APIRouter
5
+
6
+
7
+ PREFIX = f'/llamacpp'
8
+
9
+ router = APIRouter(
10
+ prefix = PREFIX
11
+ )
12
+
13
+ # TODO: Maybe this should be in another router
14
+ @router.post('/gemma4')
15
+ def post_gemma4(
16
+ request: GemmaRequest
17
+ ) -> JSONResponse:
18
+ """
19
+ The request must include a `prompt`, that is
20
+ mandatory, and could include a `schema` that
21
+ is optional.
22
+ """
23
+ gemma4_agent = Gemma4Llamacpp()
24
+
25
+ response = gemma4_agent.ask(
26
+ prompt = request.prompt,
27
+ schema = request.schema
28
+ )
29
+
30
+ return JSONResponse(
31
+ {
32
+ 'error': False,
33
+ 'data': response
34
+ },
35
+ status_code = 200
36
+ )