gh-ai-runner 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gh_ai_runner-0.1.3/LICENSE +21 -0
- gh_ai_runner-0.1.3/PKG-INFO +235 -0
- gh_ai_runner-0.1.3/README.md +214 -0
- gh_ai_runner-0.1.3/gh_ai_runner.egg-info/PKG-INFO +235 -0
- gh_ai_runner-0.1.3/gh_ai_runner.egg-info/SOURCES.txt +17 -0
- gh_ai_runner-0.1.3/gh_ai_runner.egg-info/dependency_links.txt +1 -0
- gh_ai_runner-0.1.3/gh_ai_runner.egg-info/requires.txt +4 -0
- gh_ai_runner-0.1.3/gh_ai_runner.egg-info/top_level.txt +1 -0
- gh_ai_runner-0.1.3/pyproject.toml +31 -0
- gh_ai_runner-0.1.3/servai/__init__.py +5 -0
- gh_ai_runner-0.1.3/servai/artifact.py +44 -0
- gh_ai_runner-0.1.3/servai/core.py +112 -0
- gh_ai_runner-0.1.3/servai/logger.py +17 -0
- gh_ai_runner-0.1.3/servai/models.py +24 -0
- gh_ai_runner-0.1.3/servai/polling.py +60 -0
- gh_ai_runner-0.1.3/servai/repo.py +85 -0
- gh_ai_runner-0.1.3/servai/runner.py +116 -0
- gh_ai_runner-0.1.3/servai/validation.py +51 -0
- gh_ai_runner-0.1.3/setup.cfg +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Tanish Chauhan
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gh-ai-runner
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Serverless AI inference via GitHub Actions — no server required
|
|
5
|
+
Author: Tanish Chauhan
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/TanishC4444/servai
|
|
8
|
+
Project-URL: Repository, https://github.com/TanishC4444/servai
|
|
9
|
+
Keywords: ai,llm,inference,github-actions,llama,serverless
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: requests>=2.28
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# Servai
|
|
23
|
+
|
|
24
|
+
[](https://badge.fury.io/py/servai)
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
[](https://www.python.org/downloads/)
|
|
27
|
+
|
|
28
|
+
**Serverless AI inference via GitHub Actions. No server. No GPU. No infrastructure.**
|
|
29
|
+
|
|
30
|
+
Run open-source LLMs directly through GitHub's free CI/CD runners — just a GitHub token and a prompt. `servai` handles everything else: repo creation, workflow setup, model downloading, caching, and output retrieval.
|
|
31
|
+
|
|
32
|
+
Built by [Tanish Chauhan](https://github.com/TanishC4444)
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## How it works
|
|
37
|
+
|
|
38
|
+
When you call `ai_call()`:
|
|
39
|
+
|
|
40
|
+
1. Creates a private GitHub repo (once, automatically)
|
|
41
|
+
2. Commits a workflow + inference script into it
|
|
42
|
+
3. Dispatches a `workflow_dispatch` GitHub Actions run
|
|
43
|
+
4. The runner downloads and caches the model (GGUF quantized, ~0.6 GB)
|
|
44
|
+
5. Runs inference via `llama-cpp-python`
|
|
45
|
+
6. Uploads the output as an artifact
|
|
46
|
+
7. Downloads and returns the output to you as a string
|
|
47
|
+
|
|
48
|
+
No server is ever running between calls. Each call spins up a fresh GitHub Actions runner, runs inference, and shuts down.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
```bash
|
|
54
|
+
pip install servai
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Requirements:**
|
|
58
|
+
- Python 3.9+
|
|
59
|
+
- A GitHub account with a personal access token (PAT)
|
|
60
|
+
- Token scopes needed: `repo`, `workflow`
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Getting a GitHub Token
|
|
65
|
+
|
|
66
|
+
You need a GitHub Personal Access Token (PAT) with `repo` and `workflow` scopes.
|
|
67
|
+
|
|
68
|
+
**Classic token (recommended):**
|
|
69
|
+
|
|
70
|
+
1. Go to [github.com/settings/tokens](https://github.com/settings/tokens)
|
|
71
|
+
2. Click **Generate new token > Generate new token (classic)**
|
|
72
|
+
3. Give it a name (e.g. `servai`)
|
|
73
|
+
4. Set an expiration
|
|
74
|
+
5. Check these scopes:
|
|
75
|
+
- `repo
|
|
76
|
+
```python
|
|
77
|
+
from servai import ai_call
|
|
78
|
+
|
|
79
|
+
result = ai_call(
|
|
80
|
+
github_token="ghp_...",
|
|
81
|
+
prompt="explain recursion in simple terms",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
print(result)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
That's it. On first run, `servai` will:
|
|
88
|
+
- Create a repo called `servai-runner` on your GitHub account
|
|
89
|
+
- Set up the workflow automatically
|
|
90
|
+
- Download and cache TinyLlama 1.1B (~0.6 GB)
|
|
91
|
+
|
|
92
|
+
Subsequent calls reuse the cached repo and model.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Examples
|
|
97
|
+
|
|
98
|
+
**Basic question**
|
|
99
|
+
```python
|
|
100
|
+
from servai import ai_call
|
|
101
|
+
|
|
102
|
+
result = ai_call(
|
|
103
|
+
github_token="ghp_...",
|
|
104
|
+
prompt="what is the difference between a list and a tuple in Python?",
|
|
105
|
+
)
|
|
106
|
+
print(result)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Custom system prompt**
|
|
110
|
+
```python
|
|
111
|
+
result = ai_call(
|
|
112
|
+
github_token="ghp_...",
|
|
113
|
+
prompt="explain black holes",
|
|
114
|
+
system="You are a physics professor. Be precise and use analogies.",
|
|
115
|
+
model="llama",
|
|
116
|
+
max_tokens=1024,
|
|
117
|
+
)
|
|
118
|
+
print(result)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Deterministic output (temperature=0)**
|
|
122
|
+
```python
|
|
123
|
+
result = ai_call(
|
|
124
|
+
github_token="ghp_...",
|
|
125
|
+
prompt="what is 144 divided by 12?",
|
|
126
|
+
temperature=0.0,
|
|
127
|
+
max_tokens=16,
|
|
128
|
+
)
|
|
129
|
+
print(result) # always returns the same answer
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
**Silent mode (no logs)**
|
|
133
|
+
```python
|
|
134
|
+
result = ai_call(
|
|
135
|
+
github_token="ghp_...",
|
|
136
|
+
prompt="summarize the theory of evolution",
|
|
137
|
+
verbose=False,
|
|
138
|
+
)
|
|
139
|
+
print(result)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Large output**
|
|
143
|
+
```python
|
|
144
|
+
result = ai_call(
|
|
145
|
+
github_token="ghp_...",
|
|
146
|
+
prompt="write a detailed essay on the causes of World War I",
|
|
147
|
+
max_tokens=2048,
|
|
148
|
+
temperature=0.5,
|
|
149
|
+
model="llama",
|
|
150
|
+
)
|
|
151
|
+
print(result)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Multiple calls in sequence**
|
|
155
|
+
```python
|
|
156
|
+
from servai import ai_call
|
|
157
|
+
|
|
158
|
+
TOKEN = "ghp_..."
|
|
159
|
+
|
|
160
|
+
questions = [
|
|
161
|
+
"what is a neural network?",
|
|
162
|
+
"what is gradient descent?",
|
|
163
|
+
"what is backpropagation?",
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
for q in questions:
|
|
167
|
+
answer = ai_call(github_token=TOKEN, prompt=q, verbose=False)
|
|
168
|
+
print(f"Q: {q}\nA: {answer}\n")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Parallel calls (use separate repo per call)**
|
|
172
|
+
```python
|
|
173
|
+
import threading
|
|
174
|
+
from servai import ai_call
|
|
175
|
+
|
|
176
|
+
TOKEN = "ghp_..."
|
|
177
|
+
results = {}
|
|
178
|
+
|
|
179
|
+
def run(key, prompt, repo):
|
|
180
|
+
results[key] = ai_call(
|
|
181
|
+
github_token=TOKEN,
|
|
182
|
+
prompt=prompt,
|
|
183
|
+
repo_name=repo,
|
|
184
|
+
verbose=False,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
t1 = threading.Thread(target=run, args=("q1", "explain DNA", "servai-repo-1"))
|
|
188
|
+
t2 = threading.Thread(target=run, args=("q2", "explain RNA", "servai-repo-2"))
|
|
189
|
+
|
|
190
|
+
t1.start(); t2.start()
|
|
191
|
+
t1.join(); t2.join()
|
|
192
|
+
|
|
193
|
+
print(results["q1"])
|
|
194
|
+
print(results["q2"])
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
> **Note:** Parallel calls must use different `repo_name` values. Each repo has its own independent run queue, so calls never interfere with each other.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Parameters
|
|
202
|
+
|
|
203
|
+
| Parameter | Type | Default | Required | Description |
|
|
204
|
+
|---|---|---|---|---|
|
|
205
|
+
| `github_token` | `str` | — | Yes | GitHub PAT with `repo` and `workflow` scopes |
|
|
206
|
+
| `prompt` | `str` | — | Yes | The message or question to send to the model |
|
|
207
|
+
| `model` | `str` | `"tinyllama"` | No | Which model to use. See Models section below |
|
|
208
|
+
| `system` | `str` | `"You are a helpful assistant."` | No | System prompt that controls model behavior |
|
|
209
|
+
| `max_tokens` | `int` | `512` | No | Max tokens to generate. Hard limit: 4096 |
|
|
210
|
+
| `temperature` | `float` | `0.7` | No | Randomness. `0.0` = deterministic, `2.0` = very creative |
|
|
211
|
+
| `cache` | `bool` | `True` | No | Cache model weights between runs. Strongly recommended |
|
|
212
|
+
| `n_ctx` | `int` | `None` | No | Context window size. Defaults to model built-in. Max: 8192 |
|
|
213
|
+
| `repo_name` | `str` | `"servai-runner"` | No | GitHub repo to create or reuse for running inference |
|
|
214
|
+
| `verbose` | `bool` | `True` | No | Print step-by-step logs. Set `False` for silent mode |
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Models
|
|
219
|
+
|
|
220
|
+
| Key | Model | Size | Default Context | Max Safe Context |
|
|
221
|
+
|---|---|---|---|---|
|
|
222
|
+
| `tinyllama` | TinyLlama 1.1B Chat Q4_K_M | 0.6 GB | 2048 tokens | 8192 tokens |
|
|
223
|
+
| `llama` | Llama 3.2 1B Instruct Q4_K_M | 0.7 GB | 4096 tokens | 8192 tokens |
|
|
224
|
+
|
|
225
|
+
Both models are quantized GGUF files hosted publicly on HuggingFace. No HuggingFace token required.
|
|
226
|
+
|
|
227
|
+
**Which model should I use?**
|
|
228
|
+
- `tinyllama` — faster, smaller, good for short factual answers and simple tasks
|
|
229
|
+
- `llama` — better instruction-following, better for longer structured outputs and reasoning
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Context and output size
|
|
234
|
+
|
|
235
|
+
The context window (`n_ctx`) is the total number of tokens the model can see at once — this includes your system prompt, your prompt, and the generated output combined.
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# Servai
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/servai)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
|
|
7
|
+
**Serverless AI inference via GitHub Actions. No server. No GPU. No infrastructure.**
|
|
8
|
+
|
|
9
|
+
Run open-source LLMs directly through GitHub's free CI/CD runners — just a GitHub token and a prompt. `servai` handles everything else: repo creation, workflow setup, model downloading, caching, and output retrieval.
|
|
10
|
+
|
|
11
|
+
Built by [Tanish Chauhan](https://github.com/TanishC4444)
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## How it works
|
|
16
|
+
|
|
17
|
+
When you call `ai_call()`:
|
|
18
|
+
|
|
19
|
+
1. Creates a private GitHub repo (once, automatically)
|
|
20
|
+
2. Commits a workflow + inference script into it
|
|
21
|
+
3. Dispatches a `workflow_dispatch` GitHub Actions run
|
|
22
|
+
4. The runner downloads and caches the model (GGUF quantized, ~0.6 GB)
|
|
23
|
+
5. Runs inference via `llama-cpp-python`
|
|
24
|
+
6. Uploads the output as an artifact
|
|
25
|
+
7. Downloads and returns the output to you as a string
|
|
26
|
+
|
|
27
|
+
No server is ever running between calls. Each call spins up a fresh GitHub Actions runner, runs inference, and shuts down.
|
|
28
|
+
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
```bash
|
|
33
|
+
pip install servai
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Requirements:**
|
|
37
|
+
- Python 3.9+
|
|
38
|
+
- A GitHub account with a personal access token (PAT)
|
|
39
|
+
- Token scopes needed: `repo`, `workflow`
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Getting a GitHub Token
|
|
44
|
+
|
|
45
|
+
You need a GitHub Personal Access Token (PAT) with `repo` and `workflow` scopes.
|
|
46
|
+
|
|
47
|
+
**Classic token (recommended):**
|
|
48
|
+
|
|
49
|
+
1. Go to [github.com/settings/tokens](https://github.com/settings/tokens)
|
|
50
|
+
2. Click **Generate new token > Generate new token (classic)**
|
|
51
|
+
3. Give it a name (e.g. `servai`)
|
|
52
|
+
4. Set an expiration
|
|
53
|
+
5. Check these scopes:
|
|
54
|
+
- `repo
|
|
55
|
+
```python
|
|
56
|
+
from servai import ai_call
|
|
57
|
+
|
|
58
|
+
result = ai_call(
|
|
59
|
+
github_token="ghp_...",
|
|
60
|
+
prompt="explain recursion in simple terms",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
print(result)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
That's it. On first run, `servai` will:
|
|
67
|
+
- Create a repo called `servai-runner` on your GitHub account
|
|
68
|
+
- Set up the workflow automatically
|
|
69
|
+
- Download and cache TinyLlama 1.1B (~0.6 GB)
|
|
70
|
+
|
|
71
|
+
Subsequent calls reuse the cached repo and model.
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## Examples
|
|
76
|
+
|
|
77
|
+
**Basic question**
|
|
78
|
+
```python
|
|
79
|
+
from servai import ai_call
|
|
80
|
+
|
|
81
|
+
result = ai_call(
|
|
82
|
+
github_token="ghp_...",
|
|
83
|
+
prompt="what is the difference between a list and a tuple in Python?",
|
|
84
|
+
)
|
|
85
|
+
print(result)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Custom system prompt**
|
|
89
|
+
```python
|
|
90
|
+
result = ai_call(
|
|
91
|
+
github_token="ghp_...",
|
|
92
|
+
prompt="explain black holes",
|
|
93
|
+
system="You are a physics professor. Be precise and use analogies.",
|
|
94
|
+
model="llama",
|
|
95
|
+
max_tokens=1024,
|
|
96
|
+
)
|
|
97
|
+
print(result)
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**Deterministic output (temperature=0)**
|
|
101
|
+
```python
|
|
102
|
+
result = ai_call(
|
|
103
|
+
github_token="ghp_...",
|
|
104
|
+
prompt="what is 144 divided by 12?",
|
|
105
|
+
temperature=0.0,
|
|
106
|
+
max_tokens=16,
|
|
107
|
+
)
|
|
108
|
+
print(result) # always returns the same answer
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
**Silent mode (no logs)**
|
|
112
|
+
```python
|
|
113
|
+
result = ai_call(
|
|
114
|
+
github_token="ghp_...",
|
|
115
|
+
prompt="summarize the theory of evolution",
|
|
116
|
+
verbose=False,
|
|
117
|
+
)
|
|
118
|
+
print(result)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Large output**
|
|
122
|
+
```python
|
|
123
|
+
result = ai_call(
|
|
124
|
+
github_token="ghp_...",
|
|
125
|
+
prompt="write a detailed essay on the causes of World War I",
|
|
126
|
+
max_tokens=2048,
|
|
127
|
+
temperature=0.5,
|
|
128
|
+
model="llama",
|
|
129
|
+
)
|
|
130
|
+
print(result)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
**Multiple calls in sequence**
|
|
134
|
+
```python
|
|
135
|
+
from servai import ai_call
|
|
136
|
+
|
|
137
|
+
TOKEN = "ghp_..."
|
|
138
|
+
|
|
139
|
+
questions = [
|
|
140
|
+
"what is a neural network?",
|
|
141
|
+
"what is gradient descent?",
|
|
142
|
+
"what is backpropagation?",
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
for q in questions:
|
|
146
|
+
answer = ai_call(github_token=TOKEN, prompt=q, verbose=False)
|
|
147
|
+
print(f"Q: {q}\nA: {answer}\n")
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**Parallel calls (use separate repo per call)**
|
|
151
|
+
```python
|
|
152
|
+
import threading
|
|
153
|
+
from servai import ai_call
|
|
154
|
+
|
|
155
|
+
TOKEN = "ghp_..."
|
|
156
|
+
results = {}
|
|
157
|
+
|
|
158
|
+
def run(key, prompt, repo):
|
|
159
|
+
results[key] = ai_call(
|
|
160
|
+
github_token=TOKEN,
|
|
161
|
+
prompt=prompt,
|
|
162
|
+
repo_name=repo,
|
|
163
|
+
verbose=False,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
t1 = threading.Thread(target=run, args=("q1", "explain DNA", "servai-repo-1"))
|
|
167
|
+
t2 = threading.Thread(target=run, args=("q2", "explain RNA", "servai-repo-2"))
|
|
168
|
+
|
|
169
|
+
t1.start(); t2.start()
|
|
170
|
+
t1.join(); t2.join()
|
|
171
|
+
|
|
172
|
+
print(results["q1"])
|
|
173
|
+
print(results["q2"])
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
> **Note:** Parallel calls must use different `repo_name` values. Each repo has its own independent run queue, so calls never interfere with each other.
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Parameters
|
|
181
|
+
|
|
182
|
+
| Parameter | Type | Default | Required | Description |
|
|
183
|
+
|---|---|---|---|---|
|
|
184
|
+
| `github_token` | `str` | — | Yes | GitHub PAT with `repo` and `workflow` scopes |
|
|
185
|
+
| `prompt` | `str` | — | Yes | The message or question to send to the model |
|
|
186
|
+
| `model` | `str` | `"tinyllama"` | No | Which model to use. See Models section below |
|
|
187
|
+
| `system` | `str` | `"You are a helpful assistant."` | No | System prompt that controls model behavior |
|
|
188
|
+
| `max_tokens` | `int` | `512` | No | Max tokens to generate. Hard limit: 4096 |
|
|
189
|
+
| `temperature` | `float` | `0.7` | No | Randomness. `0.0` = deterministic, `2.0` = very creative |
|
|
190
|
+
| `cache` | `bool` | `True` | No | Cache model weights between runs. Strongly recommended |
|
|
191
|
+
| `n_ctx` | `int` | `None` | No | Context window size. Defaults to model built-in. Max: 8192 |
|
|
192
|
+
| `repo_name` | `str` | `"servai-runner"` | No | GitHub repo to create or reuse for running inference |
|
|
193
|
+
| `verbose` | `bool` | `True` | No | Print step-by-step logs. Set `False` for silent mode |
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## Models
|
|
198
|
+
|
|
199
|
+
| Key | Model | Size | Default Context | Max Safe Context |
|
|
200
|
+
|---|---|---|---|---|
|
|
201
|
+
| `tinyllama` | TinyLlama 1.1B Chat Q4_K_M | 0.6 GB | 2048 tokens | 8192 tokens |
|
|
202
|
+
| `llama` | Llama 3.2 1B Instruct Q4_K_M | 0.7 GB | 4096 tokens | 8192 tokens |
|
|
203
|
+
|
|
204
|
+
Both models are quantized GGUF files hosted publicly on HuggingFace. No HuggingFace token required.
|
|
205
|
+
|
|
206
|
+
**Which model should I use?**
|
|
207
|
+
- `tinyllama` — faster, smaller, good for short factual answers and simple tasks
|
|
208
|
+
- `llama` — better instruction-following, better for longer structured outputs and reasoning
|
|
209
|
+
|
|
210
|
+
---
|
|
211
|
+
|
|
212
|
+
## Context and output size
|
|
213
|
+
|
|
214
|
+
The context window (`n_ctx`) is the total number of tokens the model can see at once — this includes your system prompt, your prompt, and the generated output combined.
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gh-ai-runner
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Serverless AI inference via GitHub Actions — no server required
|
|
5
|
+
Author: Tanish Chauhan
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/TanishC4444/servai
|
|
8
|
+
Project-URL: Repository, https://github.com/TanishC4444/servai
|
|
9
|
+
Keywords: ai,llm,inference,github-actions,llama,serverless
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
License-File: LICENSE
|
|
17
|
+
Requires-Dist: requests>=2.28
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# Servai
|
|
23
|
+
|
|
24
|
+
[](https://badge.fury.io/py/servai)
|
|
25
|
+
[](https://opensource.org/licenses/MIT)
|
|
26
|
+
[](https://www.python.org/downloads/)
|
|
27
|
+
|
|
28
|
+
**Serverless AI inference via GitHub Actions. No server. No GPU. No infrastructure.**
|
|
29
|
+
|
|
30
|
+
Run open-source LLMs directly through GitHub's free CI/CD runners — just a GitHub token and a prompt. `servai` handles everything else: repo creation, workflow setup, model downloading, caching, and output retrieval.
|
|
31
|
+
|
|
32
|
+
Built by [Tanish Chauhan](https://github.com/TanishC4444)
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## How it works
|
|
37
|
+
|
|
38
|
+
When you call `ai_call()`:
|
|
39
|
+
|
|
40
|
+
1. Creates a private GitHub repo (once, automatically)
|
|
41
|
+
2. Commits a workflow + inference script into it
|
|
42
|
+
3. Dispatches a `workflow_dispatch` GitHub Actions run
|
|
43
|
+
4. The runner downloads and caches the model (GGUF quantized, ~0.6 GB)
|
|
44
|
+
5. Runs inference via `llama-cpp-python`
|
|
45
|
+
6. Uploads the output as an artifact
|
|
46
|
+
7. Downloads and returns the output to you as a string
|
|
47
|
+
|
|
48
|
+
No server is ever running between calls. Each call spins up a fresh GitHub Actions runner, runs inference, and shuts down.
|
|
49
|
+
|
|
50
|
+
---
|
|
51
|
+
|
|
52
|
+
## Install
|
|
53
|
+
```bash
|
|
54
|
+
pip install servai
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
**Requirements:**
|
|
58
|
+
- Python 3.9+
|
|
59
|
+
- A GitHub account with a personal access token (PAT)
|
|
60
|
+
- Token scopes needed: `repo`, `workflow`
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Getting a GitHub Token
|
|
65
|
+
|
|
66
|
+
You need a GitHub Personal Access Token (PAT) with `repo` and `workflow` scopes.
|
|
67
|
+
|
|
68
|
+
**Classic token (recommended):**
|
|
69
|
+
|
|
70
|
+
1. Go to [github.com/settings/tokens](https://github.com/settings/tokens)
|
|
71
|
+
2. Click **Generate new token > Generate new token (classic)**
|
|
72
|
+
3. Give it a name (e.g. `servai`)
|
|
73
|
+
4. Set an expiration
|
|
74
|
+
5. Check these scopes:
|
|
75
|
+
- `repo
|
|
76
|
+
```python
|
|
77
|
+
from servai import ai_call
|
|
78
|
+
|
|
79
|
+
result = ai_call(
|
|
80
|
+
github_token="ghp_...",
|
|
81
|
+
prompt="explain recursion in simple terms",
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
print(result)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
That's it. On first run, `servai` will:
|
|
88
|
+
- Create a repo called `servai-runner` on your GitHub account
|
|
89
|
+
- Set up the workflow automatically
|
|
90
|
+
- Download and cache TinyLlama 1.1B (~0.6 GB)
|
|
91
|
+
|
|
92
|
+
Subsequent calls reuse the cached repo and model.
|
|
93
|
+
|
|
94
|
+
---
|
|
95
|
+
|
|
96
|
+
## Examples
|
|
97
|
+
|
|
98
|
+
**Basic question**
|
|
99
|
+
```python
|
|
100
|
+
from servai import ai_call
|
|
101
|
+
|
|
102
|
+
result = ai_call(
|
|
103
|
+
github_token="ghp_...",
|
|
104
|
+
prompt="what is the difference between a list and a tuple in Python?",
|
|
105
|
+
)
|
|
106
|
+
print(result)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Custom system prompt**
|
|
110
|
+
```python
|
|
111
|
+
result = ai_call(
|
|
112
|
+
github_token="ghp_...",
|
|
113
|
+
prompt="explain black holes",
|
|
114
|
+
system="You are a physics professor. Be precise and use analogies.",
|
|
115
|
+
model="llama",
|
|
116
|
+
max_tokens=1024,
|
|
117
|
+
)
|
|
118
|
+
print(result)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Deterministic output (temperature=0)**
|
|
122
|
+
```python
|
|
123
|
+
result = ai_call(
|
|
124
|
+
github_token="ghp_...",
|
|
125
|
+
prompt="what is 144 divided by 12?",
|
|
126
|
+
temperature=0.0,
|
|
127
|
+
max_tokens=16,
|
|
128
|
+
)
|
|
129
|
+
print(result) # always returns the same answer
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
**Silent mode (no logs)**
|
|
133
|
+
```python
|
|
134
|
+
result = ai_call(
|
|
135
|
+
github_token="ghp_...",
|
|
136
|
+
prompt="summarize the theory of evolution",
|
|
137
|
+
verbose=False,
|
|
138
|
+
)
|
|
139
|
+
print(result)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**Large output**
|
|
143
|
+
```python
|
|
144
|
+
result = ai_call(
|
|
145
|
+
github_token="ghp_...",
|
|
146
|
+
prompt="write a detailed essay on the causes of World War I",
|
|
147
|
+
max_tokens=2048,
|
|
148
|
+
temperature=0.5,
|
|
149
|
+
model="llama",
|
|
150
|
+
)
|
|
151
|
+
print(result)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Multiple calls in sequence**
|
|
155
|
+
```python
|
|
156
|
+
from servai import ai_call
|
|
157
|
+
|
|
158
|
+
TOKEN = "ghp_..."
|
|
159
|
+
|
|
160
|
+
questions = [
|
|
161
|
+
"what is a neural network?",
|
|
162
|
+
"what is gradient descent?",
|
|
163
|
+
"what is backpropagation?",
|
|
164
|
+
]
|
|
165
|
+
|
|
166
|
+
for q in questions:
|
|
167
|
+
answer = ai_call(github_token=TOKEN, prompt=q, verbose=False)
|
|
168
|
+
print(f"Q: {q}\nA: {answer}\n")
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
**Parallel calls (use separate repo per call)**
|
|
172
|
+
```python
|
|
173
|
+
import threading
|
|
174
|
+
from servai import ai_call
|
|
175
|
+
|
|
176
|
+
TOKEN = "ghp_..."
|
|
177
|
+
results = {}
|
|
178
|
+
|
|
179
|
+
def run(key, prompt, repo):
|
|
180
|
+
results[key] = ai_call(
|
|
181
|
+
github_token=TOKEN,
|
|
182
|
+
prompt=prompt,
|
|
183
|
+
repo_name=repo,
|
|
184
|
+
verbose=False,
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
t1 = threading.Thread(target=run, args=("q1", "explain DNA", "servai-repo-1"))
|
|
188
|
+
t2 = threading.Thread(target=run, args=("q2", "explain RNA", "servai-repo-2"))
|
|
189
|
+
|
|
190
|
+
t1.start(); t2.start()
|
|
191
|
+
t1.join(); t2.join()
|
|
192
|
+
|
|
193
|
+
print(results["q1"])
|
|
194
|
+
print(results["q2"])
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
> **Note:** Parallel calls must use different `repo_name` values. Each repo has its own independent run queue, so calls never interfere with each other.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Parameters
|
|
202
|
+
|
|
203
|
+
| Parameter | Type | Default | Required | Description |
|
|
204
|
+
|---|---|---|---|---|
|
|
205
|
+
| `github_token` | `str` | — | Yes | GitHub PAT with `repo` and `workflow` scopes |
|
|
206
|
+
| `prompt` | `str` | — | Yes | The message or question to send to the model |
|
|
207
|
+
| `model` | `str` | `"tinyllama"` | No | Which model to use. See Models section below |
|
|
208
|
+
| `system` | `str` | `"You are a helpful assistant."` | No | System prompt that controls model behavior |
|
|
209
|
+
| `max_tokens` | `int` | `512` | No | Max tokens to generate. Hard limit: 4096 |
|
|
210
|
+
| `temperature` | `float` | `0.7` | No | Randomness. `0.0` = deterministic, `2.0` = very creative |
|
|
211
|
+
| `cache` | `bool` | `True` | No | Cache model weights between runs. Strongly recommended |
|
|
212
|
+
| `n_ctx` | `int` | `None` | No | Context window size. Defaults to model built-in. Max: 8192 |
|
|
213
|
+
| `repo_name` | `str` | `"servai-runner"` | No | GitHub repo to create or reuse for running inference |
|
|
214
|
+
| `verbose` | `bool` | `True` | No | Print step-by-step logs. Set `False` for silent mode |
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Models
|
|
219
|
+
|
|
220
|
+
| Key | Model | Size | Default Context | Max Safe Context |
|
|
221
|
+
|---|---|---|---|---|
|
|
222
|
+
| `tinyllama` | TinyLlama 1.1B Chat Q4_K_M | 0.6 GB | 2048 tokens | 8192 tokens |
|
|
223
|
+
| `llama` | Llama 3.2 1B Instruct Q4_K_M | 0.7 GB | 4096 tokens | 8192 tokens |
|
|
224
|
+
|
|
225
|
+
Both models are quantized GGUF files hosted publicly on HuggingFace. No HuggingFace token required.
|
|
226
|
+
|
|
227
|
+
**Which model should I use?**
|
|
228
|
+
- `tinyllama` — faster, smaller, good for short factual answers and simple tasks
|
|
229
|
+
- `llama` — better instruction-following, better for longer structured outputs and reasoning
|
|
230
|
+
|
|
231
|
+
---
|
|
232
|
+
|
|
233
|
+
## Context and output size
|
|
234
|
+
|
|
235
|
+
The context window (`n_ctx`) is the total number of tokens the model can see at once — this includes your system prompt, your prompt, and the generated output combined.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
gh_ai_runner.egg-info/PKG-INFO
|
|
5
|
+
gh_ai_runner.egg-info/SOURCES.txt
|
|
6
|
+
gh_ai_runner.egg-info/dependency_links.txt
|
|
7
|
+
gh_ai_runner.egg-info/requires.txt
|
|
8
|
+
gh_ai_runner.egg-info/top_level.txt
|
|
9
|
+
servai/__init__.py
|
|
10
|
+
servai/artifact.py
|
|
11
|
+
servai/core.py
|
|
12
|
+
servai/logger.py
|
|
13
|
+
servai/models.py
|
|
14
|
+
servai/polling.py
|
|
15
|
+
servai/repo.py
|
|
16
|
+
servai/runner.py
|
|
17
|
+
servai/validation.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
servai
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "gh-ai-runner"
|
|
7
|
+
version = "0.1.3"
|
|
8
|
+
description = "Serverless AI inference via GitHub Actions — no server required"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = { text = "MIT" }
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [{ name = "Tanish Chauhan" }]
|
|
13
|
+
keywords = ["ai", "llm", "inference", "github-actions", "llama", "serverless"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
19
|
+
]
|
|
20
|
+
dependencies = ["requests>=2.28"]
|
|
21
|
+
|
|
22
|
+
[project.urls]
|
|
23
|
+
Homepage = "https://github.com/TanishC4444/servai"
|
|
24
|
+
Repository = "https://github.com/TanishC4444/servai"
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
dev = ["pytest"]
|
|
28
|
+
|
|
29
|
+
[tool.setuptools.packages.find]
|
|
30
|
+
where = ["."]
|
|
31
|
+
include = ["servai*"]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import time
|
|
3
|
+
import zipfile
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from .logger import _log
|
|
8
|
+
from .repo import API, _headers
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _download_output(token, username, repo_name, run_id, verbose, timeout=60):
|
|
12
|
+
start = time.time()
|
|
13
|
+
target = None
|
|
14
|
+
|
|
15
|
+
while time.time() - start < timeout:
|
|
16
|
+
r = requests.get(
|
|
17
|
+
f"{API}/repos/{username}/{repo_name}/actions/runs/{run_id}/artifacts",
|
|
18
|
+
headers=_headers(token),
|
|
19
|
+
)
|
|
20
|
+
artifacts = r.json().get("artifacts", [])
|
|
21
|
+
target = next((a for a in artifacts if a["name"] == "ai-output"), None)
|
|
22
|
+
if target:
|
|
23
|
+
break
|
|
24
|
+
_log("Waiting for artifact...", verbose=verbose)
|
|
25
|
+
time.sleep(5)
|
|
26
|
+
|
|
27
|
+
if not target:
|
|
28
|
+
raise RuntimeError("No ai-output artifact found after waiting.")
|
|
29
|
+
|
|
30
|
+
r = requests.get(
|
|
31
|
+
f"{API}/repos/{username}/{repo_name}/actions/artifacts/{target['id']}/zip",
|
|
32
|
+
headers=_headers(token),
|
|
33
|
+
allow_redirects=False,
|
|
34
|
+
)
|
|
35
|
+
location = r.headers.get("Location", "")
|
|
36
|
+
zip_bytes = requests.get(location).content if location else r.content
|
|
37
|
+
|
|
38
|
+
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as z:
|
|
39
|
+
_log(f"Artifact contents: {z.namelist()}", verbose=verbose)
|
|
40
|
+
name = next((n for n in z.namelist() if n.endswith("output.txt")), None)
|
|
41
|
+
if not name:
|
|
42
|
+
raise RuntimeError(f"output.txt not found in artifact. Contents: {z.namelist()}")
|
|
43
|
+
with z.open(name) as f:
|
|
44
|
+
return f.read().decode()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from .artifact import _download_output
|
|
6
|
+
from .logger import _elapsed, _log
|
|
7
|
+
from .models import MODELS, RAM_PER_1K_CTX_GB, RUNNER_RAM_GB
|
|
8
|
+
from .polling import _snapshot_run_ids, _wait_for_completion, _wait_for_run
|
|
9
|
+
from .repo import API, _ensure_repo, _get_username, _headers, _sync_files
|
|
10
|
+
from .validation import _validate
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ai_call(
|
|
14
|
+
github_token: str,
|
|
15
|
+
prompt: str,
|
|
16
|
+
model: str = "tinyllama",
|
|
17
|
+
system: str = "You are a helpful assistant.",
|
|
18
|
+
max_tokens: int = 512,
|
|
19
|
+
temperature: float = 0.7,
|
|
20
|
+
cache: bool = True,
|
|
21
|
+
n_ctx: int = None,
|
|
22
|
+
repo_name: str = "ai-inference-runner",
|
|
23
|
+
verbose: bool = True,
|
|
24
|
+
) -> str:
|
|
25
|
+
"""
|
|
26
|
+
Run AI inference on GitHub Actions and return the model output.
|
|
27
|
+
|
|
28
|
+
Required:
|
|
29
|
+
github_token: Your GitHub personal access token (needs repo + workflow scopes).
|
|
30
|
+
prompt: The message / question to send to the model.
|
|
31
|
+
|
|
32
|
+
Optional:
|
|
33
|
+
model: "tinyllama" (default) or "llama".
|
|
34
|
+
system: System prompt. Default: "You are a helpful assistant."
|
|
35
|
+
max_tokens: Max tokens to generate. Default: 512. Hard limit: 4096.
|
|
36
|
+
temperature: 0.0 = deterministic, 2.0 = very creative. Default: 0.7.
|
|
37
|
+
cache: Cache model weights between runs. Default: True.
|
|
38
|
+
n_ctx: Context window size. Default: 2048 (tinyllama) / 4096 (llama). Max: 8192.
|
|
39
|
+
repo_name: GitHub repo to create/reuse. Default: "ai-inference-runner".
|
|
40
|
+
verbose: Print detailed logs. Default: True. Set False for silent mode.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Model response as a string.
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
ValueError: If any parameter is out of safe range.
|
|
47
|
+
"""
|
|
48
|
+
if model not in MODELS:
|
|
49
|
+
raise ValueError(f"model must be one of: {list(MODELS.keys())}")
|
|
50
|
+
|
|
51
|
+
_validate(model, max_tokens, temperature, n_ctx)
|
|
52
|
+
|
|
53
|
+
call_start = time.time()
|
|
54
|
+
cfg = MODELS[model]
|
|
55
|
+
username = _get_username(github_token)
|
|
56
|
+
effective_n_ctx = n_ctx or cfg["n_ctx"]
|
|
57
|
+
extra_ctx = max(0, effective_n_ctx - cfg["n_ctx"])
|
|
58
|
+
est_ram = cfg["size_gb"] + (extra_ctx / 1000) * RAM_PER_1K_CTX_GB + 1.0
|
|
59
|
+
|
|
60
|
+
_log("=" * 50, verbose=verbose)
|
|
61
|
+
_log("ai_call started", verbose=verbose)
|
|
62
|
+
_log(f"Model : {cfg['name']} ({cfg['size_gb']} GB GGUF)", verbose=verbose)
|
|
63
|
+
_log(f"n_ctx : {effective_n_ctx} | Est. RAM: {est_ram:.1f} GB", verbose=verbose)
|
|
64
|
+
_log(f"Max tokens : {max_tokens} | Temp: {temperature} | Cache: {cache}", verbose=verbose)
|
|
65
|
+
_log(f"Prompt : {prompt[:80]}{'...' if len(prompt) > 80 else ''}", verbose=verbose)
|
|
66
|
+
_log("=" * 50, verbose=verbose)
|
|
67
|
+
|
|
68
|
+
_ensure_repo(github_token, username, repo_name, verbose)
|
|
69
|
+
_sync_files(github_token, username, repo_name, verbose)
|
|
70
|
+
|
|
71
|
+
seen_ids = _snapshot_run_ids(github_token, username, repo_name)
|
|
72
|
+
|
|
73
|
+
t = time.time()
|
|
74
|
+
_log("Dispatching workflow...", verbose=verbose)
|
|
75
|
+
r = requests.post(
|
|
76
|
+
f"{API}/repos/{username}/{repo_name}/actions/workflows/inference.yml/dispatches",
|
|
77
|
+
headers=_headers(github_token),
|
|
78
|
+
json={
|
|
79
|
+
"ref": "main",
|
|
80
|
+
"inputs": {
|
|
81
|
+
"prompt": prompt,
|
|
82
|
+
"system": system,
|
|
83
|
+
"model": model,
|
|
84
|
+
"cache": "true" if cache else "false",
|
|
85
|
+
"max_tokens": str(max_tokens),
|
|
86
|
+
"temperature": str(temperature),
|
|
87
|
+
"n_ctx": str(n_ctx) if n_ctx else "",
|
|
88
|
+
},
|
|
89
|
+
},
|
|
90
|
+
)
|
|
91
|
+
r.raise_for_status()
|
|
92
|
+
_log("Workflow dispatched", since=t, verbose=verbose)
|
|
93
|
+
|
|
94
|
+
t = time.time()
|
|
95
|
+
_log("Waiting for runner...", verbose=verbose)
|
|
96
|
+
run_id = _wait_for_run(github_token, username, repo_name, seen_ids, verbose)
|
|
97
|
+
|
|
98
|
+
t = time.time()
|
|
99
|
+
_log("Runner working (first run ~5 min, cached ~1-2 min)...", verbose=verbose)
|
|
100
|
+
_wait_for_completion(github_token, username, repo_name, run_id, verbose)
|
|
101
|
+
_log("Runner finished", since=t, verbose=verbose)
|
|
102
|
+
|
|
103
|
+
t = time.time()
|
|
104
|
+
_log("Downloading output...", verbose=verbose)
|
|
105
|
+
output = _download_output(github_token, username, repo_name, run_id, verbose)
|
|
106
|
+
_log("Output downloaded", since=t, verbose=verbose)
|
|
107
|
+
|
|
108
|
+
_log("=" * 50, verbose=verbose)
|
|
109
|
+
_log(f"Total time : {_elapsed(call_start)}", verbose=verbose)
|
|
110
|
+
_log("=" * 50, verbose=verbose)
|
|
111
|
+
|
|
112
|
+
return output
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def _ts():
|
|
6
|
+
return datetime.now().strftime("%H:%M:%S")
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _elapsed(since):
|
|
10
|
+
return f"{time.time() - since:.1f}s"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _log(msg, since=None, verbose=True):
|
|
14
|
+
if not verbose:
|
|
15
|
+
return
|
|
16
|
+
suffix = f" (+{_elapsed(since)})" if since else ""
|
|
17
|
+
print(f"[{_ts()}] {msg}{suffix}", flush=True)
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
MODELS = {
|
|
2
|
+
"tinyllama": {
|
|
3
|
+
"name": "TinyLlama 1.1B Chat",
|
|
4
|
+
"size_gb": 0.6,
|
|
5
|
+
"n_ctx": 2048,
|
|
6
|
+
"max_n_ctx": 8192,
|
|
7
|
+
"url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
8
|
+
"filename": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
9
|
+
},
|
|
10
|
+
"llama": {
|
|
11
|
+
"name": "Llama 3.2 1B Instruct",
|
|
12
|
+
"size_gb": 0.7,
|
|
13
|
+
"n_ctx": 4096,
|
|
14
|
+
"max_n_ctx": 8192,
|
|
15
|
+
"url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
|
16
|
+
"filename": "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
|
17
|
+
},
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
RUNNER_RAM_GB = 7
|
|
21
|
+
RAM_PER_1K_CTX_GB = 0.2
|
|
22
|
+
MAX_TOKENS_LIMIT = 4096
|
|
23
|
+
MAX_TEMPERATURE = 2.0
|
|
24
|
+
MIN_TEMPERATURE = 0.0
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import requests
|
|
4
|
+
|
|
5
|
+
from .logger import _log
|
|
6
|
+
from .repo import API, _headers
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _snapshot_run_ids(token, username, repo_name):
|
|
10
|
+
r = requests.get(
|
|
11
|
+
f"{API}/repos/{username}/{repo_name}/actions/runs",
|
|
12
|
+
headers=_headers(token),
|
|
13
|
+
params={"per_page": 20},
|
|
14
|
+
)
|
|
15
|
+
return {run["id"] for run in r.json().get("workflow_runs", [])}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _wait_for_run(token, username, repo_name, seen_ids, verbose, timeout=180):
|
|
19
|
+
start = time.time()
|
|
20
|
+
while time.time() - start < timeout:
|
|
21
|
+
time.sleep(4)
|
|
22
|
+
r = requests.get(
|
|
23
|
+
f"{API}/repos/{username}/{repo_name}/actions/runs",
|
|
24
|
+
headers=_headers(token),
|
|
25
|
+
params={"per_page": 10},
|
|
26
|
+
)
|
|
27
|
+
for run in r.json().get("workflow_runs", []):
|
|
28
|
+
if run["id"] not in seen_ids:
|
|
29
|
+
_log(f"Runner picked up job (run #{run['id']})", verbose=verbose)
|
|
30
|
+
return run["id"]
|
|
31
|
+
raise TimeoutError("Timed out waiting for workflow run to appear.")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _wait_for_completion(token, username, repo_name, run_id, verbose, timeout=900, poll=12):
|
|
35
|
+
start = time.time()
|
|
36
|
+
last_status = None
|
|
37
|
+
while time.time() - start < timeout:
|
|
38
|
+
r = requests.get(
|
|
39
|
+
f"{API}/repos/{username}/{repo_name}/actions/runs/{run_id}",
|
|
40
|
+
headers=_headers(token),
|
|
41
|
+
)
|
|
42
|
+
data = r.json()
|
|
43
|
+
status = data["status"]
|
|
44
|
+
conclusion = data.get("conclusion")
|
|
45
|
+
|
|
46
|
+
if status != last_status:
|
|
47
|
+
_log(f"Runner: {status}{' -> ' + conclusion if conclusion else ''}",
|
|
48
|
+
verbose=verbose)
|
|
49
|
+
last_status = status
|
|
50
|
+
|
|
51
|
+
if status == "completed":
|
|
52
|
+
if conclusion != "success":
|
|
53
|
+
raise RuntimeError(
|
|
54
|
+
f"Workflow failed ({conclusion}) — "
|
|
55
|
+
f"https://github.com/{username}/{repo_name}/actions/runs/{run_id}"
|
|
56
|
+
)
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
time.sleep(poll)
|
|
60
|
+
raise TimeoutError("Timed out waiting for workflow to complete.")
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import time
|
|
3
|
+
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
from .logger import _log
|
|
7
|
+
from .runner import INFERENCE_SCRIPT, WORKFLOW_YAML
|
|
8
|
+
|
|
9
|
+
API = "https://api.github.com"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _headers(token):
|
|
13
|
+
return {
|
|
14
|
+
"Authorization": f"token {token}",
|
|
15
|
+
"Accept": "application/vnd.github+json",
|
|
16
|
+
"X-GitHub-Api-Version": "2022-11-28",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_username(token):
|
|
21
|
+
r = requests.get(f"{API}/user", headers=_headers(token))
|
|
22
|
+
r.raise_for_status()
|
|
23
|
+
return r.json()["login"]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _repo_exists(token, username, repo_name):
|
|
27
|
+
return requests.get(
|
|
28
|
+
f"{API}/repos/{username}/{repo_name}", headers=_headers(token)
|
|
29
|
+
).status_code == 200
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _commit_file(token, username, repo_name, path, content, message):
|
|
33
|
+
url = f"{API}/repos/{username}/{repo_name}/contents/{path}"
|
|
34
|
+
existing = requests.get(url, headers=_headers(token))
|
|
35
|
+
sha = existing.json().get("sha") if existing.status_code == 200 else None
|
|
36
|
+
body = {"message": message, "content": base64.b64encode(content.encode()).decode()}
|
|
37
|
+
if sha:
|
|
38
|
+
body["sha"] = sha
|
|
39
|
+
requests.put(url, headers=_headers(token), json=body).raise_for_status()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _wait_for_workflow(token, username, repo_name, timeout=60):
|
|
43
|
+
start = time.time()
|
|
44
|
+
while time.time() - start < timeout:
|
|
45
|
+
r = requests.get(
|
|
46
|
+
f"{API}/repos/{username}/{repo_name}/actions/workflows",
|
|
47
|
+
headers=_headers(token),
|
|
48
|
+
)
|
|
49
|
+
workflows = r.json().get("workflows", [])
|
|
50
|
+
if any(w["path"] == ".github/workflows/inference.yml" for w in workflows):
|
|
51
|
+
return
|
|
52
|
+
time.sleep(3)
|
|
53
|
+
raise TimeoutError("Workflow never registered.")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _ensure_repo(token, username, repo_name, verbose):
|
|
57
|
+
if _repo_exists(token, username, repo_name):
|
|
58
|
+
_log("Repo ready", verbose=verbose)
|
|
59
|
+
return
|
|
60
|
+
|
|
61
|
+
t = time.time()
|
|
62
|
+
_log("Creating repo...", verbose=verbose)
|
|
63
|
+
r = requests.post(f"{API}/user/repos", headers=_headers(token), json={
|
|
64
|
+
"name": repo_name, "private": False, "auto_init": True,
|
|
65
|
+
"description": "GitHub AI Inference Runner",
|
|
66
|
+
})
|
|
67
|
+
r.raise_for_status()
|
|
68
|
+
time.sleep(2)
|
|
69
|
+
|
|
70
|
+
_commit_file(token, username, repo_name,
|
|
71
|
+
"run_inference.py", INFERENCE_SCRIPT, "Add inference script")
|
|
72
|
+
_commit_file(token, username, repo_name,
|
|
73
|
+
".github/workflows/inference.yml", WORKFLOW_YAML, "Add inference workflow")
|
|
74
|
+
|
|
75
|
+
_wait_for_workflow(token, username, repo_name)
|
|
76
|
+
_log("Repo created and ready", since=t, verbose=verbose)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _sync_files(token, username, repo_name, verbose):
|
|
80
|
+
_log("Syncing runner files...", verbose=verbose)
|
|
81
|
+
_commit_file(token, username, repo_name,
|
|
82
|
+
"run_inference.py", INFERENCE_SCRIPT, "Sync inference script")
|
|
83
|
+
_commit_file(token, username, repo_name,
|
|
84
|
+
".github/workflows/inference.yml", WORKFLOW_YAML, "Sync workflow")
|
|
85
|
+
_log("Runner files synced", verbose=verbose)
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
INFERENCE_SCRIPT = r'''
|
|
2
|
+
import os, urllib.request, warnings
|
|
3
|
+
warnings.filterwarnings("ignore")
|
|
4
|
+
from llama_cpp import Llama
|
|
5
|
+
|
|
6
|
+
MODEL_MAP = {
|
|
7
|
+
"tinyllama": {
|
|
8
|
+
"url": "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
9
|
+
"filename": "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
|
|
10
|
+
"n_ctx": 2048,
|
|
11
|
+
},
|
|
12
|
+
"llama": {
|
|
13
|
+
"url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
|
14
|
+
"filename": "Llama-3.2-1B-Instruct-Q4_K_M.gguf",
|
|
15
|
+
"n_ctx": 4096,
|
|
16
|
+
},
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
model_key = os.environ["MODEL"]
|
|
20
|
+
prompt = os.environ["PROMPT"]
|
|
21
|
+
system = os.environ.get("SYSTEM", "You are a helpful assistant.")
|
|
22
|
+
max_tokens = int(os.environ.get("MAX_TOKENS", "512"))
|
|
23
|
+
temperature = float(os.environ.get("TEMPERATURE", "0.7"))
|
|
24
|
+
n_ctx_env = os.environ.get("N_CTX", "").strip()
|
|
25
|
+
|
|
26
|
+
cfg = MODEL_MAP[model_key]
|
|
27
|
+
n_ctx = int(n_ctx_env) if n_ctx_env else cfg["n_ctx"]
|
|
28
|
+
model_path = f"model_cache/{cfg['filename']}"
|
|
29
|
+
|
|
30
|
+
if not os.path.exists(model_path):
|
|
31
|
+
os.makedirs("model_cache", exist_ok=True)
|
|
32
|
+
print(f"Downloading {cfg['filename']}...")
|
|
33
|
+
urllib.request.urlretrieve(cfg["url"], model_path)
|
|
34
|
+
|
|
35
|
+
print(f"Loading {cfg['filename']} (n_ctx={n_ctx})...")
|
|
36
|
+
llm = Llama(model_path=model_path, n_ctx=n_ctx, n_threads=4, verbose=False)
|
|
37
|
+
|
|
38
|
+
print("Running inference...")
|
|
39
|
+
response = llm.create_chat_completion(
|
|
40
|
+
messages=[
|
|
41
|
+
{"role": "system", "content": system},
|
|
42
|
+
{"role": "user", "content": prompt},
|
|
43
|
+
],
|
|
44
|
+
max_tokens=max_tokens,
|
|
45
|
+
temperature=temperature,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
output = response["choices"][0]["message"]["content"]
|
|
49
|
+
print("\n=== OUTPUT ===")
|
|
50
|
+
print(output)
|
|
51
|
+
|
|
52
|
+
with open("output.txt", "w") as f:
|
|
53
|
+
f.write(output)
|
|
54
|
+
'''
|
|
55
|
+
|
|
56
|
+
WORKFLOW_YAML = """\
|
|
57
|
+
name: AI Inference
|
|
58
|
+
on:
|
|
59
|
+
workflow_dispatch:
|
|
60
|
+
inputs:
|
|
61
|
+
prompt: { description: "User prompt", required: true }
|
|
62
|
+
system: { description: "System prompt", required: false, default: "You are a helpful assistant." }
|
|
63
|
+
model: { description: "tinyllama|llama", required: false, default: "tinyllama" }
|
|
64
|
+
cache: { description: "Cache weights", required: false, default: "true" }
|
|
65
|
+
max_tokens: { description: "Max new tokens", required: false, default: "512" }
|
|
66
|
+
temperature: { description: "Temperature", required: false, default: "0.7" }
|
|
67
|
+
n_ctx: { description: "Context window", required: false, default: "" }
|
|
68
|
+
|
|
69
|
+
jobs:
|
|
70
|
+
inference:
|
|
71
|
+
runs-on: ubuntu-latest
|
|
72
|
+
steps:
|
|
73
|
+
- uses: actions/checkout@v4
|
|
74
|
+
with:
|
|
75
|
+
fetch-depth: 1
|
|
76
|
+
|
|
77
|
+
- uses: actions/setup-python@v5
|
|
78
|
+
with:
|
|
79
|
+
python-version: "3.11"
|
|
80
|
+
|
|
81
|
+
- name: Cache venv
|
|
82
|
+
id: cache-venv
|
|
83
|
+
uses: actions/cache@v4
|
|
84
|
+
with:
|
|
85
|
+
path: .venv
|
|
86
|
+
key: venv-llama-cpp-v1
|
|
87
|
+
|
|
88
|
+
- name: Cache model weights
|
|
89
|
+
if: ${{ inputs.cache == 'true' }}
|
|
90
|
+
uses: actions/cache@v4
|
|
91
|
+
with:
|
|
92
|
+
path: model_cache
|
|
93
|
+
key: gguf-${{ inputs.model }}-v1
|
|
94
|
+
|
|
95
|
+
- name: Install dependencies
|
|
96
|
+
if: steps.cache-venv.outputs.cache-hit != 'true'
|
|
97
|
+
run: |
|
|
98
|
+
python -m venv .venv
|
|
99
|
+
CMAKE_ARGS="-DGGML_METAL=off" .venv/bin/pip install llama-cpp-python -q
|
|
100
|
+
|
|
101
|
+
- name: Run inference
|
|
102
|
+
env:
|
|
103
|
+
PROMPT: ${{ inputs.prompt }}
|
|
104
|
+
SYSTEM: ${{ inputs.system }}
|
|
105
|
+
MODEL: ${{ inputs.model }}
|
|
106
|
+
MAX_TOKENS: ${{ inputs.max_tokens }}
|
|
107
|
+
TEMPERATURE: ${{ inputs.temperature }}
|
|
108
|
+
N_CTX: ${{ inputs.n_ctx }}
|
|
109
|
+
run: .venv/bin/python run_inference.py
|
|
110
|
+
|
|
111
|
+
- uses: actions/upload-artifact@v4
|
|
112
|
+
with:
|
|
113
|
+
name: ai-output
|
|
114
|
+
path: output.txt
|
|
115
|
+
retention-days: 1
|
|
116
|
+
"""
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from .models import (
|
|
2
|
+
MODELS,
|
|
3
|
+
RUNNER_RAM_GB,
|
|
4
|
+
RAM_PER_1K_CTX_GB,
|
|
5
|
+
MAX_TOKENS_LIMIT,
|
|
6
|
+
MAX_TEMPERATURE,
|
|
7
|
+
MIN_TEMPERATURE,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _validate(model, max_tokens, temperature, n_ctx):
|
|
12
|
+
cfg = MODELS[model]
|
|
13
|
+
errors = []
|
|
14
|
+
|
|
15
|
+
if not (MIN_TEMPERATURE <= temperature <= MAX_TEMPERATURE):
|
|
16
|
+
errors.append(
|
|
17
|
+
f"temperature must be between {MIN_TEMPERATURE} and {MAX_TEMPERATURE}, got {temperature}"
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
if max_tokens < 1:
|
|
21
|
+
errors.append(f"max_tokens must be at least 1, got {max_tokens}")
|
|
22
|
+
if max_tokens > MAX_TOKENS_LIMIT:
|
|
23
|
+
errors.append(f"max_tokens={max_tokens} exceeds hard limit of {MAX_TOKENS_LIMIT}.")
|
|
24
|
+
|
|
25
|
+
effective_n_ctx = n_ctx or cfg["n_ctx"]
|
|
26
|
+
if effective_n_ctx > cfg["max_n_ctx"]:
|
|
27
|
+
errors.append(
|
|
28
|
+
f"n_ctx={effective_n_ctx} exceeds safe limit of {cfg['max_n_ctx']}. Risk of OOM."
|
|
29
|
+
)
|
|
30
|
+
if max_tokens >= effective_n_ctx:
|
|
31
|
+
errors.append(
|
|
32
|
+
f"max_tokens={max_tokens} must be less than n_ctx={effective_n_ctx}."
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
base_ram = cfg["size_gb"]
|
|
36
|
+
extra_ctx = max(0, effective_n_ctx - cfg["n_ctx"])
|
|
37
|
+
total_ram = base_ram + (extra_ctx / 1000) * RAM_PER_1K_CTX_GB + 1.0
|
|
38
|
+
if total_ram > RUNNER_RAM_GB:
|
|
39
|
+
errors.append(
|
|
40
|
+
f"Estimated RAM ({total_ram:.1f} GB) exceeds runner limit ({RUNNER_RAM_GB} GB)."
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if errors:
|
|
44
|
+
raise ValueError("Validation failed:\n" + "\n".join(f" - {e}" for e in errors))
|
|
45
|
+
|
|
46
|
+
if max_tokens > 2048:
|
|
47
|
+
print(f"[warn] max_tokens={max_tokens} is large — expect slower inference.")
|
|
48
|
+
if effective_n_ctx > 4096:
|
|
49
|
+
print(f"[warn] n_ctx={effective_n_ctx} is large — estimated RAM: {total_ram:.1f} GB.")
|
|
50
|
+
if temperature > 1.2:
|
|
51
|
+
print(f"[warn] temperature={temperature} is high — output may be incoherent.")
|