blazeinfer 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- blazeinfer-0.0.1/LICENSE +21 -0
- blazeinfer-0.0.1/PKG-INFO +40 -0
- blazeinfer-0.0.1/README.md +18 -0
- blazeinfer-0.0.1/blazeinfer/__init__.py +0 -0
- blazeinfer-0.0.1/blazeinfer/executor/__init__.py +0 -0
- blazeinfer-0.0.1/blazeinfer/executor/model_executor.py +67 -0
- blazeinfer-0.0.1/blazeinfer/generate.py +75 -0
- blazeinfer-0.0.1/blazeinfer/kernels/__init__.py +0 -0
- blazeinfer-0.0.1/blazeinfer/main.py +41 -0
- blazeinfer-0.0.1/blazeinfer/models/__init__.py +0 -0
- blazeinfer-0.0.1/blazeinfer/utils/__init__.py +0 -0
- blazeinfer-0.0.1/blazeinfer.egg-info/PKG-INFO +40 -0
- blazeinfer-0.0.1/blazeinfer.egg-info/SOURCES.txt +17 -0
- blazeinfer-0.0.1/blazeinfer.egg-info/dependency_links.txt +1 -0
- blazeinfer-0.0.1/blazeinfer.egg-info/entry_points.txt +2 -0
- blazeinfer-0.0.1/blazeinfer.egg-info/requires.txt +9 -0
- blazeinfer-0.0.1/blazeinfer.egg-info/top_level.txt +8 -0
- blazeinfer-0.0.1/pyproject.toml +47 -0
- blazeinfer-0.0.1/setup.cfg +4 -0
blazeinfer-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Wenyi Xu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: blazeinfer
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A high-performance, light-weight llm inference framework.
|
|
5
|
+
Author-email: Wenyi Xu <wenyixu101@email.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/xuwenyihust/BlazeInfer
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/xuwenyihust/BlazeInfer/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch<2.10.0,>=2.9.0
|
|
14
|
+
Requires-Dist: numpy>=1.26.0
|
|
15
|
+
Requires-Dist: transformers>=4.40.0
|
|
16
|
+
Requires-Dist: accelerate>=0.32.0
|
|
17
|
+
Requires-Dist: tqdm
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: ruff; extra == "dev"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+

|
|
26
|
+
A high-performance, light-weight llm inference framework.
|
|
27
|
+
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
--------------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
## About
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
- Optimized Attention Kernels
|
|
36
|
+
|
|
37
|
+
## Getting Started
|
|
38
|
+
|
|
39
|
+
## Acknowledgement
|
|
40
|
+
- [lite_llama](https://github.com/harleyszhang/lite_llama/tree/main)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+
A high-performance, light-weight llm inference framework.
|
|
5
|
+
|
|
6
|
+
</div>
|
|
7
|
+
|
|
8
|
+
--------------------------------------------------------------------------------
|
|
9
|
+
|
|
10
|
+
## About
|
|
11
|
+
|
|
12
|
+
## Features
|
|
13
|
+
- Optimized Attention Kernels
|
|
14
|
+
|
|
15
|
+
## Getting Started
|
|
16
|
+
|
|
17
|
+
## Acknowledgement
|
|
18
|
+
- [lite_llama](https://github.com/harleyszhang/lite_llama/tree/main)
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import transformers
|
|
3
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class SimpleModelExecutor:
|
|
11
|
+
"""
|
|
12
|
+
A minimal, naive ModelExecutor that loads a standard Hugging Face
|
|
13
|
+
model and runs a forward pass.
|
|
14
|
+
|
|
15
|
+
It does NOT use a KV cache.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, model_id: str):
|
|
19
|
+
"""
|
|
20
|
+
Initializes and loads the model and tokenizer.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
model_id (str): The model identifier from Hugging Face
|
|
24
|
+
(e.g., "meta-llama/Llama-3.1-8B-Instruct")
|
|
25
|
+
"""
|
|
26
|
+
logger.info(f"Loading model '{model_id}'... This may take a moment.")
|
|
27
|
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
28
|
+
|
|
29
|
+
# Load the model and tokenizer
|
|
30
|
+
self.model, self.tokenizer = self.load_model_and_tokenizer(model_id)
|
|
31
|
+
logger.info(f"Model loaded successfully on {self.device}.")
|
|
32
|
+
|
|
33
|
+
def load_model_and_tokenizer(self, model_id: str):
|
|
34
|
+
"""
|
|
35
|
+
Loads the model and tokenizer from Hugging Face.
|
|
36
|
+
This is the "minimum" way to load.
|
|
37
|
+
"""
|
|
38
|
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
39
|
+
|
|
40
|
+
model = AutoModelForCausalLM.from_pretrained(
|
|
41
|
+
model_id,
|
|
42
|
+
dtype=torch.float16, # Use float16 to save memory
|
|
43
|
+
device_map=self.device # Automatically load to GPU
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# Set to evaluation mode (disables dropout, etc.)
|
|
47
|
+
model.eval()
|
|
48
|
+
|
|
49
|
+
return model, tokenizer
|
|
50
|
+
|
|
51
|
+
def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
|
|
52
|
+
"""
|
|
53
|
+
Runs a single, simple forward pass.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
input_ids (torch.Tensor): A tensor of token IDs.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
torch.Tensor: The logits (raw predictions) from the model.
|
|
60
|
+
"""
|
|
61
|
+
# We wrap this in torch.no_grad() to tell PyTorch
|
|
62
|
+
# not to calculate gradients, which saves memory and is faster.
|
|
63
|
+
with torch.no_grad():
|
|
64
|
+
outputs = self.model(input_ids=input_ids)
|
|
65
|
+
# The model's output is a complex object.
|
|
66
|
+
# We just want the logits.
|
|
67
|
+
return outputs.logits
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
from .executor.model_executor import SimpleModelExecutor
|
|
2
|
+
import torch
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def generate_text_naively(
|
|
10
|
+
executor: SimpleModelExecutor,
|
|
11
|
+
prompt: str,
|
|
12
|
+
max_new_tokens: int = 50
|
|
13
|
+
):
|
|
14
|
+
"""
|
|
15
|
+
Generates text autoregressively WITHOUT a KV cache.
|
|
16
|
+
This is the "naive" implementation.
|
|
17
|
+
"""
|
|
18
|
+
tokenizer = executor.tokenizer
|
|
19
|
+
device = executor.device
|
|
20
|
+
|
|
21
|
+
logger.info(f"\nPrompt: '{prompt}'")
|
|
22
|
+
|
|
23
|
+
# 1. Tokenize the input prompt
|
|
24
|
+
# We add [0] at the end because the tokenizer returns a batch
|
|
25
|
+
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
|
|
26
|
+
|
|
27
|
+
generated_token_ids = []
|
|
28
|
+
|
|
29
|
+
# 2. The autoregressive loop
|
|
30
|
+
for _ in range(max_new_tokens):
|
|
31
|
+
# ------------------------------------------------------------------
|
|
32
|
+
# This is the core "naive" part:
|
|
33
|
+
# In every loop, we pass the *entire* history of tokens
|
|
34
|
+
# (original prompt + generated tokens) back into the model.
|
|
35
|
+
# ------------------------------------------------------------------
|
|
36
|
+
current_ids_to_process = input_ids
|
|
37
|
+
|
|
38
|
+
# 3. Run the forward pass using our executor
|
|
39
|
+
# 'logits' will have shape [batch_size, sequence_length, vocab_size]
|
|
40
|
+
logits = executor.forward(current_ids_to_process)
|
|
41
|
+
|
|
42
|
+
# 4. Get the logits for the *very last* token
|
|
43
|
+
# This tells us the model's prediction for the *next* token
|
|
44
|
+
# Shape: [batch_size, vocab_size]
|
|
45
|
+
# Example: next_token_logits: tensor([[-68.4375, -69.0625, -73.3125, ..., -77.0000, -77.1250, -70.0625]], dtype=torch.float16)
|
|
46
|
+
next_token_logits = logits[:, -1, :]
|
|
47
|
+
logger.debug(f"next_token_logits: {next_token_logits}")
|
|
48
|
+
|
|
49
|
+
# 5. Get the most likely token (this is "greedy sampling")
|
|
50
|
+
# Shape: [batch_size]
|
|
51
|
+
# Example: tensor([1757])
|
|
52
|
+
next_token_id = torch.argmax(next_token_logits, dim=-1)
|
|
53
|
+
logger.debug(f"next_token_id: {next_token_id}")
|
|
54
|
+
|
|
55
|
+
# 6. Check for the End-of-Sequence token
|
|
56
|
+
if next_token_id == tokenizer.eos_token_id:
|
|
57
|
+
logger.info("\n[End of sequence reached]")
|
|
58
|
+
break
|
|
59
|
+
|
|
60
|
+
# 7. Add the new token to our full sequence
|
|
61
|
+
# This is the "autoregressive" part: the new token is now
|
|
62
|
+
# part of the input for the next loop.
|
|
63
|
+
# input_ids example: tensor([[15496, 11, 616, 1438, 318, 1757]])
|
|
64
|
+
input_ids = torch.cat([input_ids, next_token_id.unsqueeze(0)], dim=-1)
|
|
65
|
+
logger.debug(f"input_ids: {input_ids}")
|
|
66
|
+
|
|
67
|
+
# Also store it for decoding later
|
|
68
|
+
generated_token_ids.append(next_token_id.item())
|
|
69
|
+
|
|
70
|
+
# (Optional) Print the new token as it's generated
|
|
71
|
+
print(tokenizer.decode(next_token_id), end="", flush=True)
|
|
72
|
+
|
|
73
|
+
# 8. Decode the final generated text
|
|
74
|
+
final_text = tokenizer.decode(generated_token_ids)
|
|
75
|
+
return final_text
|
|
File without changes
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
from .executor.model_executor import SimpleModelExecutor
|
|
2
|
+
from .generate import generate_text_naively
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def main():
|
|
11
|
+
model_id = "gpt2" # Using gpt2 as it's small and requires no login
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
# Step 1: Create the executor. This will load the model.
|
|
15
|
+
executor = SimpleModelExecutor(model_id=model_id)
|
|
16
|
+
|
|
17
|
+
# Step 2: Start a conversational loop
|
|
18
|
+
while True:
|
|
19
|
+
# Get input from the user
|
|
20
|
+
prompt = input("\nEnter your prompt (or type 'exit' to quit): ")
|
|
21
|
+
|
|
22
|
+
# Check for exit condition
|
|
23
|
+
if prompt.strip().lower() == "exit":
|
|
24
|
+
print("Exiting BlazeInfer. Goodbye!")
|
|
25
|
+
break
|
|
26
|
+
|
|
27
|
+
# Run the naive generation loop with the user's prompt
|
|
28
|
+
print("\nBlazeInfer: ", end="", flush=True)
|
|
29
|
+
generate_text_naively(executor, prompt, max_new_tokens=50)
|
|
30
|
+
print("\n" + "="*50)
|
|
31
|
+
except ImportError:
|
|
32
|
+
logger.info("\nError: Please install transformers and torch.")
|
|
33
|
+
logger.info("Run: pip install transformers torch")
|
|
34
|
+
except Exception as e:
|
|
35
|
+
logger.info(f"\nAn error occurred: {e}")
|
|
36
|
+
logger.info("If using a gated model (like Llama),")
|
|
37
|
+
logger.info("please ensure you are logged in: `huggingface-cli login`")
|
|
38
|
+
|
|
39
|
+
if __name__ == "__main__":
|
|
40
|
+
logger.info("Starting BlazeInfer...")
|
|
41
|
+
main()
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: blazeinfer
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A high-performance, light-weight llm inference framework.
|
|
5
|
+
Author-email: Wenyi Xu <wenyixu101@email.com>
|
|
6
|
+
Project-URL: Homepage, https://github.com/xuwenyihust/BlazeInfer
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/xuwenyihust/BlazeInfer/issues
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Requires-Dist: torch<2.10.0,>=2.9.0
|
|
14
|
+
Requires-Dist: numpy>=1.26.0
|
|
15
|
+
Requires-Dist: transformers>=4.40.0
|
|
16
|
+
Requires-Dist: accelerate>=0.32.0
|
|
17
|
+
Requires-Dist: tqdm
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: ruff; extra == "dev"
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+

|
|
26
|
+
A high-performance, light-weight llm inference framework.
|
|
27
|
+
|
|
28
|
+
</div>
|
|
29
|
+
|
|
30
|
+
--------------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
## About
|
|
33
|
+
|
|
34
|
+
## Features
|
|
35
|
+
- Optimized Attention Kernels
|
|
36
|
+
|
|
37
|
+
## Getting Started
|
|
38
|
+
|
|
39
|
+
## Acknowledgement
|
|
40
|
+
- [lite_llama](https://github.com/harleyszhang/lite_llama/tree/main)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
blazeinfer/__init__.py
|
|
5
|
+
blazeinfer/generate.py
|
|
6
|
+
blazeinfer/main.py
|
|
7
|
+
blazeinfer.egg-info/PKG-INFO
|
|
8
|
+
blazeinfer.egg-info/SOURCES.txt
|
|
9
|
+
blazeinfer.egg-info/dependency_links.txt
|
|
10
|
+
blazeinfer.egg-info/entry_points.txt
|
|
11
|
+
blazeinfer.egg-info/requires.txt
|
|
12
|
+
blazeinfer.egg-info/top_level.txt
|
|
13
|
+
blazeinfer/executor/__init__.py
|
|
14
|
+
blazeinfer/executor/model_executor.py
|
|
15
|
+
blazeinfer/kernels/__init__.py
|
|
16
|
+
blazeinfer/models/__init__.py
|
|
17
|
+
blazeinfer/utils/__init__.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "blazeinfer"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Wenyi Xu", email="wenyixu101@email.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A high-performance, light-weight llm inference framework."
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
# == Core Dependencies ==
|
|
20
|
+
# Pinned torch to version 2.9 as requested.
|
|
21
|
+
# Note: Installing PyTorch with specific CUDA versions may require using --index-url.
|
|
22
|
+
# e.g., pip install -r requirements.txt --index-url https://download.pytorch.org/whl/cu121
|
|
23
|
+
dependencies = [
|
|
24
|
+
"torch>=2.9.0,<2.10.0",
|
|
25
|
+
"numpy>=1.26.0",
|
|
26
|
+
"transformers>=4.40.0", # For tokenizers and model configurations
|
|
27
|
+
"accelerate>=0.32.0", # For device_map and advanced model loading
|
|
28
|
+
"tqdm", # For progress bars
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
"Homepage" = "https://github.com/xuwenyihust/BlazeInfer"
|
|
33
|
+
"Bug Tracker" = "https://github.com/xuwenyihust/BlazeInfer/issues"
|
|
34
|
+
|
|
35
|
+
# == Optional Dependencies ==
|
|
36
|
+
# For developers of the framework
|
|
37
|
+
[project.optional-dependencies]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest",
|
|
40
|
+
"ruff", # For linting and formatting
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
blazeinfer = "blazeinfer.main:main"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools]
|
|
47
|
+
packages = { find = {} }
|