blazeinfer 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Wenyi Xu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: blazeinfer
3
+ Version: 0.0.1
4
+ Summary: A high-performance, light-weight llm inference framework.
5
+ Author-email: Wenyi Xu <wenyixu101@email.com>
6
+ Project-URL: Homepage, https://github.com/xuwenyihust/BlazeInfer
7
+ Project-URL: Bug Tracker, https://github.com/xuwenyihust/BlazeInfer/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: torch<2.10.0,>=2.9.0
14
+ Requires-Dist: numpy>=1.26.0
15
+ Requires-Dist: transformers>=4.40.0
16
+ Requires-Dist: accelerate>=0.32.0
17
+ Requires-Dist: tqdm
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest; extra == "dev"
20
+ Requires-Dist: ruff; extra == "dev"
21
+ Dynamic: license-file
22
+
23
+ <div align="center">
24
+
25
+ ![# BlazeInfer](assets/logo.png)
26
+ A high-performance, light-weight llm inference framework.
27
+
28
+ </div>
29
+
30
+ --------------------------------------------------------------------------------
31
+
32
+ ## About
33
+
34
+ ## Features
35
+ - Optimized Attention Kernels
36
+
37
+ ## Getting Started
38
+
39
+ ## Acknowledgement
40
+ - [lite_llama](https://github.com/harleyszhang/lite_llama/tree/main)
@@ -0,0 +1,18 @@
1
+ <div align="center">
2
+
3
+ ![# BlazeInfer](assets/logo.png)
4
+ A high-performance, light-weight llm inference framework.
5
+
6
+ </div>
7
+
8
+ --------------------------------------------------------------------------------
9
+
10
+ ## About
11
+
12
+ ## Features
13
+ - Optimized Attention Kernels
14
+
15
+ ## Getting Started
16
+
17
+ ## Acknowledgement
18
+ - [lite_llama](https://github.com/harleyszhang/lite_llama/tree/main)
File without changes
File without changes
@@ -0,0 +1,67 @@
1
+ import torch
2
+ import transformers
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import logging
5
+
6
+ logger = logging.getLogger(__name__)
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
8
+
9
+
10
+ class SimpleModelExecutor:
11
+ """
12
+ A minimal, naive ModelExecutor that loads a standard Hugging Face
13
+ model and runs a forward pass.
14
+
15
+ It does NOT use a KV cache.
16
+ """
17
+
18
+ def __init__(self, model_id: str):
19
+ """
20
+ Initializes and loads the model and tokenizer.
21
+
22
+ Args:
23
+ model_id (str): The model identifier from Hugging Face
24
+ (e.g., "meta-llama/Llama-3.1-8B-Instruct")
25
+ """
26
+ logger.info(f"Loading model '{model_id}'... This may take a moment.")
27
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
28
+
29
+ # Load the model and tokenizer
30
+ self.model, self.tokenizer = self.load_model_and_tokenizer(model_id)
31
+ logger.info(f"Model loaded successfully on {self.device}.")
32
+
33
+ def load_model_and_tokenizer(self, model_id: str):
34
+ """
35
+ Loads the model and tokenizer from Hugging Face.
36
+ This is the "minimum" way to load.
37
+ """
38
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
39
+
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ model_id,
42
+ dtype=torch.float16, # Use float16 to save memory
43
+ device_map=self.device # Automatically load to GPU
44
+ )
45
+
46
+ # Set to evaluation mode (disables dropout, etc.)
47
+ model.eval()
48
+
49
+ return model, tokenizer
50
+
51
+ def forward(self, input_ids: torch.Tensor) -> torch.Tensor:
52
+ """
53
+ Runs a single, simple forward pass.
54
+
55
+ Args:
56
+ input_ids (torch.Tensor): A tensor of token IDs.
57
+
58
+ Returns:
59
+ torch.Tensor: The logits (raw predictions) from the model.
60
+ """
61
+ # We wrap this in torch.no_grad() to tell PyTorch
62
+ # not to calculate gradients, which saves memory and is faster.
63
+ with torch.no_grad():
64
+ outputs = self.model(input_ids=input_ids)
65
+ # The model's output is a complex object.
66
+ # We just want the logits.
67
+ return outputs.logits
@@ -0,0 +1,75 @@
1
+ from .executor.model_executor import SimpleModelExecutor
2
+ import torch
3
+ import logging
4
+
5
+ logger = logging.getLogger(__name__)
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
7
+
8
+
9
+ def generate_text_naively(
10
+ executor: SimpleModelExecutor,
11
+ prompt: str,
12
+ max_new_tokens: int = 50
13
+ ):
14
+ """
15
+ Generates text autoregressively WITHOUT a KV cache.
16
+ This is the "naive" implementation.
17
+ """
18
+ tokenizer = executor.tokenizer
19
+ device = executor.device
20
+
21
+ logger.info(f"\nPrompt: '{prompt}'")
22
+
23
+ # 1. Tokenize the input prompt
24
+ # We add [0] at the end because the tokenizer returns a batch
25
+ input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
26
+
27
+ generated_token_ids = []
28
+
29
+ # 2. The autoregressive loop
30
+ for _ in range(max_new_tokens):
31
+ # ------------------------------------------------------------------
32
+ # This is the core "naive" part:
33
+ # In every loop, we pass the *entire* history of tokens
34
+ # (original prompt + generated tokens) back into the model.
35
+ # ------------------------------------------------------------------
36
+ current_ids_to_process = input_ids
37
+
38
+ # 3. Run the forward pass using our executor
39
+ # 'logits' will have shape [batch_size, sequence_length, vocab_size]
40
+ logits = executor.forward(current_ids_to_process)
41
+
42
+ # 4. Get the logits for the *very last* token
43
+ # This tells us the model's prediction for the *next* token
44
+ # Shape: [batch_size, vocab_size]
45
+ # Example: next_token_logits: tensor([[-68.4375, -69.0625, -73.3125, ..., -77.0000, -77.1250, -70.0625]], dtype=torch.float16)
46
+ next_token_logits = logits[:, -1, :]
47
+ logger.debug(f"next_token_logits: {next_token_logits}")
48
+
49
+ # 5. Get the most likely token (this is "greedy sampling")
50
+ # Shape: [batch_size]
51
+ # Example: tensor([1757])
52
+ next_token_id = torch.argmax(next_token_logits, dim=-1)
53
+ logger.debug(f"next_token_id: {next_token_id}")
54
+
55
+ # 6. Check for the End-of-Sequence token
56
+ if next_token_id == tokenizer.eos_token_id:
57
+ logger.info("\n[End of sequence reached]")
58
+ break
59
+
60
+ # 7. Add the new token to our full sequence
61
+ # This is the "autoregressive" part: the new token is now
62
+ # part of the input for the next loop.
63
+ # input_ids example: tensor([[15496, 11, 616, 1438, 318, 1757]])
64
+ input_ids = torch.cat([input_ids, next_token_id.unsqueeze(0)], dim=-1)
65
+ logger.debug(f"input_ids: {input_ids}")
66
+
67
+ # Also store it for decoding later
68
+ generated_token_ids.append(next_token_id.item())
69
+
70
+ # (Optional) Print the new token as it's generated
71
+ print(tokenizer.decode(next_token_id), end="", flush=True)
72
+
73
+ # 8. Decode the final generated text
74
+ final_text = tokenizer.decode(generated_token_ids)
75
+ return final_text
File without changes
@@ -0,0 +1,41 @@
1
+ from .executor.model_executor import SimpleModelExecutor
2
+ from .generate import generate_text_naively
3
+ import logging
4
+
5
+
6
+ logger = logging.getLogger(__name__)
7
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(name)s - %(message)s')
8
+
9
+
10
+ def main():
11
+ model_id = "gpt2" # Using gpt2 as it's small and requires no login
12
+
13
+ try:
14
+ # Step 1: Create the executor. This will load the model.
15
+ executor = SimpleModelExecutor(model_id=model_id)
16
+
17
+ # Step 2: Start a conversational loop
18
+ while True:
19
+ # Get input from the user
20
+ prompt = input("\nEnter your prompt (or type 'exit' to quit): ")
21
+
22
+ # Check for exit condition
23
+ if prompt.strip().lower() == "exit":
24
+ print("Exiting BlazeInfer. Goodbye!")
25
+ break
26
+
27
+ # Run the naive generation loop with the user's prompt
28
+ print("\nBlazeInfer: ", end="", flush=True)
29
+ generate_text_naively(executor, prompt, max_new_tokens=50)
30
+ print("\n" + "="*50)
31
+ except ImportError:
32
+ logger.info("\nError: Please install transformers and torch.")
33
+ logger.info("Run: pip install transformers torch")
34
+ except Exception as e:
35
+ logger.info(f"\nAn error occurred: {e}")
36
+ logger.info("If using a gated model (like Llama),")
37
+ logger.info("please ensure you are logged in: `huggingface-cli login`")
38
+
39
+ if __name__ == "__main__":
40
+ logger.info("Starting BlazeInfer...")
41
+ main()
File without changes
File without changes
@@ -0,0 +1,40 @@
1
+ Metadata-Version: 2.4
2
+ Name: blazeinfer
3
+ Version: 0.0.1
4
+ Summary: A high-performance, light-weight llm inference framework.
5
+ Author-email: Wenyi Xu <wenyixu101@email.com>
6
+ Project-URL: Homepage, https://github.com/xuwenyihust/BlazeInfer
7
+ Project-URL: Bug Tracker, https://github.com/xuwenyihust/BlazeInfer/issues
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Requires-Python: >=3.10
11
+ Description-Content-Type: text/markdown
12
+ License-File: LICENSE
13
+ Requires-Dist: torch<2.10.0,>=2.9.0
14
+ Requires-Dist: numpy>=1.26.0
15
+ Requires-Dist: transformers>=4.40.0
16
+ Requires-Dist: accelerate>=0.32.0
17
+ Requires-Dist: tqdm
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest; extra == "dev"
20
+ Requires-Dist: ruff; extra == "dev"
21
+ Dynamic: license-file
22
+
23
+ <div align="center">
24
+
25
+ ![# BlazeInfer](assets/logo.png)
26
+ A high-performance, light-weight llm inference framework.
27
+
28
+ </div>
29
+
30
+ --------------------------------------------------------------------------------
31
+
32
+ ## About
33
+
34
+ ## Features
35
+ - Optimized Attention Kernels
36
+
37
+ ## Getting Started
38
+
39
+ ## Acknowledgement
40
+ - [lite_llama](https://github.com/harleyszhang/lite_llama/tree/main)
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ blazeinfer/__init__.py
5
+ blazeinfer/generate.py
6
+ blazeinfer/main.py
7
+ blazeinfer.egg-info/PKG-INFO
8
+ blazeinfer.egg-info/SOURCES.txt
9
+ blazeinfer.egg-info/dependency_links.txt
10
+ blazeinfer.egg-info/entry_points.txt
11
+ blazeinfer.egg-info/requires.txt
12
+ blazeinfer.egg-info/top_level.txt
13
+ blazeinfer/executor/__init__.py
14
+ blazeinfer/executor/model_executor.py
15
+ blazeinfer/kernels/__init__.py
16
+ blazeinfer/models/__init__.py
17
+ blazeinfer/utils/__init__.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ blazeinfer = blazeinfer.main:main
@@ -0,0 +1,9 @@
1
+ torch<2.10.0,>=2.9.0
2
+ numpy>=1.26.0
3
+ transformers>=4.40.0
4
+ accelerate>=0.32.0
5
+ tqdm
6
+
7
+ [dev]
8
+ pytest
9
+ ruff
@@ -0,0 +1,8 @@
1
+ assets
2
+ benchmark
3
+ blazeinfer
4
+ build
5
+ dist
6
+ docs
7
+ tests
8
+ venv
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "blazeinfer"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Wenyi Xu", email="wenyixu101@email.com" },
10
+ ]
11
+ description = "A high-performance, light-weight llm inference framework."
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "License :: OSI Approved :: MIT License",
17
+ ]
18
+
19
+ # == Core Dependencies ==
20
+ # Pinned torch to version 2.9 as requested.
21
+ # Note: Installing PyTorch with specific CUDA versions may require using --index-url.
22
+ # e.g., pip install -r requirements.txt --index-url https://download.pytorch.org/whl/cu121
23
+ dependencies = [
24
+ "torch>=2.9.0,<2.10.0",
25
+ "numpy>=1.26.0",
26
+ "transformers>=4.40.0", # For tokenizers and model configurations
27
+ "accelerate>=0.32.0", # For device_map and advanced model loading
28
+ "tqdm", # For progress bars
29
+ ]
30
+
31
+ [project.urls]
32
+ "Homepage" = "https://github.com/xuwenyihust/BlazeInfer"
33
+ "Bug Tracker" = "https://github.com/xuwenyihust/BlazeInfer/issues"
34
+
35
+ # == Optional Dependencies ==
36
+ # For developers of the framework
37
+ [project.optional-dependencies]
38
+ dev = [
39
+ "pytest",
40
+ "ruff", # For linting and formatting
41
+ ]
42
+
43
+ [project.scripts]
44
+ blazeinfer = "blazeinfer.main:main"
45
+
46
+ [tool.setuptools]
47
+ packages = { find = {} }
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+