grillyoptimum 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nicolas Cloutier / Grillcheese AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,63 @@
1
+ Metadata-Version: 2.4
2
+ Name: grillyoptimum
3
+ Version: 0.1.0
4
+ Summary: HuggingFace Optimum-compatible Vulkan backend — optional grilly extension
5
+ Author-email: Nicolas Cloutier <ncloutier@grillcheeseai.com>
6
+ License: MIT
7
+ Requires-Python: >=3.12
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: grilly>=0.4.0
11
+ Requires-Dist: grillyinference>=0.1.0
12
+ Requires-Dist: numpy
13
+ Requires-Dist: transformers
14
+ Provides-Extra: optimum
15
+ Requires-Dist: optimum; extra == "optimum"
16
+ Dynamic: license-file
17
+
18
+ # GrillyOptimum (Alpha not production ready)
19
+
20
+ HuggingFace Optimum-compatible Vulkan backend — optional [grilly](https://github.com/grillcheese/grilly) extension.
21
+
22
+ ## Features
23
+
24
+ - **from_pretrained()** — load any HF Llama model directly
25
+ - **generate()** — HuggingFace-compatible generation interface
26
+ - **Pipeline integration** — use with `transformers.pipeline("text-generation")`
27
+ - **Vulkan acceleration** — native fp16 inference on RDNA2 GPUs
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ pip install grillyoptimum
33
+ ```
34
+
35
+ ```python
36
+ from grillyoptimum import VulkanModelForCausalLM
37
+ from transformers import AutoTokenizer
38
+
39
+ model = VulkanModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
40
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
41
+
42
+ # HF-style generation
43
+ input_ids = tokenizer.encode("Hello, world!", return_tensors="np")
44
+ output = model.generate(input_ids, max_new_tokens=100, temperature=0.7)
45
+ print(tokenizer.decode(output[0]))
46
+
47
+ # Pipeline
48
+ from grillyoptimum.pipeline import create_text_generation_pipeline
49
+ pipe = create_text_generation_pipeline("meta-llama/Llama-3.2-3B-Instruct")
50
+ result = pipe("Explain quantum computing")
51
+ print(result[0]["generated_text"])
52
+ ```
53
+
54
+ ## Requirements
55
+
56
+ - Python 3.12+
57
+ - grilly >= 0.4.0
58
+ - grillyinference >= 0.1.0
59
+ - transformers
60
+
61
+ ## License
62
+
63
+ MIT
@@ -0,0 +1,46 @@
1
+ # GrillyOptimum (Alpha not production ready)
2
+
3
+ HuggingFace Optimum-compatible Vulkan backend — optional [grilly](https://github.com/grillcheese/grilly) extension.
4
+
5
+ ## Features
6
+
7
+ - **from_pretrained()** — load any HF Llama model directly
8
+ - **generate()** — HuggingFace-compatible generation interface
9
+ - **Pipeline integration** — use with `transformers.pipeline("text-generation")`
10
+ - **Vulkan acceleration** — native fp16 inference on RDNA2 GPUs
11
+
12
+ ## Quick Start
13
+
14
+ ```bash
15
+ pip install grillyoptimum
16
+ ```
17
+
18
+ ```python
19
+ from grillyoptimum import VulkanModelForCausalLM
20
+ from transformers import AutoTokenizer
21
+
22
+ model = VulkanModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
23
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
24
+
25
+ # HF-style generation
26
+ input_ids = tokenizer.encode("Hello, world!", return_tensors="np")
27
+ output = model.generate(input_ids, max_new_tokens=100, temperature=0.7)
28
+ print(tokenizer.decode(output[0]))
29
+
30
+ # Pipeline
31
+ from grillyoptimum.pipeline import create_text_generation_pipeline
32
+ pipe = create_text_generation_pipeline("meta-llama/Llama-3.2-3B-Instruct")
33
+ result = pipe("Explain quantum computing")
34
+ print(result[0]["generated_text"])
35
+ ```
36
+
37
+ ## Requirements
38
+
39
+ - Python 3.12+
40
+ - grilly >= 0.4.0
41
+ - grillyinference >= 0.1.0
42
+ - transformers
43
+
44
+ ## License
45
+
46
+ MIT
@@ -0,0 +1,22 @@
1
+ """GrillyOptimum — HuggingFace Optimum-compatible Vulkan backend.
2
+
3
+ Optional grilly extension providing:
4
+ - VulkanModelForCausalLM: HF-compatible model with from_pretrained + generate
5
+ - VulkanConfig: Backend configuration
6
+ - Pipeline integration for HF transformers pipelines
7
+
8
+ Usage:
9
+ from grillyoptimum import VulkanModelForCausalLM
10
+ model = VulkanModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
11
+ output = model.generate(input_ids, max_new_tokens=100)
12
+ """
13
+
14
+ from .modeling import VulkanModelForCausalLM
15
+ from .configuration import VulkanConfig
16
+
17
+ __version__ = "0.1.0"
18
+
19
+ __all__ = [
20
+ "VulkanModelForCausalLM",
21
+ "VulkanConfig",
22
+ ]
@@ -0,0 +1,43 @@
1
+ """Vulkan backend configuration for HF Optimum compatibility."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class VulkanConfig:
10
+ """Configuration for the Vulkan inference backend.
11
+
12
+ Args:
13
+ dtype: Weight precision ("fp16" or "fp32").
14
+ use_vulkan: Whether to use Vulkan GPU acceleration.
15
+ page_size: KV-cache page size (default 256).
16
+ raw_window: Raw KV window (default 2048).
17
+ enable_h2o: Enable H2O eviction for extended context.
18
+ h2o_lambda: H2O decay rate.
19
+ enable_vsa: Enable VSA multi-scale summaries.
20
+ enable_quantization: Enable SmoothQuant INT8.
21
+ quantize_group_size: INT8 quantization group size.
22
+ """
23
+
24
+ dtype: str = "fp16"
25
+ use_vulkan: bool = True
26
+ page_size: int = 256
27
+ raw_window: int = 2048
28
+ enable_h2o: bool = False
29
+ h2o_lambda: float = 0.0002
30
+ enable_vsa: bool = False
31
+ enable_quantization: bool = False
32
+ quantize_group_size: int = 64
33
+ max_batch_size: int = 1
34
+ device: str = "vulkan"
35
+
36
+ def to_dict(self) -> dict:
37
+ from dataclasses import asdict
38
+ return asdict(self)
39
+
40
+ @classmethod
41
+ def from_dict(cls, d: dict) -> VulkanConfig:
42
+ valid_fields = {f.name for f in cls.__dataclass_fields__.values()}
43
+ return cls(**{k: v for k, v in d.items() if k in valid_fields})
@@ -0,0 +1,160 @@
1
+ """VulkanModelForCausalLM — HuggingFace-compatible model interface.
2
+
3
+ Wraps GrillyInference's LlamaForCausalLM with HF GenerationMixin patterns:
4
+ - from_pretrained() loads weights and creates inference engine
5
+ - generate() is compatible with HF generation kwargs
6
+ - Can be used with transformers.pipeline("text-generation")
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ import numpy as np
16
+
17
+ from .configuration import VulkanConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class VulkanModelForCausalLM:
23
+ """HuggingFace-compatible Vulkan model for causal language modeling.
24
+
25
+ Compatible with:
26
+ from grillyoptimum import VulkanModelForCausalLM
27
+ model = VulkanModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
28
+ output = model.generate(input_ids, max_new_tokens=100)
29
+ """
30
+
31
+ def __init__(self, model, config, vulkan_config=None):
32
+ """Initialize with a GrillyInference model.
33
+
34
+ Args:
35
+ model: grillyinference.LlamaForCausalLM instance.
36
+ config: grillyinference.LlamaConfig instance.
37
+ vulkan_config: Optional VulkanConfig.
38
+ """
39
+ self._model = model
40
+ self.config = config
41
+ self.vulkan_config = vulkan_config or VulkanConfig()
42
+ self._kv_cache = None
43
+
44
+ @classmethod
45
+ def from_pretrained(
46
+ cls,
47
+ model_id_or_path: str,
48
+ dtype: str = "fp16",
49
+ vulkan_config: VulkanConfig | None = None,
50
+ **kwargs,
51
+ ) -> VulkanModelForCausalLM:
52
+ """Load a pretrained model, HuggingFace style.
53
+
54
+ Args:
55
+ model_id_or_path: HF model ID or local path.
56
+ dtype: Weight precision.
57
+ vulkan_config: Optional backend config.
58
+ **kwargs: Additional arguments (ignored for HF compat).
59
+
60
+ Returns:
61
+ VulkanModelForCausalLM instance.
62
+ """
63
+ from grillyinference import LlamaForCausalLM, LlamaConfig
64
+
65
+ if vulkan_config is None:
66
+ vulkan_config = VulkanConfig(dtype=dtype)
67
+
68
+ config = LlamaConfig.from_pretrained(model_id_or_path)
69
+ model = LlamaForCausalLM.from_pretrained(
70
+ model_id_or_path,
71
+ dtype=vulkan_config.dtype,
72
+ )
73
+
74
+ return cls(model, config, vulkan_config)
75
+
76
+ def generate(
77
+ self,
78
+ input_ids: np.ndarray | None = None,
79
+ max_new_tokens: int = 128,
80
+ max_length: int | None = None,
81
+ temperature: float = 1.0,
82
+ top_k: int = 50,
83
+ top_p: float = 1.0,
84
+ do_sample: bool = True,
85
+ num_return_sequences: int = 1,
86
+ **kwargs,
87
+ ) -> np.ndarray:
88
+ """Generate tokens, HuggingFace style.
89
+
90
+ Args:
91
+ input_ids: (batch, seq_len) int32 input token IDs.
92
+ max_new_tokens: Maximum tokens to generate.
93
+ max_length: Maximum total length (alternative to max_new_tokens).
94
+ temperature: Sampling temperature.
95
+ top_k: Top-k filtering.
96
+ top_p: Nucleus sampling.
97
+ do_sample: Whether to sample (True) or greedy decode (False).
98
+ **kwargs: Additional HF generation kwargs (ignored).
99
+
100
+ Returns:
101
+ (batch, seq_len + generated) int32 array of token IDs.
102
+ """
103
+ if input_ids is None:
104
+ raise ValueError("input_ids is required")
105
+
106
+ if isinstance(input_ids, list):
107
+ input_ids = np.array(input_ids, dtype=np.int32)
108
+ if input_ids.ndim == 1:
109
+ input_ids = input_ids[np.newaxis, :]
110
+
111
+ batch_size = input_ids.shape[0]
112
+ if batch_size != 1:
113
+ raise ValueError("Only batch_size=1 supported currently")
114
+
115
+ if max_length is not None:
116
+ max_new_tokens = max_length - input_ids.shape[1]
117
+
118
+ if not do_sample:
119
+ temperature = 0.0
120
+
121
+ # Import sampling function
122
+ from grillyinference.inference.generate import _sample_top_k_top_p, LLAMA3_STOP_TOKENS
123
+
124
+ # Prefill
125
+ from grillyinference import KVCache
126
+ kv_cache = KVCache(
127
+ self.config,
128
+ max_batch=1,
129
+ page_size=self.vulkan_config.page_size,
130
+ raw_window=self.vulkan_config.raw_window,
131
+ h2o_lambda=self.vulkan_config.h2o_lambda,
132
+ enable_vsa=self.vulkan_config.enable_vsa,
133
+ )
134
+
135
+ logits = self._model.forward(input_ids, kv_cache=kv_cache)
136
+ last_logits = logits[0, -1, :]
137
+
138
+ generated = list(input_ids[0])
139
+
140
+ for _ in range(max_new_tokens):
141
+ token_id = _sample_top_k_top_p(last_logits, temperature, top_k, top_p)
142
+ if token_id in LLAMA3_STOP_TOKENS:
143
+ break
144
+ generated.append(token_id)
145
+ token_array = np.array([[token_id]], dtype=np.int32)
146
+ logits = self._model.decode_step(token_array, kv_cache)
147
+ last_logits = logits[0, -1, :]
148
+
149
+ return np.array([generated], dtype=np.int32)
150
+
151
+ def __call__(self, input_ids, **kwargs):
152
+ """Make model callable for pipeline compatibility."""
153
+ return self.generate(input_ids, **kwargs)
154
+
155
+ @property
156
+ def device(self):
157
+ return self.vulkan_config.device
158
+
159
+ def memory_footprint(self) -> dict:
160
+ return self._model.memory_footprint()
@@ -0,0 +1,76 @@
1
+ """Pipeline integration for HuggingFace transformers.
2
+
3
+ Registers GrillyOptimum as a backend so HF pipelines can use it:
4
+ from transformers import pipeline
5
+ pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def create_text_generation_pipeline(
17
+ model_id: str,
18
+ tokenizer=None,
19
+ dtype: str = "fp16",
20
+ **kwargs,
21
+ ):
22
+ """Create a text-generation pipeline using Vulkan backend.
23
+
24
+ Args:
25
+ model_id: HF model ID or local path.
26
+ tokenizer: Optional tokenizer (auto-loaded if None).
27
+ dtype: Weight precision.
28
+ **kwargs: Additional pipeline arguments.
29
+
30
+ Returns:
31
+ A callable pipeline object.
32
+ """
33
+ from grillyinference import TextGenerator, LlamaForCausalLM
34
+
35
+ if tokenizer is None:
36
+ from transformers import AutoTokenizer
37
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
38
+
39
+ model = LlamaForCausalLM.from_pretrained(model_id, dtype=dtype)
40
+ generator = TextGenerator(model, tokenizer)
41
+
42
+ class VulkanTextGenerationPipeline:
43
+ """Simple pipeline wrapper for text generation."""
44
+
45
+ def __init__(self, generator, tokenizer):
46
+ self._generator = generator
47
+ self._tokenizer = tokenizer
48
+
49
+ def __call__(
50
+ self,
51
+ text: str | list[str],
52
+ max_new_tokens: int = 128,
53
+ temperature: float = 0.7,
54
+ top_k: int = 50,
55
+ top_p: float = 0.9,
56
+ **kwargs,
57
+ ) -> list[dict[str, Any]]:
58
+ if isinstance(text, str):
59
+ text = [text]
60
+
61
+ results = []
62
+ for prompt in text:
63
+ output = self._generator.generate(
64
+ prompt,
65
+ max_tokens=max_new_tokens,
66
+ temperature=temperature,
67
+ top_k=top_k,
68
+ top_p=top_p,
69
+ )
70
+ results.append({
71
+ "generated_text": output,
72
+ "prompt": prompt,
73
+ })
74
+ return results
75
+
76
+ return VulkanTextGenerationPipeline(generator, tokenizer)
@@ -0,0 +1,63 @@
1
+ Metadata-Version: 2.4
2
+ Name: grillyoptimum
3
+ Version: 0.1.0
4
+ Summary: HuggingFace Optimum-compatible Vulkan backend — optional grilly extension
5
+ Author-email: Nicolas Cloutier <ncloutier@grillcheeseai.com>
6
+ License: MIT
7
+ Requires-Python: >=3.12
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: grilly>=0.4.0
11
+ Requires-Dist: grillyinference>=0.1.0
12
+ Requires-Dist: numpy
13
+ Requires-Dist: transformers
14
+ Provides-Extra: optimum
15
+ Requires-Dist: optimum; extra == "optimum"
16
+ Dynamic: license-file
17
+
18
+ # GrillyOptimum (Alpha not production ready)
19
+
20
+ HuggingFace Optimum-compatible Vulkan backend — optional [grilly](https://github.com/grillcheese/grilly) extension.
21
+
22
+ ## Features
23
+
24
+ - **from_pretrained()** — load any HF Llama model directly
25
+ - **generate()** — HuggingFace-compatible generation interface
26
+ - **Pipeline integration** — use with `transformers.pipeline("text-generation")`
27
+ - **Vulkan acceleration** — native fp16 inference on RDNA2 GPUs
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ pip install grillyoptimum
33
+ ```
34
+
35
+ ```python
36
+ from grillyoptimum import VulkanModelForCausalLM
37
+ from transformers import AutoTokenizer
38
+
39
+ model = VulkanModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
40
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B-Instruct")
41
+
42
+ # HF-style generation
43
+ input_ids = tokenizer.encode("Hello, world!", return_tensors="np")
44
+ output = model.generate(input_ids, max_new_tokens=100, temperature=0.7)
45
+ print(tokenizer.decode(output[0]))
46
+
47
+ # Pipeline
48
+ from grillyoptimum.pipeline import create_text_generation_pipeline
49
+ pipe = create_text_generation_pipeline("meta-llama/Llama-3.2-3B-Instruct")
50
+ result = pipe("Explain quantum computing")
51
+ print(result[0]["generated_text"])
52
+ ```
53
+
54
+ ## Requirements
55
+
56
+ - Python 3.12+
57
+ - grilly >= 0.4.0
58
+ - grillyinference >= 0.1.0
59
+ - transformers
60
+
61
+ ## License
62
+
63
+ MIT
@@ -0,0 +1,14 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ grillyoptimum/__init__.py
5
+ grillyoptimum/configuration.py
6
+ grillyoptimum/modeling.py
7
+ grillyoptimum/pipeline.py
8
+ grillyoptimum.egg-info/PKG-INFO
9
+ grillyoptimum.egg-info/SOURCES.txt
10
+ grillyoptimum.egg-info/dependency_links.txt
11
+ grillyoptimum.egg-info/requires.txt
12
+ grillyoptimum.egg-info/top_level.txt
13
+ tests/test_optimum.py
14
+ tests/test_optimum_gpu.py
@@ -0,0 +1,7 @@
1
+ grilly>=0.4.0
2
+ grillyinference>=0.1.0
3
+ numpy
4
+ transformers
5
+
6
+ [optimum]
7
+ optimum
@@ -0,0 +1 @@
1
+ grillyoptimum
@@ -0,0 +1,21 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "grillyoptimum"
7
+ version = "0.1.0"
8
+ description = "HuggingFace Optimum-compatible Vulkan backend — optional grilly extension"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ authors = [{name = "Nicolas Cloutier", email = "ncloutier@grillcheeseai.com"}]
12
+ requires-python = ">=3.12"
13
+ dependencies = [
14
+ "grilly>=0.4.0",
15
+ "grillyinference>=0.1.0",
16
+ "numpy",
17
+ "transformers",
18
+ ]
19
+
20
+ [project.optional-dependencies]
21
+ optimum = ["optimum"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+