npuserver 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.4
2
+ Name: npuserver
3
+ Version: 0.1.0
4
+ Summary:
5
+ Author: Durga Sai
6
+ Author-email: dsainvg.20.12.24@kgpian.iitkgp.ac.in
7
+ Requires-Python: >=3.13
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.13
10
+ Classifier: Programming Language :: Python :: 3.14
11
+ Requires-Dist: flask (>=3.0.0)
12
+ Requires-Dist: huggingface-hub (>=0.20.0)
13
+ Requires-Dist: openvino-genai (>=2024.5.0)
14
+ Description-Content-Type: text/markdown
15
+
16
+ # npuserver πŸš€
17
+
18
+ A lightweight, efficient utility library for compiling and preparing Generative AI LLM models for the **Intel NPU (Neural Processing Unit)** using OpenVINOβ„’ GenAI.
19
+
20
+ ---
21
+
22
+ ## Features
23
+
24
+ - **Intel NPU Optimization:** Fast, local LLM compilation designed for Intel NPU architectures.
25
+ - **Robust Model Fallbacks:** Automated properties configuration with retry logic for prompt lengths.
26
+ - **Hugging Face Hub Integration:** Seamless resolution and down-caching of models.
27
+ - **Clean API Design:** Import and use directly in any Python environment.
28
+
29
+ ---
30
+
31
+ ## Installation
32
+
33
+ ### From PyPI
34
+ ```bash
35
+ pip install npuserver
36
+ ```
37
+
38
+ ### From Source
39
+ 1. Clone the repository:
40
+ ```bash
41
+ git clone https://github.com/yourusername/npuserver.git
42
+ cd npuserver
43
+ ```
44
+ 2. Set up a virtual environment and install:
45
+ ```bash
46
+ python -m venv .venv
47
+ # On Windows:
48
+ .venv\Scripts\activate
49
+ # On macOS/Linux:
50
+ source .venv/bin/activate
51
+
52
+ pip install -e .
53
+ ```
54
+
55
+ ---
56
+
57
+ ## Quick Start
58
+
59
+ Compile your favorite Hugging Face LLM model for the Intel NPU:
60
+
61
+ ```python
62
+ from pathlib import Path
63
+ from npuserver import compile_model
64
+
65
+ # Path to store compiled model caches
66
+ cache_dir = Path("./npu_cache")
67
+ cache_dir.mkdir(exist_ok=True)
68
+
69
+ # Compile a Hugging Face LLM (e.g., Qwen or Phi)
70
+ compile_model(
71
+ repo_id="Qwen/Qwen2.5-0.5B-Instruct",
72
+ cache_dir=cache_dir,
73
+ prompt_len=8192
74
+ )
75
+ ```
76
+
77
+ ---
78
+
79
+ ## Development
80
+
81
+ ### Running with Poetry
82
+ This library uses **Poetry** as its package manager:
83
+ ```bash
84
+ poetry install
85
+ poetry run python -c "import npuserver; print(npuserver.__all__)"
86
+ ```
87
+
88
+ ### Directory Structure
89
+ ```text
90
+ npuserver/
91
+ β”œβ”€β”€ .github/workflows/ # CI/CD & Automated Publishing
92
+ β”œβ”€β”€ src/
93
+ β”‚ └── npuserver/
94
+ β”‚ β”œβ”€β”€ __init__.py # Package entry point
95
+ β”‚ └── compile.py # Core compilation functions
96
+ β”œβ”€β”€ tests/ # Test suites
97
+ β”œβ”€β”€ pyproject.toml # Modern packaging configuration
98
+ └── requirements.txt # Standard pip requirements
99
+ ```
100
+
101
+ ---
102
+
103
+ ## PyPI Automatic Publishing
104
+
105
+ The project includes an automated GitHub Actions CI/CD pipeline (`.github/workflows/publish.yml`) that builds and publishes releases securely using **OIDC Trusted Publishing**:
106
+
107
+ 1. Tag your release:
108
+ ```bash
109
+ git tag v0.1.0
110
+ git push origin v0.1.0
111
+ ```
112
+ 2. The GitHub Action will trigger, build source/wheel distributions, and push to PyPI.
113
+
@@ -0,0 +1,97 @@
1
+ # npuserver πŸš€
2
+
3
+ A lightweight, efficient utility library for compiling and preparing Generative AI LLM models for the **Intel NPU (Neural Processing Unit)** using OpenVINOβ„’ GenAI.
4
+
5
+ ---
6
+
7
+ ## Features
8
+
9
+ - **Intel NPU Optimization:** Fast, local LLM compilation designed for Intel NPU architectures.
10
+ - **Robust Model Fallbacks:** Automated properties configuration with retry logic for prompt lengths.
11
+ - **Hugging Face Hub Integration:** Seamless resolution and down-caching of models.
12
+ - **Clean API Design:** Import and use directly in any Python environment.
13
+
14
+ ---
15
+
16
+ ## Installation
17
+
18
+ ### From PyPI
19
+ ```bash
20
+ pip install npuserver
21
+ ```
22
+
23
+ ### From Source
24
+ 1. Clone the repository:
25
+ ```bash
26
+ git clone https://github.com/yourusername/npuserver.git
27
+ cd npuserver
28
+ ```
29
+ 2. Set up a virtual environment and install:
30
+ ```bash
31
+ python -m venv .venv
32
+ # On Windows:
33
+ .venv\Scripts\activate
34
+ # On macOS/Linux:
35
+ source .venv/bin/activate
36
+
37
+ pip install -e .
38
+ ```
39
+
40
+ ---
41
+
42
+ ## Quick Start
43
+
44
+ Compile your favorite Hugging Face LLM model for the Intel NPU:
45
+
46
+ ```python
47
+ from pathlib import Path
48
+ from npuserver import compile_model
49
+
50
+ # Path to store compiled model caches
51
+ cache_dir = Path("./npu_cache")
52
+ cache_dir.mkdir(exist_ok=True)
53
+
54
+ # Compile a Hugging Face LLM (e.g., Qwen or Phi)
55
+ compile_model(
56
+ repo_id="Qwen/Qwen2.5-0.5B-Instruct",
57
+ cache_dir=cache_dir,
58
+ prompt_len=8192
59
+ )
60
+ ```
61
+
62
+ ---
63
+
64
+ ## Development
65
+
66
+ ### Running with Poetry
67
+ This library uses **Poetry** as its package manager:
68
+ ```bash
69
+ poetry install
70
+ poetry run python -c "import npuserver; print(npuserver.__all__)"
71
+ ```
72
+
73
+ ### Directory Structure
74
+ ```text
75
+ npuserver/
76
+ β”œβ”€β”€ .github/workflows/ # CI/CD & Automated Publishing
77
+ β”œβ”€β”€ src/
78
+ β”‚ └── npuserver/
79
+ β”‚ β”œβ”€β”€ __init__.py # Package entry point
80
+ β”‚ └── compile.py # Core compilation functions
81
+ β”œβ”€β”€ tests/ # Test suites
82
+ β”œβ”€β”€ pyproject.toml # Modern packaging configuration
83
+ └── requirements.txt # Standard pip requirements
84
+ ```
85
+
86
+ ---
87
+
88
+ ## PyPI Automatic Publishing
89
+
90
+ The project includes an automated GitHub Actions CI/CD pipeline (`.github/workflows/publish.yml`) that builds and publishes releases securely using **OIDC Trusted Publishing**:
91
+
92
+ 1. Tag your release:
93
+ ```bash
94
+ git tag v0.1.0
95
+ git push origin v0.1.0
96
+ ```
97
+ 2. The GitHub Action will trigger, build source/wheel distributions, and push to PyPI.
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "npuserver"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = [
6
+ {name = "Durga Sai",email = "dsainvg.20.12.24@kgpian.iitkgp.ac.in"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.13"
10
+ dependencies = [
11
+ "openvino-genai>=2024.5.0",
12
+ "huggingface-hub>=0.20.0",
13
+ "flask>=3.0.0"
14
+ ]
15
+
16
+ [tool.poetry]
17
+ packages = [{include = "npuserver", from = "src"}]
18
+
19
+ [build-system]
20
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
21
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,4 @@
1
+ from .compile import compile_model
2
+ from .server import run_server, download_and_compile
3
+
4
+ __all__ = ["compile_model", "run_server", "download_and_compile"]
@@ -0,0 +1,69 @@
1
+ import argparse
2
+ import sys
3
+ from pathlib import Path
4
+ from npuserver import run_server, download_and_compile
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser(
8
+ prog="npuserver",
9
+ description="NPU Server CLI: Download, compile, and run OpenVINO GenAI models on NPU."
10
+ )
11
+ subparsers = parser.add_subparsers(dest="command", required=True)
12
+
13
+ # Compile command
14
+ compile_parser = subparsers.add_parser("compile", help="Download and compile a model for NPU.")
15
+ compile_parser.add_argument("--model", required=True, help="Hugging Face model repository ID.")
16
+ compile_parser.add_argument("--genai-cache", help="Directory to store compiled NPU execution blobs. Defaults to ~/.cache/npuserver/compiled")
17
+ compile_parser.add_argument("--hf-cache", help="Directory containing raw downloaded Hugging Face model files. Defaults to ~/.cache/npuserver/hf")
18
+ compile_parser.add_argument("--max-prompt-len", type=int, default=16384, help="Maximum prompt length for NPU compilation.")
19
+ compile_parser.add_argument("--cache-mode", default="OPTIMIZE_SPEED", choices=["OPTIMIZE_SPEED", "OPTIMIZE_SIZE"], help="Compilation cache mode.")
20
+ compile_parser.add_argument("--disable-download", action="store_true", help="Disable downloading model files if they are not already cached locally.")
21
+
22
+ # Serve command
23
+ serve_parser = subparsers.add_parser("serve", help="Run the dynamic HTTP model server.")
24
+ serve_parser.add_argument("--genai-cache", help="Directory containing compiled NPU execution blobs. Defaults to ~/.cache/npuserver/compiled")
25
+ serve_parser.add_argument("--hf-cache", help="Directory containing raw downloaded Hugging Face model files. Defaults to ~/.cache/npuserver/hf")
26
+ serve_parser.add_argument("--model", help="Optional model ID to pre-load at startup.")
27
+ serve_parser.add_argument("--port", type=int, default=8080, help="Port to bind the HTTP server to.")
28
+ serve_parser.add_argument("--host", default="0.0.0.0", help="Host interface to bind the HTTP server to.")
29
+ serve_parser.add_argument("--disable-download", action="store_true", help="Disable on-demand model downloading during server runtime.")
30
+ serve_parser.add_argument("--log-file", help="Path to write server execution logs.")
31
+ serve_parser.add_argument("--prompt-log-file", help="Path to write raw prompt execution logs.")
32
+
33
+ args = parser.parse_args()
34
+
35
+ # Convert paths to Path objects if provided, else let the functions fallback to ~/.cache/npuserver/
36
+ g_cache = Path(args.genai_cache) if args.genai_cache else None
37
+ h_cache = Path(args.hf_cache) if args.hf_cache else None
38
+
39
+ if args.command == "compile":
40
+ print(f"Starting compilation for model: '{args.model}'")
41
+ try:
42
+ download_and_compile(
43
+ model_name=args.model,
44
+ genai_cache_root=g_cache,
45
+ hf_hub_cache=h_cache,
46
+ allow_download=not args.disable_download,
47
+ max_prompt_len=args.max_prompt_len,
48
+ cache_mode=args.cache_mode
49
+ )
50
+ print("Compilation successful.")
51
+ except Exception as e:
52
+ print(f"Error during compilation: {e}", file=sys.stderr)
53
+ sys.exit(1)
54
+
55
+ elif args.command == "serve":
56
+ print(f"Starting server on {args.host}:{args.port}")
57
+ run_server(
58
+ genai_cache_root=g_cache,
59
+ hf_hub_cache=h_cache,
60
+ model_name=args.model,
61
+ allow_download=not args.disable_download,
62
+ port=args.port,
63
+ host=args.host,
64
+ log_file=Path(args.log_file) if args.log_file else None,
65
+ prompt_log_file=Path(args.prompt_log_file) if args.prompt_log_file else None
66
+ )
67
+
68
+ if __name__ == "__main__":
69
+ main()
@@ -0,0 +1,49 @@
1
+ import time
2
+ from pathlib import Path
3
+ import openvino_genai as ov_genai
4
+
5
+
6
+ def compile_model(repo_id,cache_dir,prompt_len=8192):
7
+
8
+ # Resolve the model path (assuming they are already in HF cache as user said)
9
+ # We use snapshot_download with local_files_only=True to find the path
10
+ try:
11
+ from huggingface_hub import snapshot_download
12
+ model_path = snapshot_download(repo_id=repo_id, local_files_only=True)
13
+ except Exception as e:
14
+ # Try one more time without local_files_only if it's the first time
15
+ try:
16
+ model_path = snapshot_download(repo_id=repo_id)
17
+ except Exception as e2:
18
+ print(f"Failed to resolve {repo_id}: {e2}")
19
+ return
20
+
21
+ print(f"Model Path: {model_path}")
22
+
23
+ # Set properties for NPU compilation
24
+ # Note: GenAI properties need correct Python types (int for lengths)
25
+ config = {
26
+ "MAX_prompt_len": prompt_len,
27
+ "cache_dir": str(cache_dir / repo_id.replace("/", "--"))
28
+ }
29
+
30
+ print(f"Using properties: {config}")
31
+
32
+ start_time = time.time()
33
+ try:
34
+ # Initializing the pipeline with "NPU" triggers compilation
35
+ pipe = ov_genai.LLMPipeline(model_path, "NPU", **config)
36
+ print(f"Compilation successful for {repo_id}!")
37
+ except Exception as e:
38
+ print(f"Compilation failed for {repo_id} with MAX_prompt_len: {e}")
39
+ print("Retrying with 'NPU_MAX_prompt_len'...")
40
+ try:
41
+ config["NPU_MAX_prompt_len"] = prompt_len
42
+ del config["MAX_prompt_len"]
43
+ pipe = ov_genai.LLMPipeline(model_path, "NPU", **config)
44
+ print(f"Compilation successful for {repo_id} with NPU_MAX_prompt_len!")
45
+ except Exception as e2:
46
+ print(f"Compilation failed again for {repo_id}: {e2}")
47
+
48
+ end_time = time.time()
49
+ print(f"Time taken: {end_time - start_time:.2f} seconds")
@@ -0,0 +1,582 @@
1
+ import gc
2
+ import json
3
+ import time
4
+ import uuid
5
+ import threading
6
+ import logging
7
+ from pathlib import Path
8
+
9
+
10
+ def get_default_paths():
11
+ """
12
+ Returns the default cache and download paths inside ~/.cache/npuserver/
13
+ """
14
+ base = Path.home() / ".cache" / "npuserver"
15
+ return base / "compiled", base / "hf"
16
+
17
+
18
+ def setup_logging(log_file: str | Path | None = None, prompt_log_file: str | Path | None = None):
19
+ """
20
+ Configures logging dynamically. If `log_file` or `prompt_log_file` is provided,
21
+ creates a FileHandler to write to the requested file path.
22
+ By default, logs to standard output only, with no file creation.
23
+ """
24
+ logger = logging.getLogger("npu_server")
25
+ prompt_logger = logging.getLogger("prompt_logger")
26
+
27
+ # Avoid adding duplicate handlers if setup_logging is called multiple times
28
+ logger.handlers.clear()
29
+ prompt_logger.handlers.clear()
30
+
31
+ logger.setLevel(logging.INFO)
32
+ prompt_logger.setLevel(logging.INFO)
33
+
34
+ formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
35
+
36
+ # Stream Handler for the console
37
+ sh = logging.StreamHandler()
38
+ sh.setFormatter(formatter)
39
+ logger.addHandler(sh)
40
+
41
+ if log_file:
42
+ fh = logging.FileHandler(log_file)
43
+ fh.setFormatter(formatter)
44
+ logger.addHandler(fh)
45
+
46
+ if prompt_log_file:
47
+ pfh = logging.FileHandler(prompt_log_file)
48
+ pfh.setFormatter(logging.Formatter('%(asctime)s\n%(message)s\n' + '-'*80 + '\n'))
49
+ prompt_logger.addHandler(pfh)
50
+ else:
51
+ prompt_logger.addHandler(logging.NullHandler())
52
+
53
+ return logger, prompt_logger
54
+
55
+
56
+ def find_slot(
57
+ model_name: str,
58
+ genai_cache_root: str | Path | None = None,
59
+ hf_hub_cache: str | Path | None = None
60
+ ) -> tuple[Path, dict] | None:
61
+ """
62
+ Locates a compiled model slot inside the GenAI cache root.
63
+ If manifest.json exists, reads metadata from it; otherwise dynamically resolves the
64
+ raw model directory inside hf_hub_cache.
65
+ """
66
+ default_genai, default_hf = get_default_paths()
67
+ g_root = Path(genai_cache_root) if genai_cache_root else default_genai
68
+ h_cache = Path(hf_hub_cache) if hf_hub_cache else default_hf
69
+
70
+ if g_root.exists():
71
+ for slot in g_root.iterdir():
72
+ if slot.is_dir() and (model_name in slot.name or model_name.replace("/", "--") in slot.name):
73
+ # A slot is valid if it contains *.blob files or the compilation completion flag
74
+ if any(slot.glob("*.blob")) or (slot / "compiled.ok").exists():
75
+ mp = slot / "manifest.json"
76
+ if mp.exists():
77
+ try:
78
+ return slot, json.loads(mp.read_text())
79
+ except Exception:
80
+ pass
81
+
82
+ try:
83
+ from huggingface_hub import snapshot_download
84
+ repo_id = slot.name.replace("--", "/")
85
+ model_dir = snapshot_download(repo_id=repo_id, local_files_only=True, cache_dir=str(h_cache))
86
+ return slot, {
87
+ "model_name": model_name,
88
+ "model_dir": model_dir,
89
+ "device": "NPU",
90
+ "max_prompt_len": 16384,
91
+ "cache_mode": "OPTIMIZE_SPEED"
92
+ }
93
+ except Exception:
94
+ pass
95
+ return None
96
+
97
+
98
+ def download_and_compile(
99
+ model_name: str,
100
+ genai_cache_root: str | Path | None = None,
101
+ hf_hub_cache: str | Path | None = None,
102
+ allow_download: bool = True,
103
+ max_prompt_len: int = 16384,
104
+ cache_mode: str = "OPTIMIZE_SPEED",
105
+ logger=None
106
+ ) -> Path:
107
+ """
108
+ Downloads a model from Hugging Face (if not already local and allow_download=True)
109
+ and compiles it directly to the GenAI cache directory.
110
+ Returns the path to the compiled slot directory.
111
+ """
112
+ log = logger or logging.getLogger("npu_server")
113
+ default_genai, default_hf = get_default_paths()
114
+ g_root = Path(genai_cache_root) if genai_cache_root else default_genai
115
+ h_cache = Path(hf_hub_cache) if hf_hub_cache else default_hf
116
+
117
+ from huggingface_hub import snapshot_download
118
+ try:
119
+ log.info(f"Checking for local Hugging Face files for '{model_name}' in '{h_cache}'...")
120
+ model_dir = snapshot_download(
121
+ repo_id=model_name,
122
+ local_files_only=True,
123
+ cache_dir=str(h_cache)
124
+ )
125
+ except Exception as e:
126
+ if not allow_download:
127
+ log.error(f"Model '{model_name}' was not found locally in hf_hub_cache and allow_download=False.")
128
+ raise ValueError(f"Model '{model_name}' not found locally and downloading is disabled: {e}")
129
+
130
+ log.info(f"Local files not found. Fetching online from Hugging Face Hub: {e}")
131
+ model_dir = snapshot_download(
132
+ repo_id=model_name,
133
+ cache_dir=str(h_cache)
134
+ )
135
+
136
+ # Setup compiled slot directory (stores compiled NPU blob and manifest)
137
+ safe_name = model_name.replace("/", "--")
138
+ slot_dir = g_root / safe_name
139
+ slot_dir.mkdir(parents=True, exist_ok=True)
140
+
141
+ manifest = {
142
+ "model_name": model_name,
143
+ "model_dir": str(model_dir),
144
+ "device": "NPU",
145
+ "max_prompt_len": max_prompt_len,
146
+ "cache_mode": cache_mode,
147
+ "compiled_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
148
+ }
149
+ (slot_dir / "manifest.json").write_text(json.dumps(manifest, indent=2))
150
+
151
+ log.info(f"Compiling model '{model_name}' on NPU...")
152
+ try:
153
+ import openvino_genai as ov_genai
154
+ except ImportError as e:
155
+ raise ImportError("[ERROR] openvino-genai not found. Run: pip install openvino-genai") from e
156
+
157
+ pipeline_config = {
158
+ "CACHE_DIR": str(slot_dir),
159
+ "CACHE_MODE": cache_mode,
160
+ "MAX_PROMPT_LEN": int(max_prompt_len),
161
+ }
162
+
163
+ # Instantiating the pipeline triggers NPU compilation, saving the compiled .blob to slot_dir
164
+ ov_genai.LLMPipeline(str(model_dir), "NPU", **pipeline_config)
165
+ (slot_dir / "compiled.ok").touch()
166
+
167
+ log.info(f"Successfully compiled '{model_name}' directly onto the NPU.")
168
+ return slot_dir
169
+
170
+
171
+ class NPUPipeline:
172
+ """
173
+ Wrapper around OpenVINO GenAI LLMPipeline.
174
+ """
175
+ def __init__(self, slot_dir: Path, manifest: dict, logger=None):
176
+ try:
177
+ import openvino_genai as ov_genai
178
+ except ImportError as e:
179
+ raise ImportError("[ERROR] openvino-genai not found. Run: pip install openvino-genai") from e
180
+
181
+ self.logger = logger or logging.getLogger("npu_server")
182
+ model_dir = Path(manifest["model_dir"])
183
+ device = manifest.get("device", "NPU")
184
+
185
+ pipeline_config = {
186
+ "CACHE_DIR": str(slot_dir),
187
+ "CACHE_MODE": manifest.get("cache_mode", "OPTIMIZE_SPEED"),
188
+ "MAX_PROMPT_LEN": int(manifest.get("max_prompt_len", 16384)),
189
+ }
190
+
191
+ self.logger.info(f"Loading model from compiled blob ...")
192
+ self.logger.info(f" device : {device}")
193
+ self.logger.info(f" blob : {slot_dir}")
194
+ t0 = time.time()
195
+
196
+ self._pipe = ov_genai.LLMPipeline(str(model_dir), device, **pipeline_config)
197
+ self._genai = ov_genai
198
+ self.load_ms = round((time.time() - t0) * 1000)
199
+ self.model_name = manifest["model_name"]
200
+ self.device = device
201
+
202
+ self.logger.info(f"Pipeline ready in {self.load_ms}ms")
203
+
204
+ def generate(self, prompt: str, max_new_tokens: int = 512, temperature: float = 0.7,
205
+ stream_cb=None) -> str:
206
+ cfg = self._genai.GenerationConfig()
207
+ cfg.max_new_tokens = max_new_tokens
208
+ cfg.do_sample = temperature > 0.0
209
+ if temperature > 0.0:
210
+ cfg.temperature = temperature
211
+ cfg.top_p = 0.95
212
+
213
+ if stream_cb:
214
+ self._pipe.generate(prompt, cfg, stream_cb)
215
+ return ""
216
+ return self._pipe.generate(prompt, cfg)
217
+
218
+ def apply_chat_template(self, messages: list[dict]) -> str:
219
+ return self._pipe.get_tokenizer().apply_chat_template(messages, add_generation_prompt=True)
220
+
221
+
222
+ class NPUPipelineManager:
223
+ """
224
+ Manages active NPU pipeline state, lists models dynamically, and
225
+ lazy-loads or compiles models on request in a thread-safe, queued manner.
226
+ """
227
+ def __init__(
228
+ self,
229
+ genai_cache_root: str | Path | None = None,
230
+ hf_hub_cache: str | Path | None = None,
231
+ allow_download: bool = True,
232
+ logger=None
233
+ ):
234
+ default_genai, default_hf = get_default_paths()
235
+ self.genai_cache_root = Path(genai_cache_root) if genai_cache_root else default_genai
236
+ self.hf_hub_cache = Path(hf_hub_cache) if hf_hub_cache else default_hf
237
+ self.allow_download = allow_download
238
+ self.logger = logger or logging.getLogger("npu_server")
239
+
240
+ self.active_pipeline = None
241
+ self.active_model_name = None
242
+ self.is_loading = False
243
+
244
+ # Thread condition variable to queue and sequence loading/generation requests
245
+ self.lock = threading.Lock()
246
+ self.load_condition = threading.Condition(self.lock)
247
+
248
+ def list_all_models(self) -> list[dict]:
249
+ """
250
+ Lists all compiled models inside the GenAI cache and raw HF models
251
+ available in the Hugging Face hub cache.
252
+ """
253
+ models_dict = {}
254
+
255
+ # 1. Identify current active model
256
+ if self.active_model_name:
257
+ models_dict[self.active_model_name] = "active"
258
+
259
+ # 2. Check GenAI compiled cache directory
260
+ if self.genai_cache_root.exists():
261
+ for slot in self.genai_cache_root.iterdir():
262
+ if slot.is_dir():
263
+ mp = slot / "manifest.json"
264
+ ok = slot / "compiled.ok"
265
+ model_name = None
266
+ if mp.exists():
267
+ try:
268
+ m = json.loads(mp.read_text())
269
+ model_name = m.get("model_name")
270
+ except Exception:
271
+ pass
272
+
273
+ if not model_name:
274
+ model_name = slot.name.replace("--", "/")
275
+
276
+ if any(slot.glob("*.blob")) or ok.exists():
277
+ if model_name not in models_dict:
278
+ models_dict[model_name] = "compiled"
279
+
280
+ # 3. Check local raw HF hub cache
281
+ if self.hf_hub_cache.exists():
282
+ for entry in self.hf_hub_cache.iterdir():
283
+ if entry.is_dir() and entry.name.startswith("models--"):
284
+ parts = entry.name.split("--")[1:]
285
+ if parts:
286
+ repo_id = "/".join(parts)
287
+ if repo_id not in models_dict:
288
+ models_dict[repo_id] = "raw"
289
+
290
+ return [{"id": name, "status": status} for name, status in models_dict.items()]
291
+
292
+ def load_model(self, model_name: str) -> NPUPipeline:
293
+ """
294
+ Loads the requested model. If another model is currently in the process of loading,
295
+ blocks until loading is complete, then returns the active pipeline.
296
+ """
297
+ with self.load_condition:
298
+ # Queue request if server is currently compiling/loading
299
+ while self.is_loading:
300
+ self.logger.info(f"Model load in progress. Queuing request for model '{model_name}'...")
301
+ self.load_condition.wait()
302
+
303
+ # If active model matches requested one, serve immediately
304
+ if self.active_pipeline and self.active_model_name == model_name:
305
+ return self.active_pipeline
306
+
307
+ # Set load state to block any concurrent requests
308
+ self.is_loading = True
309
+
310
+ try:
311
+ self.logger.info(f"Initiating loading/compilation process for '{model_name}'...")
312
+
313
+ # 1. Search for compiled slot
314
+ result = find_slot(model_name, self.genai_cache_root, self.hf_hub_cache)
315
+
316
+ if result:
317
+ slot_dir, manifest = result
318
+ self.logger.info(f"Found compiled slot at {slot_dir}. Loading...")
319
+
320
+ # Unload old pipeline to free NPU memory
321
+ with self.load_condition:
322
+ if self.active_pipeline:
323
+ self.logger.info(f"Unloading active model '{self.active_model_name}' to free NPU resources...")
324
+ self.active_pipeline = None
325
+ gc.collect()
326
+ time.sleep(0.5)
327
+
328
+ pipe = NPUPipeline(slot_dir, manifest, logger=self.logger)
329
+ else:
330
+ self.logger.info(f"Model '{model_name}' is not compiled. Running download & NPU compile...")
331
+
332
+ # Unload active pipeline to free NPU resources for compilation
333
+ with self.load_condition:
334
+ if self.active_pipeline:
335
+ self.logger.info(f"Unloading active model '{self.active_model_name}' before compilation...")
336
+ self.active_pipeline = None
337
+ gc.collect()
338
+ time.sleep(0.5)
339
+
340
+ slot_dir = download_and_compile(
341
+ model_name=model_name,
342
+ genai_cache_root=self.genai_cache_root,
343
+ hf_hub_cache=self.hf_hub_cache,
344
+ allow_download=self.allow_download,
345
+ logger=self.logger
346
+ )
347
+ manifest = json.loads((slot_dir / "manifest.json").read_text())
348
+ pipe = NPUPipeline(slot_dir, manifest, logger=self.logger)
349
+
350
+ # Store the newly active pipeline
351
+ with self.load_condition:
352
+ self.active_pipeline = pipe
353
+ self.active_model_name = model_name
354
+ self.logger.info(f"Successfully activated and loaded '{model_name}'!")
355
+ return self.active_pipeline
356
+
357
+ finally:
358
+ with self.load_condition:
359
+ # Release loading lock state and wake up all waiting threads
360
+ self.is_loading = False
361
+ self.load_condition.notify_all()
362
+
363
+
364
+ def build_flask_app(manager: NPUPipelineManager, logger=None, prompt_logger=None):
365
+ """
366
+ Builds and returns the Flask WSGI application instance.
367
+ """
368
+ try:
369
+ from flask import Flask, request, jsonify, Response
370
+ except ImportError as e:
371
+ raise ImportError("[ERROR] flask not found.") from e
372
+
373
+ app = Flask(__name__)
374
+ log = logger or logging.getLogger("npu_server")
375
+ plog = prompt_logger or logging.getLogger("prompt_logger")
376
+
377
+ @app.route("/", methods=["GET"])
378
+ def index():
379
+ return jsonify({
380
+ "status": "running",
381
+ "active_model": manager.active_model_name or "none",
382
+ "info": "NPU Dynamic Model Server"
383
+ })
384
+
385
+ @app.route("/health", methods=["GET"])
386
+ def health():
387
+ if manager.active_pipeline:
388
+ return jsonify({
389
+ "status": "ok",
390
+ "model": manager.active_model_name,
391
+ "device": manager.active_pipeline.device
392
+ })
393
+ return jsonify({"status": "idle", "info": "No active model loaded"})
394
+
395
+ @app.route("/currentmodel", methods=["GET"])
396
+ def currentmodel():
397
+ """
398
+ Returns info about the currently loaded active model.
399
+ """
400
+ if manager.active_pipeline:
401
+ return jsonify({
402
+ "model": manager.active_model_name,
403
+ "device": manager.active_pipeline.device,
404
+ "status": "loaded"
405
+ })
406
+ return jsonify({
407
+ "model": "none",
408
+ "device": "NPU",
409
+ "status": "idle"
410
+ })
411
+
412
+ @app.route("/v1/models/load", methods=["POST"])
413
+ @app.route("/load", methods=["POST"])
414
+ def load_model_endpoint():
415
+ """
416
+ Receives a POST request to load a model.
417
+ While this compilation/load runs, other requests will block in queue.
418
+ """
419
+ data = request.get_json(force=True) if request.data else {}
420
+ model_name = data.get("model")
421
+ if not model_name:
422
+ return jsonify({"error": "No 'model' parameter specified in JSON body"}), 400
423
+
424
+ try:
425
+ npu = manager.load_model(model_name)
426
+ return jsonify({
427
+ "status": "success",
428
+ "model": npu.model_name,
429
+ "device": npu.device,
430
+ "message": f"Successfully loaded model '{model_name}'"
431
+ })
432
+ except Exception as e:
433
+ log.error(f"Failed to load model '{model_name}': {e}")
434
+ return jsonify({"error": f"Failed to load model '{model_name}': {str(e)}"}), 500
435
+
436
+ @app.route("/v1/models", methods=["GET"])
437
+ def v1_models():
438
+ available = manager.list_all_models()
439
+ return jsonify({
440
+ "object": "list",
441
+ "data": [
442
+ {
443
+ "id": m["id"],
444
+ "object": "model",
445
+ "created": int(time.time()),
446
+ "owned_by": "npu-local",
447
+ "status": m["status"]
448
+ }
449
+ for m in available
450
+ ]
451
+ })
452
+
453
+ @app.after_request
454
+ def add_cors_headers(response):
455
+ response.headers["Access-Control-Allow-Origin"] = "*"
456
+ response.headers["Access-Control-Allow-Methods"] = "GET, POST, OPTIONS"
457
+ response.headers["Access-Control-Allow-Headers"] = "Content-Type, Authorization"
458
+ return response
459
+
460
+ @app.route("/v1/chat/completions", methods=["POST", "OPTIONS"])
461
+ def chat_completions():
462
+ if request.method == "OPTIONS":
463
+ return "", 200
464
+
465
+ raw_data = request.get_data(as_text=True)
466
+ plog.info(f"RAW REQUEST RECEIVED:\n{raw_data}")
467
+
468
+ data = request.get_json(force=True)
469
+ messages = data.get("messages", [])
470
+ max_tokens = int(data.get("max_tokens", 512))
471
+ temperature = float(data.get("temperature", 0.7))
472
+ stream = data.get("stream", False)
473
+ req_id = f"chatcmpl-{uuid.uuid4().hex[:8]}"
474
+ created = int(time.time())
475
+
476
+ # Dynamic model selection
477
+ model_name = data.get("model")
478
+ if not model_name:
479
+ if manager.active_model_name:
480
+ model_name = manager.active_model_name
481
+ else:
482
+ return jsonify({"error": "No model is currently loaded and no 'model' parameter was specified in the request."}), 400
483
+
484
+ try:
485
+ npu = manager.load_model(model_name)
486
+ except Exception as e:
487
+ log.error(f"Failed to load or compile model '{model_name}': {e}")
488
+ return jsonify({"error": f"Failed to load or compile model '{model_name}': {str(e)}"}), 500
489
+
490
+ try:
491
+ prompt = npu.apply_chat_template(messages)
492
+ log.debug(f"Generated Prompt:\n{prompt}")
493
+ plog.info(prompt)
494
+ except Exception as e:
495
+ log.error(f"Template failed: {e}")
496
+ prompt = messages[-1]["content"] if messages else ""
497
+ plog.info(f"FALLBACK PROMPT:\n{prompt}")
498
+
499
+ if stream:
500
+ def event_stream():
501
+ import queue
502
+ q = queue.Queue()
503
+ done = threading.Event()
504
+ def cb(token: str) -> bool:
505
+ q.put(token)
506
+ return False
507
+ def run_gen():
508
+ try:
509
+ npu.generate(prompt, max_new_tokens=max_tokens, temperature=temperature, stream_cb=cb)
510
+ except Exception as ex:
511
+ log.error(f"Stream generation error: {ex}")
512
+ finally:
513
+ done.set()
514
+ threading.Thread(target=run_gen).start()
515
+ while not (done.is_set() and q.empty()):
516
+ try:
517
+ tok = q.get(timeout=0.1)
518
+ yield f"data: {json.dumps({'choices': [{'delta': {'content': tok}}]})}\n\n"
519
+ except Exception:
520
+ pass
521
+ yield "data: [DONE]\n\n"
522
+ return Response(event_stream(), mimetype="text/event-stream")
523
+
524
+ t0 = time.time()
525
+ result = npu.generate(prompt, max_new_tokens=max_tokens, temperature=temperature)
526
+ log.debug(f"Generation Result: '{result}'")
527
+ gen_ms = round((time.time() - t0) * 1000)
528
+
529
+ return jsonify({
530
+ "id": req_id,
531
+ "object": "chat.completion",
532
+ "created": created,
533
+ "model": npu.model_name,
534
+ "choices": [{
535
+ "index": 0,
536
+ "message": {"role": "assistant", "content": result},
537
+ "finish_reason": "stop",
538
+ }],
539
+ "timings": {"generation_ms": gen_ms},
540
+ })
541
+
542
+ return app
543
+
544
+
545
+ def run_server(
546
+ genai_cache_root: str | Path | None = None,
547
+ hf_hub_cache: str | Path | None = None,
548
+ model_name: str | None = None,
549
+ allow_download: bool = True,
550
+ port: int = 8080,
551
+ host: str = "0.0.0.0",
552
+ log_file: str | Path | None = None,
553
+ prompt_log_file: str | Path | None = None,
554
+ threaded: bool = True
555
+ ):
556
+ """
557
+ Starts the NPU model server.
558
+ - If `model_name` is provided, pre-loads the model immediately at startup.
559
+ - If `model_name` is not provided, starts in idle mode and lazy-loads/compiles
560
+ models dynamically based on request criteria.
561
+ All paths are optional and fall back to subfolders inside ~/.cache/npuserver/ by default.
562
+ """
563
+ logger, prompt_logger = setup_logging(log_file=log_file, prompt_log_file=prompt_log_file)
564
+
565
+ manager = NPUPipelineManager(
566
+ genai_cache_root=genai_cache_root,
567
+ hf_hub_cache=hf_hub_cache,
568
+ allow_download=allow_download,
569
+ logger=logger
570
+ )
571
+
572
+ if model_name:
573
+ logger.info(f"Pre-loading model '{model_name}' during server startup...")
574
+ try:
575
+ manager.load_model(model_name)
576
+ except Exception as e:
577
+ logger.error(f"Failed to pre-load model '{model_name}': {e}. Server starting in idle mode.")
578
+
579
+ app = build_flask_app(manager, logger=logger, prompt_logger=prompt_logger)
580
+
581
+ logger.info(f"Running NPU model server on {host}:{port}")
582
+ app.run(host=host, port=port, threaded=threaded)