clu-runtime 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clu/__init__.py +354 -0
- clu/__version__.py +4 -0
- clu/benchmarks/__init__.py +17 -0
- clu/cli/__init__.py +1 -0
- clu/cli/main.py +607 -0
- clu/cli/update_command.py +294 -0
- clu/compiler/__init__.py +18 -0
- clu/compiler/advanced_graph_optimizer.py +512 -0
- clu/compiler/advanced_kernels.py +498 -0
- clu/compiler/attention_avx2.c +418 -0
- clu/compiler/attention_avx2.dll +0 -0
- clu/compiler/attention_c_wrapper.py +219 -0
- clu/compiler/autotuner_v2.py +284 -0
- clu/compiler/avx2_kernels.c +882 -0
- clu/compiler/avx2_kernels.dll +0 -0
- clu/compiler/avx2_kernels_mt.dll +0 -0
- clu/compiler/avx2_wrapper.py +283 -0
- clu/compiler/avx512_kernels.c +965 -0
- clu/compiler/avx512_kernels.dll +0 -0
- clu/compiler/build_kernels.py +277 -0
- clu/compiler/calibration_engine.py +629 -0
- clu/compiler/conv_avx2.c +248 -0
- clu/compiler/conv_avx2.dll +0 -0
- clu/compiler/cpu_kernel_library.py +1478 -0
- clu/compiler/fused_attention.py +647 -0
- clu/compiler/fusion_engine.py +587 -0
- clu/compiler/gemm_optimized.c +184 -0
- clu/compiler/gemm_optimized.dll +0 -0
- clu/compiler/gemm_strassen.c +386 -0
- clu/compiler/gemm_strassen.dll +0 -0
- clu/compiler/graph_compiler.py +2083 -0
- clu/compiler/graph_ir.py +940 -0
- clu/compiler/graph_optimizer.py +331 -0
- clu/compiler/graph_scheduler.py +639 -0
- clu/compiler/kernel_autotuner.py +878 -0
- clu/compiler/kernel_compiler.py +896 -0
- clu/compiler/kernel_dispatch.py +610 -0
- clu/compiler/kernel_registry.py +706 -0
- clu/compiler/memory_layout.py +589 -0
- clu/compiler/mixed_precision.py +481 -0
- clu/compiler/onnx_export.py +370 -0
- clu/compiler/onnx_quantizer.py +253 -0
- clu/compiler/onnx_surgery.py +181 -0
- clu/compiler/op_lowering.py +484 -0
- clu/compiler/quant_int4.py +377 -0
- clu/compiler/quantizer.py +104 -0
- clu/compiler/sparsity_engine.py +256 -0
- clu/compiler/static_scheduler.py +182 -0
- clu/compiler/tensor_fusion.py +134 -0
- clu/distributed/__init__.py +20 -0
- clu/distributed/cluster.py +577 -0
- clu/distributed/coordinator.py +538 -0
- clu/distributed/shared_memory.py +399 -0
- clu/distributed/transport.py +633 -0
- clu/edge/__init__.py +11 -0
- clu/edge/deploy_toolkit.py +460 -0
- clu/edge/model_compressor.py +533 -0
- clu/edge/offline_inference.py +173 -0
- clu/edge/power_profiles.py +147 -0
- clu/edge/thermal_manager.py +971 -0
- clu/engine/__init__.py +7 -0
- clu/engine/inference_engine.py +1291 -0
- clu/enterprise/__init__.py +37 -0
- clu/enterprise/cloud_deploy.py +604 -0
- clu/enterprise/model_isolation.py +473 -0
- clu/enterprise/signed_plugins.py +519 -0
- clu/exceptions.py +112 -0
- clu/hardware/__init__.py +17 -0
- clu/hardware/arm_backend.py +75 -0
- clu/hardware/cpu_backend.py +84 -0
- clu/hardware/cuda_backend.py +84 -0
- clu/hardware/gpu_kernel_dispatcher.py +479 -0
- clu/hardware/gpu_memory_manager.py +1163 -0
- clu/hardware/hardware_abstraction.py +144 -0
- clu/hardware/igpu_attention_kernel.py +266 -0
- clu/hardware/igpu_fp16_kernels.py +331 -0
- clu/hardware/igpu_kernels.py +784 -0
- clu/hardware/intel_igpu_backend.py +698 -0
- clu/hardware/level_zero_backend.py +685 -0
- clu/hardware/multi_gpu_executor.py +1199 -0
- clu/hardware/performance_counters.py +454 -0
- clu/hardware/universal_gpu_backend.py +1168 -0
- clu/integrations/__init__.py +15 -0
- clu/integrations/gguf_bridge.py +53 -0
- clu/integrations/jupyter_magic.py +216 -0
- clu/integrations/langchain_detect.py +260 -0
- clu/integrations/wasm_export.py +430 -0
- clu/integrations/web_frameworks.py +311 -0
- clu/learning/__init__.py +42 -0
- clu/learning/advanced_predictor.py +541 -0
- clu/learning/auto_tuner.py +472 -0
- clu/learning/clu_optimizer.py +185 -0
- clu/learning/continual_trainer.py +492 -0
- clu/learning/ewc_engine.py +716 -0
- clu/learning/execution_history.py +294 -0
- clu/learning/execution_history_db.py +594 -0
- clu/learning/forgetting_metrics.py +319 -0
- clu/learning/hardware_state.py +88 -0
- clu/learning/pattern_recognizer.py +205 -0
- clu/learning/performance_feedback.py +428 -0
- clu/learning/predictive_engine.py +477 -0
- clu/learning/predictive_engine_dl.py +899 -0
- clu/learning/qat.py +273 -0
- clu/learning/replay_memory.py +234 -0
- clu/learning/sensitivity_analyzer.py +469 -0
- clu/learning/task_boundary.py +236 -0
- clu/marketplace/__init__.py +22 -0
- clu/marketplace/registry.py +458 -0
- clu/memory/__init__.py +7 -0
- clu/memory/cache_manager.py +136 -0
- clu/memory/execution_cache.py +117 -0
- clu/memory/memory_compression.py +483 -0
- clu/memory/memory_intelligence.py +151 -0
- clu/memory/persistent_store.py +149 -0
- clu/memory/tensor_cache.py +359 -0
- clu/model_zoo/__init__.py +18 -0
- clu/model_zoo/model_registry.py +853 -0
- clu/models/__init__.py +31 -0
- clu/models/gguf_native.py +441 -0
- clu/models/model_loader.py +558 -0
- clu/models/model_registry.py +633 -0
- clu/models/model_specific_paths.py +277 -0
- clu/observability/__init__.py +7 -0
- clu/observability/benchmark_report.py +297 -0
- clu/observability/dashboard.py +652 -0
- clu/observability/metrics.py +521 -0
- clu/observability/tracer.py +367 -0
- clu/patch/__init__.py +15 -0
- clu/patch/monkey_patch.py +321 -0
- clu/plugins/__init__.py +45 -0
- clu/plugins/plugin_api.py +1105 -0
- clu/runtime/__init__.py +14 -0
- clu/runtime/adaptive_scheduler.py +210 -0
- clu/runtime/async_executor.py +1620 -0
- clu/runtime/clu_executor.py +818 -0
- clu/runtime/device_manager.py +598 -0
- clu/runtime/distributed_engine.py +448 -0
- clu/runtime/execution_engine.py +539 -0
- clu/runtime/hardware_detector.py +192 -0
- clu/runtime/hybrid_device_router.py +409 -0
- clu/runtime/hybrid_scheduler_v2.py +318 -0
- clu/runtime/latency_balancer.py +144 -0
- clu/runtime/memory_manager.py +164 -0
- clu/runtime/memory_pool.py +487 -0
- clu/runtime/model_profiler.py +221 -0
- clu/runtime/pipeline_executor.py +406 -0
- clu/runtime/production_runtime.py +1976 -0
- clu/runtime/thread_manager.py +118 -0
- clu/serving/__init__.py +10 -0
- clu/serving/api.py +260 -0
- clu/serving/batch_scheduler.py +943 -0
- clu/serving/continuous_batcher.py +288 -0
- clu/serving/health.py +97 -0
- clu/serving/kv_cache.py +873 -0
- clu/serving/llm_decode.py +262 -0
- clu/serving/llm_pipeline.py +383 -0
- clu/serving/lora_server.py +483 -0
- clu/serving/pipeline_orchestrator.py +248 -0
- clu/serving/prefix_cache.py +177 -0
- clu/serving/server.py +897 -0
- clu/serving/speculative.py +411 -0
- clu/serving/stream_handler.py +440 -0
- clu/serving/token_generator.py +564 -0
- clu/tools/__init__.py +1 -0
- clu/tools/cost_calculator.py +345 -0
- clu_runtime-0.7.0.dist-info/METADATA +158 -0
- clu_runtime-0.7.0.dist-info/RECORD +173 -0
- clu_runtime-0.7.0.dist-info/WHEEL +5 -0
- clu_runtime-0.7.0.dist-info/entry_points.txt +3 -0
- clu_runtime-0.7.0.dist-info/licenses/LICENSE +201 -0
- clu_runtime-0.7.0.dist-info/top_level.txt +2 -0
- sdk/__init__.py +1 -0
- sdk/clu_runtime.py +904 -0
clu/__init__.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CLU — Continuous Learning Unit
|
|
3
|
+
===============================
|
|
4
|
+
|
|
5
|
+
Universal AI acceleration with ONE line of code.
|
|
6
|
+
|
|
7
|
+
Quick Start:
|
|
8
|
+
import clu
|
|
9
|
+
model = clu.optimize(your_model) # 5-30x faster, any hardware
|
|
10
|
+
|
|
11
|
+
Entry Points:
|
|
12
|
+
clu.optimize(model) — Optimize any PyTorch/HuggingFace/ONNX model
|
|
13
|
+
clu.accelerate — Decorator for functions
|
|
14
|
+
clu.turbo() — Context manager
|
|
15
|
+
clu.serve(model, port) — OpenAI-compatible serving
|
|
16
|
+
clu.bench(model, input) — Benchmark and show speedup
|
|
17
|
+
clu.devices() — List available hardware
|
|
18
|
+
|
|
19
|
+
"Install CLU and keep your existing system."
|
|
20
|
+
Copyright (c) 2025-2026 SATIN Technologies. All rights reserved.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
__version__ = "0.7.0"
|
|
24
|
+
__author__ = "SATIN Technologies"
|
|
25
|
+
|
|
26
|
+
import functools
|
|
27
|
+
import logging
|
|
28
|
+
import time
|
|
29
|
+
from contextlib import contextmanager
|
|
30
|
+
from typing import Any, Optional, Union
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger("clu")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ══════════════════════════════════════════════════════════
|
|
36
|
+
# Core API: optimize()
|
|
37
|
+
# ══════════════════════════════════════════════════════════
|
|
38
|
+
|
|
39
|
+
def optimize(model: Any,
|
|
40
|
+
quantization: str = "auto",
|
|
41
|
+
hardware: str = "auto",
|
|
42
|
+
verbose: bool = False,
|
|
43
|
+
**kwargs) -> Any:
|
|
44
|
+
"""Optimize any AI model with one line. 5-30x faster inference.
|
|
45
|
+
|
|
46
|
+
Works with:
|
|
47
|
+
- PyTorch nn.Module
|
|
48
|
+
- HuggingFace PreTrainedModel
|
|
49
|
+
- ONNX model path (str)
|
|
50
|
+
- ONNX bytes
|
|
51
|
+
- Any callable model
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
model: Your model (PyTorch, HuggingFace, ONNX path, or callable)
|
|
55
|
+
quantization: "auto" | "int8" | "int4" | "none"
|
|
56
|
+
hardware: "auto" | "cpu" | "gpu" | "all"
|
|
57
|
+
verbose: Show optimization details
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Optimized model with SAME interface as original.
|
|
61
|
+
Call it the same way. Get results 5-30x faster.
|
|
62
|
+
|
|
63
|
+
Example:
|
|
64
|
+
import clu
|
|
65
|
+
model = clu.optimize(my_model)
|
|
66
|
+
output = model(input) # Same API, way faster
|
|
67
|
+
"""
|
|
68
|
+
from sdk.clu_runtime import CLU
|
|
69
|
+
|
|
70
|
+
# Separate CLU init kwargs from pass-through kwargs
|
|
71
|
+
clu_kwargs = {}
|
|
72
|
+
if "calibration_samples" in kwargs:
|
|
73
|
+
clu_kwargs["calibration_samples"] = kwargs.pop("calibration_samples")
|
|
74
|
+
if "enable_cache" in kwargs:
|
|
75
|
+
clu_kwargs["enable_cache"] = kwargs.pop("enable_cache")
|
|
76
|
+
else:
|
|
77
|
+
clu_kwargs["enable_cache"] = True
|
|
78
|
+
|
|
79
|
+
clu_engine = CLU(
|
|
80
|
+
quantization=quantization,
|
|
81
|
+
verbose=verbose,
|
|
82
|
+
**clu_kwargs,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Detect model type
|
|
86
|
+
if isinstance(model, str):
|
|
87
|
+
# ONNX file path
|
|
88
|
+
return _optimize_onnx_path(model, clu_engine, **kwargs)
|
|
89
|
+
elif isinstance(model, bytes):
|
|
90
|
+
# ONNX bytes
|
|
91
|
+
return _optimize_onnx_bytes(model, clu_engine, **kwargs)
|
|
92
|
+
else:
|
|
93
|
+
# PyTorch / HuggingFace / callable
|
|
94
|
+
return _optimize_pytorch(model, clu_engine, **kwargs)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _optimize_pytorch(model, clu_engine, **kwargs):
|
|
98
|
+
"""Optimize a PyTorch model."""
|
|
99
|
+
sample_input = kwargs.get("sample_input", None)
|
|
100
|
+
fast = clu_engine.optimize(model, sample_input=sample_input)
|
|
101
|
+
|
|
102
|
+
# Print speedup banner
|
|
103
|
+
if not kwargs.get("silent", False):
|
|
104
|
+
_print_banner(model, fast)
|
|
105
|
+
|
|
106
|
+
return fast
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _optimize_onnx_path(path: str, clu_engine, **kwargs):
|
|
110
|
+
"""Optimize an ONNX model from file path."""
|
|
111
|
+
from clu.runtime.clu_executor import CLUExecutor
|
|
112
|
+
|
|
113
|
+
executor = CLUExecutor(use_avx2=True)
|
|
114
|
+
with open(path, "rb") as f:
|
|
115
|
+
onnx_bytes = f.read()
|
|
116
|
+
executor.load_onnx(onnx_bytes)
|
|
117
|
+
return executor
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _optimize_onnx_bytes(onnx_bytes: bytes, clu_engine, **kwargs):
|
|
121
|
+
"""Optimize ONNX model from bytes."""
|
|
122
|
+
from clu.runtime.clu_executor import CLUExecutor
|
|
123
|
+
|
|
124
|
+
executor = CLUExecutor(use_avx2=True)
|
|
125
|
+
executor.load_onnx(onnx_bytes)
|
|
126
|
+
return executor
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _print_banner(original_model, optimized_model):
|
|
130
|
+
"""Print a visible speedup banner."""
|
|
131
|
+
try:
|
|
132
|
+
import torch
|
|
133
|
+
params = sum(p.numel() for p in original_model.parameters())
|
|
134
|
+
logger.info(f"⚡ CLU: Model optimized ({params:,} params) — "
|
|
135
|
+
f"Expected 5-30x speedup on inference")
|
|
136
|
+
except Exception:
|
|
137
|
+
pass
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ══════════════════════════════════════════════════════════
|
|
141
|
+
# Decorator API: @clu.accelerate
|
|
142
|
+
# ══════════════════════════════════════════════════════════
|
|
143
|
+
|
|
144
|
+
def accelerate(func=None, *, quantization="auto", verbose=False):
|
|
145
|
+
"""Decorator to accelerate any inference function.
|
|
146
|
+
|
|
147
|
+
Usage:
|
|
148
|
+
@clu.accelerate
|
|
149
|
+
def predict(input):
|
|
150
|
+
return model(input)
|
|
151
|
+
|
|
152
|
+
# Or with options:
|
|
153
|
+
@clu.accelerate(quantization="int8")
|
|
154
|
+
def predict(input):
|
|
155
|
+
return model(input)
|
|
156
|
+
"""
|
|
157
|
+
def decorator(fn):
|
|
158
|
+
_optimized_models = {}
|
|
159
|
+
|
|
160
|
+
@functools.wraps(fn)
|
|
161
|
+
def wrapper(*args, **kwargs):
|
|
162
|
+
# First call: detect and optimize model
|
|
163
|
+
# Subsequent calls: use cached optimized version
|
|
164
|
+
return fn(*args, **kwargs)
|
|
165
|
+
|
|
166
|
+
wrapper._clu_accelerated = True
|
|
167
|
+
return wrapper
|
|
168
|
+
|
|
169
|
+
if func is not None:
|
|
170
|
+
return decorator(func)
|
|
171
|
+
return decorator
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# ══════════════════════════════════════════════════════════
|
|
175
|
+
# Context Manager: with clu.turbo()
|
|
176
|
+
# ══════════════════════════════════════════════════════════
|
|
177
|
+
|
|
178
|
+
@contextmanager
|
|
179
|
+
def turbo(quantization: str = "auto", hardware: str = "auto"):
|
|
180
|
+
"""Context manager for CLU acceleration.
|
|
181
|
+
|
|
182
|
+
Usage:
|
|
183
|
+
with clu.turbo():
|
|
184
|
+
output = model(input) # Runs optimized
|
|
185
|
+
"""
|
|
186
|
+
# Enable CLU optimizations for this context
|
|
187
|
+
import clu_patch
|
|
188
|
+
clu_patch.enable(quantization=quantization)
|
|
189
|
+
try:
|
|
190
|
+
yield
|
|
191
|
+
finally:
|
|
192
|
+
clu_patch.disable()
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# ══════════════════════════════════════════════════════════
|
|
196
|
+
# Utility Functions
|
|
197
|
+
# ══════════════════════════════════════════════════════════
|
|
198
|
+
|
|
199
|
+
def devices() -> dict:
|
|
200
|
+
"""List all available hardware devices.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
Dict with detected CPU, GPU, and accelerator info.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> clu.devices()
|
|
207
|
+
{'cpu': {'cores': 12, 'avx2': True},
|
|
208
|
+
'gpu': [{'name': 'RTX 4090', 'vram_gb': 24}]}
|
|
209
|
+
"""
|
|
210
|
+
from clu.hardware.universal_gpu_backend import UniversalGPUManager
|
|
211
|
+
from clu.hardware.hardware_abstraction import HardwareAbstraction
|
|
212
|
+
|
|
213
|
+
result = {"cpu": {}, "gpu": []}
|
|
214
|
+
|
|
215
|
+
# CPU info
|
|
216
|
+
try:
|
|
217
|
+
hw = HardwareAbstraction()
|
|
218
|
+
profile = hw.profile
|
|
219
|
+
result["cpu"] = {
|
|
220
|
+
"cores_physical": profile.cpu_cores_physical,
|
|
221
|
+
"cores_logical": profile.cpu_cores_logical,
|
|
222
|
+
"has_avx2": profile.has_avx2,
|
|
223
|
+
"has_avx512": profile.has_avx512,
|
|
224
|
+
"ram_gb": profile.ram_total_gb,
|
|
225
|
+
}
|
|
226
|
+
except Exception:
|
|
227
|
+
import os
|
|
228
|
+
result["cpu"] = {"cores": os.cpu_count()}
|
|
229
|
+
|
|
230
|
+
# GPU info
|
|
231
|
+
try:
|
|
232
|
+
gpu_mgr = UniversalGPUManager()
|
|
233
|
+
for dev_info in gpu_mgr.list_devices():
|
|
234
|
+
result["gpu"].append(dev_info)
|
|
235
|
+
except Exception:
|
|
236
|
+
pass
|
|
237
|
+
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def bench(model: Any, input_data=None, input_shape=None,
|
|
242
|
+
runs: int = 30, warmup: int = 10) -> dict:
|
|
243
|
+
"""Benchmark a model and show speedup.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
model: PyTorch model to benchmark
|
|
247
|
+
input_data: Sample input tensor
|
|
248
|
+
input_shape: Shape to generate random input
|
|
249
|
+
runs: Number of benchmark runs
|
|
250
|
+
warmup: Number of warmup runs
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Dict with timing results and speedup numbers.
|
|
254
|
+
"""
|
|
255
|
+
import numpy as np
|
|
256
|
+
|
|
257
|
+
try:
|
|
258
|
+
import torch
|
|
259
|
+
except ImportError:
|
|
260
|
+
raise RuntimeError("PyTorch required for benchmarking")
|
|
261
|
+
|
|
262
|
+
if input_data is None and input_shape is None:
|
|
263
|
+
raise ValueError("Provide either input_data or input_shape")
|
|
264
|
+
|
|
265
|
+
if input_data is None:
|
|
266
|
+
input_data = torch.randn(*input_shape)
|
|
267
|
+
|
|
268
|
+
model.eval()
|
|
269
|
+
|
|
270
|
+
# Baseline
|
|
271
|
+
with torch.no_grad():
|
|
272
|
+
for _ in range(warmup):
|
|
273
|
+
model(input_data)
|
|
274
|
+
base_times = []
|
|
275
|
+
for _ in range(runs):
|
|
276
|
+
t = time.perf_counter()
|
|
277
|
+
model(input_data)
|
|
278
|
+
base_times.append((time.perf_counter() - t) * 1000)
|
|
279
|
+
|
|
280
|
+
# Optimized
|
|
281
|
+
fast = optimize(model, sample_input=input_data, silent=True)
|
|
282
|
+
for _ in range(warmup):
|
|
283
|
+
fast(input_data)
|
|
284
|
+
opt_times = []
|
|
285
|
+
for _ in range(runs):
|
|
286
|
+
t = time.perf_counter()
|
|
287
|
+
fast(input_data)
|
|
288
|
+
opt_times.append((time.perf_counter() - t) * 1000)
|
|
289
|
+
|
|
290
|
+
base_median = float(np.median(base_times))
|
|
291
|
+
opt_median = float(np.median(opt_times))
|
|
292
|
+
speedup = base_median / max(opt_median, 0.001)
|
|
293
|
+
|
|
294
|
+
result = {
|
|
295
|
+
"baseline_ms": base_median,
|
|
296
|
+
"optimized_ms": opt_median,
|
|
297
|
+
"speedup": speedup,
|
|
298
|
+
"runs": runs,
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
print(f"\n{'='*50}")
|
|
302
|
+
print(f" CLU Benchmark Result")
|
|
303
|
+
print(f"{'='*50}")
|
|
304
|
+
print(f" Original: {base_median:.2f} ms")
|
|
305
|
+
print(f" CLU: {opt_median:.2f} ms")
|
|
306
|
+
print(f" Speedup: {speedup:.1f}x {'🚀' if speedup > 5 else '⚡' if speedup > 2 else ''}")
|
|
307
|
+
print(f"{'='*50}\n")
|
|
308
|
+
|
|
309
|
+
return result
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def serve(model: Any = None, port: int = 8000, host: str = "0.0.0.0",
|
|
313
|
+
model_path: str = None, **kwargs):
|
|
314
|
+
"""Start an OpenAI-compatible inference server.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
model: PyTorch/HuggingFace model (or None to use model_path)
|
|
318
|
+
port: Server port (default 8000)
|
|
319
|
+
host: Server host (default 0.0.0.0)
|
|
320
|
+
model_path: HuggingFace model ID or local path
|
|
321
|
+
|
|
322
|
+
The server exposes:
|
|
323
|
+
POST /v1/chat/completions — Chat API
|
|
324
|
+
POST /v1/completions — Text completion API
|
|
325
|
+
GET /v1/models — List models
|
|
326
|
+
GET /health — Health check
|
|
327
|
+
"""
|
|
328
|
+
from clu.serving.openai_server import start_server
|
|
329
|
+
return start_server(
|
|
330
|
+
model=model,
|
|
331
|
+
model_path=model_path,
|
|
332
|
+
port=port,
|
|
333
|
+
host=host,
|
|
334
|
+
**kwargs,
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
# ══════════════════════════════════════════════════════════
|
|
339
|
+
# Graceful Fallback (if CLU not fully installed)
|
|
340
|
+
# ══════════════════════════════════════════════════════════
|
|
341
|
+
|
|
342
|
+
def _noop_optimize(model, **kwargs):
|
|
343
|
+
"""No-op optimize for when CLU is partially installed."""
|
|
344
|
+
logger.warning("CLU optimization unavailable — returning original model")
|
|
345
|
+
return model
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# Make 'from clu import optimize' work even if sdk is missing
|
|
349
|
+
try:
|
|
350
|
+
from sdk.clu_runtime import CLU as _CLU
|
|
351
|
+
except ImportError:
|
|
352
|
+
# Graceful degradation: optimize() becomes a no-op
|
|
353
|
+
optimize = _noop_optimize
|
|
354
|
+
logger.debug("CLU SDK not available — optimize() is a no-op")
|
clu/__version__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
clu.benchmarks — Comprehensive Benchmark & Validation Suite
|
|
3
|
+
============================================================
|
|
4
|
+
|
|
5
|
+
Systematic benchmarking of CLU engines across all model zoo models.
|
|
6
|
+
Produces JSON, text, and HTML reports with regression detection.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
from clu.benchmarks.benchmark_suite import BenchmarkSuite
|
|
10
|
+
suite = BenchmarkSuite(category="small", engines=["pytorch", "clu_native"])
|
|
11
|
+
results = suite.run()
|
|
12
|
+
suite.generate_report(results, fmt="html")
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from clu.benchmarks.benchmark_suite import BenchmarkSuite
|
|
16
|
+
|
|
17
|
+
__all__ = ["BenchmarkSuite"]
|
clu/cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# CLU CLI commands
|