inferml 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- _win_compat.py +52 -0
- adapters/__init__.py +65 -0
- adapters/base.py +37 -0
- adapters/diffusers_pipeline.py +57 -0
- adapters/standard_pipeline.py +31 -0
- engine.py +294 -0
- inferml-1.0.1.dist-info/METADATA +137 -0
- inferml-1.0.1.dist-info/RECORD +222 -0
- inferml-1.0.1.dist-info/WHEEL +5 -0
- inferml-1.0.1.dist-info/entry_points.txt +2 -0
- inferml-1.0.1.dist-info/licenses/LICENSE +21 -0
- inferml-1.0.1.dist-info/top_level.txt +9 -0
- io_utils.py +68 -0
- models/__init__.py +167 -0
- models/_diffusion_helper.py +45 -0
- models/_pipeline_helper.py +50 -0
- models/aria/__init__.py +6 -0
- models/bamba/__init__.py +6 -0
- models/bark/__init__.py +6 -0
- models/bit/__init__.py +6 -0
- models/bitnet/__init__.py +6 -0
- models/blip/__init__.py +12 -0
- models/bloom/__init__.py +6 -0
- models/chameleon/__init__.py +6 -0
- models/clip/__init__.py +6 -0
- models/codegen/__init__.py +6 -0
- models/cohere/__init__.py +6 -0
- models/cohere2_vision/__init__.py +6 -0
- models/conditional_detr/__init__.py +6 -0
- models/convnext/__init__.py +6 -0
- models/csm/__init__.py +11 -0
- models/cvt/__init__.py +6 -0
- models/d_fine/__init__.py +6 -0
- models/data2vec_vision/__init__.py +6 -0
- models/dbrx/__init__.py +6 -0
- models/deepseek/__init__.py +6 -0
- models/deepseek_vl/__init__.py +5 -0
- models/deepseek_vl/adapter.py +93 -0
- models/deformable_detr/__init__.py +6 -0
- models/depth_anything/__init__.py +6 -0
- models/depth_pro/__init__.py +6 -0
- models/detr/__init__.py +7 -0
- models/dia/__init__.py +13 -0
- models/donut/__init__.py +11 -0
- models/dpt/__init__.py +7 -0
- models/edgetam/__init__.py +14 -0
- models/efficientnet/__init__.py +6 -0
- models/emu3/__init__.py +6 -0
- models/eomt/__init__.py +6 -0
- models/eomt_dinov3/__init__.py +11 -0
- models/exaone/__init__.py +6 -0
- models/falcon/__init__.py +6 -0
- models/fastspeech2/__init__.py +6 -0
- models/fastvlm/__init__.py +5 -0
- models/fastvlm/adapter.py +99 -0
- models/florence2/__init__.py +5 -0
- models/florence2/adapter.py +102 -0
- models/flux/__init__.py +22 -0
- models/focalnet/__init__.py +6 -0
- models/fuyu/__init__.py +6 -0
- models/gemma/__init__.py +10 -0
- models/gemma3_vlm/__init__.py +6 -0
- models/git/__init__.py +6 -0
- models/glm/__init__.py +6 -0
- models/glm4v/__init__.py +6 -0
- models/got_ocr2/__init__.py +20 -0
- models/gpt2/__init__.py +6 -0
- models/gpt_oss/__init__.py +6 -0
- models/granite/__init__.py +6 -0
- models/granite_speech/__init__.py +15 -0
- models/grounding_dino/__init__.py +6 -0
- models/hunyuan_vl/__init__.py +6 -0
- models/idefics/__init__.py +6 -0
- models/instructpix2pix/__init__.py +19 -0
- models/internvl/__init__.py +6 -0
- models/jamba/__init__.py +6 -0
- models/janus/__init__.py +5 -0
- models/janus/adapter.py +125 -0
- models/kandinsky/__init__.py +14 -0
- models/kimi_vl/__init__.py +6 -0
- models/kolors/__init__.py +15 -0
- models/kosmos/__init__.py +6 -0
- models/kyutai_stt/__init__.py +11 -0
- models/layoutlmv3/__init__.py +9 -0
- models/levit/__init__.py +6 -0
- models/lfm2_vl/__init__.py +6 -0
- models/llama/__init__.py +6 -0
- models/llava/__init__.py +5 -0
- models/llava/adapter.py +79 -0
- models/m2m_100/__init__.py +6 -0
- models/mamba/__init__.py +6 -0
- models/marian/__init__.py +6 -0
- models/mask2former/__init__.py +6 -0
- models/maskformer/__init__.py +6 -0
- models/mgp_str/__init__.py +12 -0
- models/minicpm_v/__init__.py +6 -0
- models/minimax/__init__.py +6 -0
- models/mistral/__init__.py +6 -0
- models/mllama/__init__.py +6 -0
- models/mm_grounding_dino/__init__.py +12 -0
- models/mobilenet/__init__.py +7 -0
- models/moondream/__init__.py +5 -0
- models/moondream/adapter.py +37 -0
- models/moonshine/__init__.py +6 -0
- models/mpt/__init__.py +6 -0
- models/musicgen/__init__.py +6 -0
- models/nemotron/__init__.py +6 -0
- models/olmo/__init__.py +6 -0
- models/omdet_turbo/__init__.py +11 -0
- models/oneformer/__init__.py +11 -0
- models/opt/__init__.py +6 -0
- models/ovis/__init__.py +6 -0
- models/owlvit/__init__.py +6 -0
- models/paligemma/__init__.py +6 -0
- models/parakeet/__init__.py +6 -0
- models/persimmon/__init__.py +6 -0
- models/phi/__init__.py +6 -0
- models/pix2struct/__init__.py +6 -0
- models/pixart/__init__.py +14 -0
- models/playground/__init__.py +14 -0
- models/poolformer/__init__.py +6 -0
- models/pop2piano/__init__.py +13 -0
- models/prophetnet/__init__.py +6 -0
- models/pvt/__init__.py +6 -0
- models/qwen/__init__.py +9 -0
- models/qwen_vl/__init__.py +5 -0
- models/qwen_vl/adapter.py +83 -0
- models/regnet/__init__.py +6 -0
- models/resnet/__init__.py +6 -0
- models/rt_detr/__init__.py +6 -0
- models/rwkv/__init__.py +6 -0
- models/sam/__init__.py +6 -0
- models/sam2/__init__.py +6 -0
- models/sam3/__init__.py +6 -0
- models/sam_hq/__init__.py +10 -0
- models/sana/__init__.py +16 -0
- models/sd_inpainting/__init__.py +23 -0
- models/sdxl/__init__.py +25 -0
- models/sdxl_refiner/__init__.py +18 -0
- models/sdxl_turbo/__init__.py +16 -0
- models/seamless_m4t/__init__.py +6 -0
- models/segformer/__init__.py +12 -0
- models/siglip/__init__.py +6 -0
- models/smollm/__init__.py +6 -0
- models/smolvlm/__init__.py +6 -0
- models/speecht5/__init__.py +6 -0
- models/stable_diffusion/__init__.py +22 -0
- models/stablelm/__init__.py +6 -0
- models/starcoder2/__init__.py +6 -0
- models/swiftformer/__init__.py +6 -0
- models/swin/__init__.py +6 -0
- models/table_transformer/__init__.py +6 -0
- models/timm/__init__.py +12 -0
- models/trocr/__init__.py +6 -0
- models/upernet/__init__.py +6 -0
- models/vision_encoder_decoder/__init__.py +7 -0
- models/vit/__init__.py +12 -0
- models/vits/__init__.py +6 -0
- models/voxtral/__init__.py +18 -0
- models/wav2vec2/__init__.py +21 -0
- models/whisper/__init__.py +11 -0
- models/xglm/__init__.py +6 -0
- models/xlnet/__init__.py +6 -0
- models/yolos/__init__.py +6 -0
- models/zamba/__init__.py +6 -0
- models/zoedepth/__init__.py +6 -0
- output_kinds.py +56 -0
- routing.py +202 -0
- server/__init__.py +13 -0
- server/_data/model_overrides.json +32 -0
- server/_data/supported_architectures.json +307 -0
- server/app.py +71 -0
- server/appdata.py +67 -0
- server/cli.py +68 -0
- server/deps.py +55 -0
- server/events.py +42 -0
- server/hf_service.py +319 -0
- server/hw_service.py +131 -0
- server/openai_api/__init__.py +9 -0
- server/openai_api/llm.py +236 -0
- server/openai_api/routes.py +236 -0
- server/openai_api/tools/__init__.py +55 -0
- server/openai_api/tools/base.py +80 -0
- server/openai_api/tools/hermes_qwen.py +28 -0
- server/openai_api/tools/llama.py +29 -0
- server/openai_api/tools/mistral.py +31 -0
- server/paths.py +24 -0
- server/routes/__init__.py +1 -0
- server/routes/hf.py +65 -0
- server/routes/inference.py +286 -0
- server/routes/store.py +42 -0
- server/routes/system.py +147 -0
- server/store_service.py +134 -0
- server/webui/components/app.js +718 -0
- server/webui/components/chat.js +288 -0
- server/webui/components/home.js +173 -0
- server/webui/components/icons.js +50 -0
- server/webui/components/model-browser.js +559 -0
- server/webui/components/onboarding.js +193 -0
- server/webui/components/settings.js +512 -0
- server/webui/components/task-workspace.js +1286 -0
- server/webui/components/welcome.js +4 -0
- server/webui/index.html +26 -0
- server/webui/styles.css +2109 -0
- server/webui/vendor/marked.umd.js +79 -0
- server/webui/vendor/purify.min.js +3 -0
- server/webui/vendor/react-dom.production.min.js +267 -0
- server/webui/vendor/react.production.min.js +31 -0
- server/webui/web-bridge.js +247 -0
- tasks/__init__.py +61 -0
- tasks/_render.py +120 -0
- tasks/asr.py +66 -0
- tasks/base.py +93 -0
- tasks/depth_estimation.py +88 -0
- tasks/document_qa.py +58 -0
- tasks/image_classification.py +48 -0
- tasks/image_segmentation.py +205 -0
- tasks/image_to_text.py +94 -0
- tasks/mask_generation.py +300 -0
- tasks/misc_tasks.py +122 -0
- tasks/object_detection.py +112 -0
- tasks/text_generation.py +162 -0
_win_compat.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Windows compatibility patches applied process-wide at sidecar boot.
|
|
2
|
+
|
|
3
|
+
Currently:
|
|
4
|
+
- os.symlink → transparent copy fallback when the caller lacks
|
|
5
|
+
SeCreateSymbolicLinkPrivilege (the WinError 1314 case). HuggingFace's
|
|
6
|
+
cache layout uses symlinks to dedup blobs across snapshots, and a
|
|
7
|
+
standard non-admin user without Developer Mode hits this on every
|
|
8
|
+
download. POSIX users keep the real os.symlink (symlinks always work
|
|
9
|
+
there).
|
|
10
|
+
|
|
11
|
+
Import this module before any other library that may call os.symlink.
|
|
12
|
+
On non-Windows platforms the import is a no-op.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
import shutil
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _install_symlink_copy_fallback() -> None:
|
|
22
|
+
if sys.platform != "win32":
|
|
23
|
+
return
|
|
24
|
+
|
|
25
|
+
_orig = os.symlink
|
|
26
|
+
|
|
27
|
+
def _resolve(src, dst):
|
|
28
|
+
if os.path.isabs(src):
|
|
29
|
+
return src
|
|
30
|
+
return os.path.normpath(os.path.join(os.path.dirname(dst), src))
|
|
31
|
+
|
|
32
|
+
def _symlink(src, dst, target_is_directory=False, *, dir_fd=None):
|
|
33
|
+
try:
|
|
34
|
+
return _orig(src, dst, target_is_directory=target_is_directory, dir_fd=dir_fd)
|
|
35
|
+
except OSError as e:
|
|
36
|
+
is_privilege_error = (
|
|
37
|
+
getattr(e, "winerror", None) == 1314
|
|
38
|
+
or "privilege" in str(e).lower()
|
|
39
|
+
)
|
|
40
|
+
if not is_privilege_error:
|
|
41
|
+
raise
|
|
42
|
+
real_src = _resolve(src, dst)
|
|
43
|
+
if target_is_directory or os.path.isdir(real_src):
|
|
44
|
+
shutil.copytree(real_src, dst, dirs_exist_ok=True)
|
|
45
|
+
else:
|
|
46
|
+
shutil.copyfile(real_src, dst)
|
|
47
|
+
return None
|
|
48
|
+
|
|
49
|
+
os.symlink = _symlink
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_install_symlink_copy_fallback()
|
adapters/__init__.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Adapter base classes + shared catch-all adapters.
|
|
2
|
+
|
|
3
|
+
Per-family inference code lives in `python/models/<family>/`. This package
|
|
4
|
+
holds only the cross-cutting pieces:
|
|
5
|
+
|
|
6
|
+
- `Adapter` the base class every family inherits from
|
|
7
|
+
- `StandardPipelineAdapter` fallback for repos with no dedicated family
|
|
8
|
+
- `DiffusersAdapter` library passthrough for diffusers checkpoints
|
|
9
|
+
|
|
10
|
+
Routing strategy lives in `routing.py` and is:
|
|
11
|
+
|
|
12
|
+
1. Named override (via model_overrides.json `"adapter"` field)
|
|
13
|
+
2. Plugin adapters (python/plugins/*.py)
|
|
14
|
+
3. models/<family>/ registry (per-family folders)
|
|
15
|
+
4. DiffusersAdapter (library == "diffusers")
|
|
16
|
+
5. StandardPipelineAdapter (pipeline_tag in its task list)
|
|
17
|
+
"""
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from .base import Adapter # noqa: F401
|
|
21
|
+
from .standard_pipeline import StandardPipelineAdapter
|
|
22
|
+
from .diffusers_pipeline import DiffusersAdapter
|
|
23
|
+
|
|
24
|
+
def _named_adapters() -> dict[str, type]:
|
|
25
|
+
"""Build the name→class map used by `model_overrides.json "adapter"` pins.
|
|
26
|
+
|
|
27
|
+
Includes the cross-cutting fallbacks plus every family folder in
|
|
28
|
+
`python/models/`. Built LAZILY (see `__getattr__` below) so we don't
|
|
29
|
+
capture a partially-loaded `models.FAMILIES` if some caller imports
|
|
30
|
+
`models` before `adapters` and the family-folder-discovery chain
|
|
31
|
+
re-enters this module mid-load.
|
|
32
|
+
"""
|
|
33
|
+
out: dict[str, type] = {
|
|
34
|
+
"standard": StandardPipelineAdapter,
|
|
35
|
+
"diffusers": DiffusersAdapter,
|
|
36
|
+
}
|
|
37
|
+
try:
|
|
38
|
+
from models import FAMILIES
|
|
39
|
+
for fam_name, entry in FAMILIES.items():
|
|
40
|
+
cls = entry.get("adapter")
|
|
41
|
+
if cls is None:
|
|
42
|
+
continue
|
|
43
|
+
out[fam_name] = cls
|
|
44
|
+
short = cls.__name__.replace("Adapter", "").lower()
|
|
45
|
+
out.setdefault(short, cls)
|
|
46
|
+
except Exception:
|
|
47
|
+
pass
|
|
48
|
+
return out
|
|
49
|
+
|
|
50
|
+
_NAMED_ADAPTERS_CACHE: "dict[str, type] | None" = None
|
|
51
|
+
|
|
52
|
+
def __getattr__(name: str):
|
|
53
|
+
global _NAMED_ADAPTERS_CACHE
|
|
54
|
+
if name == "NAMED_ADAPTERS":
|
|
55
|
+
if _NAMED_ADAPTERS_CACHE is None:
|
|
56
|
+
_NAMED_ADAPTERS_CACHE = _named_adapters()
|
|
57
|
+
return _NAMED_ADAPTERS_CACHE
|
|
58
|
+
raise AttributeError(f"module 'adapters' has no attribute {name!r}")
|
|
59
|
+
|
|
60
|
+
__all__ = [
|
|
61
|
+
"Adapter",
|
|
62
|
+
"StandardPipelineAdapter",
|
|
63
|
+
"DiffusersAdapter",
|
|
64
|
+
"NAMED_ADAPTERS",
|
|
65
|
+
]
|
adapters/base.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Adapter base class.
|
|
2
|
+
|
|
3
|
+
One instance of an adapter = one loaded model. The router picks the adapter,
|
|
4
|
+
calls `load(info, device)` once, then `run(inputs, params)` per request.
|
|
5
|
+
Instances are cached by (adapter_class, model_id) in the engine.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
|
|
11
|
+
class Adapter(ABC):
|
|
12
|
+
override: dict = {}
|
|
13
|
+
|
|
14
|
+
@classmethod
|
|
15
|
+
def can_handle(cls, info: dict) -> bool:
|
|
16
|
+
"""Return True if this adapter can run the described model.
|
|
17
|
+
|
|
18
|
+
`info` is the dict from routing.inspect_model. Implementations should
|
|
19
|
+
inspect `model_id`, `model_type`, `architectures`, `tags`, etc. -
|
|
20
|
+
*not* download any weights."""
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def load(self, info: dict, device) -> None:
|
|
25
|
+
"""Instantiate the underlying model + any helpers (processor, tokenizer)."""
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def run(self, inputs: dict, params: dict) -> dict:
|
|
29
|
+
"""Execute inference. Must return a dict matching one of the kinds in
|
|
30
|
+
`output_kinds.py` (`boxes`, `masks`, `labels`, `text`, `image`,
|
|
31
|
+
`audio`, `vector`)."""
|
|
32
|
+
|
|
33
|
+
def unload(self) -> None:
|
|
34
|
+
"""Hook for freeing GPU memory - default: drop references."""
|
|
35
|
+
for attr in list(self.__dict__.keys()):
|
|
36
|
+
if attr not in ("override",):
|
|
37
|
+
setattr(self, attr, None)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Diffusers fallback. text-to-image, img2img, inpainting."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from .base import Adapter
|
|
5
|
+
import output_kinds as ok
|
|
6
|
+
from io_utils import decode_image, resolve_device, torch_dtype_for_device
|
|
7
|
+
|
|
8
|
+
class DiffusersAdapter(Adapter):
|
|
9
|
+
@classmethod
|
|
10
|
+
def can_handle(cls, info):
|
|
11
|
+
if info.get("library") == "diffusers":
|
|
12
|
+
return info.get("pipeline_tag") != "text-to-video"
|
|
13
|
+
tag = info.get("pipeline_tag")
|
|
14
|
+
return tag in ("text-to-image", "image-to-image", "inpainting")
|
|
15
|
+
|
|
16
|
+
def load(self, info, device):
|
|
17
|
+
self.info = info
|
|
18
|
+
self.device = device
|
|
19
|
+
self.task = info.get("pipeline_tag") or "text-to-image"
|
|
20
|
+
dtype = torch_dtype_for_device()
|
|
21
|
+
|
|
22
|
+
from diffusers import (
|
|
23
|
+
AutoPipelineForText2Image,
|
|
24
|
+
AutoPipelineForImage2Image,
|
|
25
|
+
AutoPipelineForInpainting,
|
|
26
|
+
)
|
|
27
|
+
cls = {
|
|
28
|
+
"image-to-image": AutoPipelineForImage2Image,
|
|
29
|
+
"inpainting": AutoPipelineForInpainting,
|
|
30
|
+
}.get(self.task, AutoPipelineForText2Image)
|
|
31
|
+
|
|
32
|
+
kwargs = {"torch_dtype": dtype}
|
|
33
|
+
if self.override.get("trust_remote_code"):
|
|
34
|
+
kwargs["trust_remote_code"] = True
|
|
35
|
+
self.pipe = cls.from_pretrained(info["model_id"], **kwargs)
|
|
36
|
+
resolved = resolve_device()
|
|
37
|
+
if resolved is not False:
|
|
38
|
+
self.pipe = self.pipe.to(resolved)
|
|
39
|
+
|
|
40
|
+
def run(self, inputs, params):
|
|
41
|
+
prompt = (inputs.get("text") or "").strip()
|
|
42
|
+
if not prompt:
|
|
43
|
+
raise ValueError("Prompt required")
|
|
44
|
+
kwargs = {k: params[k] for k in
|
|
45
|
+
("num_inference_steps", "guidance_scale", "negative_prompt", "strength")
|
|
46
|
+
if k in params}
|
|
47
|
+
kwargs.setdefault("num_inference_steps", 20)
|
|
48
|
+
kwargs.setdefault("guidance_scale", 7.5)
|
|
49
|
+
|
|
50
|
+
if self.task == "image-to-image" and inputs.get("dataUrl"):
|
|
51
|
+
kwargs["image"] = decode_image(inputs["dataUrl"])
|
|
52
|
+
elif self.task == "inpainting" and inputs.get("dataUrl"):
|
|
53
|
+
kwargs["image"] = decode_image(inputs["dataUrl"])
|
|
54
|
+
|
|
55
|
+
result = self.pipe(prompt, **kwargs)
|
|
56
|
+
image = result.images[0]
|
|
57
|
+
return ok.image(image)
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Standard HF pipeline adapter. Fallback when no `models/<family>/` matches.
|
|
2
|
+
|
|
3
|
+
Thin dispatcher: the actual per-task logic lives in `python/tasks/`. When a
|
|
4
|
+
model breaks, add a folder under `python/models/` (preferred) or a Variant
|
|
5
|
+
in the relevant task file.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .base import Adapter
|
|
10
|
+
from tasks import TASK_REGISTRY, get_task
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class StandardPipelineAdapter(Adapter):
|
|
14
|
+
SUPPORTED_TASKS = set(TASK_REGISTRY.keys())
|
|
15
|
+
|
|
16
|
+
@classmethod
|
|
17
|
+
def can_handle(cls, info):
|
|
18
|
+
return info.get("pipeline_tag") in cls.SUPPORTED_TASKS
|
|
19
|
+
|
|
20
|
+
def load(self, info, device):
|
|
21
|
+
self.info = info
|
|
22
|
+
self.device = device
|
|
23
|
+
self.task_name = info["pipeline_tag"]
|
|
24
|
+
self.handler = get_task(self.task_name)
|
|
25
|
+
if self.handler is None:
|
|
26
|
+
raise ValueError(f"No task handler registered for {self.task_name!r}")
|
|
27
|
+
extra = {"trust_remote_code": True} if self.override.get("trust_remote_code") else {}
|
|
28
|
+
self.state = self.handler.load_pipeline(info, device, extra_kwargs=extra)
|
|
29
|
+
|
|
30
|
+
def run(self, inputs, params):
|
|
31
|
+
return self.handler.handle(self.state, inputs, params)
|
engine.py
ADDED
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""LocalML inference engine - the reusable core.
|
|
2
|
+
|
|
3
|
+
Holds all model-loading and inference logic, driven in-process by the FastAPI
|
|
4
|
+
web server (`server/`). Running in-process is what lets the OpenAI-compatible
|
|
5
|
+
endpoint hold a live handle to the currently-loaded LLM and stream tokens from
|
|
6
|
+
it.
|
|
7
|
+
|
|
8
|
+
Design invariants:
|
|
9
|
+
- One model = one loaded pipeline. Adapter instances are cached by
|
|
10
|
+
(adapter_class_name, model_id); a second request reuses the loaded model.
|
|
11
|
+
- Inference is NOT thread-safe against itself (torch). Callers serialize.
|
|
12
|
+
The server runs `run()`/`download()` in a threadpool behind a single lock.
|
|
13
|
+
"""
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import fnmatch
|
|
17
|
+
import re
|
|
18
|
+
import sys
|
|
19
|
+
import threading
|
|
20
|
+
import time
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
sys.path.insert(0, str(Path(__file__).parent.resolve()))
|
|
24
|
+
|
|
25
|
+
import _win_compat # noqa: F401, E402
|
|
26
|
+
|
|
27
|
+
from routing import inspect_model, pick_adapter, override_for # noqa: E402
|
|
28
|
+
from io_utils import resolve_device # noqa: E402
|
|
29
|
+
|
|
30
|
+
class DownloadCancelled(Exception):
|
|
31
|
+
"""Raised from the custom tqdm when the user dismisses a download."""
|
|
32
|
+
|
|
33
|
+
class Engine:
|
|
34
|
+
"""Holds the adapter cache and drives load / run / download / unload."""
|
|
35
|
+
|
|
36
|
+
def __init__(self):
|
|
37
|
+
self._adapter_cache: dict = {}
|
|
38
|
+
self._current_llm_id: str | None = None
|
|
39
|
+
self._log = _default_log
|
|
40
|
+
|
|
41
|
+
def _get_adapter(self, info: dict):
|
|
42
|
+
model_id = info["model_id"]
|
|
43
|
+
adapter = pick_adapter(info)
|
|
44
|
+
cache_key = (type(adapter).__name__, model_id)
|
|
45
|
+
cached = self._adapter_cache.get(cache_key)
|
|
46
|
+
if cached is not None:
|
|
47
|
+
return cached
|
|
48
|
+
self._log(f"loading {type(adapter).__name__} for {model_id}")
|
|
49
|
+
dev = resolve_device()
|
|
50
|
+
adapter.load(info, dev)
|
|
51
|
+
self._adapter_cache[cache_key] = adapter
|
|
52
|
+
return adapter
|
|
53
|
+
|
|
54
|
+
def _resolve_info(self, model_id: str, task: str | None) -> dict:
|
|
55
|
+
info = inspect_model(model_id)
|
|
56
|
+
if not info.get("pipeline_tag") and task:
|
|
57
|
+
info["pipeline_tag"] = task
|
|
58
|
+
return info
|
|
59
|
+
|
|
60
|
+
def run(self, model_id: str, task: str | None, inputs: dict, params: dict | None) -> dict:
|
|
61
|
+
"""Execute one inference. Returns an `output_kinds` dict.
|
|
62
|
+
|
|
63
|
+
Overrides merge under request params, the model is inspected + routed,
|
|
64
|
+
the adapter is loaded (or reused) and invoked. Records the model as the
|
|
65
|
+
current LLM when it's a text generator so the OpenAI endpoint can find
|
|
66
|
+
it.
|
|
67
|
+
"""
|
|
68
|
+
if not model_id:
|
|
69
|
+
raise ValueError("Missing 'modelId' - the session isn't bound to a model")
|
|
70
|
+
|
|
71
|
+
inputs = inputs or {}
|
|
72
|
+
|
|
73
|
+
override = override_for(model_id) or {}
|
|
74
|
+
ovr_params = override.get("params") or {}
|
|
75
|
+
req_params = params or {}
|
|
76
|
+
merged_params = {**ovr_params, **req_params}
|
|
77
|
+
|
|
78
|
+
info = self._resolve_info(model_id, task)
|
|
79
|
+
adapter = self._get_adapter(info)
|
|
80
|
+
|
|
81
|
+
out = adapter.run(inputs, merged_params)
|
|
82
|
+
|
|
83
|
+
if _is_text_generation(info, task, out):
|
|
84
|
+
self._current_llm_id = model_id
|
|
85
|
+
return out
|
|
86
|
+
|
|
87
|
+
def current_llm_id(self) -> str | None:
|
|
88
|
+
return self._current_llm_id
|
|
89
|
+
|
|
90
|
+
def loaded_model_ids(self) -> list[str]:
|
|
91
|
+
seen = []
|
|
92
|
+
for (_cls, model_id) in self._adapter_cache.keys():
|
|
93
|
+
if model_id not in seen:
|
|
94
|
+
seen.append(model_id)
|
|
95
|
+
return seen
|
|
96
|
+
|
|
97
|
+
def get_cached_adapter(self, model_id: str):
|
|
98
|
+
"""Return a loaded adapter instance for `model_id`, or None."""
|
|
99
|
+
for (_cls, mid), adapter in self._adapter_cache.items():
|
|
100
|
+
if mid == model_id:
|
|
101
|
+
return adapter
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
def ensure_loaded(self, model_id: str, task: str | None = None):
|
|
105
|
+
"""Load a model without running inference and return its adapter.
|
|
106
|
+
|
|
107
|
+
Used by the OpenAI endpoint to lazy-load a model named in the request
|
|
108
|
+
body when it isn't resident yet.
|
|
109
|
+
"""
|
|
110
|
+
cached = self.get_cached_adapter(model_id)
|
|
111
|
+
if cached is not None:
|
|
112
|
+
return cached
|
|
113
|
+
info = self._resolve_info(model_id, task)
|
|
114
|
+
return self._get_adapter(info)
|
|
115
|
+
|
|
116
|
+
def unload(self, model_id: str | None = None) -> int:
|
|
117
|
+
"""Drop cached adapter(s), freeing references (GPU memory). Returns the
|
|
118
|
+
number of adapters unloaded. `None` unloads everything."""
|
|
119
|
+
keys = [k for k in self._adapter_cache
|
|
120
|
+
if model_id is None or k[1] == model_id]
|
|
121
|
+
for k in keys:
|
|
122
|
+
adapter = self._adapter_cache.pop(k, None)
|
|
123
|
+
if adapter is not None:
|
|
124
|
+
try:
|
|
125
|
+
adapter.unload()
|
|
126
|
+
except Exception:
|
|
127
|
+
pass
|
|
128
|
+
if self._current_llm_id == k[1]:
|
|
129
|
+
self._current_llm_id = None
|
|
130
|
+
_empty_torch_cache()
|
|
131
|
+
return len(keys)
|
|
132
|
+
|
|
133
|
+
def download(self, model_id: str, on_progress=None, cancel_event: "threading.Event | None" = None) -> dict:
|
|
134
|
+
"""Run `snapshot_download`, streaming byte-level progress via the
|
|
135
|
+
`on_progress(dict)` callback. Picks exactly one weight format so multi-
|
|
136
|
+
format repos don't download 4× the bytes. Raises DownloadCancelled if
|
|
137
|
+
`cancel_event` is set mid-flight.
|
|
138
|
+
|
|
139
|
+
Progress is delivered to the `on_progress` callback.
|
|
140
|
+
"""
|
|
141
|
+
from huggingface_hub import snapshot_download, HfApi
|
|
142
|
+
from tqdm.auto import tqdm as _BaseTqdm
|
|
143
|
+
|
|
144
|
+
if not model_id:
|
|
145
|
+
raise ValueError("Missing 'modelId'")
|
|
146
|
+
|
|
147
|
+
on_progress = on_progress or (lambda _evt: None)
|
|
148
|
+
|
|
149
|
+
WEIGHT_FORMAT_ORDER = ["safetensors", "bin", "pt", "ckpt", "msgpack", "h5", "onnx", "ot"]
|
|
150
|
+
WEIGHT_EXT_RX = re.compile(r"\.(safetensors|bin|pt|ckpt|msgpack|h5|onnx|ot)$", re.IGNORECASE)
|
|
151
|
+
|
|
152
|
+
siblings = []
|
|
153
|
+
try:
|
|
154
|
+
info = HfApi().model_info(model_id, files_metadata=True)
|
|
155
|
+
siblings = list(info.siblings or [])
|
|
156
|
+
except Exception:
|
|
157
|
+
siblings = []
|
|
158
|
+
|
|
159
|
+
chosen_ext = None
|
|
160
|
+
by_ext: dict = {}
|
|
161
|
+
for s in siblings:
|
|
162
|
+
m = WEIGHT_EXT_RX.search((getattr(s, "rfilename", "") or "").lower())
|
|
163
|
+
if not m:
|
|
164
|
+
continue
|
|
165
|
+
by_ext.setdefault(m.group(1), []).append(s)
|
|
166
|
+
for ext in WEIGHT_FORMAT_ORDER:
|
|
167
|
+
if by_ext.get(ext):
|
|
168
|
+
chosen_ext = ext
|
|
169
|
+
break
|
|
170
|
+
|
|
171
|
+
ignore_patterns: list = []
|
|
172
|
+
if chosen_ext:
|
|
173
|
+
for ext in WEIGHT_FORMAT_ORDER:
|
|
174
|
+
if ext != chosen_ext and by_ext.get(ext):
|
|
175
|
+
ignore_patterns.append(f"*.{ext}")
|
|
176
|
+
if chosen_ext != "onnx":
|
|
177
|
+
ignore_patterns.append("onnx/*")
|
|
178
|
+
|
|
179
|
+
total_bytes = 0
|
|
180
|
+
if siblings:
|
|
181
|
+
for s in siblings:
|
|
182
|
+
name = (getattr(s, "rfilename", "") or "").lower()
|
|
183
|
+
if any(fnmatch.fnmatch(name, p) for p in ignore_patterns):
|
|
184
|
+
continue
|
|
185
|
+
sz = getattr(s, "size", None) or 0
|
|
186
|
+
if sz:
|
|
187
|
+
total_bytes += sz
|
|
188
|
+
|
|
189
|
+
state = {"done": 0, "last_emit": 0.0}
|
|
190
|
+
emit_lock = threading.Lock()
|
|
191
|
+
|
|
192
|
+
def emit(final: bool = False) -> None:
|
|
193
|
+
with emit_lock:
|
|
194
|
+
done = state["done"]
|
|
195
|
+
pct = (done / total_bytes * 100.0) if total_bytes else 0.0
|
|
196
|
+
on_progress({
|
|
197
|
+
"done": int(done),
|
|
198
|
+
"total": int(total_bytes),
|
|
199
|
+
"pct": round(pct, 2),
|
|
200
|
+
"final": bool(final),
|
|
201
|
+
})
|
|
202
|
+
|
|
203
|
+
class ProgressTqdm(_BaseTqdm):
|
|
204
|
+
def __init__(self, *args, **kwargs):
|
|
205
|
+
self._is_bytes = kwargs.get("unit") == "B"
|
|
206
|
+
kwargs["disable"] = True
|
|
207
|
+
super().__init__(*args, **kwargs)
|
|
208
|
+
|
|
209
|
+
def update(self, n=1):
|
|
210
|
+
if cancel_event is not None and cancel_event.is_set():
|
|
211
|
+
raise DownloadCancelled()
|
|
212
|
+
super().update(n)
|
|
213
|
+
if self._is_bytes and n:
|
|
214
|
+
should_emit = False
|
|
215
|
+
with emit_lock:
|
|
216
|
+
state["done"] += n
|
|
217
|
+
now = time.time()
|
|
218
|
+
if (now - state["last_emit"]) >= 0.15:
|
|
219
|
+
state["last_emit"] = now
|
|
220
|
+
should_emit = True
|
|
221
|
+
if should_emit:
|
|
222
|
+
emit()
|
|
223
|
+
|
|
224
|
+
emit()
|
|
225
|
+
try:
|
|
226
|
+
kwargs: dict = {"repo_id": model_id, "tqdm_class": ProgressTqdm}
|
|
227
|
+
if ignore_patterns:
|
|
228
|
+
kwargs["ignore_patterns"] = ignore_patterns
|
|
229
|
+
path = snapshot_download(**kwargs)
|
|
230
|
+
finally:
|
|
231
|
+
emit(final=True)
|
|
232
|
+
return {"path": path, "bytes": state["done"], "total_bytes": total_bytes}
|
|
233
|
+
|
|
234
|
+
def _default_log(msg: str) -> None:
|
|
235
|
+
print(f"[engine] {msg}", file=sys.stderr, flush=True)
|
|
236
|
+
|
|
237
|
+
def _is_text_generation(info: dict, task: str | None, out: dict) -> bool:
|
|
238
|
+
"""Best-effort: did this run produce LLM text from a causal model?"""
|
|
239
|
+
pt = (info.get("pipeline_tag") or task or "").lower()
|
|
240
|
+
if pt in ("text-generation", "conversational"):
|
|
241
|
+
return (out or {}).get("kind") == "text"
|
|
242
|
+
return False
|
|
243
|
+
|
|
244
|
+
def _empty_torch_cache() -> None:
|
|
245
|
+
try:
|
|
246
|
+
import torch
|
|
247
|
+
if torch.cuda.is_available():
|
|
248
|
+
torch.cuda.empty_cache()
|
|
249
|
+
except Exception:
|
|
250
|
+
pass
|
|
251
|
+
|
|
252
|
+
def actionable_error(e: Exception) -> str:
|
|
253
|
+
"""Translate raw tracebacks into messages a user can act on.
|
|
254
|
+
|
|
255
|
+
Maps common failure modes (OOM, gated repos, missing deps, ...) to guidance
|
|
256
|
+
the UI and API can surface directly.
|
|
257
|
+
"""
|
|
258
|
+
msg = str(e)
|
|
259
|
+
lower = msg.lower()
|
|
260
|
+
if "out of memory" in lower or "cuda out of memory" in lower:
|
|
261
|
+
return "Out of memory - try a smaller model or switch to CPU (disable CUDA) in settings."
|
|
262
|
+
if "cve-2025-32434" in lower or ("torch" in lower and "v2.6" in msg):
|
|
263
|
+
return ("Your torch version is too old - transformers requires torch ≥ 2.6 to load this model's weights. "
|
|
264
|
+
"Reinstall LocalML's inference extra with a torch ≥ 2.6 wheel.")
|
|
265
|
+
if "not a valid" in lower and "trust_remote_code" in lower:
|
|
266
|
+
return ("This model requires `trust_remote_code=True`. Add an entry for it in "
|
|
267
|
+
"python/model_overrides.json: { \"trust_remote_code\": true }.")
|
|
268
|
+
is_gated = (
|
|
269
|
+
"gatedrepoerror" in lower
|
|
270
|
+
or "gated repo" in lower
|
|
271
|
+
or "access to model" in lower and "restricted" in lower
|
|
272
|
+
or "401" in msg and ("huggingface" in lower or "unauthorized" in lower)
|
|
273
|
+
or "403" in msg and ("huggingface" in lower or "forbidden" in lower)
|
|
274
|
+
or "must be authenticated" in lower
|
|
275
|
+
or "you need to be logged in" in lower
|
|
276
|
+
)
|
|
277
|
+
if is_gated:
|
|
278
|
+
return ("This model is gated or private - it requires a Hugging Face access token. "
|
|
279
|
+
"Open Settings → HF Token, paste a token from "
|
|
280
|
+
"https://huggingface.co/settings/tokens (Read access is enough), then retry.")
|
|
281
|
+
if "no module named" in lower:
|
|
282
|
+
mod = msg.split("'")[1] if "'" in msg else "unknown"
|
|
283
|
+
return f"Missing Python package: `{mod}`. Install it into the LocalML environment and retry."
|
|
284
|
+
m = re.search(r"requires the (\S+) library", msg)
|
|
285
|
+
if m:
|
|
286
|
+
mod = m.group(1).strip("`'\".,")
|
|
287
|
+
return f"Missing Python package: `{mod}`. Install it into the LocalML environment and retry."
|
|
288
|
+
if "could not load model" in lower or "not a recognized model" in lower:
|
|
289
|
+
return (f"{msg}\n\nThis model doesn't fit any registered family. Add a folder "
|
|
290
|
+
"under python/models/ for it, pin it in python/model_overrides.json, "
|
|
291
|
+
"or drop a plugin file in python/plugins/.")
|
|
292
|
+
return msg
|
|
293
|
+
|
|
294
|
+
ENGINE = Engine()
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: inferml
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Any HuggingFace model. Local. Multi-modal. Served over an OpenAI-compatible API.
|
|
5
|
+
Author: LocalML
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 LocalML, Gitesh Chawda
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
26
|
+
THE SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/IMvision12/InferML
|
|
29
|
+
Keywords: huggingface,transformers,inference,openai,local,llm,diffusion
|
|
30
|
+
Requires-Python: >=3.10
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Requires-Dist: fastapi>=0.110
|
|
34
|
+
Requires-Dist: uvicorn[standard]>=0.29
|
|
35
|
+
Requires-Dist: huggingface_hub
|
|
36
|
+
Requires-Dist: platformdirs>=4
|
|
37
|
+
Requires-Dist: psutil>=5.9
|
|
38
|
+
Provides-Extra: inference
|
|
39
|
+
Requires-Dist: transformers>=5.7.0; extra == "inference"
|
|
40
|
+
Requires-Dist: torch>=2.6; extra == "inference"
|
|
41
|
+
Requires-Dist: torchvision; extra == "inference"
|
|
42
|
+
Requires-Dist: torchaudio>=2.6; extra == "inference"
|
|
43
|
+
Requires-Dist: diffusers; extra == "inference"
|
|
44
|
+
Requires-Dist: accelerate; extra == "inference"
|
|
45
|
+
Requires-Dist: timm; extra == "inference"
|
|
46
|
+
Requires-Dist: pillow; extra == "inference"
|
|
47
|
+
Requires-Dist: soundfile; extra == "inference"
|
|
48
|
+
Requires-Dist: librosa; extra == "inference"
|
|
49
|
+
Requires-Dist: numpy; extra == "inference"
|
|
50
|
+
Requires-Dist: scipy; extra == "inference"
|
|
51
|
+
Requires-Dist: sentencepiece; extra == "inference"
|
|
52
|
+
Requires-Dist: protobuf; extra == "inference"
|
|
53
|
+
Dynamic: license-file
|
|
54
|
+
|
|
55
|
+
<p align="center">
|
|
56
|
+
<img src="assets/logo.png" alt="LocalML logo" width="140" />
|
|
57
|
+
</p>
|
|
58
|
+
|
|
59
|
+
# LocalML
|
|
60
|
+
|
|
61
|
+
Any Hugging Face model. Local. Multi-modal. Now a **local web server** with an
|
|
62
|
+
**OpenAI-compatible API** - no Electron, no native binary.
|
|
63
|
+
|
|
64
|
+
Run 143+ model families fully on-device (LLMs, VLMs, diffusion, ASR, TTS,
|
|
65
|
+
segmentation, detection) behind a browser UI, and point agent frameworks
|
|
66
|
+
(LangChain, LangGraph, the OpenAI SDK) at it the way you point them at Ollama.
|
|
67
|
+
|
|
68
|
+
## Install
|
|
69
|
+
|
|
70
|
+
Requires **Python 3.10+** - the installer checks for it but won't install Python
|
|
71
|
+
for you. One line in your terminal:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
# macOS / Linux
|
|
75
|
+
curl -fsSL https://www.localml.tech/install.sh | sh
|
|
76
|
+
# Windows (PowerShell)
|
|
77
|
+
irm https://www.localml.tech/install.ps1 | iex
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The script bootstraps pipx and installs the LocalML server. On first launch the
|
|
81
|
+
app walks you through installing the inference stack (PyTorch + transformers) for
|
|
82
|
+
your hardware - pick **CPU** or **GPU** and it fetches the matching build.
|
|
83
|
+
|
|
84
|
+
Prefer to do it by hand?
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pipx install inferml # server only; the app installs torch on first run
|
|
88
|
+
pipx install "inferml[inference]" # or grab the whole stack up front (generic torch wheel)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Run
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
localml # starts the server and opens http://localhost:11500
|
|
95
|
+
localml --port 8080 # custom port
|
|
96
|
+
localml --host 0.0.0.0 --no-browser # expose on the LAN, headless
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Open the printed URL, download a model from the Hub tab, and run it.
|
|
100
|
+
|
|
101
|
+
## OpenAI-compatible API
|
|
102
|
+
|
|
103
|
+
Point any OpenAI client at `http://localhost:11500/v1` (any api key). It routes
|
|
104
|
+
to whichever LLM is currently loaded in LocalML.
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from openai import OpenAI
|
|
108
|
+
client = OpenAI(base_url="http://localhost:11500/v1", api_key="not-needed")
|
|
109
|
+
client.chat.completions.create(
|
|
110
|
+
model="Qwen/Qwen2.5-0.5B-Instruct",
|
|
111
|
+
messages=[{"role": "user", "content": "Hello!"}],
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Supports streaming (`stream=True`), `GET /v1/models`, and tool/function calling
|
|
116
|
+
for the Qwen/Hermes, Llama, and Mistral families.
|
|
117
|
+
|
|
118
|
+
## Docker
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
docker build -t localml .
|
|
122
|
+
docker run --rm -p 11500:11500 localml # CPU
|
|
123
|
+
docker run --rm --gpus all -p 11500:11500 localml # GPU
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Development
|
|
127
|
+
|
|
128
|
+
The React UI lives in `src/renderer/` (built with esbuild) and talks to the
|
|
129
|
+
server via `window.localml` (see `src/renderer/web-bridge.js`). The Python
|
|
130
|
+
server + inference engine live in `python/`.
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
npm install # build deps (esbuild + the vendored UMD libs)
|
|
134
|
+
npm run build # compile the renderer and bundle it into the package
|
|
135
|
+
pip install -e ".[inference]"
|
|
136
|
+
localml
|
|
137
|
+
```
|