inferml 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. _win_compat.py +52 -0
  2. adapters/__init__.py +65 -0
  3. adapters/base.py +37 -0
  4. adapters/diffusers_pipeline.py +57 -0
  5. adapters/standard_pipeline.py +31 -0
  6. engine.py +294 -0
  7. inferml-1.0.1.dist-info/METADATA +137 -0
  8. inferml-1.0.1.dist-info/RECORD +222 -0
  9. inferml-1.0.1.dist-info/WHEEL +5 -0
  10. inferml-1.0.1.dist-info/entry_points.txt +2 -0
  11. inferml-1.0.1.dist-info/licenses/LICENSE +21 -0
  12. inferml-1.0.1.dist-info/top_level.txt +9 -0
  13. io_utils.py +68 -0
  14. models/__init__.py +167 -0
  15. models/_diffusion_helper.py +45 -0
  16. models/_pipeline_helper.py +50 -0
  17. models/aria/__init__.py +6 -0
  18. models/bamba/__init__.py +6 -0
  19. models/bark/__init__.py +6 -0
  20. models/bit/__init__.py +6 -0
  21. models/bitnet/__init__.py +6 -0
  22. models/blip/__init__.py +12 -0
  23. models/bloom/__init__.py +6 -0
  24. models/chameleon/__init__.py +6 -0
  25. models/clip/__init__.py +6 -0
  26. models/codegen/__init__.py +6 -0
  27. models/cohere/__init__.py +6 -0
  28. models/cohere2_vision/__init__.py +6 -0
  29. models/conditional_detr/__init__.py +6 -0
  30. models/convnext/__init__.py +6 -0
  31. models/csm/__init__.py +11 -0
  32. models/cvt/__init__.py +6 -0
  33. models/d_fine/__init__.py +6 -0
  34. models/data2vec_vision/__init__.py +6 -0
  35. models/dbrx/__init__.py +6 -0
  36. models/deepseek/__init__.py +6 -0
  37. models/deepseek_vl/__init__.py +5 -0
  38. models/deepseek_vl/adapter.py +93 -0
  39. models/deformable_detr/__init__.py +6 -0
  40. models/depth_anything/__init__.py +6 -0
  41. models/depth_pro/__init__.py +6 -0
  42. models/detr/__init__.py +7 -0
  43. models/dia/__init__.py +13 -0
  44. models/donut/__init__.py +11 -0
  45. models/dpt/__init__.py +7 -0
  46. models/edgetam/__init__.py +14 -0
  47. models/efficientnet/__init__.py +6 -0
  48. models/emu3/__init__.py +6 -0
  49. models/eomt/__init__.py +6 -0
  50. models/eomt_dinov3/__init__.py +11 -0
  51. models/exaone/__init__.py +6 -0
  52. models/falcon/__init__.py +6 -0
  53. models/fastspeech2/__init__.py +6 -0
  54. models/fastvlm/__init__.py +5 -0
  55. models/fastvlm/adapter.py +99 -0
  56. models/florence2/__init__.py +5 -0
  57. models/florence2/adapter.py +102 -0
  58. models/flux/__init__.py +22 -0
  59. models/focalnet/__init__.py +6 -0
  60. models/fuyu/__init__.py +6 -0
  61. models/gemma/__init__.py +10 -0
  62. models/gemma3_vlm/__init__.py +6 -0
  63. models/git/__init__.py +6 -0
  64. models/glm/__init__.py +6 -0
  65. models/glm4v/__init__.py +6 -0
  66. models/got_ocr2/__init__.py +20 -0
  67. models/gpt2/__init__.py +6 -0
  68. models/gpt_oss/__init__.py +6 -0
  69. models/granite/__init__.py +6 -0
  70. models/granite_speech/__init__.py +15 -0
  71. models/grounding_dino/__init__.py +6 -0
  72. models/hunyuan_vl/__init__.py +6 -0
  73. models/idefics/__init__.py +6 -0
  74. models/instructpix2pix/__init__.py +19 -0
  75. models/internvl/__init__.py +6 -0
  76. models/jamba/__init__.py +6 -0
  77. models/janus/__init__.py +5 -0
  78. models/janus/adapter.py +125 -0
  79. models/kandinsky/__init__.py +14 -0
  80. models/kimi_vl/__init__.py +6 -0
  81. models/kolors/__init__.py +15 -0
  82. models/kosmos/__init__.py +6 -0
  83. models/kyutai_stt/__init__.py +11 -0
  84. models/layoutlmv3/__init__.py +9 -0
  85. models/levit/__init__.py +6 -0
  86. models/lfm2_vl/__init__.py +6 -0
  87. models/llama/__init__.py +6 -0
  88. models/llava/__init__.py +5 -0
  89. models/llava/adapter.py +79 -0
  90. models/m2m_100/__init__.py +6 -0
  91. models/mamba/__init__.py +6 -0
  92. models/marian/__init__.py +6 -0
  93. models/mask2former/__init__.py +6 -0
  94. models/maskformer/__init__.py +6 -0
  95. models/mgp_str/__init__.py +12 -0
  96. models/minicpm_v/__init__.py +6 -0
  97. models/minimax/__init__.py +6 -0
  98. models/mistral/__init__.py +6 -0
  99. models/mllama/__init__.py +6 -0
  100. models/mm_grounding_dino/__init__.py +12 -0
  101. models/mobilenet/__init__.py +7 -0
  102. models/moondream/__init__.py +5 -0
  103. models/moondream/adapter.py +37 -0
  104. models/moonshine/__init__.py +6 -0
  105. models/mpt/__init__.py +6 -0
  106. models/musicgen/__init__.py +6 -0
  107. models/nemotron/__init__.py +6 -0
  108. models/olmo/__init__.py +6 -0
  109. models/omdet_turbo/__init__.py +11 -0
  110. models/oneformer/__init__.py +11 -0
  111. models/opt/__init__.py +6 -0
  112. models/ovis/__init__.py +6 -0
  113. models/owlvit/__init__.py +6 -0
  114. models/paligemma/__init__.py +6 -0
  115. models/parakeet/__init__.py +6 -0
  116. models/persimmon/__init__.py +6 -0
  117. models/phi/__init__.py +6 -0
  118. models/pix2struct/__init__.py +6 -0
  119. models/pixart/__init__.py +14 -0
  120. models/playground/__init__.py +14 -0
  121. models/poolformer/__init__.py +6 -0
  122. models/pop2piano/__init__.py +13 -0
  123. models/prophetnet/__init__.py +6 -0
  124. models/pvt/__init__.py +6 -0
  125. models/qwen/__init__.py +9 -0
  126. models/qwen_vl/__init__.py +5 -0
  127. models/qwen_vl/adapter.py +83 -0
  128. models/regnet/__init__.py +6 -0
  129. models/resnet/__init__.py +6 -0
  130. models/rt_detr/__init__.py +6 -0
  131. models/rwkv/__init__.py +6 -0
  132. models/sam/__init__.py +6 -0
  133. models/sam2/__init__.py +6 -0
  134. models/sam3/__init__.py +6 -0
  135. models/sam_hq/__init__.py +10 -0
  136. models/sana/__init__.py +16 -0
  137. models/sd_inpainting/__init__.py +23 -0
  138. models/sdxl/__init__.py +25 -0
  139. models/sdxl_refiner/__init__.py +18 -0
  140. models/sdxl_turbo/__init__.py +16 -0
  141. models/seamless_m4t/__init__.py +6 -0
  142. models/segformer/__init__.py +12 -0
  143. models/siglip/__init__.py +6 -0
  144. models/smollm/__init__.py +6 -0
  145. models/smolvlm/__init__.py +6 -0
  146. models/speecht5/__init__.py +6 -0
  147. models/stable_diffusion/__init__.py +22 -0
  148. models/stablelm/__init__.py +6 -0
  149. models/starcoder2/__init__.py +6 -0
  150. models/swiftformer/__init__.py +6 -0
  151. models/swin/__init__.py +6 -0
  152. models/table_transformer/__init__.py +6 -0
  153. models/timm/__init__.py +12 -0
  154. models/trocr/__init__.py +6 -0
  155. models/upernet/__init__.py +6 -0
  156. models/vision_encoder_decoder/__init__.py +7 -0
  157. models/vit/__init__.py +12 -0
  158. models/vits/__init__.py +6 -0
  159. models/voxtral/__init__.py +18 -0
  160. models/wav2vec2/__init__.py +21 -0
  161. models/whisper/__init__.py +11 -0
  162. models/xglm/__init__.py +6 -0
  163. models/xlnet/__init__.py +6 -0
  164. models/yolos/__init__.py +6 -0
  165. models/zamba/__init__.py +6 -0
  166. models/zoedepth/__init__.py +6 -0
  167. output_kinds.py +56 -0
  168. routing.py +202 -0
  169. server/__init__.py +13 -0
  170. server/_data/model_overrides.json +32 -0
  171. server/_data/supported_architectures.json +307 -0
  172. server/app.py +71 -0
  173. server/appdata.py +67 -0
  174. server/cli.py +68 -0
  175. server/deps.py +55 -0
  176. server/events.py +42 -0
  177. server/hf_service.py +319 -0
  178. server/hw_service.py +131 -0
  179. server/openai_api/__init__.py +9 -0
  180. server/openai_api/llm.py +236 -0
  181. server/openai_api/routes.py +236 -0
  182. server/openai_api/tools/__init__.py +55 -0
  183. server/openai_api/tools/base.py +80 -0
  184. server/openai_api/tools/hermes_qwen.py +28 -0
  185. server/openai_api/tools/llama.py +29 -0
  186. server/openai_api/tools/mistral.py +31 -0
  187. server/paths.py +24 -0
  188. server/routes/__init__.py +1 -0
  189. server/routes/hf.py +65 -0
  190. server/routes/inference.py +286 -0
  191. server/routes/store.py +42 -0
  192. server/routes/system.py +147 -0
  193. server/store_service.py +134 -0
  194. server/webui/components/app.js +718 -0
  195. server/webui/components/chat.js +288 -0
  196. server/webui/components/home.js +173 -0
  197. server/webui/components/icons.js +50 -0
  198. server/webui/components/model-browser.js +559 -0
  199. server/webui/components/onboarding.js +193 -0
  200. server/webui/components/settings.js +512 -0
  201. server/webui/components/task-workspace.js +1286 -0
  202. server/webui/components/welcome.js +4 -0
  203. server/webui/index.html +26 -0
  204. server/webui/styles.css +2109 -0
  205. server/webui/vendor/marked.umd.js +79 -0
  206. server/webui/vendor/purify.min.js +3 -0
  207. server/webui/vendor/react-dom.production.min.js +267 -0
  208. server/webui/vendor/react.production.min.js +31 -0
  209. server/webui/web-bridge.js +247 -0
  210. tasks/__init__.py +61 -0
  211. tasks/_render.py +120 -0
  212. tasks/asr.py +66 -0
  213. tasks/base.py +93 -0
  214. tasks/depth_estimation.py +88 -0
  215. tasks/document_qa.py +58 -0
  216. tasks/image_classification.py +48 -0
  217. tasks/image_segmentation.py +205 -0
  218. tasks/image_to_text.py +94 -0
  219. tasks/mask_generation.py +300 -0
  220. tasks/misc_tasks.py +122 -0
  221. tasks/object_detection.py +112 -0
  222. tasks/text_generation.py +162 -0
_win_compat.py ADDED
@@ -0,0 +1,52 @@
1
+ """Windows compatibility patches applied process-wide at sidecar boot.
2
+
3
+ Currently:
4
+ - os.symlink → transparent copy fallback when the caller lacks
5
+ SeCreateSymbolicLinkPrivilege (the WinError 1314 case). HuggingFace's
6
+ cache layout uses symlinks to dedup blobs across snapshots, and a
7
+ standard non-admin user without Developer Mode hits this on every
8
+ download. POSIX users keep the real os.symlink (symlinks always work
9
+ there).
10
+
11
+ Import this module before any other library that may call os.symlink.
12
+ On non-Windows platforms the import is a no-op.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import os
17
+ import shutil
18
+ import sys
19
+
20
+
21
+ def _install_symlink_copy_fallback() -> None:
22
+ if sys.platform != "win32":
23
+ return
24
+
25
+ _orig = os.symlink
26
+
27
+ def _resolve(src, dst):
28
+ if os.path.isabs(src):
29
+ return src
30
+ return os.path.normpath(os.path.join(os.path.dirname(dst), src))
31
+
32
+ def _symlink(src, dst, target_is_directory=False, *, dir_fd=None):
33
+ try:
34
+ return _orig(src, dst, target_is_directory=target_is_directory, dir_fd=dir_fd)
35
+ except OSError as e:
36
+ is_privilege_error = (
37
+ getattr(e, "winerror", None) == 1314
38
+ or "privilege" in str(e).lower()
39
+ )
40
+ if not is_privilege_error:
41
+ raise
42
+ real_src = _resolve(src, dst)
43
+ if target_is_directory or os.path.isdir(real_src):
44
+ shutil.copytree(real_src, dst, dirs_exist_ok=True)
45
+ else:
46
+ shutil.copyfile(real_src, dst)
47
+ return None
48
+
49
+ os.symlink = _symlink
50
+
51
+
52
+ _install_symlink_copy_fallback()
adapters/__init__.py ADDED
@@ -0,0 +1,65 @@
1
+ """Adapter base classes + shared catch-all adapters.
2
+
3
+ Per-family inference code lives in `python/models/<family>/`. This package
4
+ holds only the cross-cutting pieces:
5
+
6
+ - `Adapter` the base class every family inherits from
7
+ - `StandardPipelineAdapter` fallback for repos with no dedicated family
8
+ - `DiffusersAdapter` library passthrough for diffusers checkpoints
9
+
10
+ Routing strategy lives in `routing.py` and is:
11
+
12
+ 1. Named override (via model_overrides.json `"adapter"` field)
13
+ 2. Plugin adapters (python/plugins/*.py)
14
+ 3. models/<family>/ registry (per-family folders)
15
+ 4. DiffusersAdapter (library == "diffusers")
16
+ 5. StandardPipelineAdapter (pipeline_tag in its task list)
17
+ """
18
+ from __future__ import annotations
19
+
20
+ from .base import Adapter # noqa: F401
21
+ from .standard_pipeline import StandardPipelineAdapter
22
+ from .diffusers_pipeline import DiffusersAdapter
23
+
24
+ def _named_adapters() -> dict[str, type]:
25
+ """Build the name→class map used by `model_overrides.json "adapter"` pins.
26
+
27
+ Includes the cross-cutting fallbacks plus every family folder in
28
+ `python/models/`. Built LAZILY (see `__getattr__` below) so we don't
29
+ capture a partially-loaded `models.FAMILIES` if some caller imports
30
+ `models` before `adapters` and the family-folder-discovery chain
31
+ re-enters this module mid-load.
32
+ """
33
+ out: dict[str, type] = {
34
+ "standard": StandardPipelineAdapter,
35
+ "diffusers": DiffusersAdapter,
36
+ }
37
+ try:
38
+ from models import FAMILIES
39
+ for fam_name, entry in FAMILIES.items():
40
+ cls = entry.get("adapter")
41
+ if cls is None:
42
+ continue
43
+ out[fam_name] = cls
44
+ short = cls.__name__.replace("Adapter", "").lower()
45
+ out.setdefault(short, cls)
46
+ except Exception:
47
+ pass
48
+ return out
49
+
50
+ _NAMED_ADAPTERS_CACHE: "dict[str, type] | None" = None
51
+
52
+ def __getattr__(name: str):
53
+ global _NAMED_ADAPTERS_CACHE
54
+ if name == "NAMED_ADAPTERS":
55
+ if _NAMED_ADAPTERS_CACHE is None:
56
+ _NAMED_ADAPTERS_CACHE = _named_adapters()
57
+ return _NAMED_ADAPTERS_CACHE
58
+ raise AttributeError(f"module 'adapters' has no attribute {name!r}")
59
+
60
+ __all__ = [
61
+ "Adapter",
62
+ "StandardPipelineAdapter",
63
+ "DiffusersAdapter",
64
+ "NAMED_ADAPTERS",
65
+ ]
adapters/base.py ADDED
@@ -0,0 +1,37 @@
1
+ """Adapter base class.
2
+
3
+ One instance of an adapter = one loaded model. The router picks the adapter,
4
+ calls `load(info, device)` once, then `run(inputs, params)` per request.
5
+ Instances are cached by (adapter_class, model_id) in the engine.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from abc import ABC, abstractmethod
10
+
11
+ class Adapter(ABC):
12
+ override: dict = {}
13
+
14
+ @classmethod
15
+ def can_handle(cls, info: dict) -> bool:
16
+ """Return True if this adapter can run the described model.
17
+
18
+ `info` is the dict from routing.inspect_model. Implementations should
19
+ inspect `model_id`, `model_type`, `architectures`, `tags`, etc. -
20
+ *not* download any weights."""
21
+ return False
22
+
23
+ @abstractmethod
24
+ def load(self, info: dict, device) -> None:
25
+ """Instantiate the underlying model + any helpers (processor, tokenizer)."""
26
+
27
+ @abstractmethod
28
+ def run(self, inputs: dict, params: dict) -> dict:
29
+ """Execute inference. Must return a dict matching one of the kinds in
30
+ `output_kinds.py` (`boxes`, `masks`, `labels`, `text`, `image`,
31
+ `audio`, `vector`)."""
32
+
33
+ def unload(self) -> None:
34
+ """Hook for freeing GPU memory - default: drop references."""
35
+ for attr in list(self.__dict__.keys()):
36
+ if attr not in ("override",):
37
+ setattr(self, attr, None)
@@ -0,0 +1,57 @@
1
+ """Diffusers fallback. text-to-image, img2img, inpainting."""
2
+ from __future__ import annotations
3
+
4
+ from .base import Adapter
5
+ import output_kinds as ok
6
+ from io_utils import decode_image, resolve_device, torch_dtype_for_device
7
+
8
+ class DiffusersAdapter(Adapter):
9
+ @classmethod
10
+ def can_handle(cls, info):
11
+ if info.get("library") == "diffusers":
12
+ return info.get("pipeline_tag") != "text-to-video"
13
+ tag = info.get("pipeline_tag")
14
+ return tag in ("text-to-image", "image-to-image", "inpainting")
15
+
16
+ def load(self, info, device):
17
+ self.info = info
18
+ self.device = device
19
+ self.task = info.get("pipeline_tag") or "text-to-image"
20
+ dtype = torch_dtype_for_device()
21
+
22
+ from diffusers import (
23
+ AutoPipelineForText2Image,
24
+ AutoPipelineForImage2Image,
25
+ AutoPipelineForInpainting,
26
+ )
27
+ cls = {
28
+ "image-to-image": AutoPipelineForImage2Image,
29
+ "inpainting": AutoPipelineForInpainting,
30
+ }.get(self.task, AutoPipelineForText2Image)
31
+
32
+ kwargs = {"torch_dtype": dtype}
33
+ if self.override.get("trust_remote_code"):
34
+ kwargs["trust_remote_code"] = True
35
+ self.pipe = cls.from_pretrained(info["model_id"], **kwargs)
36
+ resolved = resolve_device()
37
+ if resolved is not False:
38
+ self.pipe = self.pipe.to(resolved)
39
+
40
+ def run(self, inputs, params):
41
+ prompt = (inputs.get("text") or "").strip()
42
+ if not prompt:
43
+ raise ValueError("Prompt required")
44
+ kwargs = {k: params[k] for k in
45
+ ("num_inference_steps", "guidance_scale", "negative_prompt", "strength")
46
+ if k in params}
47
+ kwargs.setdefault("num_inference_steps", 20)
48
+ kwargs.setdefault("guidance_scale", 7.5)
49
+
50
+ if self.task == "image-to-image" and inputs.get("dataUrl"):
51
+ kwargs["image"] = decode_image(inputs["dataUrl"])
52
+ elif self.task == "inpainting" and inputs.get("dataUrl"):
53
+ kwargs["image"] = decode_image(inputs["dataUrl"])
54
+
55
+ result = self.pipe(prompt, **kwargs)
56
+ image = result.images[0]
57
+ return ok.image(image)
@@ -0,0 +1,31 @@
1
+ """Standard HF pipeline adapter. Fallback when no `models/<family>/` matches.
2
+
3
+ Thin dispatcher: the actual per-task logic lives in `python/tasks/`. When a
4
+ model breaks, add a folder under `python/models/` (preferred) or a Variant
5
+ in the relevant task file.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ from .base import Adapter
10
+ from tasks import TASK_REGISTRY, get_task
11
+
12
+
13
+ class StandardPipelineAdapter(Adapter):
14
+ SUPPORTED_TASKS = set(TASK_REGISTRY.keys())
15
+
16
+ @classmethod
17
+ def can_handle(cls, info):
18
+ return info.get("pipeline_tag") in cls.SUPPORTED_TASKS
19
+
20
+ def load(self, info, device):
21
+ self.info = info
22
+ self.device = device
23
+ self.task_name = info["pipeline_tag"]
24
+ self.handler = get_task(self.task_name)
25
+ if self.handler is None:
26
+ raise ValueError(f"No task handler registered for {self.task_name!r}")
27
+ extra = {"trust_remote_code": True} if self.override.get("trust_remote_code") else {}
28
+ self.state = self.handler.load_pipeline(info, device, extra_kwargs=extra)
29
+
30
+ def run(self, inputs, params):
31
+ return self.handler.handle(self.state, inputs, params)
engine.py ADDED
@@ -0,0 +1,294 @@
1
+ """LocalML inference engine - the reusable core.
2
+
3
+ Holds all model-loading and inference logic, driven in-process by the FastAPI
4
+ web server (`server/`). Running in-process is what lets the OpenAI-compatible
5
+ endpoint hold a live handle to the currently-loaded LLM and stream tokens from
6
+ it.
7
+
8
+ Design invariants:
9
+ - One model = one loaded pipeline. Adapter instances are cached by
10
+ (adapter_class_name, model_id); a second request reuses the loaded model.
11
+ - Inference is NOT thread-safe against itself (torch). Callers serialize.
12
+ The server runs `run()`/`download()` in a threadpool behind a single lock.
13
+ """
14
+ from __future__ import annotations
15
+
16
+ import fnmatch
17
+ import re
18
+ import sys
19
+ import threading
20
+ import time
21
+ from pathlib import Path
22
+
23
+ sys.path.insert(0, str(Path(__file__).parent.resolve()))
24
+
25
+ import _win_compat # noqa: F401, E402
26
+
27
+ from routing import inspect_model, pick_adapter, override_for # noqa: E402
28
+ from io_utils import resolve_device # noqa: E402
29
+
30
+ class DownloadCancelled(Exception):
31
+ """Raised from the custom tqdm when the user dismisses a download."""
32
+
33
+ class Engine:
34
+ """Holds the adapter cache and drives load / run / download / unload."""
35
+
36
+ def __init__(self):
37
+ self._adapter_cache: dict = {}
38
+ self._current_llm_id: str | None = None
39
+ self._log = _default_log
40
+
41
+ def _get_adapter(self, info: dict):
42
+ model_id = info["model_id"]
43
+ adapter = pick_adapter(info)
44
+ cache_key = (type(adapter).__name__, model_id)
45
+ cached = self._adapter_cache.get(cache_key)
46
+ if cached is not None:
47
+ return cached
48
+ self._log(f"loading {type(adapter).__name__} for {model_id}")
49
+ dev = resolve_device()
50
+ adapter.load(info, dev)
51
+ self._adapter_cache[cache_key] = adapter
52
+ return adapter
53
+
54
+ def _resolve_info(self, model_id: str, task: str | None) -> dict:
55
+ info = inspect_model(model_id)
56
+ if not info.get("pipeline_tag") and task:
57
+ info["pipeline_tag"] = task
58
+ return info
59
+
60
+ def run(self, model_id: str, task: str | None, inputs: dict, params: dict | None) -> dict:
61
+ """Execute one inference. Returns an `output_kinds` dict.
62
+
63
+ Overrides merge under request params, the model is inspected + routed,
64
+ the adapter is loaded (or reused) and invoked. Records the model as the
65
+ current LLM when it's a text generator so the OpenAI endpoint can find
66
+ it.
67
+ """
68
+ if not model_id:
69
+ raise ValueError("Missing 'modelId' - the session isn't bound to a model")
70
+
71
+ inputs = inputs or {}
72
+
73
+ override = override_for(model_id) or {}
74
+ ovr_params = override.get("params") or {}
75
+ req_params = params or {}
76
+ merged_params = {**ovr_params, **req_params}
77
+
78
+ info = self._resolve_info(model_id, task)
79
+ adapter = self._get_adapter(info)
80
+
81
+ out = adapter.run(inputs, merged_params)
82
+
83
+ if _is_text_generation(info, task, out):
84
+ self._current_llm_id = model_id
85
+ return out
86
+
87
+ def current_llm_id(self) -> str | None:
88
+ return self._current_llm_id
89
+
90
+ def loaded_model_ids(self) -> list[str]:
91
+ seen = []
92
+ for (_cls, model_id) in self._adapter_cache.keys():
93
+ if model_id not in seen:
94
+ seen.append(model_id)
95
+ return seen
96
+
97
+ def get_cached_adapter(self, model_id: str):
98
+ """Return a loaded adapter instance for `model_id`, or None."""
99
+ for (_cls, mid), adapter in self._adapter_cache.items():
100
+ if mid == model_id:
101
+ return adapter
102
+ return None
103
+
104
+ def ensure_loaded(self, model_id: str, task: str | None = None):
105
+ """Load a model without running inference and return its adapter.
106
+
107
+ Used by the OpenAI endpoint to lazy-load a model named in the request
108
+ body when it isn't resident yet.
109
+ """
110
+ cached = self.get_cached_adapter(model_id)
111
+ if cached is not None:
112
+ return cached
113
+ info = self._resolve_info(model_id, task)
114
+ return self._get_adapter(info)
115
+
116
+ def unload(self, model_id: str | None = None) -> int:
117
+ """Drop cached adapter(s), freeing references (GPU memory). Returns the
118
+ number of adapters unloaded. `None` unloads everything."""
119
+ keys = [k for k in self._adapter_cache
120
+ if model_id is None or k[1] == model_id]
121
+ for k in keys:
122
+ adapter = self._adapter_cache.pop(k, None)
123
+ if adapter is not None:
124
+ try:
125
+ adapter.unload()
126
+ except Exception:
127
+ pass
128
+ if self._current_llm_id == k[1]:
129
+ self._current_llm_id = None
130
+ _empty_torch_cache()
131
+ return len(keys)
132
+
133
+ def download(self, model_id: str, on_progress=None, cancel_event: "threading.Event | None" = None) -> dict:
134
+ """Run `snapshot_download`, streaming byte-level progress via the
135
+ `on_progress(dict)` callback. Picks exactly one weight format so multi-
136
+ format repos don't download 4× the bytes. Raises DownloadCancelled if
137
+ `cancel_event` is set mid-flight.
138
+
139
+ Progress is delivered to the `on_progress` callback.
140
+ """
141
+ from huggingface_hub import snapshot_download, HfApi
142
+ from tqdm.auto import tqdm as _BaseTqdm
143
+
144
+ if not model_id:
145
+ raise ValueError("Missing 'modelId'")
146
+
147
+ on_progress = on_progress or (lambda _evt: None)
148
+
149
+ WEIGHT_FORMAT_ORDER = ["safetensors", "bin", "pt", "ckpt", "msgpack", "h5", "onnx", "ot"]
150
+ WEIGHT_EXT_RX = re.compile(r"\.(safetensors|bin|pt|ckpt|msgpack|h5|onnx|ot)$", re.IGNORECASE)
151
+
152
+ siblings = []
153
+ try:
154
+ info = HfApi().model_info(model_id, files_metadata=True)
155
+ siblings = list(info.siblings or [])
156
+ except Exception:
157
+ siblings = []
158
+
159
+ chosen_ext = None
160
+ by_ext: dict = {}
161
+ for s in siblings:
162
+ m = WEIGHT_EXT_RX.search((getattr(s, "rfilename", "") or "").lower())
163
+ if not m:
164
+ continue
165
+ by_ext.setdefault(m.group(1), []).append(s)
166
+ for ext in WEIGHT_FORMAT_ORDER:
167
+ if by_ext.get(ext):
168
+ chosen_ext = ext
169
+ break
170
+
171
+ ignore_patterns: list = []
172
+ if chosen_ext:
173
+ for ext in WEIGHT_FORMAT_ORDER:
174
+ if ext != chosen_ext and by_ext.get(ext):
175
+ ignore_patterns.append(f"*.{ext}")
176
+ if chosen_ext != "onnx":
177
+ ignore_patterns.append("onnx/*")
178
+
179
+ total_bytes = 0
180
+ if siblings:
181
+ for s in siblings:
182
+ name = (getattr(s, "rfilename", "") or "").lower()
183
+ if any(fnmatch.fnmatch(name, p) for p in ignore_patterns):
184
+ continue
185
+ sz = getattr(s, "size", None) or 0
186
+ if sz:
187
+ total_bytes += sz
188
+
189
+ state = {"done": 0, "last_emit": 0.0}
190
+ emit_lock = threading.Lock()
191
+
192
+ def emit(final: bool = False) -> None:
193
+ with emit_lock:
194
+ done = state["done"]
195
+ pct = (done / total_bytes * 100.0) if total_bytes else 0.0
196
+ on_progress({
197
+ "done": int(done),
198
+ "total": int(total_bytes),
199
+ "pct": round(pct, 2),
200
+ "final": bool(final),
201
+ })
202
+
203
+ class ProgressTqdm(_BaseTqdm):
204
+ def __init__(self, *args, **kwargs):
205
+ self._is_bytes = kwargs.get("unit") == "B"
206
+ kwargs["disable"] = True
207
+ super().__init__(*args, **kwargs)
208
+
209
+ def update(self, n=1):
210
+ if cancel_event is not None and cancel_event.is_set():
211
+ raise DownloadCancelled()
212
+ super().update(n)
213
+ if self._is_bytes and n:
214
+ should_emit = False
215
+ with emit_lock:
216
+ state["done"] += n
217
+ now = time.time()
218
+ if (now - state["last_emit"]) >= 0.15:
219
+ state["last_emit"] = now
220
+ should_emit = True
221
+ if should_emit:
222
+ emit()
223
+
224
+ emit()
225
+ try:
226
+ kwargs: dict = {"repo_id": model_id, "tqdm_class": ProgressTqdm}
227
+ if ignore_patterns:
228
+ kwargs["ignore_patterns"] = ignore_patterns
229
+ path = snapshot_download(**kwargs)
230
+ finally:
231
+ emit(final=True)
232
+ return {"path": path, "bytes": state["done"], "total_bytes": total_bytes}
233
+
234
+ def _default_log(msg: str) -> None:
235
+ print(f"[engine] {msg}", file=sys.stderr, flush=True)
236
+
237
+ def _is_text_generation(info: dict, task: str | None, out: dict) -> bool:
238
+ """Best-effort: did this run produce LLM text from a causal model?"""
239
+ pt = (info.get("pipeline_tag") or task or "").lower()
240
+ if pt in ("text-generation", "conversational"):
241
+ return (out or {}).get("kind") == "text"
242
+ return False
243
+
244
+ def _empty_torch_cache() -> None:
245
+ try:
246
+ import torch
247
+ if torch.cuda.is_available():
248
+ torch.cuda.empty_cache()
249
+ except Exception:
250
+ pass
251
+
252
+ def actionable_error(e: Exception) -> str:
253
+ """Translate raw tracebacks into messages a user can act on.
254
+
255
+ Maps common failure modes (OOM, gated repos, missing deps, ...) to guidance
256
+ the UI and API can surface directly.
257
+ """
258
+ msg = str(e)
259
+ lower = msg.lower()
260
+ if "out of memory" in lower or "cuda out of memory" in lower:
261
+ return "Out of memory - try a smaller model or switch to CPU (disable CUDA) in settings."
262
+ if "cve-2025-32434" in lower or ("torch" in lower and "v2.6" in msg):
263
+ return ("Your torch version is too old - transformers requires torch ≥ 2.6 to load this model's weights. "
264
+ "Reinstall LocalML's inference extra with a torch ≥ 2.6 wheel.")
265
+ if "not a valid" in lower and "trust_remote_code" in lower:
266
+ return ("This model requires `trust_remote_code=True`. Add an entry for it in "
267
+ "python/model_overrides.json: { \"trust_remote_code\": true }.")
268
+ is_gated = (
269
+ "gatedrepoerror" in lower
270
+ or "gated repo" in lower
271
+ or "access to model" in lower and "restricted" in lower
272
+ or "401" in msg and ("huggingface" in lower or "unauthorized" in lower)
273
+ or "403" in msg and ("huggingface" in lower or "forbidden" in lower)
274
+ or "must be authenticated" in lower
275
+ or "you need to be logged in" in lower
276
+ )
277
+ if is_gated:
278
+ return ("This model is gated or private - it requires a Hugging Face access token. "
279
+ "Open Settings → HF Token, paste a token from "
280
+ "https://huggingface.co/settings/tokens (Read access is enough), then retry.")
281
+ if "no module named" in lower:
282
+ mod = msg.split("'")[1] if "'" in msg else "unknown"
283
+ return f"Missing Python package: `{mod}`. Install it into the LocalML environment and retry."
284
+ m = re.search(r"requires the (\S+) library", msg)
285
+ if m:
286
+ mod = m.group(1).strip("`'\".,")
287
+ return f"Missing Python package: `{mod}`. Install it into the LocalML environment and retry."
288
+ if "could not load model" in lower or "not a recognized model" in lower:
289
+ return (f"{msg}\n\nThis model doesn't fit any registered family. Add a folder "
290
+ "under python/models/ for it, pin it in python/model_overrides.json, "
291
+ "or drop a plugin file in python/plugins/.")
292
+ return msg
293
+
294
+ ENGINE = Engine()
@@ -0,0 +1,137 @@
1
+ Metadata-Version: 2.4
2
+ Name: inferml
3
+ Version: 1.0.1
4
+ Summary: Any HuggingFace model. Local. Multi-modal. Served over an OpenAI-compatible API.
5
+ Author: LocalML
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 LocalML, Gitesh Chawda
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26
+ THE SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/IMvision12/InferML
29
+ Keywords: huggingface,transformers,inference,openai,local,llm,diffusion
30
+ Requires-Python: >=3.10
31
+ Description-Content-Type: text/markdown
32
+ License-File: LICENSE
33
+ Requires-Dist: fastapi>=0.110
34
+ Requires-Dist: uvicorn[standard]>=0.29
35
+ Requires-Dist: huggingface_hub
36
+ Requires-Dist: platformdirs>=4
37
+ Requires-Dist: psutil>=5.9
38
+ Provides-Extra: inference
39
+ Requires-Dist: transformers>=5.7.0; extra == "inference"
40
+ Requires-Dist: torch>=2.6; extra == "inference"
41
+ Requires-Dist: torchvision; extra == "inference"
42
+ Requires-Dist: torchaudio>=2.6; extra == "inference"
43
+ Requires-Dist: diffusers; extra == "inference"
44
+ Requires-Dist: accelerate; extra == "inference"
45
+ Requires-Dist: timm; extra == "inference"
46
+ Requires-Dist: pillow; extra == "inference"
47
+ Requires-Dist: soundfile; extra == "inference"
48
+ Requires-Dist: librosa; extra == "inference"
49
+ Requires-Dist: numpy; extra == "inference"
50
+ Requires-Dist: scipy; extra == "inference"
51
+ Requires-Dist: sentencepiece; extra == "inference"
52
+ Requires-Dist: protobuf; extra == "inference"
53
+ Dynamic: license-file
54
+
55
+ <p align="center">
56
+ <img src="assets/logo.png" alt="LocalML logo" width="140" />
57
+ </p>
58
+
59
+ # LocalML
60
+
61
+ Any Hugging Face model. Local. Multi-modal. Now a **local web server** with an
62
+ **OpenAI-compatible API** - no Electron, no native binary.
63
+
64
+ Run 143+ model families fully on-device (LLMs, VLMs, diffusion, ASR, TTS,
65
+ segmentation, detection) behind a browser UI, and point agent frameworks
66
+ (LangChain, LangGraph, the OpenAI SDK) at it the way you point them at Ollama.
67
+
68
+ ## Install
69
+
70
+ Requires **Python 3.10+** - the installer checks for it but won't install Python
71
+ for you. One line in your terminal:
72
+
73
+ ```bash
74
+ # macOS / Linux
75
+ curl -fsSL https://www.localml.tech/install.sh | sh
76
+ # Windows (PowerShell)
77
+ irm https://www.localml.tech/install.ps1 | iex
78
+ ```
79
+
80
+ The script bootstraps pipx and installs the LocalML server. On first launch the
81
+ app walks you through installing the inference stack (PyTorch + transformers) for
82
+ your hardware - pick **CPU** or **GPU** and it fetches the matching build.
83
+
84
+ Prefer to do it by hand?
85
+
86
+ ```bash
87
+ pipx install inferml # server only; the app installs torch on first run
88
+ pipx install "inferml[inference]" # or grab the whole stack up front (generic torch wheel)
89
+ ```
90
+
91
+ ## Run
92
+
93
+ ```bash
94
+ localml # starts the server and opens http://localhost:11500
95
+ localml --port 8080 # custom port
96
+ localml --host 0.0.0.0 --no-browser # expose on the LAN, headless
97
+ ```
98
+
99
+ Open the printed URL, download a model from the Hub tab, and run it.
100
+
101
+ ## OpenAI-compatible API
102
+
103
+ Point any OpenAI client at `http://localhost:11500/v1` (any api key). It routes
104
+ to whichever LLM is currently loaded in LocalML.
105
+
106
+ ```python
107
+ from openai import OpenAI
108
+ client = OpenAI(base_url="http://localhost:11500/v1", api_key="not-needed")
109
+ client.chat.completions.create(
110
+ model="Qwen/Qwen2.5-0.5B-Instruct",
111
+ messages=[{"role": "user", "content": "Hello!"}],
112
+ )
113
+ ```
114
+
115
+ Supports streaming (`stream=True`), `GET /v1/models`, and tool/function calling
116
+ for the Qwen/Hermes, Llama, and Mistral families.
117
+
118
+ ## Docker
119
+
120
+ ```bash
121
+ docker build -t localml .
122
+ docker run --rm -p 11500:11500 localml # CPU
123
+ docker run --rm --gpus all -p 11500:11500 localml # GPU
124
+ ```
125
+
126
+ ## Development
127
+
128
+ The React UI lives in `src/renderer/` (built with esbuild) and talks to the
129
+ server via `window.localml` (see `src/renderer/web-bridge.js`). The Python
130
+ server + inference engine live in `python/`.
131
+
132
+ ```bash
133
+ npm install # build deps (esbuild + the vendored UMD libs)
134
+ npm run build # compile the renderer and bundle it into the package
135
+ pip install -e ".[inference]"
136
+ localml
137
+ ```