superqode 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- superqode/__init__.py +33 -0
- superqode/acp/__init__.py +23 -0
- superqode/acp/client.py +913 -0
- superqode/acp/permission_screen.py +457 -0
- superqode/acp/types.py +480 -0
- superqode/acp_discovery.py +856 -0
- superqode/agent/__init__.py +22 -0
- superqode/agent/edit_strategies.py +334 -0
- superqode/agent/loop.py +892 -0
- superqode/agent/qe_report_templates.py +39 -0
- superqode/agent/system_prompts.py +353 -0
- superqode/agent_output.py +721 -0
- superqode/agent_stream.py +953 -0
- superqode/agents/__init__.py +59 -0
- superqode/agents/acp_registry.py +305 -0
- superqode/agents/client.py +249 -0
- superqode/agents/data/augmentcode.com.toml +51 -0
- superqode/agents/data/cagent.dev.toml +51 -0
- superqode/agents/data/claude.com.toml +60 -0
- superqode/agents/data/codeassistant.dev.toml +51 -0
- superqode/agents/data/codex.openai.com.toml +57 -0
- superqode/agents/data/fastagent.ai.toml +66 -0
- superqode/agents/data/geminicli.com.toml +77 -0
- superqode/agents/data/goose.block.xyz.toml +54 -0
- superqode/agents/data/junie.jetbrains.com.toml +56 -0
- superqode/agents/data/kimi.moonshot.cn.toml +57 -0
- superqode/agents/data/llmlingagent.dev.toml +51 -0
- superqode/agents/data/molt.bot.toml +49 -0
- superqode/agents/data/opencode.ai.toml +60 -0
- superqode/agents/data/stakpak.dev.toml +51 -0
- superqode/agents/data/vtcode.dev.toml +51 -0
- superqode/agents/discovery.py +266 -0
- superqode/agents/messaging.py +160 -0
- superqode/agents/persona.py +166 -0
- superqode/agents/registry.py +421 -0
- superqode/agents/schema.py +72 -0
- superqode/agents/unified.py +367 -0
- superqode/app/__init__.py +111 -0
- superqode/app/constants.py +314 -0
- superqode/app/css.py +366 -0
- superqode/app/models.py +118 -0
- superqode/app/suggester.py +125 -0
- superqode/app/widgets.py +1591 -0
- superqode/app_enhanced.py +399 -0
- superqode/app_main.py +17187 -0
- superqode/approval.py +312 -0
- superqode/atomic.py +296 -0
- superqode/commands/__init__.py +1 -0
- superqode/commands/acp.py +965 -0
- superqode/commands/agents.py +180 -0
- superqode/commands/auth.py +278 -0
- superqode/commands/config.py +374 -0
- superqode/commands/init.py +826 -0
- superqode/commands/providers.py +819 -0
- superqode/commands/qe.py +1145 -0
- superqode/commands/roles.py +380 -0
- superqode/commands/serve.py +172 -0
- superqode/commands/suggestions.py +127 -0
- superqode/commands/superqe.py +460 -0
- superqode/config/__init__.py +51 -0
- superqode/config/loader.py +812 -0
- superqode/config/schema.py +498 -0
- superqode/core/__init__.py +111 -0
- superqode/core/roles.py +281 -0
- superqode/danger.py +386 -0
- superqode/data/superqode-template.yaml +1522 -0
- superqode/design_system.py +1080 -0
- superqode/dialogs/__init__.py +6 -0
- superqode/dialogs/base.py +39 -0
- superqode/dialogs/model.py +130 -0
- superqode/dialogs/provider.py +870 -0
- superqode/diff_view.py +919 -0
- superqode/enterprise.py +21 -0
- superqode/evaluation/__init__.py +25 -0
- superqode/evaluation/adapters.py +93 -0
- superqode/evaluation/behaviors.py +89 -0
- superqode/evaluation/engine.py +209 -0
- superqode/evaluation/scenarios.py +96 -0
- superqode/execution/__init__.py +36 -0
- superqode/execution/linter.py +538 -0
- superqode/execution/modes.py +347 -0
- superqode/execution/resolver.py +283 -0
- superqode/execution/runner.py +642 -0
- superqode/file_explorer.py +811 -0
- superqode/file_viewer.py +471 -0
- superqode/flash.py +183 -0
- superqode/guidance/__init__.py +58 -0
- superqode/guidance/config.py +203 -0
- superqode/guidance/prompts.py +71 -0
- superqode/harness/__init__.py +54 -0
- superqode/harness/accelerator.py +291 -0
- superqode/harness/config.py +319 -0
- superqode/harness/validator.py +147 -0
- superqode/history.py +279 -0
- superqode/integrations/superopt_runner.py +124 -0
- superqode/logging/__init__.py +49 -0
- superqode/logging/adapters.py +219 -0
- superqode/logging/formatter.py +923 -0
- superqode/logging/integration.py +341 -0
- superqode/logging/sinks.py +170 -0
- superqode/logging/unified_log.py +417 -0
- superqode/lsp/__init__.py +26 -0
- superqode/lsp/client.py +544 -0
- superqode/main.py +1069 -0
- superqode/mcp/__init__.py +89 -0
- superqode/mcp/auth_storage.py +380 -0
- superqode/mcp/client.py +1236 -0
- superqode/mcp/config.py +319 -0
- superqode/mcp/integration.py +337 -0
- superqode/mcp/oauth.py +436 -0
- superqode/mcp/oauth_callback.py +385 -0
- superqode/mcp/types.py +290 -0
- superqode/memory/__init__.py +31 -0
- superqode/memory/feedback.py +342 -0
- superqode/memory/store.py +522 -0
- superqode/notifications.py +369 -0
- superqode/optimization/__init__.py +5 -0
- superqode/optimization/config.py +33 -0
- superqode/permissions/__init__.py +25 -0
- superqode/permissions/rules.py +488 -0
- superqode/plan.py +323 -0
- superqode/providers/__init__.py +33 -0
- superqode/providers/gateway/__init__.py +165 -0
- superqode/providers/gateway/base.py +228 -0
- superqode/providers/gateway/litellm_gateway.py +1170 -0
- superqode/providers/gateway/openresponses_gateway.py +436 -0
- superqode/providers/health.py +297 -0
- superqode/providers/huggingface/__init__.py +74 -0
- superqode/providers/huggingface/downloader.py +472 -0
- superqode/providers/huggingface/endpoints.py +442 -0
- superqode/providers/huggingface/hub.py +531 -0
- superqode/providers/huggingface/inference.py +394 -0
- superqode/providers/huggingface/transformers_runner.py +516 -0
- superqode/providers/local/__init__.py +100 -0
- superqode/providers/local/base.py +438 -0
- superqode/providers/local/discovery.py +418 -0
- superqode/providers/local/lmstudio.py +256 -0
- superqode/providers/local/mlx.py +457 -0
- superqode/providers/local/ollama.py +486 -0
- superqode/providers/local/sglang.py +268 -0
- superqode/providers/local/tgi.py +260 -0
- superqode/providers/local/tool_support.py +477 -0
- superqode/providers/local/vllm.py +258 -0
- superqode/providers/manager.py +1338 -0
- superqode/providers/models.py +1016 -0
- superqode/providers/models_dev.py +578 -0
- superqode/providers/openresponses/__init__.py +87 -0
- superqode/providers/openresponses/converters/__init__.py +17 -0
- superqode/providers/openresponses/converters/messages.py +343 -0
- superqode/providers/openresponses/converters/tools.py +268 -0
- superqode/providers/openresponses/schema/__init__.py +56 -0
- superqode/providers/openresponses/schema/models.py +585 -0
- superqode/providers/openresponses/streaming/__init__.py +5 -0
- superqode/providers/openresponses/streaming/parser.py +338 -0
- superqode/providers/openresponses/tools/__init__.py +21 -0
- superqode/providers/openresponses/tools/apply_patch.py +352 -0
- superqode/providers/openresponses/tools/code_interpreter.py +290 -0
- superqode/providers/openresponses/tools/file_search.py +333 -0
- superqode/providers/openresponses/tools/mcp_adapter.py +252 -0
- superqode/providers/registry.py +716 -0
- superqode/providers/usage.py +332 -0
- superqode/pure_mode.py +384 -0
- superqode/qr/__init__.py +23 -0
- superqode/qr/dashboard.py +781 -0
- superqode/qr/generator.py +1018 -0
- superqode/qr/templates.py +135 -0
- superqode/safety/__init__.py +41 -0
- superqode/safety/sandbox.py +413 -0
- superqode/safety/warnings.py +256 -0
- superqode/server/__init__.py +33 -0
- superqode/server/lsp_server.py +775 -0
- superqode/server/web.py +250 -0
- superqode/session/__init__.py +25 -0
- superqode/session/persistence.py +580 -0
- superqode/session/sharing.py +477 -0
- superqode/session.py +475 -0
- superqode/sidebar.py +2991 -0
- superqode/stream_view.py +648 -0
- superqode/styles/__init__.py +3 -0
- superqode/superqe/__init__.py +184 -0
- superqode/superqe/acp_runner.py +1064 -0
- superqode/superqe/constitution/__init__.py +62 -0
- superqode/superqe/constitution/evaluator.py +308 -0
- superqode/superqe/constitution/loader.py +432 -0
- superqode/superqe/constitution/schema.py +250 -0
- superqode/superqe/events.py +591 -0
- superqode/superqe/frameworks/__init__.py +65 -0
- superqode/superqe/frameworks/base.py +234 -0
- superqode/superqe/frameworks/e2e.py +263 -0
- superqode/superqe/frameworks/executor.py +237 -0
- superqode/superqe/frameworks/javascript.py +409 -0
- superqode/superqe/frameworks/python.py +373 -0
- superqode/superqe/frameworks/registry.py +92 -0
- superqode/superqe/mcp_tools/__init__.py +47 -0
- superqode/superqe/mcp_tools/core_tools.py +418 -0
- superqode/superqe/mcp_tools/registry.py +230 -0
- superqode/superqe/mcp_tools/testing_tools.py +167 -0
- superqode/superqe/noise.py +89 -0
- superqode/superqe/orchestrator.py +778 -0
- superqode/superqe/roles.py +609 -0
- superqode/superqe/session.py +713 -0
- superqode/superqe/skills/__init__.py +57 -0
- superqode/superqe/skills/base.py +106 -0
- superqode/superqe/skills/core_skills.py +899 -0
- superqode/superqe/skills/registry.py +90 -0
- superqode/superqe/verifier.py +101 -0
- superqode/superqe_cli.py +76 -0
- superqode/tool_call.py +358 -0
- superqode/tools/__init__.py +93 -0
- superqode/tools/agent_tools.py +496 -0
- superqode/tools/base.py +324 -0
- superqode/tools/batch_tool.py +133 -0
- superqode/tools/diagnostics.py +311 -0
- superqode/tools/edit_tools.py +653 -0
- superqode/tools/enhanced_base.py +515 -0
- superqode/tools/file_tools.py +269 -0
- superqode/tools/file_tracking.py +45 -0
- superqode/tools/lsp_tools.py +610 -0
- superqode/tools/network_tools.py +350 -0
- superqode/tools/permissions.py +400 -0
- superqode/tools/question_tool.py +324 -0
- superqode/tools/search_tools.py +598 -0
- superqode/tools/shell_tools.py +259 -0
- superqode/tools/todo_tools.py +121 -0
- superqode/tools/validation.py +80 -0
- superqode/tools/web_tools.py +639 -0
- superqode/tui.py +1152 -0
- superqode/tui_integration.py +875 -0
- superqode/tui_widgets/__init__.py +27 -0
- superqode/tui_widgets/widgets/__init__.py +18 -0
- superqode/tui_widgets/widgets/progress.py +185 -0
- superqode/tui_widgets/widgets/tool_display.py +188 -0
- superqode/undo_manager.py +574 -0
- superqode/utils/__init__.py +5 -0
- superqode/utils/error_handling.py +323 -0
- superqode/utils/fuzzy.py +257 -0
- superqode/widgets/__init__.py +477 -0
- superqode/widgets/agent_collab.py +390 -0
- superqode/widgets/agent_store.py +936 -0
- superqode/widgets/agent_switcher.py +395 -0
- superqode/widgets/animation_manager.py +284 -0
- superqode/widgets/code_context.py +356 -0
- superqode/widgets/command_palette.py +412 -0
- superqode/widgets/connection_status.py +537 -0
- superqode/widgets/conversation_history.py +470 -0
- superqode/widgets/diff_indicator.py +155 -0
- superqode/widgets/enhanced_status_bar.py +385 -0
- superqode/widgets/enhanced_toast.py +476 -0
- superqode/widgets/file_browser.py +809 -0
- superqode/widgets/file_reference.py +585 -0
- superqode/widgets/issue_timeline.py +340 -0
- superqode/widgets/leader_key.py +264 -0
- superqode/widgets/mode_switcher.py +445 -0
- superqode/widgets/model_picker.py +234 -0
- superqode/widgets/permission_preview.py +1205 -0
- superqode/widgets/prompt.py +358 -0
- superqode/widgets/provider_connect.py +725 -0
- superqode/widgets/pty_shell.py +587 -0
- superqode/widgets/qe_dashboard.py +321 -0
- superqode/widgets/resizable_sidebar.py +377 -0
- superqode/widgets/response_changes.py +218 -0
- superqode/widgets/response_display.py +528 -0
- superqode/widgets/rich_tool_display.py +613 -0
- superqode/widgets/sidebar_panels.py +1180 -0
- superqode/widgets/slash_complete.py +356 -0
- superqode/widgets/split_view.py +612 -0
- superqode/widgets/status_bar.py +273 -0
- superqode/widgets/superqode_display.py +786 -0
- superqode/widgets/thinking_display.py +815 -0
- superqode/widgets/throbber.py +87 -0
- superqode/widgets/toast.py +206 -0
- superqode/widgets/unified_output.py +1073 -0
- superqode/workspace/__init__.py +75 -0
- superqode/workspace/artifacts.py +472 -0
- superqode/workspace/coordinator.py +353 -0
- superqode/workspace/diff_tracker.py +429 -0
- superqode/workspace/git_guard.py +373 -0
- superqode/workspace/git_snapshot.py +526 -0
- superqode/workspace/manager.py +750 -0
- superqode/workspace/snapshot.py +357 -0
- superqode/workspace/watcher.py +535 -0
- superqode/workspace/worktree.py +440 -0
- superqode-0.1.5.dist-info/METADATA +204 -0
- superqode-0.1.5.dist-info/RECORD +288 -0
- superqode-0.1.5.dist-info/WHEEL +5 -0
- superqode-0.1.5.dist-info/entry_points.txt +3 -0
- superqode-0.1.5.dist-info/licenses/LICENSE +648 -0
- superqode-0.1.5.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""Local transformers runner for pure Python inference.
|
|
2
|
+
|
|
3
|
+
This module provides the ability to run HuggingFace models locally
|
|
4
|
+
using the transformers library without requiring Ollama or other
|
|
5
|
+
external servers.
|
|
6
|
+
|
|
7
|
+
Requires optional dependencies:
|
|
8
|
+
pip install superqode[transformers]
|
|
9
|
+
|
|
10
|
+
Or manually:
|
|
11
|
+
pip install transformers accelerate torch
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import gc
|
|
16
|
+
import os
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any, Dict, List, Optional, Union
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TransformersConfig:
|
|
23
|
+
"""Configuration for transformers model loading.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
quantization: Quantization mode ("4bit", "8bit", None)
|
|
27
|
+
device_map: Device mapping strategy ("auto", "cpu", "cuda", etc.)
|
|
28
|
+
torch_dtype: Data type ("float16", "bfloat16", "float32", "auto")
|
|
29
|
+
max_memory: Max memory per device (e.g., {"cuda:0": "10GB"})
|
|
30
|
+
trust_remote_code: Allow executing model's custom code
|
|
31
|
+
use_flash_attention: Enable Flash Attention 2 if available
|
|
32
|
+
low_cpu_mem_usage: Reduce CPU memory during loading
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
quantization: Optional[str] = None
|
|
36
|
+
device_map: str = "auto"
|
|
37
|
+
torch_dtype: str = "auto"
|
|
38
|
+
max_memory: Optional[Dict[str, str]] = None
|
|
39
|
+
trust_remote_code: bool = False
|
|
40
|
+
use_flash_attention: bool = True
|
|
41
|
+
low_cpu_mem_usage: bool = True
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class GenerationResult:
|
|
46
|
+
"""Result from text generation.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
content: Generated text
|
|
50
|
+
model_id: Model used
|
|
51
|
+
input_tokens: Number of input tokens
|
|
52
|
+
output_tokens: Number of generated tokens
|
|
53
|
+
time_seconds: Generation time
|
|
54
|
+
tokens_per_second: Generation speed
|
|
55
|
+
error: Error message if failed
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
content: str = ""
|
|
59
|
+
model_id: str = ""
|
|
60
|
+
input_tokens: int = 0
|
|
61
|
+
output_tokens: int = 0
|
|
62
|
+
time_seconds: float = 0.0
|
|
63
|
+
tokens_per_second: float = 0.0
|
|
64
|
+
error: str = ""
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class LoadedModel:
|
|
69
|
+
"""Information about a loaded model.
|
|
70
|
+
|
|
71
|
+
Attributes:
|
|
72
|
+
model_id: HuggingFace model ID
|
|
73
|
+
model: The loaded model object
|
|
74
|
+
tokenizer: The loaded tokenizer
|
|
75
|
+
config: Loading configuration used
|
|
76
|
+
memory_usage_gb: Estimated GPU memory usage
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
model_id: str
|
|
80
|
+
model: Any = None
|
|
81
|
+
tokenizer: Any = None
|
|
82
|
+
config: TransformersConfig = field(default_factory=TransformersConfig)
|
|
83
|
+
memory_usage_gb: float = 0.0
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class TransformersRunner:
|
|
87
|
+
"""Run HuggingFace models locally using transformers.
|
|
88
|
+
|
|
89
|
+
This class provides a pure Python way to run models without
|
|
90
|
+
external servers like Ollama. It handles model loading, caching,
|
|
91
|
+
and generation with support for:
|
|
92
|
+
|
|
93
|
+
- 4-bit and 8-bit quantization via bitsandbytes
|
|
94
|
+
- Automatic device mapping (CPU/GPU)
|
|
95
|
+
- Flash Attention 2 when available
|
|
96
|
+
- Memory-efficient loading
|
|
97
|
+
|
|
98
|
+
Example:
|
|
99
|
+
runner = TransformersRunner()
|
|
100
|
+
await runner.load_model("microsoft/Phi-3.5-mini-instruct")
|
|
101
|
+
|
|
102
|
+
response = await runner.generate(
|
|
103
|
+
messages=[{"role": "user", "content": "Hello!"}]
|
|
104
|
+
)
|
|
105
|
+
print(response.content)
|
|
106
|
+
|
|
107
|
+
await runner.unload()
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
def __init__(self):
|
|
111
|
+
"""Initialize the transformers runner."""
|
|
112
|
+
self._loaded: Optional[LoadedModel] = None
|
|
113
|
+
self._dependencies_checked = False
|
|
114
|
+
self._available_deps: Dict[str, bool] = {}
|
|
115
|
+
|
|
116
|
+
@property
|
|
117
|
+
def is_loaded(self) -> bool:
|
|
118
|
+
"""Check if a model is currently loaded."""
|
|
119
|
+
return self._loaded is not None and self._loaded.model is not None
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def loaded_model_id(self) -> Optional[str]:
|
|
123
|
+
"""Get the ID of the currently loaded model."""
|
|
124
|
+
return self._loaded.model_id if self._loaded else None
|
|
125
|
+
|
|
126
|
+
def check_dependencies(self) -> Dict[str, bool]:
|
|
127
|
+
"""Check which transformers dependencies are available.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Dict mapping dependency name to availability.
|
|
131
|
+
"""
|
|
132
|
+
if self._dependencies_checked:
|
|
133
|
+
return self._available_deps
|
|
134
|
+
|
|
135
|
+
deps = {
|
|
136
|
+
"transformers": False,
|
|
137
|
+
"torch": False,
|
|
138
|
+
"accelerate": False,
|
|
139
|
+
"bitsandbytes": False,
|
|
140
|
+
"flash_attn": False,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
try:
|
|
144
|
+
import transformers
|
|
145
|
+
|
|
146
|
+
deps["transformers"] = True
|
|
147
|
+
except ImportError:
|
|
148
|
+
pass
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
import torch
|
|
152
|
+
|
|
153
|
+
deps["torch"] = True
|
|
154
|
+
except ImportError:
|
|
155
|
+
pass
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
import accelerate
|
|
159
|
+
|
|
160
|
+
deps["accelerate"] = True
|
|
161
|
+
except ImportError:
|
|
162
|
+
pass
|
|
163
|
+
|
|
164
|
+
try:
|
|
165
|
+
import bitsandbytes
|
|
166
|
+
|
|
167
|
+
deps["bitsandbytes"] = True
|
|
168
|
+
except ImportError:
|
|
169
|
+
pass
|
|
170
|
+
|
|
171
|
+
try:
|
|
172
|
+
import flash_attn
|
|
173
|
+
|
|
174
|
+
deps["flash_attn"] = True
|
|
175
|
+
except ImportError:
|
|
176
|
+
pass
|
|
177
|
+
|
|
178
|
+
self._available_deps = deps
|
|
179
|
+
self._dependencies_checked = True
|
|
180
|
+
return deps
|
|
181
|
+
|
|
182
|
+
def is_available(self) -> bool:
|
|
183
|
+
"""Check if transformers runner can be used.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
True if required dependencies are available.
|
|
187
|
+
"""
|
|
188
|
+
deps = self.check_dependencies()
|
|
189
|
+
return deps["transformers"] and deps["torch"]
|
|
190
|
+
|
|
191
|
+
def get_device_info(self) -> Dict[str, Any]:
|
|
192
|
+
"""Get information about available compute devices.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Dict with device information.
|
|
196
|
+
"""
|
|
197
|
+
deps = self.check_dependencies()
|
|
198
|
+
|
|
199
|
+
if not deps["torch"]:
|
|
200
|
+
return {
|
|
201
|
+
"available": False,
|
|
202
|
+
"error": "PyTorch not installed",
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
import torch
|
|
206
|
+
|
|
207
|
+
info = {
|
|
208
|
+
"available": True,
|
|
209
|
+
"cuda_available": torch.cuda.is_available(),
|
|
210
|
+
"mps_available": hasattr(torch.backends, "mps") and torch.backends.mps.is_available(),
|
|
211
|
+
"cpu_threads": torch.get_num_threads(),
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
if info["cuda_available"]:
|
|
215
|
+
info["cuda_device_count"] = torch.cuda.device_count()
|
|
216
|
+
info["cuda_device_name"] = torch.cuda.get_device_name(0)
|
|
217
|
+
info["cuda_memory_gb"] = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
|
218
|
+
|
|
219
|
+
return info
|
|
220
|
+
|
|
221
|
+
async def load_model(
|
|
222
|
+
self, model_id: str, config: Optional[TransformersConfig] = None, force: bool = False
|
|
223
|
+
) -> bool:
|
|
224
|
+
"""Load a model for inference.
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
model_id: HuggingFace model ID.
|
|
228
|
+
config: Loading configuration.
|
|
229
|
+
force: Force reload even if model is already loaded.
|
|
230
|
+
|
|
231
|
+
Returns:
|
|
232
|
+
True if loading succeeded.
|
|
233
|
+
"""
|
|
234
|
+
# Check if already loaded
|
|
235
|
+
if self.is_loaded and self._loaded.model_id == model_id and not force:
|
|
236
|
+
return True
|
|
237
|
+
|
|
238
|
+
# Unload existing model
|
|
239
|
+
if self.is_loaded:
|
|
240
|
+
await self.unload()
|
|
241
|
+
|
|
242
|
+
deps = self.check_dependencies()
|
|
243
|
+
if not deps["transformers"] or not deps["torch"]:
|
|
244
|
+
return False
|
|
245
|
+
|
|
246
|
+
config = config or TransformersConfig()
|
|
247
|
+
|
|
248
|
+
# Run loading in executor to not block
|
|
249
|
+
loop = asyncio.get_event_loop()
|
|
250
|
+
loaded = await loop.run_in_executor(None, lambda: self._load_model_sync(model_id, config))
|
|
251
|
+
|
|
252
|
+
if loaded:
|
|
253
|
+
self._loaded = loaded
|
|
254
|
+
return True
|
|
255
|
+
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
def _load_model_sync(self, model_id: str, config: TransformersConfig) -> Optional[LoadedModel]:
|
|
259
|
+
"""Synchronous model loading."""
|
|
260
|
+
try:
|
|
261
|
+
import torch
|
|
262
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
263
|
+
|
|
264
|
+
# Get HF token
|
|
265
|
+
token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
|
266
|
+
|
|
267
|
+
# Build loading kwargs
|
|
268
|
+
model_kwargs: Dict[str, Any] = {
|
|
269
|
+
"device_map": config.device_map,
|
|
270
|
+
"low_cpu_mem_usage": config.low_cpu_mem_usage,
|
|
271
|
+
"trust_remote_code": config.trust_remote_code,
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
if token:
|
|
275
|
+
model_kwargs["token"] = token
|
|
276
|
+
|
|
277
|
+
# Handle torch dtype
|
|
278
|
+
if config.torch_dtype == "auto":
|
|
279
|
+
model_kwargs["torch_dtype"] = "auto"
|
|
280
|
+
elif config.torch_dtype == "float16":
|
|
281
|
+
model_kwargs["torch_dtype"] = torch.float16
|
|
282
|
+
elif config.torch_dtype == "bfloat16":
|
|
283
|
+
model_kwargs["torch_dtype"] = torch.bfloat16
|
|
284
|
+
elif config.torch_dtype == "float32":
|
|
285
|
+
model_kwargs["torch_dtype"] = torch.float32
|
|
286
|
+
|
|
287
|
+
# Handle quantization
|
|
288
|
+
if config.quantization and self._available_deps.get("bitsandbytes"):
|
|
289
|
+
from transformers import BitsAndBytesConfig
|
|
290
|
+
|
|
291
|
+
if config.quantization == "4bit":
|
|
292
|
+
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
|
293
|
+
load_in_4bit=True,
|
|
294
|
+
bnb_4bit_compute_dtype=torch.float16,
|
|
295
|
+
bnb_4bit_quant_type="nf4",
|
|
296
|
+
bnb_4bit_use_double_quant=True,
|
|
297
|
+
)
|
|
298
|
+
elif config.quantization == "8bit":
|
|
299
|
+
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
|
300
|
+
load_in_8bit=True,
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Handle max memory
|
|
304
|
+
if config.max_memory:
|
|
305
|
+
model_kwargs["max_memory"] = config.max_memory
|
|
306
|
+
|
|
307
|
+
# Handle flash attention
|
|
308
|
+
if config.use_flash_attention and self._available_deps.get("flash_attn"):
|
|
309
|
+
model_kwargs["attn_implementation"] = "flash_attention_2"
|
|
310
|
+
|
|
311
|
+
# Load tokenizer
|
|
312
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
|
313
|
+
model_id,
|
|
314
|
+
token=token,
|
|
315
|
+
trust_remote_code=config.trust_remote_code,
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Load model
|
|
319
|
+
model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
|
|
320
|
+
|
|
321
|
+
# Estimate memory usage
|
|
322
|
+
memory_gb = 0.0
|
|
323
|
+
if torch.cuda.is_available():
|
|
324
|
+
memory_gb = torch.cuda.memory_allocated() / (1024**3)
|
|
325
|
+
|
|
326
|
+
return LoadedModel(
|
|
327
|
+
model_id=model_id,
|
|
328
|
+
model=model,
|
|
329
|
+
tokenizer=tokenizer,
|
|
330
|
+
config=config,
|
|
331
|
+
memory_usage_gb=memory_gb,
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
except Exception as e:
|
|
335
|
+
print(f"Error loading model: {e}")
|
|
336
|
+
return None
|
|
337
|
+
|
|
338
|
+
async def generate(
|
|
339
|
+
self,
|
|
340
|
+
messages: List[Dict[str, str]],
|
|
341
|
+
max_tokens: int = 2048,
|
|
342
|
+
temperature: float = 0.7,
|
|
343
|
+
top_p: float = 0.9,
|
|
344
|
+
top_k: int = 50,
|
|
345
|
+
stop: Optional[List[str]] = None,
|
|
346
|
+
) -> GenerationResult:
|
|
347
|
+
"""Generate text from messages.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
messages: Chat messages in OpenAI format.
|
|
351
|
+
max_tokens: Maximum tokens to generate.
|
|
352
|
+
temperature: Sampling temperature.
|
|
353
|
+
top_p: Nucleus sampling threshold.
|
|
354
|
+
top_k: Top-k sampling.
|
|
355
|
+
stop: Stop sequences.
|
|
356
|
+
|
|
357
|
+
Returns:
|
|
358
|
+
GenerationResult with generated text.
|
|
359
|
+
"""
|
|
360
|
+
if not self.is_loaded:
|
|
361
|
+
return GenerationResult(error="No model loaded")
|
|
362
|
+
|
|
363
|
+
loop = asyncio.get_event_loop()
|
|
364
|
+
return await loop.run_in_executor(
|
|
365
|
+
None, lambda: self._generate_sync(messages, max_tokens, temperature, top_p, top_k, stop)
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
def _generate_sync(
|
|
369
|
+
self,
|
|
370
|
+
messages: List[Dict[str, str]],
|
|
371
|
+
max_tokens: int,
|
|
372
|
+
temperature: float,
|
|
373
|
+
top_p: float,
|
|
374
|
+
top_k: int,
|
|
375
|
+
stop: Optional[List[str]],
|
|
376
|
+
) -> GenerationResult:
|
|
377
|
+
"""Synchronous generation."""
|
|
378
|
+
import time
|
|
379
|
+
import torch
|
|
380
|
+
|
|
381
|
+
try:
|
|
382
|
+
model = self._loaded.model
|
|
383
|
+
tokenizer = self._loaded.tokenizer
|
|
384
|
+
model_id = self._loaded.model_id
|
|
385
|
+
|
|
386
|
+
# Apply chat template
|
|
387
|
+
if hasattr(tokenizer, "apply_chat_template"):
|
|
388
|
+
prompt = tokenizer.apply_chat_template(
|
|
389
|
+
messages,
|
|
390
|
+
tokenize=False,
|
|
391
|
+
add_generation_prompt=True,
|
|
392
|
+
)
|
|
393
|
+
else:
|
|
394
|
+
# Fallback for models without chat template
|
|
395
|
+
prompt = (
|
|
396
|
+
"\n".join(f"{m['role']}: {m['content']}" for m in messages) + "\nassistant:"
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Tokenize
|
|
400
|
+
inputs = tokenizer(prompt, return_tensors="pt")
|
|
401
|
+
input_length = inputs["input_ids"].shape[1]
|
|
402
|
+
|
|
403
|
+
# Move to model device
|
|
404
|
+
device = next(model.parameters()).device
|
|
405
|
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
|
406
|
+
|
|
407
|
+
# Build generation kwargs
|
|
408
|
+
gen_kwargs = {
|
|
409
|
+
"max_new_tokens": max_tokens,
|
|
410
|
+
"do_sample": temperature > 0,
|
|
411
|
+
"pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
if temperature > 0:
|
|
415
|
+
gen_kwargs["temperature"] = temperature
|
|
416
|
+
gen_kwargs["top_p"] = top_p
|
|
417
|
+
gen_kwargs["top_k"] = top_k
|
|
418
|
+
|
|
419
|
+
# Handle stop sequences
|
|
420
|
+
if stop:
|
|
421
|
+
stop_ids = [tokenizer.encode(s, add_special_tokens=False) for s in stop]
|
|
422
|
+
# Flatten for stopping_criteria would be complex, skip for now
|
|
423
|
+
|
|
424
|
+
# Generate
|
|
425
|
+
start_time = time.time()
|
|
426
|
+
|
|
427
|
+
with torch.no_grad():
|
|
428
|
+
outputs = model.generate(**inputs, **gen_kwargs)
|
|
429
|
+
|
|
430
|
+
gen_time = time.time() - start_time
|
|
431
|
+
|
|
432
|
+
# Decode output
|
|
433
|
+
output_tokens = outputs[0][input_length:]
|
|
434
|
+
output_length = len(output_tokens)
|
|
435
|
+
|
|
436
|
+
generated_text = tokenizer.decode(output_tokens, skip_special_tokens=True)
|
|
437
|
+
|
|
438
|
+
# Calculate speed
|
|
439
|
+
tokens_per_sec = output_length / gen_time if gen_time > 0 else 0
|
|
440
|
+
|
|
441
|
+
return GenerationResult(
|
|
442
|
+
content=generated_text.strip(),
|
|
443
|
+
model_id=model_id,
|
|
444
|
+
input_tokens=input_length,
|
|
445
|
+
output_tokens=output_length,
|
|
446
|
+
time_seconds=gen_time,
|
|
447
|
+
tokens_per_second=tokens_per_sec,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
except Exception as e:
|
|
451
|
+
return GenerationResult(
|
|
452
|
+
model_id=self._loaded.model_id if self._loaded else "", error=str(e)
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
async def unload(self) -> None:
|
|
456
|
+
"""Unload the current model and free memory."""
|
|
457
|
+
if not self.is_loaded:
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
loop = asyncio.get_event_loop()
|
|
461
|
+
await loop.run_in_executor(None, self._unload_sync)
|
|
462
|
+
|
|
463
|
+
def _unload_sync(self) -> None:
|
|
464
|
+
"""Synchronous unload."""
|
|
465
|
+
if self._loaded:
|
|
466
|
+
# Delete model references
|
|
467
|
+
if self._loaded.model is not None:
|
|
468
|
+
del self._loaded.model
|
|
469
|
+
if self._loaded.tokenizer is not None:
|
|
470
|
+
del self._loaded.tokenizer
|
|
471
|
+
|
|
472
|
+
self._loaded = None
|
|
473
|
+
|
|
474
|
+
# Force garbage collection
|
|
475
|
+
gc.collect()
|
|
476
|
+
|
|
477
|
+
# Clear CUDA cache if available
|
|
478
|
+
try:
|
|
479
|
+
import torch
|
|
480
|
+
|
|
481
|
+
if torch.cuda.is_available():
|
|
482
|
+
torch.cuda.empty_cache()
|
|
483
|
+
except Exception:
|
|
484
|
+
pass
|
|
485
|
+
|
|
486
|
+
def get_loaded_info(self) -> Optional[Dict[str, Any]]:
|
|
487
|
+
"""Get information about the currently loaded model.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
Dict with model info, or None if no model loaded.
|
|
491
|
+
"""
|
|
492
|
+
if not self.is_loaded:
|
|
493
|
+
return None
|
|
494
|
+
|
|
495
|
+
return {
|
|
496
|
+
"model_id": self._loaded.model_id,
|
|
497
|
+
"memory_usage_gb": self._loaded.memory_usage_gb,
|
|
498
|
+
"quantization": self._loaded.config.quantization,
|
|
499
|
+
"device_map": self._loaded.config.device_map,
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# Singleton instance
|
|
504
|
+
_runner_instance: Optional[TransformersRunner] = None
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def get_transformers_runner() -> TransformersRunner:
|
|
508
|
+
"""Get the global TransformersRunner instance.
|
|
509
|
+
|
|
510
|
+
Returns:
|
|
511
|
+
TransformersRunner instance.
|
|
512
|
+
"""
|
|
513
|
+
global _runner_instance
|
|
514
|
+
if _runner_instance is None:
|
|
515
|
+
_runner_instance = TransformersRunner()
|
|
516
|
+
return _runner_instance
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Local LLM provider clients and utilities.
|
|
2
|
+
|
|
3
|
+
This module provides clients for self-hosted LLM servers including:
|
|
4
|
+
- Ollama
|
|
5
|
+
- LM Studio
|
|
6
|
+
- vLLM
|
|
7
|
+
- SGLang
|
|
8
|
+
- MLX-LM
|
|
9
|
+
- TGI (Text Generation Inference)
|
|
10
|
+
- llama.cpp server
|
|
11
|
+
- Generic OpenAI-compatible servers
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from superqode.providers.local.base import (
|
|
15
|
+
LocalProviderType,
|
|
16
|
+
Quantization,
|
|
17
|
+
LocalModel,
|
|
18
|
+
ProviderStatus,
|
|
19
|
+
ToolTestResult,
|
|
20
|
+
GenerationConfig,
|
|
21
|
+
LocalProviderClient,
|
|
22
|
+
MODEL_FAMILIES,
|
|
23
|
+
TOOL_CAPABLE_FAMILIES,
|
|
24
|
+
detect_model_family,
|
|
25
|
+
detect_quantization,
|
|
26
|
+
likely_supports_tools,
|
|
27
|
+
)
|
|
28
|
+
from superqode.providers.local.ollama import OllamaClient, get_ollama_client
|
|
29
|
+
from superqode.providers.local.vllm import VLLMClient, get_vllm_client
|
|
30
|
+
from superqode.providers.local.sglang import SGLangClient, get_sglang_client
|
|
31
|
+
from superqode.providers.local.mlx import MLXClient, get_mlx_client
|
|
32
|
+
from superqode.providers.local.tgi import TGIClient, get_tgi_client
|
|
33
|
+
from superqode.providers.local.lmstudio import LMStudioClient, get_lmstudio_client
|
|
34
|
+
from superqode.providers.local.discovery import (
|
|
35
|
+
DiscoveredProvider,
|
|
36
|
+
LocalProviderDiscovery,
|
|
37
|
+
get_discovery_service,
|
|
38
|
+
quick_scan,
|
|
39
|
+
DEFAULT_PORTS,
|
|
40
|
+
ALL_PORTS,
|
|
41
|
+
)
|
|
42
|
+
from superqode.providers.local.tool_support import (
|
|
43
|
+
ToolCapabilityInfo,
|
|
44
|
+
TOOL_CAPABLE_MODELS,
|
|
45
|
+
TOOL_QUIRKS,
|
|
46
|
+
NO_TOOL_SUPPORT,
|
|
47
|
+
get_tool_capability_info,
|
|
48
|
+
test_tool_calling,
|
|
49
|
+
get_recommended_coding_models,
|
|
50
|
+
estimate_tool_support,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
__all__ = [
|
|
54
|
+
# Enums
|
|
55
|
+
"LocalProviderType",
|
|
56
|
+
"Quantization",
|
|
57
|
+
# Data classes
|
|
58
|
+
"LocalModel",
|
|
59
|
+
"ProviderStatus",
|
|
60
|
+
"ToolTestResult",
|
|
61
|
+
"GenerationConfig",
|
|
62
|
+
"DiscoveredProvider",
|
|
63
|
+
# Base class
|
|
64
|
+
"LocalProviderClient",
|
|
65
|
+
# Clients
|
|
66
|
+
"OllamaClient",
|
|
67
|
+
"get_ollama_client",
|
|
68
|
+
"VLLMClient",
|
|
69
|
+
"get_vllm_client",
|
|
70
|
+
"SGLangClient",
|
|
71
|
+
"get_sglang_client",
|
|
72
|
+
"MLXClient",
|
|
73
|
+
"get_mlx_client",
|
|
74
|
+
"TGIClient",
|
|
75
|
+
"get_tgi_client",
|
|
76
|
+
"LMStudioClient",
|
|
77
|
+
"get_lmstudio_client",
|
|
78
|
+
# Discovery
|
|
79
|
+
"LocalProviderDiscovery",
|
|
80
|
+
"get_discovery_service",
|
|
81
|
+
"quick_scan",
|
|
82
|
+
"DEFAULT_PORTS",
|
|
83
|
+
"ALL_PORTS",
|
|
84
|
+
# Constants
|
|
85
|
+
"MODEL_FAMILIES",
|
|
86
|
+
"TOOL_CAPABLE_FAMILIES",
|
|
87
|
+
# Utilities
|
|
88
|
+
"detect_model_family",
|
|
89
|
+
"detect_quantization",
|
|
90
|
+
"likely_supports_tools",
|
|
91
|
+
# Tool support
|
|
92
|
+
"ToolCapabilityInfo",
|
|
93
|
+
"TOOL_CAPABLE_MODELS",
|
|
94
|
+
"TOOL_QUIRKS",
|
|
95
|
+
"NO_TOOL_SUPPORT",
|
|
96
|
+
"get_tool_capability_info",
|
|
97
|
+
"test_tool_calling",
|
|
98
|
+
"get_recommended_coding_models",
|
|
99
|
+
"estimate_tool_support",
|
|
100
|
+
]
|