velune-cli 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- velune/__init__.py +5 -0
- velune/__main__.py +6 -0
- velune/cli/__init__.py +5 -0
- velune/cli/app.py +208 -0
- velune/cli/autocomplete.py +80 -0
- velune/cli/banner.py +60 -0
- velune/cli/commands/__init__.py +32 -0
- velune/cli/commands/ask.py +175 -0
- velune/cli/commands/base.py +16 -0
- velune/cli/commands/chat.py +228 -0
- velune/cli/commands/config.py +224 -0
- velune/cli/commands/daemon.py +88 -0
- velune/cli/commands/doctor.py +721 -0
- velune/cli/commands/init.py +170 -0
- velune/cli/commands/mcp.py +82 -0
- velune/cli/commands/memory.py +293 -0
- velune/cli/commands/models.py +683 -0
- velune/cli/commands/preflight.py +95 -0
- velune/cli/commands/run.py +270 -0
- velune/cli/commands/setup.py +184 -0
- velune/cli/commands/workspace.py +249 -0
- velune/cli/context.py +36 -0
- velune/cli/councilmodel_ui.py +199 -0
- velune/cli/display/council_view.py +254 -0
- velune/cli/display/memory_view.py +126 -0
- velune/cli/display/panels.py +35 -0
- velune/cli/display/progress.py +25 -0
- velune/cli/display/themes.py +25 -0
- velune/cli/main.py +15 -0
- velune/cli/model_selector.py +51 -0
- velune/cli/modes.py +86 -0
- velune/cli/pull_ui.py +123 -0
- velune/cli/registry.py +80 -0
- velune/cli/rendering/__init__.py +5 -0
- velune/cli/rendering/error_panel.py +79 -0
- velune/cli/rendering/markdown.py +63 -0
- velune/cli/repl.py +1855 -0
- velune/cli/session_manager.py +71 -0
- velune/cli/slash_commands.py +37 -0
- velune/cli/theme.py +8 -0
- velune/cognition/__init__.py +23 -0
- velune/cognition/agents/__init__.py +7 -0
- velune/cognition/agents/coder.py +209 -0
- velune/cognition/agents/planner.py +156 -0
- velune/cognition/agents/reviewer.py +195 -0
- velune/cognition/arbitrator.py +220 -0
- velune/cognition/architecture.py +415 -0
- velune/cognition/budget.py +65 -0
- velune/cognition/council/__init__.py +47 -0
- velune/cognition/council/base.py +217 -0
- velune/cognition/council/challenger.py +74 -0
- velune/cognition/council/coder.py +79 -0
- velune/cognition/council/critic_agent.py +43 -0
- velune/cognition/council/critic_configs.py +111 -0
- velune/cognition/council/critics.py +41 -0
- velune/cognition/council/debate.py +46 -0
- velune/cognition/council/factory.py +140 -0
- velune/cognition/council/messages.py +56 -0
- velune/cognition/council/planner.py +124 -0
- velune/cognition/council/reviewer.py +74 -0
- velune/cognition/council/synthesizer.py +67 -0
- velune/cognition/council/tiers.py +188 -0
- velune/cognition/council_orchestrator.py +282 -0
- velune/cognition/firewall.py +354 -0
- velune/cognition/module.py +46 -0
- velune/cognition/orchestrator.py +1205 -0
- velune/cognition/personality.py +238 -0
- velune/cognition/state.py +104 -0
- velune/cognition/style_resolver.py +64 -0
- velune/cognition/verification.py +205 -0
- velune/context/__init__.py +28 -0
- velune/context/assembler.py +240 -0
- velune/context/budget.py +97 -0
- velune/context/extractive.py +95 -0
- velune/context/prompt_adaptation.py +480 -0
- velune/context/sections.py +99 -0
- velune/context/token_counter.py +134 -0
- velune/context/utilization.py +33 -0
- velune/context/window.py +63 -0
- velune/core/__init__.py +89 -0
- velune/core/background.py +5 -0
- velune/core/config/__init__.py +37 -0
- velune/core/errors/__init__.py +90 -0
- velune/core/errors/catalog.py +188 -0
- velune/core/errors/execution.py +31 -0
- velune/core/errors/memory.py +25 -0
- velune/core/errors/orchestration.py +31 -0
- velune/core/errors/provider.py +37 -0
- velune/core/event_loop.py +35 -0
- velune/core/logging.py +83 -0
- velune/core/paths.py +165 -0
- velune/core/runtime.py +113 -0
- velune/core/startup_profiler.py +56 -0
- velune/core/task_registry.py +117 -0
- velune/core/trace.py +83 -0
- velune/core/types/__init__.py +48 -0
- velune/core/types/agent.py +53 -0
- velune/core/types/context.py +42 -0
- velune/core/types/inference.py +38 -0
- velune/core/types/memory.py +42 -0
- velune/core/types/model.py +70 -0
- velune/core/types/provider.py +62 -0
- velune/core/types/repository.py +38 -0
- velune/core/types/task.py +61 -0
- velune/core/types/workspace.py +28 -0
- velune/daemon/client.py +13 -0
- velune/daemon/server.py +127 -0
- velune/daemon/transport.py +179 -0
- velune/events.py +204 -0
- velune/execution/__init__.py +22 -0
- velune/execution/benchmarker.py +315 -0
- velune/execution/cancellation.py +53 -0
- velune/execution/checkpointer.py +130 -0
- velune/execution/command_spec.py +165 -0
- velune/execution/diff_preview.py +197 -0
- velune/execution/executor.py +181 -0
- velune/execution/module.py +18 -0
- velune/execution/multi_diff.py +67 -0
- velune/execution/path_guard.py +74 -0
- velune/execution/planner.py +91 -0
- velune/execution/rollback.py +89 -0
- velune/execution/sandbox.py +268 -0
- velune/execution/validator.py +115 -0
- velune/hardware/__init__.py +1 -0
- velune/hardware/detector.py +192 -0
- velune/kernel/__init__.py +55 -0
- velune/kernel/bootstrap.py +125 -0
- velune/kernel/config.py +426 -0
- velune/kernel/entrypoint.py +78 -0
- velune/kernel/health.py +54 -0
- velune/kernel/lifecycle.py +143 -0
- velune/kernel/module.py +17 -0
- velune/kernel/modules.py +23 -0
- velune/kernel/registry.py +96 -0
- velune/kernel/schemas.py +28 -0
- velune/main.py +9 -0
- velune/mcp/__init__.py +9 -0
- velune/mcp/client.py +115 -0
- velune/mcp/config.py +19 -0
- velune/mcp/server.py +624 -0
- velune/memory/__init__.py +32 -0
- velune/memory/compaction.py +506 -0
- velune/memory/embedding_pipeline.py +241 -0
- velune/memory/lifecycle.py +680 -0
- velune/memory/module.py +218 -0
- velune/memory/prioritizer.py +67 -0
- velune/memory/storage/episodic_schema.sql +53 -0
- velune/memory/storage/lancedb_store.py +282 -0
- velune/memory/storage/sqlite_manager.py +369 -0
- velune/memory/storage/sqlite_pool.py +149 -0
- velune/memory/tiers/episodic.py +588 -0
- velune/memory/tiers/graph.py +378 -0
- velune/memory/tiers/lineage.py +416 -0
- velune/memory/tiers/semantic.py +475 -0
- velune/memory/tiers/working.py +168 -0
- velune/memory/vitality.py +132 -0
- velune/models/__init__.py +15 -0
- velune/models/family.py +76 -0
- velune/models/module.py +20 -0
- velune/models/probes.py +192 -0
- velune/models/profile_cache.py +84 -0
- velune/models/profiler.py +108 -0
- velune/models/registry.py +251 -0
- velune/models/scorer.py +233 -0
- velune/models/specializations.py +205 -0
- velune/orchestration/__init__.py +19 -0
- velune/orchestration/engine.py +239 -0
- velune/orchestration/module.py +15 -0
- velune/orchestration/role_assignments.py +82 -0
- velune/orchestration/schemas.py +98 -0
- velune/plugins/__init__.py +20 -0
- velune/plugins/hooks.py +50 -0
- velune/plugins/loader.py +161 -0
- velune/plugins/registry.py +56 -0
- velune/plugins/schemas.py +21 -0
- velune/providers/__init__.py +23 -0
- velune/providers/adapters/anthropic.py +257 -0
- velune/providers/adapters/fireworks.py +115 -0
- velune/providers/adapters/google.py +234 -0
- velune/providers/adapters/groq.py +151 -0
- velune/providers/adapters/huggingface.py +210 -0
- velune/providers/adapters/llamacpp.py +208 -0
- velune/providers/adapters/lmstudio.py +175 -0
- velune/providers/adapters/ollama.py +233 -0
- velune/providers/adapters/openai.py +213 -0
- velune/providers/adapters/openrouter.py +81 -0
- velune/providers/adapters/together.py +134 -0
- velune/providers/adapters/xai.py +60 -0
- velune/providers/base.py +86 -0
- velune/providers/benchmarker.py +138 -0
- velune/providers/discovery/__init__.py +33 -0
- velune/providers/discovery/anthropic.py +79 -0
- velune/providers/discovery/benchmarks.py +44 -0
- velune/providers/discovery/classifier.py +69 -0
- velune/providers/discovery/fireworks.py +95 -0
- velune/providers/discovery/gguf.py +88 -0
- velune/providers/discovery/google.py +95 -0
- velune/providers/discovery/gpu.py +117 -0
- velune/providers/discovery/groq.py +21 -0
- velune/providers/discovery/huggingface.py +67 -0
- velune/providers/discovery/lmstudio.py +80 -0
- velune/providers/discovery/ollama.py +162 -0
- velune/providers/discovery/openai.py +96 -0
- velune/providers/discovery/openrouter.py +113 -0
- velune/providers/discovery/scanner.py +115 -0
- velune/providers/discovery/together.py +114 -0
- velune/providers/discovery/xai.py +57 -0
- velune/providers/health.py +67 -0
- velune/providers/health_monitor.py +169 -0
- velune/providers/keystore.py +142 -0
- velune/providers/local_paths.py +49 -0
- velune/providers/local_resolver.py +229 -0
- velune/providers/module.py +51 -0
- velune/providers/ollama_manager.py +193 -0
- velune/providers/registry.py +220 -0
- velune/providers/router.py +255 -0
- velune/providers/task_classifier.py +288 -0
- velune/py.typed +0 -0
- velune/repository/__init__.py +33 -0
- velune/repository/analyzer.py +127 -0
- velune/repository/ast_parser.py +822 -0
- velune/repository/blast_radius.py +298 -0
- velune/repository/boundary_classifier.py +295 -0
- velune/repository/cognition.py +316 -0
- velune/repository/grapher.py +179 -0
- velune/repository/import_graph.py +263 -0
- velune/repository/incremental_indexer.py +275 -0
- velune/repository/index_state.py +96 -0
- velune/repository/indexer.py +243 -0
- velune/repository/module.py +17 -0
- velune/repository/parser.py +474 -0
- velune/repository/project_type.py +300 -0
- velune/repository/rename_journal.py +287 -0
- velune/repository/scanner.py +193 -0
- velune/repository/schemas.py +102 -0
- velune/repository/symbol_registry.py +365 -0
- velune/repository/tracker.py +252 -0
- velune/retrieval/__init__.py +27 -0
- velune/retrieval/cache.py +110 -0
- velune/retrieval/fast_path.py +391 -0
- velune/retrieval/graph.py +124 -0
- velune/retrieval/hybrid.py +271 -0
- velune/retrieval/keyword.py +131 -0
- velune/retrieval/module.py +26 -0
- velune/retrieval/pipeline.py +303 -0
- velune/retrieval/reranker.py +102 -0
- velune/retrieval/schemas.py +59 -0
- velune/retrieval/slow_path.py +364 -0
- velune/retrieval/vector.py +203 -0
- velune/telemetry/__init__.py +59 -0
- velune/telemetry/cognition.py +267 -0
- velune/telemetry/cost_estimator.py +92 -0
- velune/telemetry/debug.py +304 -0
- velune/telemetry/doctor.py +244 -0
- velune/telemetry/logging.py +286 -0
- velune/telemetry/spans.py +277 -0
- velune/telemetry/token_tracker.py +140 -0
- velune/telemetry/usage_tracker.py +340 -0
- velune/tools/__init__.py +41 -0
- velune/tools/base/registry.py +87 -0
- velune/tools/base/tool.py +63 -0
- velune/tools/code/navigate.py +116 -0
- velune/tools/code/search.py +123 -0
- velune/tools/filesystem/read.py +75 -0
- velune/tools/filesystem/search.py +136 -0
- velune/tools/filesystem/write.py +163 -0
- velune/tools/git/history.py +177 -0
- velune/tools/git/operations.py +122 -0
- velune/tools/git/state.py +121 -0
- velune/tools/module.py +81 -0
- velune/tools/terminal/execute.py +72 -0
- velune/tools/terminal/history.py +47 -0
- velune/tools/web/fetch.py +55 -0
- velune/tools/web/validator.py +122 -0
- velune_cli-0.9.0.dist-info/METADATA +518 -0
- velune_cli-0.9.0.dist-info/RECORD +279 -0
- velune_cli-0.9.0.dist-info/WHEEL +4 -0
- velune_cli-0.9.0.dist-info/entry_points.txt +2 -0
- velune_cli-0.9.0.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Llama.cpp local GGUF model provider adapter implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from velune.core.errors.provider import InferenceError, ProviderConnectionError
|
|
12
|
+
from velune.core.types.inference import InferenceRequest, InferenceResponse, StreamChunk
|
|
13
|
+
from velune.core.types.model import ModelDescriptor
|
|
14
|
+
from velune.core.types.provider import ProviderCapabilities, ProviderHealth
|
|
15
|
+
from velune.providers.base import ModelProvider
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LlamaCppProvider(ModelProvider):
|
|
19
|
+
"""Llama.cpp provider for running in-process GGUF models."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, models_dir: str | None = None) -> None:
|
|
22
|
+
self._models_dir = Path(models_dir) if models_dir else Path.home() / "models"
|
|
23
|
+
self._loaded_models = {}
|
|
24
|
+
self._capabilities = ProviderCapabilities(
|
|
25
|
+
supports_streaming=True,
|
|
26
|
+
supports_function_calling=False,
|
|
27
|
+
supports_embeddings=True,
|
|
28
|
+
max_context_window=32768,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def provider_id(self) -> str:
|
|
33
|
+
return "llamacpp"
|
|
34
|
+
|
|
35
|
+
async def initialize(self) -> None:
|
|
36
|
+
"""Verify llama-cpp-python library is available."""
|
|
37
|
+
try:
|
|
38
|
+
import llama_cpp # noqa: F401
|
|
39
|
+
except ImportError:
|
|
40
|
+
raise ProviderConnectionError(
|
|
41
|
+
"llama-cpp-python dependency is missing. Install with: pip install llama-cpp-python"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def _resolve_model_path(self, model_id: str) -> Path:
|
|
45
|
+
"""Resolve the GGUF model path from model ID."""
|
|
46
|
+
from velune.providers.local_paths import get_model_path, save_model_path
|
|
47
|
+
from velune.providers.local_resolver import LocalModelResolver
|
|
48
|
+
|
|
49
|
+
# 1. Check persistent cache first
|
|
50
|
+
cached = get_model_path(model_id)
|
|
51
|
+
if cached:
|
|
52
|
+
return cached
|
|
53
|
+
|
|
54
|
+
# 2. Ask LocalModelResolver (absolute, relative, stem scan)
|
|
55
|
+
resolver = LocalModelResolver()
|
|
56
|
+
found = resolver.resolve_model_path(model_id)
|
|
57
|
+
if found:
|
|
58
|
+
save_model_path(model_id, found)
|
|
59
|
+
return found
|
|
60
|
+
|
|
61
|
+
# 3. Interactive prompt — only in a real terminal
|
|
62
|
+
prompted = resolver.prompt_for_path(model_id)
|
|
63
|
+
if prompted:
|
|
64
|
+
save_model_path(model_id, prompted)
|
|
65
|
+
return prompted
|
|
66
|
+
|
|
67
|
+
raise FileNotFoundError(f"GGUF model file not found for ID: {model_id}")
|
|
68
|
+
|
|
69
|
+
def _get_model(self, model_id: str, context_window: int = 4096) -> Any:
|
|
70
|
+
"""Synchronously get or load the llama_cpp Llama instance."""
|
|
71
|
+
if model_id in self._loaded_models:
|
|
72
|
+
return self._loaded_models[model_id]
|
|
73
|
+
|
|
74
|
+
from llama_cpp import Llama
|
|
75
|
+
|
|
76
|
+
model_path = self._resolve_model_path(model_id)
|
|
77
|
+
|
|
78
|
+
# Load the model in-memory.
|
|
79
|
+
# Using typical defaults, letting it use GPU if compiled with CUDA/metal.
|
|
80
|
+
llm = Llama(
|
|
81
|
+
model_path=str(model_path),
|
|
82
|
+
n_ctx=context_window,
|
|
83
|
+
n_gpu_layers=-1, # Load as many layers as possible to GPU if available
|
|
84
|
+
verbose=False,
|
|
85
|
+
)
|
|
86
|
+
self._loaded_models[model_id] = llm
|
|
87
|
+
return llm
|
|
88
|
+
|
|
89
|
+
async def list_models(self) -> list[ModelDescriptor]:
|
|
90
|
+
"""List local GGUF models via filesystem discovery."""
|
|
91
|
+
await self.initialize()
|
|
92
|
+
from velune.providers.discovery.gguf import GGUFDiscovery
|
|
93
|
+
|
|
94
|
+
return await GGUFDiscovery().discover()
|
|
95
|
+
|
|
96
|
+
async def infer(self, request: InferenceRequest) -> InferenceResponse:
|
|
97
|
+
"""Non-blocking in-process inference using asyncio thread offloading."""
|
|
98
|
+
await self.initialize()
|
|
99
|
+
start = time.perf_counter()
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
# Resolve model context window size
|
|
103
|
+
ctx_len = request.max_tokens or 4096
|
|
104
|
+
llm = await asyncio.to_thread(self._get_model, request.model_id, ctx_len)
|
|
105
|
+
|
|
106
|
+
# Map standard messages to llama_cpp chat completions format
|
|
107
|
+
messages = [
|
|
108
|
+
{"role": msg.get("role"), "content": msg.get("content")} for msg in request.messages
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
completion = await asyncio.to_thread(
|
|
112
|
+
llm.create_chat_completion,
|
|
113
|
+
messages=messages,
|
|
114
|
+
temperature=request.temperature,
|
|
115
|
+
max_tokens=request.max_tokens,
|
|
116
|
+
top_p=request.top_p,
|
|
117
|
+
stream=False,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
latency = (time.perf_counter() - start) * 1000.0
|
|
121
|
+
choice = completion["choices"][0]
|
|
122
|
+
|
|
123
|
+
return InferenceResponse(
|
|
124
|
+
content=choice["message"]["content"] or "",
|
|
125
|
+
model_id=request.model_id,
|
|
126
|
+
finish_reason=choice.get("finish_reason") or "stop",
|
|
127
|
+
tokens_used=completion.get("usage", {}).get("total_tokens", 0),
|
|
128
|
+
latency_ms=latency,
|
|
129
|
+
)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
raise InferenceError(f"Local llama.cpp inference failed: {e}")
|
|
132
|
+
|
|
133
|
+
async def stream(self, request: InferenceRequest) -> AsyncIterator[StreamChunk]:
|
|
134
|
+
"""Streaming chat completions in non-blocking fashion."""
|
|
135
|
+
await self.initialize()
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
ctx_len = request.max_tokens or 4096
|
|
139
|
+
llm = await asyncio.to_thread(self._get_model, request.model_id, ctx_len)
|
|
140
|
+
messages = [
|
|
141
|
+
{"role": msg.get("role"), "content": msg.get("content")} for msg in request.messages
|
|
142
|
+
]
|
|
143
|
+
|
|
144
|
+
# Run the generator in a thread pool and yield chunks back to async loop
|
|
145
|
+
def run_stream():
|
|
146
|
+
return llm.create_chat_completion(
|
|
147
|
+
messages=messages,
|
|
148
|
+
temperature=request.temperature,
|
|
149
|
+
max_tokens=request.max_tokens,
|
|
150
|
+
top_p=request.top_p,
|
|
151
|
+
stream=True,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
stream_gen = await asyncio.to_thread(run_stream)
|
|
155
|
+
|
|
156
|
+
# Helper to fetch next item synchronously in thread
|
|
157
|
+
def next_chunk(iterator):
|
|
158
|
+
try:
|
|
159
|
+
return next(iterator)
|
|
160
|
+
except StopIteration:
|
|
161
|
+
return None
|
|
162
|
+
|
|
163
|
+
while True:
|
|
164
|
+
chunk = await asyncio.to_thread(next_chunk, stream_gen)
|
|
165
|
+
if chunk is None:
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
choice = chunk["choices"][0]
|
|
169
|
+
delta = choice.get("delta", {})
|
|
170
|
+
content = delta.get("content", "")
|
|
171
|
+
finish = choice.get("finish_reason")
|
|
172
|
+
|
|
173
|
+
yield StreamChunk(
|
|
174
|
+
content=content,
|
|
175
|
+
finish_reason=finish,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
except Exception as e:
|
|
179
|
+
raise InferenceError(f"Local llama.cpp streaming failed: {e}")
|
|
180
|
+
|
|
181
|
+
async def embed(self, texts: list[str], model_id: str) -> list[list[float]]:
|
|
182
|
+
"""Generate batch embeddings in-process."""
|
|
183
|
+
await self.initialize()
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
llm = await asyncio.to_thread(self._get_model, model_id)
|
|
187
|
+
embeddings = []
|
|
188
|
+
for text in texts:
|
|
189
|
+
res = await asyncio.to_thread(llm.create_embedding, input=text)
|
|
190
|
+
embeddings.append(res["data"][0]["embedding"])
|
|
191
|
+
return embeddings
|
|
192
|
+
except Exception as e:
|
|
193
|
+
raise InferenceError(f"Local llama.cpp embedding failed: {e}")
|
|
194
|
+
|
|
195
|
+
async def health_check(self) -> ProviderHealth:
|
|
196
|
+
"""Pings provider availability."""
|
|
197
|
+
try:
|
|
198
|
+
await self.initialize()
|
|
199
|
+
return ProviderHealth.HEALTHY
|
|
200
|
+
except Exception:
|
|
201
|
+
return ProviderHealth.UNAVAILABLE
|
|
202
|
+
|
|
203
|
+
def get_capabilities(self) -> ProviderCapabilities:
|
|
204
|
+
return self._capabilities
|
|
205
|
+
|
|
206
|
+
async def shutdown(self) -> None:
|
|
207
|
+
"""Release loaded model states."""
|
|
208
|
+
self._loaded_models.clear()
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""LM Studio provider adapter implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
from velune.core.errors.provider import InferenceError, ProviderConnectionError
|
|
12
|
+
from velune.core.types.inference import InferenceRequest, InferenceResponse, StreamChunk
|
|
13
|
+
from velune.core.types.model import CapabilityLevel, ModelDescriptor
|
|
14
|
+
from velune.core.types.provider import ProviderCapabilities, ProviderHealth
|
|
15
|
+
from velune.providers.base import ModelProvider
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LMStudioProvider(ModelProvider):
|
|
19
|
+
"""LM Studio provider for local OpenAI-compatible endpoints."""
|
|
20
|
+
|
|
21
|
+
def __init__(self, base_url: str = "http://localhost:1234/v1") -> None:
|
|
22
|
+
self._base_url = base_url
|
|
23
|
+
self.client: httpx.AsyncClient | None = None
|
|
24
|
+
self._capabilities = ProviderCapabilities(
|
|
25
|
+
supports_streaming=True,
|
|
26
|
+
supports_function_calling=True,
|
|
27
|
+
supports_embeddings=True,
|
|
28
|
+
max_context_window=32768,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
@property
|
|
32
|
+
def provider_id(self) -> str:
|
|
33
|
+
return "lmstudio"
|
|
34
|
+
|
|
35
|
+
async def initialize(self) -> None:
|
|
36
|
+
"""Initialize headers and async client connection."""
|
|
37
|
+
if not self.client:
|
|
38
|
+
self.client = httpx.AsyncClient(base_url=self._base_url, timeout=300.0)
|
|
39
|
+
|
|
40
|
+
async def list_models(self) -> list[ModelDescriptor]:
|
|
41
|
+
"""Fetch list of active models loaded in LM Studio."""
|
|
42
|
+
await self.initialize()
|
|
43
|
+
assert self.client is not None
|
|
44
|
+
try:
|
|
45
|
+
response = await self.client.get("/models")
|
|
46
|
+
response.raise_for_status()
|
|
47
|
+
data = response.json()
|
|
48
|
+
|
|
49
|
+
descriptors: list[ModelDescriptor] = []
|
|
50
|
+
for item in data.get("data", []):
|
|
51
|
+
m_id = item["id"]
|
|
52
|
+
descriptors.append(
|
|
53
|
+
ModelDescriptor(
|
|
54
|
+
model_id=m_id,
|
|
55
|
+
display_name=m_id,
|
|
56
|
+
provider_id="lmstudio",
|
|
57
|
+
context_length=32768,
|
|
58
|
+
capabilities={
|
|
59
|
+
"coding": CapabilityLevel.INTERMEDIATE,
|
|
60
|
+
"reasoning": CapabilityLevel.INTERMEDIATE,
|
|
61
|
+
"planning": CapabilityLevel.BASIC,
|
|
62
|
+
"summarization": CapabilityLevel.INTERMEDIATE,
|
|
63
|
+
"embedding": CapabilityLevel.INTERMEDIATE,
|
|
64
|
+
"instruction_following": CapabilityLevel.INTERMEDIATE,
|
|
65
|
+
"multimodal": CapabilityLevel.NONE,
|
|
66
|
+
"tool_use": CapabilityLevel.INTERMEDIATE,
|
|
67
|
+
"long_context": CapabilityLevel.BASIC,
|
|
68
|
+
},
|
|
69
|
+
is_local=True,
|
|
70
|
+
)
|
|
71
|
+
)
|
|
72
|
+
return descriptors
|
|
73
|
+
except httpx.HTTPError as e:
|
|
74
|
+
raise ProviderConnectionError(f"LM Studio connection error: {e}")
|
|
75
|
+
|
|
76
|
+
async def infer(self, request: InferenceRequest) -> InferenceResponse:
|
|
77
|
+
"""Standard chat inference."""
|
|
78
|
+
await self.initialize()
|
|
79
|
+
assert self.client is not None
|
|
80
|
+
start = time.perf_counter()
|
|
81
|
+
try:
|
|
82
|
+
payload = {
|
|
83
|
+
"model": request.model_id,
|
|
84
|
+
"messages": request.messages,
|
|
85
|
+
"temperature": request.temperature,
|
|
86
|
+
"max_tokens": request.max_tokens,
|
|
87
|
+
"top_p": request.top_p,
|
|
88
|
+
}
|
|
89
|
+
if request.stop_sequences:
|
|
90
|
+
payload["stop"] = request.stop_sequences
|
|
91
|
+
|
|
92
|
+
response = await self.client.post("/chat/completions", json=payload)
|
|
93
|
+
response.raise_for_status()
|
|
94
|
+
data = response.json()
|
|
95
|
+
latency = (time.perf_counter() - start) * 1000.0
|
|
96
|
+
|
|
97
|
+
return InferenceResponse(
|
|
98
|
+
content=data["choices"][0]["message"]["content"],
|
|
99
|
+
model_id=request.model_id,
|
|
100
|
+
finish_reason=data["choices"][0]["finish_reason"] or "stop",
|
|
101
|
+
tokens_used=data.get("usage", {}).get("total_tokens", 0),
|
|
102
|
+
latency_ms=latency,
|
|
103
|
+
)
|
|
104
|
+
except httpx.HTTPError as e:
|
|
105
|
+
raise InferenceError(f"LM Studio completion failed: {e}")
|
|
106
|
+
|
|
107
|
+
async def stream(self, request: InferenceRequest) -> AsyncIterator[StreamChunk]:
|
|
108
|
+
"""Streaming chat completions."""
|
|
109
|
+
await self.initialize()
|
|
110
|
+
assert self.client is not None
|
|
111
|
+
try:
|
|
112
|
+
payload = {
|
|
113
|
+
"model": request.model_id,
|
|
114
|
+
"messages": request.messages,
|
|
115
|
+
"temperature": request.temperature,
|
|
116
|
+
"max_tokens": request.max_tokens,
|
|
117
|
+
"top_p": request.top_p,
|
|
118
|
+
"stream": True,
|
|
119
|
+
}
|
|
120
|
+
if request.stop_sequences:
|
|
121
|
+
payload["stop"] = request.stop_sequences
|
|
122
|
+
|
|
123
|
+
async with self.client.stream("POST", "/chat/completions", json=payload) as response:
|
|
124
|
+
response.raise_for_status()
|
|
125
|
+
async for line in response.aiter_lines():
|
|
126
|
+
if line.startswith("data: "):
|
|
127
|
+
data_str = line[6:]
|
|
128
|
+
if data_str == "[DONE]":
|
|
129
|
+
break
|
|
130
|
+
try:
|
|
131
|
+
data = json.loads(data_str)
|
|
132
|
+
delta = data["choices"][0]["delta"]
|
|
133
|
+
yield StreamChunk(
|
|
134
|
+
content=delta.get("content", ""),
|
|
135
|
+
finish_reason=data["choices"][0].get("finish_reason"),
|
|
136
|
+
)
|
|
137
|
+
except (json.JSONDecodeError, KeyError):
|
|
138
|
+
continue
|
|
139
|
+
except httpx.HTTPError as e:
|
|
140
|
+
raise InferenceError(f"LM Studio stream failed: {e}")
|
|
141
|
+
|
|
142
|
+
async def embed(self, texts: list[str], model_id: str) -> list[list[float]]:
|
|
143
|
+
"""Generate batch embeddings."""
|
|
144
|
+
await self.initialize()
|
|
145
|
+
assert self.client is not None
|
|
146
|
+
try:
|
|
147
|
+
response = await self.client.post(
|
|
148
|
+
"/embeddings", json={"model": model_id, "input": texts}
|
|
149
|
+
)
|
|
150
|
+
response.raise_for_status()
|
|
151
|
+
data = response.json()
|
|
152
|
+
sorted_data = sorted(data["data"], key=lambda x: x["index"])
|
|
153
|
+
return [item["embedding"] for item in sorted_data]
|
|
154
|
+
except httpx.HTTPError as e:
|
|
155
|
+
raise InferenceError(f"LM Studio embedding failed: {e}")
|
|
156
|
+
|
|
157
|
+
async def health_check(self) -> ProviderHealth:
|
|
158
|
+
"""Verifies connectivity."""
|
|
159
|
+
try:
|
|
160
|
+
await self.initialize()
|
|
161
|
+
assert self.client is not None
|
|
162
|
+
resp = await self.client.get("/models")
|
|
163
|
+
if resp.status_code == 200:
|
|
164
|
+
return ProviderHealth.HEALTHY
|
|
165
|
+
return ProviderHealth.DEGRADED
|
|
166
|
+
except Exception:
|
|
167
|
+
return ProviderHealth.UNAVAILABLE
|
|
168
|
+
|
|
169
|
+
def get_capabilities(self) -> ProviderCapabilities:
|
|
170
|
+
return self._capabilities
|
|
171
|
+
|
|
172
|
+
async def shutdown(self) -> None:
|
|
173
|
+
if self.client:
|
|
174
|
+
await self.client.aclose()
|
|
175
|
+
self.client = None
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""Ollama provider adapter implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import time
|
|
8
|
+
from collections.abc import AsyncIterator
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from velune.core.errors.provider import InferenceError, ProviderConnectionError
|
|
13
|
+
from velune.core.types.inference import InferenceRequest, InferenceResponse, StreamChunk
|
|
14
|
+
from velune.core.types.model import CapabilityLevel, ModelDescriptor
|
|
15
|
+
from velune.core.types.provider import ProviderCapabilities, ProviderHealth
|
|
16
|
+
from velune.providers.base import ModelProvider
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("velune.providers.adapters.ollama")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class OllamaProvider(ModelProvider):
|
|
22
|
+
"""Ollama provider for local models."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, base_url: str = "http://localhost:11434") -> None:
|
|
25
|
+
self._base_url = base_url
|
|
26
|
+
self.client: httpx.AsyncClient | None = None
|
|
27
|
+
self._capabilities = ProviderCapabilities(
|
|
28
|
+
supports_streaming=True,
|
|
29
|
+
supports_function_calling=False,
|
|
30
|
+
supports_embeddings=True,
|
|
31
|
+
max_context_window=8192,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def provider_id(self) -> str:
|
|
36
|
+
return "ollama"
|
|
37
|
+
|
|
38
|
+
async def initialize(self) -> None:
|
|
39
|
+
"""Initialize the async client."""
|
|
40
|
+
if not self.client:
|
|
41
|
+
self.client = httpx.AsyncClient(base_url=self._base_url, timeout=300.0)
|
|
42
|
+
|
|
43
|
+
async def _get_model_context_length(self, model_name: str) -> int:
|
|
44
|
+
"""Query /api/show for the model's actual context window size.
|
|
45
|
+
|
|
46
|
+
Ollama's ``/api/show`` returns a ``parameters`` string that may contain
|
|
47
|
+
a line like ``num_ctx 131072``. We parse that to
|
|
48
|
+
get the real context length instead of hard-coding 8192.
|
|
49
|
+
|
|
50
|
+
Falls back to 8 192 if the endpoint is unreachable, the model is not
|
|
51
|
+
loaded, or the field is absent.
|
|
52
|
+
"""
|
|
53
|
+
assert self.client is not None
|
|
54
|
+
try:
|
|
55
|
+
resp = await self.client.post("/api/show", json={"name": model_name})
|
|
56
|
+
if resp.status_code != 200:
|
|
57
|
+
return 8192
|
|
58
|
+
data = resp.json()
|
|
59
|
+
# ``parameters`` is a newline-delimited string of key-value pairs
|
|
60
|
+
params_str: str = data.get("parameters", "")
|
|
61
|
+
for line in params_str.splitlines():
|
|
62
|
+
parts = line.split()
|
|
63
|
+
if len(parts) >= 2 and parts[0].lower() == "num_ctx":
|
|
64
|
+
try:
|
|
65
|
+
return int(parts[1])
|
|
66
|
+
except ValueError:
|
|
67
|
+
pass
|
|
68
|
+
# Fallback: check model_info dict returned by newer Ollama builds
|
|
69
|
+
model_info: dict = data.get("model_info", {})
|
|
70
|
+
for key, value in model_info.items():
|
|
71
|
+
if "context" in key.lower() and isinstance(value, int) and value > 0:
|
|
72
|
+
return value
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
logger.debug("Could not fetch context length for %s: %s", model_name, exc)
|
|
75
|
+
return 8192
|
|
76
|
+
|
|
77
|
+
async def list_models(self) -> list[ModelDescriptor]:
|
|
78
|
+
"""Fetch models from active Ollama endpoint with accurate context lengths.
|
|
79
|
+
|
|
80
|
+
Queries ``/api/show`` for each model to populate the real ``num_ctx``
|
|
81
|
+
value instead of defaulting every model to 8 192 tokens.
|
|
82
|
+
"""
|
|
83
|
+
await self.initialize()
|
|
84
|
+
assert self.client is not None
|
|
85
|
+
try:
|
|
86
|
+
response = await self.client.get("/api/tags")
|
|
87
|
+
response.raise_for_status()
|
|
88
|
+
data = response.json()
|
|
89
|
+
|
|
90
|
+
descriptors: list[ModelDescriptor] = []
|
|
91
|
+
for item in data.get("models", []):
|
|
92
|
+
model_name = item["name"]
|
|
93
|
+
ctx_len = await self._get_model_context_length(model_name)
|
|
94
|
+
descriptors.append(
|
|
95
|
+
ModelDescriptor(
|
|
96
|
+
model_id=model_name,
|
|
97
|
+
display_name=model_name,
|
|
98
|
+
provider_id="ollama",
|
|
99
|
+
context_length=ctx_len,
|
|
100
|
+
capabilities={
|
|
101
|
+
"coding": CapabilityLevel.INTERMEDIATE,
|
|
102
|
+
"reasoning": CapabilityLevel.INTERMEDIATE,
|
|
103
|
+
"planning": CapabilityLevel.BASIC,
|
|
104
|
+
"summarization": CapabilityLevel.INTERMEDIATE,
|
|
105
|
+
"embedding": CapabilityLevel.INTERMEDIATE,
|
|
106
|
+
"instruction_following": CapabilityLevel.INTERMEDIATE,
|
|
107
|
+
"multimodal": CapabilityLevel.NONE,
|
|
108
|
+
"tool_use": CapabilityLevel.NONE,
|
|
109
|
+
"long_context": (
|
|
110
|
+
CapabilityLevel.INTERMEDIATE
|
|
111
|
+
if ctx_len > 32768
|
|
112
|
+
else CapabilityLevel.NONE
|
|
113
|
+
),
|
|
114
|
+
},
|
|
115
|
+
is_local=True,
|
|
116
|
+
)
|
|
117
|
+
)
|
|
118
|
+
return descriptors
|
|
119
|
+
except httpx.HTTPError as e:
|
|
120
|
+
raise ProviderConnectionError(f"Failed to fetch models from Ollama: {e}")
|
|
121
|
+
|
|
122
|
+
async def infer(self, request: InferenceRequest) -> InferenceResponse:
|
|
123
|
+
"""Synchronous chat inference."""
|
|
124
|
+
await self.initialize()
|
|
125
|
+
assert self.client is not None
|
|
126
|
+
start = time.perf_counter()
|
|
127
|
+
try:
|
|
128
|
+
payload = {
|
|
129
|
+
"model": request.model_id,
|
|
130
|
+
"messages": request.messages,
|
|
131
|
+
"stream": False,
|
|
132
|
+
"options": {
|
|
133
|
+
"temperature": request.temperature,
|
|
134
|
+
"num_predict": request.max_tokens,
|
|
135
|
+
"top_p": request.top_p,
|
|
136
|
+
},
|
|
137
|
+
}
|
|
138
|
+
if request.stop_sequences:
|
|
139
|
+
payload["options"]["stop"] = request.stop_sequences
|
|
140
|
+
|
|
141
|
+
response = await self.client.post("/api/chat", json=payload)
|
|
142
|
+
response.raise_for_status()
|
|
143
|
+
data = response.json()
|
|
144
|
+
latency = (time.perf_counter() - start) * 1000.0
|
|
145
|
+
|
|
146
|
+
if latency > 30000.0:
|
|
147
|
+
logger.warning(
|
|
148
|
+
"Slow inference on %s (%.1fs). Consider a smaller model for your hardware.",
|
|
149
|
+
request.model_id,
|
|
150
|
+
latency / 1000.0,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
return InferenceResponse(
|
|
154
|
+
content=data["message"]["content"],
|
|
155
|
+
model_id=request.model_id,
|
|
156
|
+
finish_reason=data.get("done_reason", "stop"),
|
|
157
|
+
tokens_used=data.get("eval_count", 0) + data.get("prompt_eval_count", 0),
|
|
158
|
+
latency_ms=latency,
|
|
159
|
+
)
|
|
160
|
+
except httpx.HTTPError as e:
|
|
161
|
+
raise InferenceError(f"Ollama inference failed: {e}")
|
|
162
|
+
|
|
163
|
+
async def stream(self, request: InferenceRequest) -> AsyncIterator[StreamChunk]:
|
|
164
|
+
"""Streaming chat completion."""
|
|
165
|
+
await self.initialize()
|
|
166
|
+
assert self.client is not None
|
|
167
|
+
try:
|
|
168
|
+
payload = {
|
|
169
|
+
"model": request.model_id,
|
|
170
|
+
"messages": request.messages,
|
|
171
|
+
"stream": True,
|
|
172
|
+
"options": {
|
|
173
|
+
"temperature": request.temperature,
|
|
174
|
+
"num_predict": request.max_tokens,
|
|
175
|
+
"top_p": request.top_p,
|
|
176
|
+
},
|
|
177
|
+
}
|
|
178
|
+
if request.stop_sequences:
|
|
179
|
+
payload["options"]["stop"] = request.stop_sequences
|
|
180
|
+
|
|
181
|
+
async with self.client.stream("POST", "/api/chat", json=payload) as response:
|
|
182
|
+
response.raise_for_status()
|
|
183
|
+
async for line in response.aiter_lines():
|
|
184
|
+
if not line:
|
|
185
|
+
continue
|
|
186
|
+
try:
|
|
187
|
+
data = json.loads(line)
|
|
188
|
+
if "message" in data:
|
|
189
|
+
yield StreamChunk(
|
|
190
|
+
content=data["message"].get("content", ""),
|
|
191
|
+
finish_reason=data.get("done_reason"),
|
|
192
|
+
)
|
|
193
|
+
except json.JSONDecodeError:
|
|
194
|
+
continue
|
|
195
|
+
except httpx.HTTPError as e:
|
|
196
|
+
raise InferenceError(f"Ollama streaming failed: {e}")
|
|
197
|
+
|
|
198
|
+
async def embed(self, texts: list[str], model_id: str) -> list[list[float]]:
|
|
199
|
+
"""Batch embedding generation."""
|
|
200
|
+
await self.initialize()
|
|
201
|
+
assert self.client is not None
|
|
202
|
+
embeddings: list[list[float]] = []
|
|
203
|
+
try:
|
|
204
|
+
for text in texts:
|
|
205
|
+
resp = await self.client.post(
|
|
206
|
+
"/api/embeddings", json={"model": model_id, "prompt": text}
|
|
207
|
+
)
|
|
208
|
+
resp.raise_for_status()
|
|
209
|
+
embeddings.append(resp.json()["embedding"])
|
|
210
|
+
return embeddings
|
|
211
|
+
except httpx.HTTPError as e:
|
|
212
|
+
raise InferenceError(f"Ollama embedding failed: {e}")
|
|
213
|
+
|
|
214
|
+
async def health_check(self) -> ProviderHealth:
|
|
215
|
+
"""Pings Ollama core endpoint."""
|
|
216
|
+
await self.initialize()
|
|
217
|
+
assert self.client is not None
|
|
218
|
+
try:
|
|
219
|
+
resp = await self.client.get("/")
|
|
220
|
+
if resp.status_code == 200:
|
|
221
|
+
return ProviderHealth.HEALTHY
|
|
222
|
+
return ProviderHealth.DEGRADED
|
|
223
|
+
except Exception:
|
|
224
|
+
return ProviderHealth.UNAVAILABLE
|
|
225
|
+
|
|
226
|
+
def get_capabilities(self) -> ProviderCapabilities:
|
|
227
|
+
return self._capabilities
|
|
228
|
+
|
|
229
|
+
async def shutdown(self) -> None:
|
|
230
|
+
"""Close connection pools."""
|
|
231
|
+
if self.client:
|
|
232
|
+
await self.client.aclose()
|
|
233
|
+
self.client = None
|