parishad 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parishad/__init__.py +70 -0
- parishad/__main__.py +10 -0
- parishad/checker/__init__.py +25 -0
- parishad/checker/deterministic.py +644 -0
- parishad/checker/ensemble.py +496 -0
- parishad/checker/retrieval.py +546 -0
- parishad/cli/__init__.py +6 -0
- parishad/cli/code.py +3254 -0
- parishad/cli/main.py +1158 -0
- parishad/cli/prarambh.py +99 -0
- parishad/cli/sthapana.py +368 -0
- parishad/config/modes.py +139 -0
- parishad/config/pipeline.core.yaml +128 -0
- parishad/config/pipeline.extended.yaml +172 -0
- parishad/config/pipeline.fast.yaml +89 -0
- parishad/config/user_config.py +115 -0
- parishad/data/catalog.py +118 -0
- parishad/data/models.json +108 -0
- parishad/memory/__init__.py +79 -0
- parishad/models/__init__.py +181 -0
- parishad/models/backends/__init__.py +247 -0
- parishad/models/backends/base.py +211 -0
- parishad/models/backends/huggingface.py +318 -0
- parishad/models/backends/llama_cpp.py +239 -0
- parishad/models/backends/mlx_lm.py +141 -0
- parishad/models/backends/ollama.py +253 -0
- parishad/models/backends/openai_api.py +193 -0
- parishad/models/backends/transformers_hf.py +198 -0
- parishad/models/costs.py +385 -0
- parishad/models/downloader.py +1557 -0
- parishad/models/optimizations.py +871 -0
- parishad/models/profiles.py +610 -0
- parishad/models/reliability.py +876 -0
- parishad/models/runner.py +651 -0
- parishad/models/tokenization.py +287 -0
- parishad/orchestrator/__init__.py +24 -0
- parishad/orchestrator/config_loader.py +210 -0
- parishad/orchestrator/engine.py +1113 -0
- parishad/orchestrator/exceptions.py +14 -0
- parishad/roles/__init__.py +71 -0
- parishad/roles/base.py +712 -0
- parishad/roles/dandadhyaksha.py +163 -0
- parishad/roles/darbari.py +246 -0
- parishad/roles/majumdar.py +274 -0
- parishad/roles/pantapradhan.py +150 -0
- parishad/roles/prerak.py +357 -0
- parishad/roles/raja.py +345 -0
- parishad/roles/sacheev.py +203 -0
- parishad/roles/sainik.py +427 -0
- parishad/roles/sar_senapati.py +164 -0
- parishad/roles/vidushak.py +69 -0
- parishad/tools/__init__.py +7 -0
- parishad/tools/base.py +57 -0
- parishad/tools/fs.py +110 -0
- parishad/tools/perception.py +96 -0
- parishad/tools/retrieval.py +74 -0
- parishad/tools/shell.py +103 -0
- parishad/utils/__init__.py +7 -0
- parishad/utils/hardware.py +122 -0
- parishad/utils/logging.py +79 -0
- parishad/utils/scanner.py +164 -0
- parishad/utils/text.py +61 -0
- parishad/utils/tracing.py +133 -0
- parishad-0.1.0.dist-info/METADATA +256 -0
- parishad-0.1.0.dist-info/RECORD +68 -0
- parishad-0.1.0.dist-info/WHEEL +4 -0
- parishad-0.1.0.dist-info/entry_points.txt +2 -0
- parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base classes and types for Parishad backends.
|
|
3
|
+
|
|
4
|
+
This module contains:
|
|
5
|
+
- BackendError: Exception for backend failures
|
|
6
|
+
- BackendConfig: Configuration dataclass
|
|
7
|
+
- BackendResult: Result dataclass
|
|
8
|
+
- ModelBackend: Protocol for backend implementations
|
|
9
|
+
- BaseBackend: Abstract base class
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any, Protocol, runtime_checkable
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BackendError(Exception):
|
|
23
|
+
"""
|
|
24
|
+
Raised when a backend operation fails.
|
|
25
|
+
|
|
26
|
+
Attributes:
|
|
27
|
+
backend_name: Name of the backend that failed
|
|
28
|
+
model_id: Model identifier (if known)
|
|
29
|
+
original_error: The underlying exception
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
message: str,
|
|
35
|
+
backend_name: str = "",
|
|
36
|
+
model_id: str = "",
|
|
37
|
+
original_error: Exception | None = None,
|
|
38
|
+
):
|
|
39
|
+
super().__init__(message)
|
|
40
|
+
self.backend_name = backend_name
|
|
41
|
+
self.model_id = model_id
|
|
42
|
+
self.original_error = original_error
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class BackendConfig:
|
|
47
|
+
"""
|
|
48
|
+
Configuration for a model backend.
|
|
49
|
+
|
|
50
|
+
This is a unified config structure that backends can use.
|
|
51
|
+
Backend-specific options go in `extra`.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
model_id: str
|
|
55
|
+
"""Model identifier (path, HuggingFace ID, or API model name)."""
|
|
56
|
+
|
|
57
|
+
context_length: int = 4096
|
|
58
|
+
"""Maximum context window size in tokens."""
|
|
59
|
+
|
|
60
|
+
temperature: float = 0.5
|
|
61
|
+
"""Default sampling temperature."""
|
|
62
|
+
|
|
63
|
+
top_p: float = 0.9
|
|
64
|
+
"""Default nucleus sampling parameter."""
|
|
65
|
+
|
|
66
|
+
max_tokens: int = 1024
|
|
67
|
+
"""Default maximum tokens to generate."""
|
|
68
|
+
|
|
69
|
+
stop: list[str] | None = None
|
|
70
|
+
"""Default stop sequences."""
|
|
71
|
+
|
|
72
|
+
timeout: float = 120.0
|
|
73
|
+
"""Request timeout in seconds."""
|
|
74
|
+
|
|
75
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
76
|
+
"""Backend-specific options (e.g., n_gpu_layers for llama.cpp)."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
@dataclass
|
|
80
|
+
class BackendResult:
|
|
81
|
+
"""
|
|
82
|
+
Result from a backend generation call.
|
|
83
|
+
|
|
84
|
+
All backends must return this structure for consistent handling.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
text: str
|
|
88
|
+
"""Generated text content."""
|
|
89
|
+
|
|
90
|
+
tokens_in: int
|
|
91
|
+
"""Number of input/prompt tokens."""
|
|
92
|
+
|
|
93
|
+
tokens_out: int
|
|
94
|
+
"""Number of output/generated tokens."""
|
|
95
|
+
|
|
96
|
+
model_id: str = ""
|
|
97
|
+
"""Model identifier used for generation."""
|
|
98
|
+
|
|
99
|
+
finish_reason: str = "stop"
|
|
100
|
+
"""Why generation stopped: 'stop', 'length', 'error'."""
|
|
101
|
+
|
|
102
|
+
latency_ms: float = 0.0
|
|
103
|
+
"""Generation latency in milliseconds."""
|
|
104
|
+
|
|
105
|
+
extra: dict[str, Any] = field(default_factory=dict)
|
|
106
|
+
"""Backend-specific metadata."""
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@runtime_checkable
|
|
110
|
+
class ModelBackend(Protocol):
|
|
111
|
+
"""
|
|
112
|
+
Protocol for model backend implementations.
|
|
113
|
+
|
|
114
|
+
All backends must implement these methods to be usable by ModelRunner.
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def name(self) -> str:
|
|
119
|
+
"""Backend name (e.g., 'llama_cpp', 'openai', 'stub')."""
|
|
120
|
+
...
|
|
121
|
+
|
|
122
|
+
@property
|
|
123
|
+
def is_loaded(self) -> bool:
|
|
124
|
+
"""Whether the backend is ready to generate."""
|
|
125
|
+
...
|
|
126
|
+
|
|
127
|
+
@property
|
|
128
|
+
def model_id(self) -> str:
|
|
129
|
+
"""Current model identifier."""
|
|
130
|
+
...
|
|
131
|
+
|
|
132
|
+
def load(self, config: BackendConfig) -> None:
|
|
133
|
+
"""Load/initialize the backend with the given config."""
|
|
134
|
+
...
|
|
135
|
+
|
|
136
|
+
def generate(
|
|
137
|
+
self,
|
|
138
|
+
prompt: str,
|
|
139
|
+
max_tokens: int,
|
|
140
|
+
temperature: float,
|
|
141
|
+
top_p: float,
|
|
142
|
+
stop: list[str] | None = None,
|
|
143
|
+
) -> BackendResult:
|
|
144
|
+
"""Generate text completion."""
|
|
145
|
+
...
|
|
146
|
+
|
|
147
|
+
def unload(self) -> None:
|
|
148
|
+
"""Unload the model to free resources."""
|
|
149
|
+
...
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class BaseBackend(ABC):
|
|
153
|
+
"""
|
|
154
|
+
Abstract base class for backend implementations.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
_name: str = "base"
|
|
158
|
+
_model_id: str = ""
|
|
159
|
+
_loaded: bool = False
|
|
160
|
+
_config: BackendConfig | None = None
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def name(self) -> str:
|
|
164
|
+
"""Backend name."""
|
|
165
|
+
return self._name
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def is_loaded(self) -> bool:
|
|
169
|
+
"""Whether backend is ready."""
|
|
170
|
+
return self._loaded
|
|
171
|
+
|
|
172
|
+
@property
|
|
173
|
+
def model_id(self) -> str:
|
|
174
|
+
"""Current model ID."""
|
|
175
|
+
return self._model_id
|
|
176
|
+
|
|
177
|
+
@abstractmethod
|
|
178
|
+
def load(self, config: BackendConfig) -> None:
|
|
179
|
+
"""Load the backend. Must be implemented by subclasses."""
|
|
180
|
+
pass
|
|
181
|
+
|
|
182
|
+
@abstractmethod
|
|
183
|
+
def generate(
|
|
184
|
+
self,
|
|
185
|
+
prompt: str,
|
|
186
|
+
max_tokens: int,
|
|
187
|
+
temperature: float,
|
|
188
|
+
top_p: float,
|
|
189
|
+
stop: list[str] | None = None,
|
|
190
|
+
) -> BackendResult:
|
|
191
|
+
"""Generate text. Must be implemented by subclasses."""
|
|
192
|
+
pass
|
|
193
|
+
|
|
194
|
+
def unload(self) -> None:
|
|
195
|
+
"""Default unload implementation."""
|
|
196
|
+
self._loaded = False
|
|
197
|
+
self._model_id = ""
|
|
198
|
+
self._config = None
|
|
199
|
+
|
|
200
|
+
def _estimate_tokens(self, text: str) -> int:
|
|
201
|
+
"""
|
|
202
|
+
Cheap token estimation heuristic.
|
|
203
|
+
|
|
204
|
+
Uses word count * 1.3 as a rough approximation.
|
|
205
|
+
Override in subclasses for more accurate counting.
|
|
206
|
+
"""
|
|
207
|
+
if not text:
|
|
208
|
+
return 0
|
|
209
|
+
# Rough approximation: ~1.3 tokens per word for English
|
|
210
|
+
words = len(text.split())
|
|
211
|
+
return int(words * 1.3)
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HuggingFace backends for inference.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- HuggingFaceBackend: Uses HuggingFace Inference API (cloud)
|
|
6
|
+
- HuggingFaceLocalBackend: Uses local transformers (alias for TransformersBackend)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
from .base import BackendConfig, BackendError, BackendResult, BaseBackend
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Lazy imports
|
|
20
|
+
_huggingface_hub = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _get_huggingface_hub():
|
|
24
|
+
"""Lazy import of huggingface_hub."""
|
|
25
|
+
global _huggingface_hub
|
|
26
|
+
if _huggingface_hub is None:
|
|
27
|
+
try:
|
|
28
|
+
import huggingface_hub
|
|
29
|
+
_huggingface_hub = huggingface_hub
|
|
30
|
+
except ImportError:
|
|
31
|
+
raise ImportError(
|
|
32
|
+
"huggingface_hub is required for HuggingFaceBackend. "
|
|
33
|
+
"Install with: pip install huggingface_hub"
|
|
34
|
+
)
|
|
35
|
+
return _huggingface_hub
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class HuggingFaceBackend(BaseBackend):
|
|
39
|
+
"""
|
|
40
|
+
Backend for HuggingFace Inference API (cloud-based).
|
|
41
|
+
|
|
42
|
+
Uses HuggingFace's serverless Inference API or dedicated Inference Endpoints.
|
|
43
|
+
Requires HF_TOKEN environment variable or token in config.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
_name = "huggingface"
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
"""Initialize HuggingFaceBackend."""
|
|
50
|
+
super().__init__()
|
|
51
|
+
self._client = None
|
|
52
|
+
|
|
53
|
+
def load(self, config: BackendConfig) -> None:
|
|
54
|
+
"""Initialize HuggingFace Inference client."""
|
|
55
|
+
hf = _get_huggingface_hub()
|
|
56
|
+
|
|
57
|
+
extra = config.extra or {}
|
|
58
|
+
|
|
59
|
+
# Get token
|
|
60
|
+
token = extra.get("token") or os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
|
|
61
|
+
|
|
62
|
+
if not token:
|
|
63
|
+
logger.warning(
|
|
64
|
+
"No HuggingFace token found. Some models may not be accessible. "
|
|
65
|
+
"Set HF_TOKEN environment variable."
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
# Check if it's an Inference Endpoint URL or model ID
|
|
70
|
+
model_id = config.model_id
|
|
71
|
+
|
|
72
|
+
if model_id.startswith("https://"):
|
|
73
|
+
# Dedicated Inference Endpoint
|
|
74
|
+
self._client = hf.InferenceClient(
|
|
75
|
+
model=model_id,
|
|
76
|
+
token=token,
|
|
77
|
+
timeout=config.timeout,
|
|
78
|
+
)
|
|
79
|
+
logger.info(f"✅ Connected to HuggingFace Inference Endpoint")
|
|
80
|
+
else:
|
|
81
|
+
# Serverless Inference API
|
|
82
|
+
self._client = hf.InferenceClient(
|
|
83
|
+
model=model_id,
|
|
84
|
+
token=token,
|
|
85
|
+
timeout=config.timeout,
|
|
86
|
+
)
|
|
87
|
+
logger.info(f"✅ Using HuggingFace Serverless API for {model_id}")
|
|
88
|
+
|
|
89
|
+
self._config = config
|
|
90
|
+
self._model_id = model_id
|
|
91
|
+
self._loaded = True
|
|
92
|
+
|
|
93
|
+
except Exception as e:
|
|
94
|
+
raise BackendError(
|
|
95
|
+
f"Failed to initialize HuggingFace client: {e}",
|
|
96
|
+
backend_name=self._name,
|
|
97
|
+
model_id=config.model_id,
|
|
98
|
+
original_error=e,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def generate(
|
|
102
|
+
self,
|
|
103
|
+
prompt: str,
|
|
104
|
+
max_tokens: int,
|
|
105
|
+
temperature: float,
|
|
106
|
+
top_p: float,
|
|
107
|
+
stop: list[str] | None = None,
|
|
108
|
+
) -> BackendResult:
|
|
109
|
+
"""Generate text using HuggingFace Inference API."""
|
|
110
|
+
if not self._loaded or self._client is None:
|
|
111
|
+
raise BackendError(
|
|
112
|
+
"Client not initialized",
|
|
113
|
+
backend_name=self._name,
|
|
114
|
+
model_id=self._model_id,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
start_time = time.perf_counter()
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
# Use text_generation for LLMs
|
|
121
|
+
response = self._client.text_generation(
|
|
122
|
+
prompt=prompt,
|
|
123
|
+
max_new_tokens=max_tokens,
|
|
124
|
+
temperature=max(temperature, 0.01),
|
|
125
|
+
top_p=top_p,
|
|
126
|
+
stop_sequences=stop or [],
|
|
127
|
+
return_full_text=False, # Only return generated text
|
|
128
|
+
details=True, # Get token counts
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Extract text and details
|
|
132
|
+
if hasattr(response, 'generated_text'):
|
|
133
|
+
text = response.generated_text
|
|
134
|
+
tokens_out = response.details.generated_tokens if hasattr(response, 'details') else self._estimate_tokens(text)
|
|
135
|
+
finish_reason = response.details.finish_reason if hasattr(response, 'details') else "stop"
|
|
136
|
+
else:
|
|
137
|
+
# Simple string response
|
|
138
|
+
text = str(response)
|
|
139
|
+
tokens_out = self._estimate_tokens(text)
|
|
140
|
+
finish_reason = "stop"
|
|
141
|
+
|
|
142
|
+
tokens_in = self._estimate_tokens(prompt)
|
|
143
|
+
latency_ms = (time.perf_counter() - start_time) * 1000
|
|
144
|
+
|
|
145
|
+
return BackendResult(
|
|
146
|
+
text=text,
|
|
147
|
+
tokens_in=tokens_in,
|
|
148
|
+
tokens_out=tokens_out,
|
|
149
|
+
model_id=self._model_id,
|
|
150
|
+
finish_reason=finish_reason,
|
|
151
|
+
latency_ms=latency_ms,
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
except Exception as e:
|
|
155
|
+
# Check for common HF API errors
|
|
156
|
+
error_msg = str(e)
|
|
157
|
+
if "429" in error_msg:
|
|
158
|
+
error_msg = f"Rate limited by HuggingFace API. Try again later. {e}"
|
|
159
|
+
elif "401" in error_msg or "403" in error_msg:
|
|
160
|
+
error_msg = f"Authentication failed. Check your HF_TOKEN. {e}"
|
|
161
|
+
elif "Model is currently loading" in error_msg:
|
|
162
|
+
error_msg = f"Model is loading on HuggingFace servers. Retry in ~30s. {e}"
|
|
163
|
+
|
|
164
|
+
raise BackendError(
|
|
165
|
+
f"HuggingFace generation failed: {error_msg}",
|
|
166
|
+
backend_name=self._name,
|
|
167
|
+
model_id=self._model_id,
|
|
168
|
+
original_error=e,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
def unload(self) -> None:
|
|
172
|
+
"""Close the client."""
|
|
173
|
+
self._client = None
|
|
174
|
+
super().unload()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
class HuggingFaceChatBackend(BaseBackend):
|
|
178
|
+
"""
|
|
179
|
+
Backend for HuggingFace Inference API with chat/conversation support.
|
|
180
|
+
|
|
181
|
+
Uses the chat_completion endpoint for models that support it.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
_name = "huggingface_chat"
|
|
185
|
+
|
|
186
|
+
def __init__(self):
|
|
187
|
+
"""Initialize HuggingFaceChatBackend."""
|
|
188
|
+
super().__init__()
|
|
189
|
+
self._client = None
|
|
190
|
+
|
|
191
|
+
def load(self, config: BackendConfig) -> None:
|
|
192
|
+
"""Initialize HuggingFace Inference client."""
|
|
193
|
+
hf = _get_huggingface_hub()
|
|
194
|
+
|
|
195
|
+
extra = config.extra or {}
|
|
196
|
+
token = extra.get("token") or os.environ.get("HF_TOKEN")
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
self._client = hf.InferenceClient(
|
|
200
|
+
model=config.model_id,
|
|
201
|
+
token=token,
|
|
202
|
+
timeout=config.timeout,
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
self._config = config
|
|
206
|
+
self._model_id = config.model_id
|
|
207
|
+
self._loaded = True
|
|
208
|
+
|
|
209
|
+
except Exception as e:
|
|
210
|
+
raise BackendError(
|
|
211
|
+
f"Failed to initialize HuggingFace chat client: {e}",
|
|
212
|
+
backend_name=self._name,
|
|
213
|
+
model_id=config.model_id,
|
|
214
|
+
original_error=e,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
def generate(
|
|
218
|
+
self,
|
|
219
|
+
prompt: str,
|
|
220
|
+
max_tokens: int,
|
|
221
|
+
temperature: float,
|
|
222
|
+
top_p: float,
|
|
223
|
+
stop: list[str] | None = None,
|
|
224
|
+
) -> BackendResult:
|
|
225
|
+
"""Generate text using HuggingFace chat completion."""
|
|
226
|
+
if not self._loaded or self._client is None:
|
|
227
|
+
raise BackendError(
|
|
228
|
+
"Client not initialized",
|
|
229
|
+
backend_name=self._name,
|
|
230
|
+
model_id=self._model_id,
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
start_time = time.perf_counter()
|
|
234
|
+
|
|
235
|
+
try:
|
|
236
|
+
# Parse prompt into messages
|
|
237
|
+
messages = self._parse_prompt_to_messages(prompt)
|
|
238
|
+
|
|
239
|
+
response = self._client.chat_completion(
|
|
240
|
+
messages=messages,
|
|
241
|
+
max_tokens=max_tokens,
|
|
242
|
+
temperature=max(temperature, 0.01),
|
|
243
|
+
top_p=top_p,
|
|
244
|
+
stop=stop,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Extract response
|
|
248
|
+
choice = response.choices[0]
|
|
249
|
+
text = choice.message.content or ""
|
|
250
|
+
finish_reason = choice.finish_reason or "stop"
|
|
251
|
+
|
|
252
|
+
# Token counts
|
|
253
|
+
if hasattr(response, 'usage') and response.usage:
|
|
254
|
+
tokens_in = response.usage.prompt_tokens
|
|
255
|
+
tokens_out = response.usage.completion_tokens
|
|
256
|
+
else:
|
|
257
|
+
tokens_in = self._estimate_tokens(prompt)
|
|
258
|
+
tokens_out = self._estimate_tokens(text)
|
|
259
|
+
|
|
260
|
+
latency_ms = (time.perf_counter() - start_time) * 1000
|
|
261
|
+
|
|
262
|
+
return BackendResult(
|
|
263
|
+
text=text,
|
|
264
|
+
tokens_in=tokens_in,
|
|
265
|
+
tokens_out=tokens_out,
|
|
266
|
+
model_id=self._model_id,
|
|
267
|
+
finish_reason=finish_reason,
|
|
268
|
+
latency_ms=latency_ms,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
except Exception as e:
|
|
272
|
+
raise BackendError(
|
|
273
|
+
f"HuggingFace chat generation failed: {e}",
|
|
274
|
+
backend_name=self._name,
|
|
275
|
+
model_id=self._model_id,
|
|
276
|
+
original_error=e,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def _parse_prompt_to_messages(self, prompt: str) -> list[dict]:
|
|
280
|
+
"""Parse prompt string into message format."""
|
|
281
|
+
messages = []
|
|
282
|
+
|
|
283
|
+
# Try to parse Llama-3 format
|
|
284
|
+
if "<|start_header_id|>" in prompt:
|
|
285
|
+
parts = prompt.split("<|start_header_id|>")
|
|
286
|
+
for part in parts:
|
|
287
|
+
if part.startswith("system"):
|
|
288
|
+
content = part.split("<|end_header_id|>")[1].split("<|eot_id|>")[0].strip()
|
|
289
|
+
if content:
|
|
290
|
+
messages.append({"role": "system", "content": content})
|
|
291
|
+
elif part.startswith("user"):
|
|
292
|
+
content = part.split("<|end_header_id|>")[1].split("<|eot_id|>")[0].strip()
|
|
293
|
+
if content:
|
|
294
|
+
messages.append({"role": "user", "content": content})
|
|
295
|
+
elif part.startswith("assistant"):
|
|
296
|
+
content = part.split("<|end_header_id|>")[1].split("<|eot_id|>")[0].strip()
|
|
297
|
+
if content:
|
|
298
|
+
messages.append({"role": "assistant", "content": content})
|
|
299
|
+
elif "System:" in prompt and "User:" in prompt:
|
|
300
|
+
# Simple format
|
|
301
|
+
parts = prompt.split("User:", 1)
|
|
302
|
+
system = parts[0].replace("System:", "").strip()
|
|
303
|
+
user = parts[1].strip() if len(parts) > 1 else ""
|
|
304
|
+
|
|
305
|
+
if system:
|
|
306
|
+
messages.append({"role": "system", "content": system})
|
|
307
|
+
if user:
|
|
308
|
+
messages.append({"role": "user", "content": user})
|
|
309
|
+
else:
|
|
310
|
+
# Single user message
|
|
311
|
+
messages.append({"role": "user", "content": prompt})
|
|
312
|
+
|
|
313
|
+
return messages
|
|
314
|
+
|
|
315
|
+
def unload(self) -> None:
|
|
316
|
+
"""Close client."""
|
|
317
|
+
self._client = None
|
|
318
|
+
super().unload()
|