eval-ai-library 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- {eval_ai_library-0.3.1.dist-info → eval_ai_library-0.3.3.dist-info}/METADATA +214 -1
- {eval_ai_library-0.3.1.dist-info → eval_ai_library-0.3.3.dist-info}/RECORD +7 -7
- eval_lib/__init__.py +10 -6
- eval_lib/llm_client.py +47 -1
- {eval_ai_library-0.3.1.dist-info → eval_ai_library-0.3.3.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.3.1.dist-info → eval_ai_library-0.3.3.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.3.1.dist-info → eval_ai_library-0.3.3.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eval-ai-library
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.3
|
|
4
4
|
Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
|
|
5
5
|
Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -807,6 +807,219 @@ response, cost = await chat_complete(
|
|
|
807
807
|
)
|
|
808
808
|
```
|
|
809
809
|
|
|
810
|
+
## Custom LLM Providers
|
|
811
|
+
|
|
812
|
+
The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
|
|
813
|
+
|
|
814
|
+
### Creating a Custom Provider
|
|
815
|
+
|
|
816
|
+
Implement the `CustomLLMClient` interface:
|
|
817
|
+
```python
|
|
818
|
+
from eval_lib import CustomLLMClient
|
|
819
|
+
from typing import Optional
|
|
820
|
+
from openai import AsyncOpenAI
|
|
821
|
+
|
|
822
|
+
class InternalLLMClient(CustomLLMClient):
|
|
823
|
+
"""Client for internal corporate LLM or custom endpoint"""
|
|
824
|
+
|
|
825
|
+
def __init__(
|
|
826
|
+
self,
|
|
827
|
+
endpoint: str,
|
|
828
|
+
model: str,
|
|
829
|
+
api_key: Optional[str] = None,
|
|
830
|
+
temperature: float = 0.0
|
|
831
|
+
):
|
|
832
|
+
"""
|
|
833
|
+
Args:
|
|
834
|
+
endpoint: Your internal LLM endpoint URL (e.g., "https://internal-llm.company.com/v1")
|
|
835
|
+
model: Model name to use
|
|
836
|
+
api_key: API key if required (optional for local models)
|
|
837
|
+
temperature: Default temperature
|
|
838
|
+
"""
|
|
839
|
+
self.endpoint = endpoint
|
|
840
|
+
self.model = model
|
|
841
|
+
self.api_key = api_key or "not-needed" # Some endpoints don't need auth
|
|
842
|
+
|
|
843
|
+
self.client = AsyncOpenAI(
|
|
844
|
+
api_key=self.api_key,
|
|
845
|
+
base_url=self.endpoint
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
async def chat_complete(
|
|
849
|
+
self,
|
|
850
|
+
messages: list[dict[str, str]],
|
|
851
|
+
temperature: float
|
|
852
|
+
) -> tuple[str, Optional[float]]:
|
|
853
|
+
"""Generate response from internal LLM"""
|
|
854
|
+
response = await self.client.chat.completions.create(
|
|
855
|
+
model=self.model,
|
|
856
|
+
messages=messages,
|
|
857
|
+
temperature=temperature,
|
|
858
|
+
)
|
|
859
|
+
text = response.choices[0].message.content.strip()
|
|
860
|
+
cost = None # Internal models typically don't have API costs
|
|
861
|
+
return text, cost
|
|
862
|
+
|
|
863
|
+
def get_model_name(self) -> str:
|
|
864
|
+
"""Return model name for logging"""
|
|
865
|
+
return f"internal:{self.model}"
|
|
866
|
+
```
|
|
867
|
+
|
|
868
|
+
### Using Custom Providers
|
|
869
|
+
|
|
870
|
+
Use your custom provider in any metric:
|
|
871
|
+
```python
|
|
872
|
+
import asyncio
|
|
873
|
+
from eval_lib import (
|
|
874
|
+
evaluate,
|
|
875
|
+
EvalTestCase,
|
|
876
|
+
AnswerRelevancyMetric,
|
|
877
|
+
FaithfulnessMetric
|
|
878
|
+
)
|
|
879
|
+
|
|
880
|
+
# Create custom internal LLM client
|
|
881
|
+
internal_llm = InternalLLMClient(
|
|
882
|
+
endpoint="https://internal-llm.company.com/v1",
|
|
883
|
+
model="company-gpt-v2",
|
|
884
|
+
api_key="your-internal-key" # Optional
|
|
885
|
+
)
|
|
886
|
+
|
|
887
|
+
# Use in metrics
|
|
888
|
+
test_cases = [
|
|
889
|
+
EvalTestCase(
|
|
890
|
+
input="What is the capital of France?",
|
|
891
|
+
actual_output="Paris is the capital.",
|
|
892
|
+
expected_output="Paris",
|
|
893
|
+
retrieval_context=["Paris is the capital of France."]
|
|
894
|
+
)
|
|
895
|
+
]
|
|
896
|
+
|
|
897
|
+
metrics = [
|
|
898
|
+
AnswerRelevancyMetric(
|
|
899
|
+
model=internal_llm, # ← Your custom LLM
|
|
900
|
+
threshold=0.7
|
|
901
|
+
),
|
|
902
|
+
FaithfulnessMetric(
|
|
903
|
+
model=internal_llm, # ← Same custom client
|
|
904
|
+
threshold=0.8
|
|
905
|
+
)
|
|
906
|
+
]
|
|
907
|
+
|
|
908
|
+
async def run_evaluation():
|
|
909
|
+
results = await evaluate(
|
|
910
|
+
test_cases=test_cases,
|
|
911
|
+
metrics=metrics,
|
|
912
|
+
verbose=True
|
|
913
|
+
)
|
|
914
|
+
return results
|
|
915
|
+
|
|
916
|
+
asyncio.run(run_evaluation())
|
|
917
|
+
```
|
|
918
|
+
|
|
919
|
+
### Mixing Standard and Custom Providers
|
|
920
|
+
|
|
921
|
+
You can mix standard and custom providers in the same evaluation:
|
|
922
|
+
```python
|
|
923
|
+
# Create custom provider
|
|
924
|
+
internal_llm = InternalLLMClient(
|
|
925
|
+
endpoint="https://internal-llm.company.com/v1",
|
|
926
|
+
model="company-model"
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
# Mix standard OpenAI and custom internal LLM
|
|
930
|
+
metrics = [
|
|
931
|
+
AnswerRelevancyMetric(
|
|
932
|
+
model="gpt-4o-mini", # ← Standard OpenAI
|
|
933
|
+
threshold=0.7
|
|
934
|
+
),
|
|
935
|
+
FaithfulnessMetric(
|
|
936
|
+
model=internal_llm, # ← Custom internal LLM
|
|
937
|
+
threshold=0.8
|
|
938
|
+
),
|
|
939
|
+
ContextualRelevancyMetric(
|
|
940
|
+
model="anthropic:claude-sonnet-4-0", # ← Standard Anthropic
|
|
941
|
+
threshold=0.7
|
|
942
|
+
)
|
|
943
|
+
]
|
|
944
|
+
|
|
945
|
+
results = await evaluate(test_cases=test_cases, metrics=metrics)
|
|
946
|
+
```
|
|
947
|
+
|
|
948
|
+
### Custom Provider Use Cases
|
|
949
|
+
|
|
950
|
+
**When to use custom providers:**
|
|
951
|
+
|
|
952
|
+
1. **Internal Corporate LLMs**: Connect to your company's proprietary models
|
|
953
|
+
2. **Local Models**: Integrate locally-hosted models (vLLM, TGI, LM Studio, Ollama with custom setup)
|
|
954
|
+
3. **Fine-tuned Models**: Use your own fine-tuned models hosted anywhere
|
|
955
|
+
4. **Research Models**: Connect to experimental or research models
|
|
956
|
+
5. **Custom Endpoints**: Any LLM accessible via HTTP endpoint
|
|
957
|
+
|
|
958
|
+
**Example: Local Model with vLLM**
|
|
959
|
+
```python
|
|
960
|
+
# vLLM server running on localhost:8000
|
|
961
|
+
local_model = InternalLLMClient(
|
|
962
|
+
endpoint="http://localhost:8000/v1",
|
|
963
|
+
model="meta-llama/Llama-2-7b-chat",
|
|
964
|
+
api_key=None # Local models don't need auth
|
|
965
|
+
)
|
|
966
|
+
|
|
967
|
+
# Use in evaluation
|
|
968
|
+
metric = AnswerRelevancyMetric(model=local_model, threshold=0.7)
|
|
969
|
+
```
|
|
970
|
+
|
|
971
|
+
**Example: Corporate Internal Model**
|
|
972
|
+
```python
|
|
973
|
+
# Company's internal LLM with authentication
|
|
974
|
+
company_model = InternalLLMClient(
|
|
975
|
+
endpoint="https://ai-platform.company.internal/api/v1",
|
|
976
|
+
model="company-gpt-enterprise",
|
|
977
|
+
api_key="internal-api-key-here"
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
# Use in evaluation
|
|
981
|
+
metrics = [
|
|
982
|
+
AnswerRelevancyMetric(model=company_model, threshold=0.7),
|
|
983
|
+
FaithfulnessMetric(model=company_model, threshold=0.8)
|
|
984
|
+
]
|
|
985
|
+
```
|
|
986
|
+
|
|
987
|
+
**Key Requirements:**
|
|
988
|
+
|
|
989
|
+
1. **`async def chat_complete()`** - Must be async and return `(str, Optional[float])`
|
|
990
|
+
2. **`def get_model_name()`** - Return string identifier for logging
|
|
991
|
+
3. **Error Handling** - Handle connection and API errors appropriately
|
|
992
|
+
4. **Cost** - Return `None` for cost if not applicable (e.g., internal/local models)
|
|
993
|
+
|
|
994
|
+
### Advanced: Custom Authentication
|
|
995
|
+
|
|
996
|
+
For custom authentication schemes:
|
|
997
|
+
```python
|
|
998
|
+
class CustomAuthLLMClient(CustomLLMClient):
|
|
999
|
+
"""Client with custom authentication"""
|
|
1000
|
+
|
|
1001
|
+
def __init__(self, endpoint: str, auth_token: str):
|
|
1002
|
+
self.endpoint = endpoint
|
|
1003
|
+
self.headers = {
|
|
1004
|
+
"Authorization": f"Bearer {auth_token}",
|
|
1005
|
+
"X-Custom-Header": "value"
|
|
1006
|
+
}
|
|
1007
|
+
# Use aiohttp or httpx for custom auth
|
|
1008
|
+
import aiohttp
|
|
1009
|
+
self.session = aiohttp.ClientSession(headers=self.headers)
|
|
1010
|
+
|
|
1011
|
+
async def chat_complete(self, messages, temperature):
|
|
1012
|
+
async with self.session.post(
|
|
1013
|
+
f"{self.endpoint}/chat",
|
|
1014
|
+
json={"messages": messages, "temperature": temperature}
|
|
1015
|
+
) as response:
|
|
1016
|
+
data = await response.json()
|
|
1017
|
+
return data["content"], None
|
|
1018
|
+
|
|
1019
|
+
def get_model_name(self):
|
|
1020
|
+
return "custom-auth-model"
|
|
1021
|
+
```
|
|
1022
|
+
|
|
810
1023
|
## Test Data Generation
|
|
811
1024
|
|
|
812
1025
|
The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
eval_ai_library-0.3.
|
|
2
|
-
eval_lib/__init__.py,sha256=
|
|
1
|
+
eval_ai_library-0.3.3.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
|
|
2
|
+
eval_lib/__init__.py,sha256=ySdAQb2DQma2y-ERuFv3VQEAq3S8d8G4vORfo__aqfk,3087
|
|
3
3
|
eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
|
|
4
4
|
eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
|
|
5
|
-
eval_lib/llm_client.py,sha256=
|
|
5
|
+
eval_lib/llm_client.py,sha256=eeTVhCLR1uYbhqOEOSBt3wWPKuzgzA9v8m0F9f-4Gqg,14910
|
|
6
6
|
eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
|
|
7
7
|
eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
|
|
8
8
|
eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
|
|
@@ -28,7 +28,7 @@ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK
|
|
|
28
28
|
eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
|
|
29
29
|
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
|
|
30
30
|
eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
|
|
31
|
-
eval_ai_library-0.3.
|
|
32
|
-
eval_ai_library-0.3.
|
|
33
|
-
eval_ai_library-0.3.
|
|
34
|
-
eval_ai_library-0.3.
|
|
31
|
+
eval_ai_library-0.3.3.dist-info/METADATA,sha256=S6nodzMnFB5T1Gvtsg19qi1TEwxGtwc9CqLaBWxgPnM,43879
|
|
32
|
+
eval_ai_library-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
33
|
+
eval_ai_library-0.3.3.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
|
|
34
|
+
eval_ai_library-0.3.3.dist-info/RECORD,,
|
eval_lib/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
|
|
|
7
7
|
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
__version__ = "0.3.
|
|
10
|
+
__version__ = "0.3.3"
|
|
11
11
|
__author__ = "Aleksandr Meshkov"
|
|
12
12
|
|
|
13
13
|
# Core evaluation functions
|
|
@@ -39,6 +39,7 @@ from eval_lib.llm_client import (
|
|
|
39
39
|
chat_complete,
|
|
40
40
|
get_embeddings,
|
|
41
41
|
LLMDescriptor,
|
|
42
|
+
CustomLLMClient,
|
|
42
43
|
Provider
|
|
43
44
|
)
|
|
44
45
|
|
|
@@ -68,12 +69,14 @@ from eval_lib.agent_metrics import (
|
|
|
68
69
|
|
|
69
70
|
def __getattr__(name):
|
|
70
71
|
"""
|
|
71
|
-
|
|
72
|
-
DataGenerator импортируется только когда реально используется.
|
|
72
|
+
Lazy loading for data generation components.
|
|
73
73
|
"""
|
|
74
|
-
if name == "
|
|
75
|
-
from eval_lib.datagenerator.datagenerator import
|
|
76
|
-
return
|
|
74
|
+
if name == "DatasetGenerator":
|
|
75
|
+
from eval_lib.datagenerator.datagenerator import DatasetGenerator
|
|
76
|
+
return DatasetGenerator
|
|
77
|
+
if name == "DataGenerator": # Alias for DatasetGenerator
|
|
78
|
+
from eval_lib.datagenerator.datagenerator import DatasetGenerator
|
|
79
|
+
return DatasetGenerator
|
|
77
80
|
if name == "DocumentLoader":
|
|
78
81
|
from eval_lib.datagenerator.document_loader import DocumentLoader
|
|
79
82
|
return DocumentLoader
|
|
@@ -104,6 +107,7 @@ __all__ = [
|
|
|
104
107
|
"chat_complete",
|
|
105
108
|
"get_embeddings",
|
|
106
109
|
"LLMDescriptor",
|
|
110
|
+
"CustomLLMClient",
|
|
107
111
|
"Provider",
|
|
108
112
|
|
|
109
113
|
# RAG Metrics
|
eval_lib/llm_client.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
import openai
|
|
3
3
|
import functools
|
|
4
4
|
import anthropic
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
5
6
|
from openai import AsyncAzureOpenAI
|
|
6
7
|
from google import genai
|
|
7
8
|
from google.genai.types import GenerateContentConfig
|
|
@@ -13,6 +14,45 @@ from types import SimpleNamespace
|
|
|
13
14
|
from .price import model_pricing
|
|
14
15
|
|
|
15
16
|
|
|
17
|
+
class CustomLLMClient(ABC):
|
|
18
|
+
"""
|
|
19
|
+
Base class for custom LLM clients.
|
|
20
|
+
Inherit from this to create your own model implementations.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
class MyCustomLLM(CustomLLMClient):
|
|
24
|
+
async def chat_complete(self, messages, temperature):
|
|
25
|
+
# Your implementation
|
|
26
|
+
return response_text, cost
|
|
27
|
+
|
|
28
|
+
def get_model_name(self):
|
|
29
|
+
return "my-custom-model"
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
async def chat_complete(
|
|
34
|
+
self,
|
|
35
|
+
messages: list[dict[str, str]],
|
|
36
|
+
temperature: float
|
|
37
|
+
) -> tuple[str, Optional[float]]:
|
|
38
|
+
"""
|
|
39
|
+
Generate a response for the given messages.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
messages: List of message dicts [{"role": "user", "content": "..."}]
|
|
43
|
+
temperature: Sampling temperature
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Tuple of (response_text, cost_in_usd)
|
|
47
|
+
"""
|
|
48
|
+
pass
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
def get_model_name(self) -> str:
|
|
52
|
+
"""Return the model name for logging/tracking purposes."""
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
16
56
|
class LLMConfigurationError(Exception):
|
|
17
57
|
"""Raised when LLM client configuration is missing or invalid."""
|
|
18
58
|
pass
|
|
@@ -24,6 +64,7 @@ class Provider(str, Enum):
|
|
|
24
64
|
GOOGLE = "google"
|
|
25
65
|
OLLAMA = "ollama"
|
|
26
66
|
ANTHROPIC = "anthropic"
|
|
67
|
+
CUSTOM = "custom"
|
|
27
68
|
|
|
28
69
|
|
|
29
70
|
@dataclass(frozen=True, slots=True)
|
|
@@ -308,7 +349,7 @@ _HELPERS = {
|
|
|
308
349
|
|
|
309
350
|
|
|
310
351
|
async def chat_complete(
|
|
311
|
-
llm: str | tuple[str, str] | LLMDescriptor,
|
|
352
|
+
llm: str | tuple[str, str] | LLMDescriptor | CustomLLMClient,
|
|
312
353
|
messages: list[dict[str, str]],
|
|
313
354
|
temperature: float = 0.0,
|
|
314
355
|
):
|
|
@@ -327,6 +368,11 @@ async def chat_complete(
|
|
|
327
368
|
LLMConfigurationError: If required API keys or configuration are missing
|
|
328
369
|
ValueError: If provider is not supported
|
|
329
370
|
"""
|
|
371
|
+
# Handle custom LLM clients
|
|
372
|
+
if isinstance(llm, CustomLLMClient):
|
|
373
|
+
return await llm.chat_complete(messages, temperature)
|
|
374
|
+
|
|
375
|
+
# Standard providers
|
|
330
376
|
llm = LLMDescriptor.parse(llm)
|
|
331
377
|
helper = _HELPERS.get(llm.provider)
|
|
332
378
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|