eval-ai-library 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -807,6 +807,219 @@ response, cost = await chat_complete(
807
807
  )
808
808
  ```
809
809
 
810
+ ## Custom LLM Providers
811
+
812
+ The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
813
+
814
+ ### Creating a Custom Provider
815
+
816
+ Implement the `CustomLLMClient` interface:
817
+ ```python
818
+ from eval_lib import CustomLLMClient
819
+ from typing import Optional
820
+ from openai import AsyncOpenAI
821
+
822
+ class InternalLLMClient(CustomLLMClient):
823
+ """Client for internal corporate LLM or custom endpoint"""
824
+
825
+ def __init__(
826
+ self,
827
+ endpoint: str,
828
+ model: str,
829
+ api_key: Optional[str] = None,
830
+ temperature: float = 0.0
831
+ ):
832
+ """
833
+ Args:
834
+ endpoint: Your internal LLM endpoint URL (e.g., "https://internal-llm.company.com/v1")
835
+ model: Model name to use
836
+ api_key: API key if required (optional for local models)
837
+ temperature: Default temperature
838
+ """
839
+ self.endpoint = endpoint
840
+ self.model = model
841
+ self.api_key = api_key or "not-needed" # Some endpoints don't need auth
842
+
843
+ self.client = AsyncOpenAI(
844
+ api_key=self.api_key,
845
+ base_url=self.endpoint
846
+ )
847
+
848
+ async def chat_complete(
849
+ self,
850
+ messages: list[dict[str, str]],
851
+ temperature: float
852
+ ) -> tuple[str, Optional[float]]:
853
+ """Generate response from internal LLM"""
854
+ response = await self.client.chat.completions.create(
855
+ model=self.model,
856
+ messages=messages,
857
+ temperature=temperature,
858
+ )
859
+ text = response.choices[0].message.content.strip()
860
+ cost = None # Internal models typically don't have API costs
861
+ return text, cost
862
+
863
+ def get_model_name(self) -> str:
864
+ """Return model name for logging"""
865
+ return f"internal:{self.model}"
866
+ ```
867
+
868
+ ### Using Custom Providers
869
+
870
+ Use your custom provider in any metric:
871
+ ```python
872
+ import asyncio
873
+ from eval_lib import (
874
+ evaluate,
875
+ EvalTestCase,
876
+ AnswerRelevancyMetric,
877
+ FaithfulnessMetric
878
+ )
879
+
880
+ # Create custom internal LLM client
881
+ internal_llm = InternalLLMClient(
882
+ endpoint="https://internal-llm.company.com/v1",
883
+ model="company-gpt-v2",
884
+ api_key="your-internal-key" # Optional
885
+ )
886
+
887
+ # Use in metrics
888
+ test_cases = [
889
+ EvalTestCase(
890
+ input="What is the capital of France?",
891
+ actual_output="Paris is the capital.",
892
+ expected_output="Paris",
893
+ retrieval_context=["Paris is the capital of France."]
894
+ )
895
+ ]
896
+
897
+ metrics = [
898
+ AnswerRelevancyMetric(
899
+ model=internal_llm, # ← Your custom LLM
900
+ threshold=0.7
901
+ ),
902
+ FaithfulnessMetric(
903
+ model=internal_llm, # ← Same custom client
904
+ threshold=0.8
905
+ )
906
+ ]
907
+
908
+ async def run_evaluation():
909
+ results = await evaluate(
910
+ test_cases=test_cases,
911
+ metrics=metrics,
912
+ verbose=True
913
+ )
914
+ return results
915
+
916
+ asyncio.run(run_evaluation())
917
+ ```
918
+
919
+ ### Mixing Standard and Custom Providers
920
+
921
+ You can mix standard and custom providers in the same evaluation:
922
+ ```python
923
+ # Create custom provider
924
+ internal_llm = InternalLLMClient(
925
+ endpoint="https://internal-llm.company.com/v1",
926
+ model="company-model"
927
+ )
928
+
929
+ # Mix standard OpenAI and custom internal LLM
930
+ metrics = [
931
+ AnswerRelevancyMetric(
932
+ model="gpt-4o-mini", # ← Standard OpenAI
933
+ threshold=0.7
934
+ ),
935
+ FaithfulnessMetric(
936
+ model=internal_llm, # ← Custom internal LLM
937
+ threshold=0.8
938
+ ),
939
+ ContextualRelevancyMetric(
940
+ model="anthropic:claude-sonnet-4-0", # ← Standard Anthropic
941
+ threshold=0.7
942
+ )
943
+ ]
944
+
945
+ results = await evaluate(test_cases=test_cases, metrics=metrics)
946
+ ```
947
+
948
+ ### Custom Provider Use Cases
949
+
950
+ **When to use custom providers:**
951
+
952
+ 1. **Internal Corporate LLMs**: Connect to your company's proprietary models
953
+ 2. **Local Models**: Integrate locally-hosted models (vLLM, TGI, LM Studio, Ollama with custom setup)
954
+ 3. **Fine-tuned Models**: Use your own fine-tuned models hosted anywhere
955
+ 4. **Research Models**: Connect to experimental or research models
956
+ 5. **Custom Endpoints**: Any LLM accessible via HTTP endpoint
957
+
958
+ **Example: Local Model with vLLM**
959
+ ```python
960
+ # vLLM server running on localhost:8000
961
+ local_model = InternalLLMClient(
962
+ endpoint="http://localhost:8000/v1",
963
+ model="meta-llama/Llama-2-7b-chat",
964
+ api_key=None # Local models don't need auth
965
+ )
966
+
967
+ # Use in evaluation
968
+ metric = AnswerRelevancyMetric(model=local_model, threshold=0.7)
969
+ ```
970
+
971
+ **Example: Corporate Internal Model**
972
+ ```python
973
+ # Company's internal LLM with authentication
974
+ company_model = InternalLLMClient(
975
+ endpoint="https://ai-platform.company.internal/api/v1",
976
+ model="company-gpt-enterprise",
977
+ api_key="internal-api-key-here"
978
+ )
979
+
980
+ # Use in evaluation
981
+ metrics = [
982
+ AnswerRelevancyMetric(model=company_model, threshold=0.7),
983
+ FaithfulnessMetric(model=company_model, threshold=0.8)
984
+ ]
985
+ ```
986
+
987
+ **Key Requirements:**
988
+
989
+ 1. **`async def chat_complete()`** - Must be async and return `(str, Optional[float])`
990
+ 2. **`def get_model_name()`** - Return string identifier for logging
991
+ 3. **Error Handling** - Handle connection and API errors appropriately
992
+ 4. **Cost** - Return `None` for cost if not applicable (e.g., internal/local models)
993
+
994
+ ### Advanced: Custom Authentication
995
+
996
+ For custom authentication schemes:
997
+ ```python
998
+ class CustomAuthLLMClient(CustomLLMClient):
999
+ """Client with custom authentication"""
1000
+
1001
+ def __init__(self, endpoint: str, auth_token: str):
1002
+ self.endpoint = endpoint
1003
+ self.headers = {
1004
+ "Authorization": f"Bearer {auth_token}",
1005
+ "X-Custom-Header": "value"
1006
+ }
1007
+ # Use aiohttp or httpx for custom auth
1008
+ import aiohttp
1009
+ self.session = aiohttp.ClientSession(headers=self.headers)
1010
+
1011
+ async def chat_complete(self, messages, temperature):
1012
+ async with self.session.post(
1013
+ f"{self.endpoint}/chat",
1014
+ json={"messages": messages, "temperature": temperature}
1015
+ ) as response:
1016
+ data = await response.json()
1017
+ return data["content"], None
1018
+
1019
+ def get_model_name(self):
1020
+ return "custom-auth-model"
1021
+ ```
1022
+
810
1023
  ## Test Data Generation
811
1024
 
812
1025
  The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
@@ -1,8 +1,8 @@
1
- eval_ai_library-0.3.1.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
- eval_lib/__init__.py,sha256=Jayvtz47_-0POIspT_LJKZ6jmWyf0fQc9fqQ5KvdPRI,3029
1
+ eval_ai_library-0.3.3.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
+ eval_lib/__init__.py,sha256=ySdAQb2DQma2y-ERuFv3VQEAq3S8d8G4vORfo__aqfk,3087
3
3
  eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
4
4
  eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
5
- eval_lib/llm_client.py,sha256=3eMcarKLkDLDVh4AOxgWbaIzXlzpqsmEfJXNTBonNic,13633
5
+ eval_lib/llm_client.py,sha256=eeTVhCLR1uYbhqOEOSBt3wWPKuzgzA9v8m0F9f-4Gqg,14910
6
6
  eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
7
7
  eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
8
8
  eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
@@ -28,7 +28,7 @@ eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK
28
28
  eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
29
29
  eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
30
30
  eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
31
- eval_ai_library-0.3.1.dist-info/METADATA,sha256=UytyyuWVrL3CuvK7hQC_y-AqoabHEPI0euolxhmfZrQ,37706
32
- eval_ai_library-0.3.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- eval_ai_library-0.3.1.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
34
- eval_ai_library-0.3.1.dist-info/RECORD,,
31
+ eval_ai_library-0.3.3.dist-info/METADATA,sha256=S6nodzMnFB5T1Gvtsg19qi1TEwxGtwc9CqLaBWxgPnM,43879
32
+ eval_ai_library-0.3.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ eval_ai_library-0.3.3.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
34
+ eval_ai_library-0.3.3.dist-info/RECORD,,
eval_lib/__init__.py CHANGED
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
7
7
  and a wide range of evaluation metrics for RAG systems and AI agents.
8
8
  """
9
9
 
10
- __version__ = "0.3.1"
10
+ __version__ = "0.3.3"
11
11
  __author__ = "Aleksandr Meshkov"
12
12
 
13
13
  # Core evaluation functions
@@ -39,6 +39,7 @@ from eval_lib.llm_client import (
39
39
  chat_complete,
40
40
  get_embeddings,
41
41
  LLMDescriptor,
42
+ CustomLLMClient,
42
43
  Provider
43
44
  )
44
45
 
@@ -68,12 +69,14 @@ from eval_lib.agent_metrics import (
68
69
 
69
70
  def __getattr__(name):
70
71
  """
71
- Ленивый импорт для модулей с тяжёлыми зависимостями.
72
- DataGenerator импортируется только когда реально используется.
72
+ Lazy loading for data generation components.
73
73
  """
74
- if name == "DataGenerator":
75
- from eval_lib.datagenerator.datagenerator import DataGenerator
76
- return DataGenerator
74
+ if name == "DatasetGenerator":
75
+ from eval_lib.datagenerator.datagenerator import DatasetGenerator
76
+ return DatasetGenerator
77
+ if name == "DataGenerator": # Alias for DatasetGenerator
78
+ from eval_lib.datagenerator.datagenerator import DatasetGenerator
79
+ return DatasetGenerator
77
80
  if name == "DocumentLoader":
78
81
  from eval_lib.datagenerator.document_loader import DocumentLoader
79
82
  return DocumentLoader
@@ -104,6 +107,7 @@ __all__ = [
104
107
  "chat_complete",
105
108
  "get_embeddings",
106
109
  "LLMDescriptor",
110
+ "CustomLLMClient",
107
111
  "Provider",
108
112
 
109
113
  # RAG Metrics
eval_lib/llm_client.py CHANGED
@@ -2,6 +2,7 @@
2
2
  import openai
3
3
  import functools
4
4
  import anthropic
5
+ from abc import ABC, abstractmethod
5
6
  from openai import AsyncAzureOpenAI
6
7
  from google import genai
7
8
  from google.genai.types import GenerateContentConfig
@@ -13,6 +14,45 @@ from types import SimpleNamespace
13
14
  from .price import model_pricing
14
15
 
15
16
 
17
+ class CustomLLMClient(ABC):
18
+ """
19
+ Base class for custom LLM clients.
20
+ Inherit from this to create your own model implementations.
21
+
22
+ Example:
23
+ class MyCustomLLM(CustomLLMClient):
24
+ async def chat_complete(self, messages, temperature):
25
+ # Your implementation
26
+ return response_text, cost
27
+
28
+ def get_model_name(self):
29
+ return "my-custom-model"
30
+ """
31
+
32
+ @abstractmethod
33
+ async def chat_complete(
34
+ self,
35
+ messages: list[dict[str, str]],
36
+ temperature: float
37
+ ) -> tuple[str, Optional[float]]:
38
+ """
39
+ Generate a response for the given messages.
40
+
41
+ Args:
42
+ messages: List of message dicts [{"role": "user", "content": "..."}]
43
+ temperature: Sampling temperature
44
+
45
+ Returns:
46
+ Tuple of (response_text, cost_in_usd)
47
+ """
48
+ pass
49
+
50
+ @abstractmethod
51
+ def get_model_name(self) -> str:
52
+ """Return the model name for logging/tracking purposes."""
53
+ pass
54
+
55
+
16
56
  class LLMConfigurationError(Exception):
17
57
  """Raised when LLM client configuration is missing or invalid."""
18
58
  pass
@@ -24,6 +64,7 @@ class Provider(str, Enum):
24
64
  GOOGLE = "google"
25
65
  OLLAMA = "ollama"
26
66
  ANTHROPIC = "anthropic"
67
+ CUSTOM = "custom"
27
68
 
28
69
 
29
70
  @dataclass(frozen=True, slots=True)
@@ -308,7 +349,7 @@ _HELPERS = {
308
349
 
309
350
 
310
351
  async def chat_complete(
311
- llm: str | tuple[str, str] | LLMDescriptor,
352
+ llm: str | tuple[str, str] | LLMDescriptor | CustomLLMClient,
312
353
  messages: list[dict[str, str]],
313
354
  temperature: float = 0.0,
314
355
  ):
@@ -327,6 +368,11 @@ async def chat_complete(
327
368
  LLMConfigurationError: If required API keys or configuration are missing
328
369
  ValueError: If provider is not supported
329
370
  """
371
+ # Handle custom LLM clients
372
+ if isinstance(llm, CustomLLMClient):
373
+ return await llm.chat_complete(messages, temperature)
374
+
375
+ # Standard providers
330
376
  llm = LLMDescriptor.parse(llm)
331
377
  helper = _HELPERS.get(llm.provider)
332
378