eval-ai-library 0.3.2__tar.gz → 0.3.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

Files changed (41) hide show
  1. {eval_ai_library-0.3.2/eval_ai_library.egg-info → eval_ai_library-0.3.3}/PKG-INFO +214 -1
  2. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/README.md +213 -0
  3. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3/eval_ai_library.egg-info}/PKG-INFO +214 -1
  4. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/__init__.py +3 -1
  5. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/llm_client.py +47 -1
  6. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/pyproject.toml +1 -1
  7. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/LICENSE +0 -0
  8. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/MANIFEST.in +0 -0
  9. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_ai_library.egg-info/SOURCES.txt +0 -0
  10. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_ai_library.egg-info/dependency_links.txt +0 -0
  11. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_ai_library.egg-info/requires.txt +0 -0
  12. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_ai_library.egg-info/top_level.txt +0 -0
  13. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/agent_metrics/__init__.py +0 -0
  14. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +0 -0
  15. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +0 -0
  16. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/agent_metrics/task_success_metric/task_success_rate.py +0 -0
  17. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py +0 -0
  18. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/datagenerator/datagenerator.py +0 -0
  19. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/datagenerator/document_loader.py +0 -0
  20. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/datagenerator/prompts.py +0 -0
  21. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/evaluate.py +0 -0
  22. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/evaluation_schema.py +0 -0
  23. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metric_pattern.py +0 -0
  24. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/__init__.py +0 -0
  25. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/answer_precision_metric/answer_precision.py +0 -0
  26. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +0 -0
  27. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/bias_metric/bias.py +0 -0
  28. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/contextual_precision_metric/contextual_precision.py +0 -0
  29. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/contextual_recall_metric/contextual_recall.py +0 -0
  30. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +0 -0
  31. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/custom_metric/custom_eval.py +0 -0
  32. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/faithfulness_metric/faithfulness.py +0 -0
  33. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/geval/geval.py +0 -0
  34. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py +0 -0
  35. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/metrics/toxicity_metric/toxicity.py +0 -0
  36. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/price.py +0 -0
  37. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/py.typed +0 -0
  38. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/testcases_schema.py +0 -0
  39. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/eval_lib/utils.py +0 -0
  40. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/setup.cfg +0 -0
  41. {eval_ai_library-0.3.2 → eval_ai_library-0.3.3}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -807,6 +807,219 @@ response, cost = await chat_complete(
807
807
  )
808
808
  ```
809
809
 
810
+ ## Custom LLM Providers
811
+
812
+ The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
813
+
814
+ ### Creating a Custom Provider
815
+
816
+ Implement the `CustomLLMClient` interface:
817
+ ```python
818
+ from eval_lib import CustomLLMClient
819
+ from typing import Optional
820
+ from openai import AsyncOpenAI
821
+
822
+ class InternalLLMClient(CustomLLMClient):
823
+ """Client for internal corporate LLM or custom endpoint"""
824
+
825
+ def __init__(
826
+ self,
827
+ endpoint: str,
828
+ model: str,
829
+ api_key: Optional[str] = None,
830
+ temperature: float = 0.0
831
+ ):
832
+ """
833
+ Args:
834
+ endpoint: Your internal LLM endpoint URL (e.g., "https://internal-llm.company.com/v1")
835
+ model: Model name to use
836
+ api_key: API key if required (optional for local models)
837
+ temperature: Default temperature
838
+ """
839
+ self.endpoint = endpoint
840
+ self.model = model
841
+ self.api_key = api_key or "not-needed" # Some endpoints don't need auth
842
+
843
+ self.client = AsyncOpenAI(
844
+ api_key=self.api_key,
845
+ base_url=self.endpoint
846
+ )
847
+
848
+ async def chat_complete(
849
+ self,
850
+ messages: list[dict[str, str]],
851
+ temperature: float
852
+ ) -> tuple[str, Optional[float]]:
853
+ """Generate response from internal LLM"""
854
+ response = await self.client.chat.completions.create(
855
+ model=self.model,
856
+ messages=messages,
857
+ temperature=temperature,
858
+ )
859
+ text = response.choices[0].message.content.strip()
860
+ cost = None # Internal models typically don't have API costs
861
+ return text, cost
862
+
863
+ def get_model_name(self) -> str:
864
+ """Return model name for logging"""
865
+ return f"internal:{self.model}"
866
+ ```
867
+
868
+ ### Using Custom Providers
869
+
870
+ Use your custom provider in any metric:
871
+ ```python
872
+ import asyncio
873
+ from eval_lib import (
874
+ evaluate,
875
+ EvalTestCase,
876
+ AnswerRelevancyMetric,
877
+ FaithfulnessMetric
878
+ )
879
+
880
+ # Create custom internal LLM client
881
+ internal_llm = InternalLLMClient(
882
+ endpoint="https://internal-llm.company.com/v1",
883
+ model="company-gpt-v2",
884
+ api_key="your-internal-key" # Optional
885
+ )
886
+
887
+ # Use in metrics
888
+ test_cases = [
889
+ EvalTestCase(
890
+ input="What is the capital of France?",
891
+ actual_output="Paris is the capital.",
892
+ expected_output="Paris",
893
+ retrieval_context=["Paris is the capital of France."]
894
+ )
895
+ ]
896
+
897
+ metrics = [
898
+ AnswerRelevancyMetric(
899
+ model=internal_llm, # ← Your custom LLM
900
+ threshold=0.7
901
+ ),
902
+ FaithfulnessMetric(
903
+ model=internal_llm, # ← Same custom client
904
+ threshold=0.8
905
+ )
906
+ ]
907
+
908
+ async def run_evaluation():
909
+ results = await evaluate(
910
+ test_cases=test_cases,
911
+ metrics=metrics,
912
+ verbose=True
913
+ )
914
+ return results
915
+
916
+ asyncio.run(run_evaluation())
917
+ ```
918
+
919
+ ### Mixing Standard and Custom Providers
920
+
921
+ You can mix standard and custom providers in the same evaluation:
922
+ ```python
923
+ # Create custom provider
924
+ internal_llm = InternalLLMClient(
925
+ endpoint="https://internal-llm.company.com/v1",
926
+ model="company-model"
927
+ )
928
+
929
+ # Mix standard OpenAI and custom internal LLM
930
+ metrics = [
931
+ AnswerRelevancyMetric(
932
+ model="gpt-4o-mini", # ← Standard OpenAI
933
+ threshold=0.7
934
+ ),
935
+ FaithfulnessMetric(
936
+ model=internal_llm, # ← Custom internal LLM
937
+ threshold=0.8
938
+ ),
939
+ ContextualRelevancyMetric(
940
+ model="anthropic:claude-sonnet-4-0", # ← Standard Anthropic
941
+ threshold=0.7
942
+ )
943
+ ]
944
+
945
+ results = await evaluate(test_cases=test_cases, metrics=metrics)
946
+ ```
947
+
948
+ ### Custom Provider Use Cases
949
+
950
+ **When to use custom providers:**
951
+
952
+ 1. **Internal Corporate LLMs**: Connect to your company's proprietary models
953
+ 2. **Local Models**: Integrate locally-hosted models (vLLM, TGI, LM Studio, Ollama with custom setup)
954
+ 3. **Fine-tuned Models**: Use your own fine-tuned models hosted anywhere
955
+ 4. **Research Models**: Connect to experimental or research models
956
+ 5. **Custom Endpoints**: Any LLM accessible via HTTP endpoint
957
+
958
+ **Example: Local Model with vLLM**
959
+ ```python
960
+ # vLLM server running on localhost:8000
961
+ local_model = InternalLLMClient(
962
+ endpoint="http://localhost:8000/v1",
963
+ model="meta-llama/Llama-2-7b-chat",
964
+ api_key=None # Local models don't need auth
965
+ )
966
+
967
+ # Use in evaluation
968
+ metric = AnswerRelevancyMetric(model=local_model, threshold=0.7)
969
+ ```
970
+
971
+ **Example: Corporate Internal Model**
972
+ ```python
973
+ # Company's internal LLM with authentication
974
+ company_model = InternalLLMClient(
975
+ endpoint="https://ai-platform.company.internal/api/v1",
976
+ model="company-gpt-enterprise",
977
+ api_key="internal-api-key-here"
978
+ )
979
+
980
+ # Use in evaluation
981
+ metrics = [
982
+ AnswerRelevancyMetric(model=company_model, threshold=0.7),
983
+ FaithfulnessMetric(model=company_model, threshold=0.8)
984
+ ]
985
+ ```
986
+
987
+ **Key Requirements:**
988
+
989
+ 1. **`async def chat_complete()`** - Must be async and return `(str, Optional[float])`
990
+ 2. **`def get_model_name()`** - Return string identifier for logging
991
+ 3. **Error Handling** - Handle connection and API errors appropriately
992
+ 4. **Cost** - Return `None` for cost if not applicable (e.g., internal/local models)
993
+
994
+ ### Advanced: Custom Authentication
995
+
996
+ For custom authentication schemes:
997
+ ```python
998
+ class CustomAuthLLMClient(CustomLLMClient):
999
+ """Client with custom authentication"""
1000
+
1001
+ def __init__(self, endpoint: str, auth_token: str):
1002
+ self.endpoint = endpoint
1003
+ self.headers = {
1004
+ "Authorization": f"Bearer {auth_token}",
1005
+ "X-Custom-Header": "value"
1006
+ }
1007
+ # Use aiohttp or httpx for custom auth
1008
+ import aiohttp
1009
+ self.session = aiohttp.ClientSession(headers=self.headers)
1010
+
1011
+ async def chat_complete(self, messages, temperature):
1012
+ async with self.session.post(
1013
+ f"{self.endpoint}/chat",
1014
+ json={"messages": messages, "temperature": temperature}
1015
+ ) as response:
1016
+ data = await response.json()
1017
+ return data["content"], None
1018
+
1019
+ def get_model_name(self):
1020
+ return "custom-auth-model"
1021
+ ```
1022
+
810
1023
  ## Test Data Generation
811
1024
 
812
1025
  The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
@@ -748,6 +748,219 @@ response, cost = await chat_complete(
748
748
  )
749
749
  ```
750
750
 
751
+ ## Custom LLM Providers
752
+
753
+ The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
754
+
755
+ ### Creating a Custom Provider
756
+
757
+ Implement the `CustomLLMClient` interface:
758
+ ```python
759
+ from eval_lib import CustomLLMClient
760
+ from typing import Optional
761
+ from openai import AsyncOpenAI
762
+
763
+ class InternalLLMClient(CustomLLMClient):
764
+ """Client for internal corporate LLM or custom endpoint"""
765
+
766
+ def __init__(
767
+ self,
768
+ endpoint: str,
769
+ model: str,
770
+ api_key: Optional[str] = None,
771
+ temperature: float = 0.0
772
+ ):
773
+ """
774
+ Args:
775
+ endpoint: Your internal LLM endpoint URL (e.g., "https://internal-llm.company.com/v1")
776
+ model: Model name to use
777
+ api_key: API key if required (optional for local models)
778
+ temperature: Default temperature
779
+ """
780
+ self.endpoint = endpoint
781
+ self.model = model
782
+ self.api_key = api_key or "not-needed" # Some endpoints don't need auth
783
+
784
+ self.client = AsyncOpenAI(
785
+ api_key=self.api_key,
786
+ base_url=self.endpoint
787
+ )
788
+
789
+ async def chat_complete(
790
+ self,
791
+ messages: list[dict[str, str]],
792
+ temperature: float
793
+ ) -> tuple[str, Optional[float]]:
794
+ """Generate response from internal LLM"""
795
+ response = await self.client.chat.completions.create(
796
+ model=self.model,
797
+ messages=messages,
798
+ temperature=temperature,
799
+ )
800
+ text = response.choices[0].message.content.strip()
801
+ cost = None # Internal models typically don't have API costs
802
+ return text, cost
803
+
804
+ def get_model_name(self) -> str:
805
+ """Return model name for logging"""
806
+ return f"internal:{self.model}"
807
+ ```
808
+
809
+ ### Using Custom Providers
810
+
811
+ Use your custom provider in any metric:
812
+ ```python
813
+ import asyncio
814
+ from eval_lib import (
815
+ evaluate,
816
+ EvalTestCase,
817
+ AnswerRelevancyMetric,
818
+ FaithfulnessMetric
819
+ )
820
+
821
+ # Create custom internal LLM client
822
+ internal_llm = InternalLLMClient(
823
+ endpoint="https://internal-llm.company.com/v1",
824
+ model="company-gpt-v2",
825
+ api_key="your-internal-key" # Optional
826
+ )
827
+
828
+ # Use in metrics
829
+ test_cases = [
830
+ EvalTestCase(
831
+ input="What is the capital of France?",
832
+ actual_output="Paris is the capital.",
833
+ expected_output="Paris",
834
+ retrieval_context=["Paris is the capital of France."]
835
+ )
836
+ ]
837
+
838
+ metrics = [
839
+ AnswerRelevancyMetric(
840
+ model=internal_llm, # ← Your custom LLM
841
+ threshold=0.7
842
+ ),
843
+ FaithfulnessMetric(
844
+ model=internal_llm, # ← Same custom client
845
+ threshold=0.8
846
+ )
847
+ ]
848
+
849
+ async def run_evaluation():
850
+ results = await evaluate(
851
+ test_cases=test_cases,
852
+ metrics=metrics,
853
+ verbose=True
854
+ )
855
+ return results
856
+
857
+ asyncio.run(run_evaluation())
858
+ ```
859
+
860
+ ### Mixing Standard and Custom Providers
861
+
862
+ You can mix standard and custom providers in the same evaluation:
863
+ ```python
864
+ # Create custom provider
865
+ internal_llm = InternalLLMClient(
866
+ endpoint="https://internal-llm.company.com/v1",
867
+ model="company-model"
868
+ )
869
+
870
+ # Mix standard OpenAI and custom internal LLM
871
+ metrics = [
872
+ AnswerRelevancyMetric(
873
+ model="gpt-4o-mini", # ← Standard OpenAI
874
+ threshold=0.7
875
+ ),
876
+ FaithfulnessMetric(
877
+ model=internal_llm, # ← Custom internal LLM
878
+ threshold=0.8
879
+ ),
880
+ ContextualRelevancyMetric(
881
+ model="anthropic:claude-sonnet-4-0", # ← Standard Anthropic
882
+ threshold=0.7
883
+ )
884
+ ]
885
+
886
+ results = await evaluate(test_cases=test_cases, metrics=metrics)
887
+ ```
888
+
889
+ ### Custom Provider Use Cases
890
+
891
+ **When to use custom providers:**
892
+
893
+ 1. **Internal Corporate LLMs**: Connect to your company's proprietary models
894
+ 2. **Local Models**: Integrate locally-hosted models (vLLM, TGI, LM Studio, Ollama with custom setup)
895
+ 3. **Fine-tuned Models**: Use your own fine-tuned models hosted anywhere
896
+ 4. **Research Models**: Connect to experimental or research models
897
+ 5. **Custom Endpoints**: Any LLM accessible via HTTP endpoint
898
+
899
+ **Example: Local Model with vLLM**
900
+ ```python
901
+ # vLLM server running on localhost:8000
902
+ local_model = InternalLLMClient(
903
+ endpoint="http://localhost:8000/v1",
904
+ model="meta-llama/Llama-2-7b-chat",
905
+ api_key=None # Local models don't need auth
906
+ )
907
+
908
+ # Use in evaluation
909
+ metric = AnswerRelevancyMetric(model=local_model, threshold=0.7)
910
+ ```
911
+
912
+ **Example: Corporate Internal Model**
913
+ ```python
914
+ # Company's internal LLM with authentication
915
+ company_model = InternalLLMClient(
916
+ endpoint="https://ai-platform.company.internal/api/v1",
917
+ model="company-gpt-enterprise",
918
+ api_key="internal-api-key-here"
919
+ )
920
+
921
+ # Use in evaluation
922
+ metrics = [
923
+ AnswerRelevancyMetric(model=company_model, threshold=0.7),
924
+ FaithfulnessMetric(model=company_model, threshold=0.8)
925
+ ]
926
+ ```
927
+
928
+ **Key Requirements:**
929
+
930
+ 1. **`async def chat_complete()`** - Must be async and return `(str, Optional[float])`
931
+ 2. **`def get_model_name()`** - Return string identifier for logging
932
+ 3. **Error Handling** - Handle connection and API errors appropriately
933
+ 4. **Cost** - Return `None` for cost if not applicable (e.g., internal/local models)
934
+
935
+ ### Advanced: Custom Authentication
936
+
937
+ For custom authentication schemes:
938
+ ```python
939
+ class CustomAuthLLMClient(CustomLLMClient):
940
+ """Client with custom authentication"""
941
+
942
+ def __init__(self, endpoint: str, auth_token: str):
943
+ self.endpoint = endpoint
944
+ self.headers = {
945
+ "Authorization": f"Bearer {auth_token}",
946
+ "X-Custom-Header": "value"
947
+ }
948
+ # Use aiohttp or httpx for custom auth
949
+ import aiohttp
950
+ self.session = aiohttp.ClientSession(headers=self.headers)
951
+
952
+ async def chat_complete(self, messages, temperature):
953
+ async with self.session.post(
954
+ f"{self.endpoint}/chat",
955
+ json={"messages": messages, "temperature": temperature}
956
+ ) as response:
957
+ data = await response.json()
958
+ return data["content"], None
959
+
960
+ def get_model_name(self):
961
+ return "custom-auth-model"
962
+ ```
963
+
751
964
  ## Test Data Generation
752
965
 
753
966
  The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.2
3
+ Version: 0.3.3
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -807,6 +807,219 @@ response, cost = await chat_complete(
807
807
  )
808
808
  ```
809
809
 
810
+ ## Custom LLM Providers
811
+
812
+ The library supports custom LLM providers through the `CustomLLMClient` abstract base class. This allows you to integrate any LLM provider, including internal corporate models, locally-hosted models, or custom endpoints.
813
+
814
+ ### Creating a Custom Provider
815
+
816
+ Implement the `CustomLLMClient` interface:
817
+ ```python
818
+ from eval_lib import CustomLLMClient
819
+ from typing import Optional
820
+ from openai import AsyncOpenAI
821
+
822
+ class InternalLLMClient(CustomLLMClient):
823
+ """Client for internal corporate LLM or custom endpoint"""
824
+
825
+ def __init__(
826
+ self,
827
+ endpoint: str,
828
+ model: str,
829
+ api_key: Optional[str] = None,
830
+ temperature: float = 0.0
831
+ ):
832
+ """
833
+ Args:
834
+ endpoint: Your internal LLM endpoint URL (e.g., "https://internal-llm.company.com/v1")
835
+ model: Model name to use
836
+ api_key: API key if required (optional for local models)
837
+ temperature: Default temperature
838
+ """
839
+ self.endpoint = endpoint
840
+ self.model = model
841
+ self.api_key = api_key or "not-needed" # Some endpoints don't need auth
842
+
843
+ self.client = AsyncOpenAI(
844
+ api_key=self.api_key,
845
+ base_url=self.endpoint
846
+ )
847
+
848
+ async def chat_complete(
849
+ self,
850
+ messages: list[dict[str, str]],
851
+ temperature: float
852
+ ) -> tuple[str, Optional[float]]:
853
+ """Generate response from internal LLM"""
854
+ response = await self.client.chat.completions.create(
855
+ model=self.model,
856
+ messages=messages,
857
+ temperature=temperature,
858
+ )
859
+ text = response.choices[0].message.content.strip()
860
+ cost = None # Internal models typically don't have API costs
861
+ return text, cost
862
+
863
+ def get_model_name(self) -> str:
864
+ """Return model name for logging"""
865
+ return f"internal:{self.model}"
866
+ ```
867
+
868
+ ### Using Custom Providers
869
+
870
+ Use your custom provider in any metric:
871
+ ```python
872
+ import asyncio
873
+ from eval_lib import (
874
+ evaluate,
875
+ EvalTestCase,
876
+ AnswerRelevancyMetric,
877
+ FaithfulnessMetric
878
+ )
879
+
880
+ # Create custom internal LLM client
881
+ internal_llm = InternalLLMClient(
882
+ endpoint="https://internal-llm.company.com/v1",
883
+ model="company-gpt-v2",
884
+ api_key="your-internal-key" # Optional
885
+ )
886
+
887
+ # Use in metrics
888
+ test_cases = [
889
+ EvalTestCase(
890
+ input="What is the capital of France?",
891
+ actual_output="Paris is the capital.",
892
+ expected_output="Paris",
893
+ retrieval_context=["Paris is the capital of France."]
894
+ )
895
+ ]
896
+
897
+ metrics = [
898
+ AnswerRelevancyMetric(
899
+ model=internal_llm, # ← Your custom LLM
900
+ threshold=0.7
901
+ ),
902
+ FaithfulnessMetric(
903
+ model=internal_llm, # ← Same custom client
904
+ threshold=0.8
905
+ )
906
+ ]
907
+
908
+ async def run_evaluation():
909
+ results = await evaluate(
910
+ test_cases=test_cases,
911
+ metrics=metrics,
912
+ verbose=True
913
+ )
914
+ return results
915
+
916
+ asyncio.run(run_evaluation())
917
+ ```
918
+
919
+ ### Mixing Standard and Custom Providers
920
+
921
+ You can mix standard and custom providers in the same evaluation:
922
+ ```python
923
+ # Create custom provider
924
+ internal_llm = InternalLLMClient(
925
+ endpoint="https://internal-llm.company.com/v1",
926
+ model="company-model"
927
+ )
928
+
929
+ # Mix standard OpenAI and custom internal LLM
930
+ metrics = [
931
+ AnswerRelevancyMetric(
932
+ model="gpt-4o-mini", # ← Standard OpenAI
933
+ threshold=0.7
934
+ ),
935
+ FaithfulnessMetric(
936
+ model=internal_llm, # ← Custom internal LLM
937
+ threshold=0.8
938
+ ),
939
+ ContextualRelevancyMetric(
940
+ model="anthropic:claude-sonnet-4-0", # ← Standard Anthropic
941
+ threshold=0.7
942
+ )
943
+ ]
944
+
945
+ results = await evaluate(test_cases=test_cases, metrics=metrics)
946
+ ```
947
+
948
+ ### Custom Provider Use Cases
949
+
950
+ **When to use custom providers:**
951
+
952
+ 1. **Internal Corporate LLMs**: Connect to your company's proprietary models
953
+ 2. **Local Models**: Integrate locally-hosted models (vLLM, TGI, LM Studio, Ollama with custom setup)
954
+ 3. **Fine-tuned Models**: Use your own fine-tuned models hosted anywhere
955
+ 4. **Research Models**: Connect to experimental or research models
956
+ 5. **Custom Endpoints**: Any LLM accessible via HTTP endpoint
957
+
958
+ **Example: Local Model with vLLM**
959
+ ```python
960
+ # vLLM server running on localhost:8000
961
+ local_model = InternalLLMClient(
962
+ endpoint="http://localhost:8000/v1",
963
+ model="meta-llama/Llama-2-7b-chat",
964
+ api_key=None # Local models don't need auth
965
+ )
966
+
967
+ # Use in evaluation
968
+ metric = AnswerRelevancyMetric(model=local_model, threshold=0.7)
969
+ ```
970
+
971
+ **Example: Corporate Internal Model**
972
+ ```python
973
+ # Company's internal LLM with authentication
974
+ company_model = InternalLLMClient(
975
+ endpoint="https://ai-platform.company.internal/api/v1",
976
+ model="company-gpt-enterprise",
977
+ api_key="internal-api-key-here"
978
+ )
979
+
980
+ # Use in evaluation
981
+ metrics = [
982
+ AnswerRelevancyMetric(model=company_model, threshold=0.7),
983
+ FaithfulnessMetric(model=company_model, threshold=0.8)
984
+ ]
985
+ ```
986
+
987
+ **Key Requirements:**
988
+
989
+ 1. **`async def chat_complete()`** - Must be async and return `(str, Optional[float])`
990
+ 2. **`def get_model_name()`** - Return string identifier for logging
991
+ 3. **Error Handling** - Handle connection and API errors appropriately
992
+ 4. **Cost** - Return `None` for cost if not applicable (e.g., internal/local models)
993
+
994
+ ### Advanced: Custom Authentication
995
+
996
+ For custom authentication schemes:
997
+ ```python
998
+ class CustomAuthLLMClient(CustomLLMClient):
999
+ """Client with custom authentication"""
1000
+
1001
+ def __init__(self, endpoint: str, auth_token: str):
1002
+ self.endpoint = endpoint
1003
+ self.headers = {
1004
+ "Authorization": f"Bearer {auth_token}",
1005
+ "X-Custom-Header": "value"
1006
+ }
1007
+ # Use aiohttp or httpx for custom auth
1008
+ import aiohttp
1009
+ self.session = aiohttp.ClientSession(headers=self.headers)
1010
+
1011
+ async def chat_complete(self, messages, temperature):
1012
+ async with self.session.post(
1013
+ f"{self.endpoint}/chat",
1014
+ json={"messages": messages, "temperature": temperature}
1015
+ ) as response:
1016
+ data = await response.json()
1017
+ return data["content"], None
1018
+
1019
+ def get_model_name(self):
1020
+ return "custom-auth-model"
1021
+ ```
1022
+
810
1023
  ## Test Data Generation
811
1024
 
812
1025
  The library includes a powerful test data generator that can create realistic test cases either from scratch or based on your documents.
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
7
7
  and a wide range of evaluation metrics for RAG systems and AI agents.
8
8
  """
9
9
 
10
- __version__ = "0.3.2"
10
+ __version__ = "0.3.3"
11
11
  __author__ = "Aleksandr Meshkov"
12
12
 
13
13
  # Core evaluation functions
@@ -39,6 +39,7 @@ from eval_lib.llm_client import (
39
39
  chat_complete,
40
40
  get_embeddings,
41
41
  LLMDescriptor,
42
+ CustomLLMClient,
42
43
  Provider
43
44
  )
44
45
 
@@ -106,6 +107,7 @@ __all__ = [
106
107
  "chat_complete",
107
108
  "get_embeddings",
108
109
  "LLMDescriptor",
110
+ "CustomLLMClient",
109
111
  "Provider",
110
112
 
111
113
  # RAG Metrics
@@ -2,6 +2,7 @@
2
2
  import openai
3
3
  import functools
4
4
  import anthropic
5
+ from abc import ABC, abstractmethod
5
6
  from openai import AsyncAzureOpenAI
6
7
  from google import genai
7
8
  from google.genai.types import GenerateContentConfig
@@ -13,6 +14,45 @@ from types import SimpleNamespace
13
14
  from .price import model_pricing
14
15
 
15
16
 
17
+ class CustomLLMClient(ABC):
18
+ """
19
+ Base class for custom LLM clients.
20
+ Inherit from this to create your own model implementations.
21
+
22
+ Example:
23
+ class MyCustomLLM(CustomLLMClient):
24
+ async def chat_complete(self, messages, temperature):
25
+ # Your implementation
26
+ return response_text, cost
27
+
28
+ def get_model_name(self):
29
+ return "my-custom-model"
30
+ """
31
+
32
+ @abstractmethod
33
+ async def chat_complete(
34
+ self,
35
+ messages: list[dict[str, str]],
36
+ temperature: float
37
+ ) -> tuple[str, Optional[float]]:
38
+ """
39
+ Generate a response for the given messages.
40
+
41
+ Args:
42
+ messages: List of message dicts [{"role": "user", "content": "..."}]
43
+ temperature: Sampling temperature
44
+
45
+ Returns:
46
+ Tuple of (response_text, cost_in_usd)
47
+ """
48
+ pass
49
+
50
+ @abstractmethod
51
+ def get_model_name(self) -> str:
52
+ """Return the model name for logging/tracking purposes."""
53
+ pass
54
+
55
+
16
56
  class LLMConfigurationError(Exception):
17
57
  """Raised when LLM client configuration is missing or invalid."""
18
58
  pass
@@ -24,6 +64,7 @@ class Provider(str, Enum):
24
64
  GOOGLE = "google"
25
65
  OLLAMA = "ollama"
26
66
  ANTHROPIC = "anthropic"
67
+ CUSTOM = "custom"
27
68
 
28
69
 
29
70
  @dataclass(frozen=True, slots=True)
@@ -308,7 +349,7 @@ _HELPERS = {
308
349
 
309
350
 
310
351
  async def chat_complete(
311
- llm: str | tuple[str, str] | LLMDescriptor,
352
+ llm: str | tuple[str, str] | LLMDescriptor | CustomLLMClient,
312
353
  messages: list[dict[str, str]],
313
354
  temperature: float = 0.0,
314
355
  ):
@@ -327,6 +368,11 @@ async def chat_complete(
327
368
  LLMConfigurationError: If required API keys or configuration are missing
328
369
  ValueError: If provider is not supported
329
370
  """
371
+ # Handle custom LLM clients
372
+ if isinstance(llm, CustomLLMClient):
373
+ return await llm.chat_complete(messages, temperature)
374
+
375
+ # Standard providers
330
376
  llm = LLMDescriptor.parse(llm)
331
377
  helper = _HELPERS.get(llm.provider)
332
378
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "eval-ai-library"
7
- version = "0.3.2"
7
+ version = "0.3.3"
8
8
  description = "Comprehensive AI Model Evaluation Framework with support for multiple LLM providers"
9
9
  readme = "README.md"
10
10
  authors = [
File without changes