eval-ai-library 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eval-ai-library might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: eval-ai-library
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Comprehensive AI Model Evaluation Framework with support for multiple LLM providers
5
5
  Author-email: Aleksandr Meshkov <alekslynx90@gmail.com>
6
6
  License: MIT
@@ -1,34 +1,34 @@
1
- eval_ai_library-0.3.0.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
- eval_lib/__init__.py,sha256=BA0vmdi5_3Zd1ib8nLyq6pESBVnFX7aKdvRAbR_I9bQ,3029
1
+ eval_ai_library-0.3.2.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
2
+ eval_lib/__init__.py,sha256=IeDW5pLarPmHCBJu-6vFX71g9VxZTS46UkHjDkrU_Gw,3043
3
3
  eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
4
4
  eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
5
- eval_lib/llm_client.py,sha256=6uNLIAfSdj4u_n4SAk3UkYmxcfO2Y2y-9Kp0T4bSjPM,13845
5
+ eval_lib/llm_client.py,sha256=3eMcarKLkDLDVh4AOxgWbaIzXlzpqsmEfJXNTBonNic,13633
6
6
  eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
7
7
  eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
8
8
  eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
9
9
  eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
10
10
  eval_lib/utils.py,sha256=-hwagFFn3_QjgyLqF8Qx7JIkpgOEI8-F14eycog3bgc,3141
11
11
  eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
12
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=_9MRWbv8oi6goY-ZXWu7FaNV0vQX1UD1w1Ar7CPVino,8244
13
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=D9MuKUI8ujh766vF8VQO73m-fPSQxy5u2Tkcr3wHbVk,9180
14
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=mt1PRHadup2k64gF_OyL8d-eQ5zm2EeNwztcE-aGOe0,12452
12
+ eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=7j89HOTsu0rMoFnznTjMl-tqQpnZlS6ZIdrHPueEbb8,8289
13
+ eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=kJsYj9H3W3Mw2iBqj3Br_glP8gU6_diFPiJhRSnHGxg,9225
14
+ eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=v5cO07cymo9GWSZ34ryAx3ya4DDBiRWih9w0bm_j_R8,12497
15
15
  eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=qbVMtD6EWKah27FogLEiEh6pBX-k2wwKbwM_kFkvYeQ,4220
16
16
  eval_lib/datagenerator/datagenerator.py,sha256=NQZIQuSCmryxIT3lTLS1PpJjENmGqARtR-zTiQ8OvRk,15513
17
17
  eval_lib/datagenerator/document_loader.py,sha256=vnQUz_Dxb3SxcVPUmMXZe-rgfPp6OfHb6D2Ie9iqPms,17025
18
18
  eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
19
19
  eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
20
20
  eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=AxPmwzGFU7tnTrrZuQZ7ow4nNSD-blDHdAGwhMHMxjM,15040
21
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=LHf7FI9dYzZcKBdxUH9VpsOceRLh0NMb79qliZRE3Uo,8477
21
+ eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=-Xb9I-BVMDf5E55FbJzP6IyvD6IVTUPBI-uCrRnEboc,8522
22
22
  eval_lib/metrics/bias_metric/bias.py,sha256=BVH8xlTUTRfVG_F1kauwpGAkVKBkUWhM9rUsrrLhpRU,4020
23
23
  eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=CQOb6uR2KeffTkhPSqZae56sX5tXMr0pJVM5W_wU1fU,3993
24
24
  eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=iw73_hGLWklHZSBkCRkPDNUt1xD5dknA_7CZ6Efkf5w,3913
25
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=pjyQPVDzWfTnqTgy5D1OYq1KdsCP1bVeUg3rcV8El4I,6501
26
- eval_lib/metrics/custom_metric/custom_eval.py,sha256=GrOo39DxHzdMUGnBnCiHfT7Y8VabFFgmYQGcnkyGm1w,11340
25
+ eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=G1cYlA95YNcpEqQsALVi6ZbyNzWr9ccO2DATcsES5pk,6546
26
+ eval_lib/metrics/custom_metric/custom_eval.py,sha256=Ov3-i6IytaJXlzcMgp46SRSeb8scyhqyuR2BqUtDFoM,11385
27
27
  eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK7Tq-UAJXNql8VKjgtuqlDhA,5930
28
28
  eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
29
29
  eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
30
30
  eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
31
- eval_ai_library-0.3.0.dist-info/METADATA,sha256=s7bRUm49crFB9ICrQaPnJvWHVQ28YUBMlTtlL0L2dWQ,37706
32
- eval_ai_library-0.3.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- eval_ai_library-0.3.0.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
34
- eval_ai_library-0.3.0.dist-info/RECORD,,
31
+ eval_ai_library-0.3.2.dist-info/METADATA,sha256=7kIgiDelzM1wbCdKdy4PXuFudIZ3UCUTPXhlVccFe9k,37706
32
+ eval_ai_library-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ eval_ai_library-0.3.2.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
34
+ eval_ai_library-0.3.2.dist-info/RECORD,,
eval_lib/__init__.py CHANGED
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
7
7
  and a wide range of evaluation metrics for RAG systems and AI agents.
8
8
  """
9
9
 
10
- __version__ = "0.3.0"
10
+ __version__ = "0.3.2"
11
11
  __author__ = "Aleksandr Meshkov"
12
12
 
13
13
  # Core evaluation functions
@@ -68,12 +68,14 @@ from eval_lib.agent_metrics import (
68
68
 
69
69
  def __getattr__(name):
70
70
  """
71
- Ленивый импорт для модулей с тяжёлыми зависимостями.
72
- DataGenerator импортируется только когда реально используется.
71
+ Lazy loading for data generation components.
73
72
  """
74
- if name == "DataGenerator":
75
- from eval_lib.datagenerator.datagenerator import DataGenerator
76
- return DataGenerator
73
+ if name == "DatasetGenerator":
74
+ from eval_lib.datagenerator.datagenerator import DatasetGenerator
75
+ return DatasetGenerator
76
+ if name == "DataGenerator": # Alias for DatasetGenerator
77
+ from eval_lib.datagenerator.datagenerator import DatasetGenerator
78
+ return DatasetGenerator
77
79
  if name == "DocumentLoader":
78
80
  from eval_lib.datagenerator.document_loader import DocumentLoader
79
81
  return DocumentLoader
@@ -217,6 +217,7 @@ Verdicts:
217
217
  "final_score": final_score,
218
218
  "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
219
219
  "threshold": self.threshold,
220
+ "temperature": self.temperature,
220
221
  "success": success,
221
222
  "comment_success": "Whether the retention score meets the required threshold.",
222
223
  "final_reason": summary,
@@ -239,6 +239,7 @@ Return JSON array:
239
239
  "final_score": final_score,
240
240
  "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
241
241
  "threshold": self.threshold,
242
+ "temperature": self.temperature,
242
243
  "success": success,
243
244
  "comment_success": "Whether the role adherence score meets the required threshold.",
244
245
  "final_reason": summary,
@@ -328,6 +328,7 @@ Criteria: [
328
328
  "final_score": final_score,
329
329
  "comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
330
330
  "threshold": self.threshold,
331
+ "temperature": self.temperature,
331
332
  "success": success,
332
333
  "comment_success": "Whether the task success score meets the required threshold.",
333
334
  "final_reason": summary,
eval_lib/llm_client.py CHANGED
@@ -101,7 +101,6 @@ def _get_client(provider: Provider):
101
101
  if provider == Provider.AZURE:
102
102
  _check_env_var("AZURE_OPENAI_API_KEY", "Azure OpenAI")
103
103
  _check_env_var("AZURE_OPENAI_ENDPOINT", "Azure OpenAI")
104
- # AZURE_OPENAI_DEPLOYMENT проверяется при вызове, не обязателен здесь
105
104
 
106
105
  return AsyncAzureOpenAI(
107
106
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),
@@ -114,7 +113,6 @@ def _get_client(provider: Provider):
114
113
  return genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
115
114
 
116
115
  if provider == Provider.OLLAMA:
117
- # Ollama может работать без ключа (локальный сервер)
118
116
  api_key = _check_env_var(
119
117
  "OLLAMA_API_KEY", "Ollama", required=False) or "ollama"
120
118
  base_url = _check_env_var(
@@ -181,6 +181,7 @@ class AnswerRelevancyMetric(MetricPattern):
181
181
  "final_score": final_score,
182
182
  "comment_final_score": "Score based on the proportion of relevant statements.",
183
183
  "threshold": self.threshold,
184
+ "temperature": self.temperature,
184
185
  "success": success,
185
186
  "comment_success": "Whether the score exceeds the pass threshold.",
186
187
  "final_reason": summary_reason,
@@ -155,6 +155,7 @@ class ContextualRelevancyMetric(MetricPattern):
155
155
  "final_score": score,
156
156
  "comment_final_score": "Weighted support score from context.",
157
157
  "threshold": self.threshold,
158
+ "temperature": self.temperature,
158
159
  "success": success,
159
160
  "comment_success": "Whether the score exceeds the threshold.",
160
161
  "final_reason": summary,
@@ -312,6 +312,7 @@ JSON:"""
312
312
  "final_score": final_score,
313
313
  "comment_final_score": f"Weighted average of verdict scores calculated using softmax aggregation (temperature={self.temperature}).",
314
314
  "threshold": self.threshold,
315
+ "temperature": self.temperature,
315
316
  "success": success,
316
317
  "comment_success": "Whether the final score meets the required threshold.",
317
318
  "summary": summary,