eval-ai-library 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eval-ai-library might be problematic. Click here for more details.
- {eval_ai_library-0.3.0.dist-info → eval_ai_library-0.3.2.dist-info}/METADATA +1 -1
- {eval_ai_library-0.3.0.dist-info → eval_ai_library-0.3.2.dist-info}/RECORD +13 -13
- eval_lib/__init__.py +8 -6
- eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py +1 -0
- eval_lib/agent_metrics/role_adherence_metric/role_adherence.py +1 -0
- eval_lib/agent_metrics/task_success_metric/task_success_rate.py +1 -0
- eval_lib/llm_client.py +0 -2
- eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py +1 -0
- eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py +1 -0
- eval_lib/metrics/custom_metric/custom_eval.py +1 -0
- {eval_ai_library-0.3.0.dist-info → eval_ai_library-0.3.2.dist-info}/WHEEL +0 -0
- {eval_ai_library-0.3.0.dist-info → eval_ai_library-0.3.2.dist-info}/licenses/LICENSE +0 -0
- {eval_ai_library-0.3.0.dist-info → eval_ai_library-0.3.2.dist-info}/top_level.txt +0 -0
|
@@ -1,34 +1,34 @@
|
|
|
1
|
-
eval_ai_library-0.3.
|
|
2
|
-
eval_lib/__init__.py,sha256=
|
|
1
|
+
eval_ai_library-0.3.2.dist-info/licenses/LICENSE,sha256=rK9uLDgWNrCHNdp-Zma_XghDE7Fs0u0kDi3WMcmYx6w,1074
|
|
2
|
+
eval_lib/__init__.py,sha256=IeDW5pLarPmHCBJu-6vFX71g9VxZTS46UkHjDkrU_Gw,3043
|
|
3
3
|
eval_lib/evaluate.py,sha256=GjlXZb5dnl44LCaJwdkyGCYcC50zoNZn3NrofzNAVJ0,11490
|
|
4
4
|
eval_lib/evaluation_schema.py,sha256=7IDd_uozqewhh7k0p1hKut_20udvRxxkV6thclxKUg0,1904
|
|
5
|
-
eval_lib/llm_client.py,sha256=
|
|
5
|
+
eval_lib/llm_client.py,sha256=3eMcarKLkDLDVh4AOxgWbaIzXlzpqsmEfJXNTBonNic,13633
|
|
6
6
|
eval_lib/metric_pattern.py,sha256=wULgMNDeAqJC_Qjglo7bYzY2eGhA_PmY_hA_qGfg0sI,11730
|
|
7
7
|
eval_lib/price.py,sha256=jbmkkUTxPuXrkSHuaJYPl7jSzfDIzQ9p_swWWs26UJ0,1986
|
|
8
8
|
eval_lib/py.typed,sha256=8PjyZ1aVoQpRVvt71muvuq5qE-jTFZkK-GLHkhdebmc,26
|
|
9
9
|
eval_lib/testcases_schema.py,sha256=qI4o6kX0jH1DR3sHGXUnu3Cyt2oq7rGlsMlOaXSt6F4,696
|
|
10
10
|
eval_lib/utils.py,sha256=-hwagFFn3_QjgyLqF8Qx7JIkpgOEI8-F14eycog3bgc,3141
|
|
11
11
|
eval_lib/agent_metrics/__init__.py,sha256=20Y4BsicD2s7OkOBQPBvB2JKStBDtplv52_q6q35Vgo,525
|
|
12
|
-
eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=
|
|
13
|
-
eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=
|
|
14
|
-
eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=
|
|
12
|
+
eval_lib/agent_metrics/knowledge_retention_metric/knowledge_retention.py,sha256=7j89HOTsu0rMoFnznTjMl-tqQpnZlS6ZIdrHPueEbb8,8289
|
|
13
|
+
eval_lib/agent_metrics/role_adherence_metric/role_adherence.py,sha256=kJsYj9H3W3Mw2iBqj3Br_glP8gU6_diFPiJhRSnHGxg,9225
|
|
14
|
+
eval_lib/agent_metrics/task_success_metric/task_success_rate.py,sha256=v5cO07cymo9GWSZ34ryAx3ya4DDBiRWih9w0bm_j_R8,12497
|
|
15
15
|
eval_lib/agent_metrics/tools_correctness_metric/tool_correctness.py,sha256=qbVMtD6EWKah27FogLEiEh6pBX-k2wwKbwM_kFkvYeQ,4220
|
|
16
16
|
eval_lib/datagenerator/datagenerator.py,sha256=NQZIQuSCmryxIT3lTLS1PpJjENmGqARtR-zTiQ8OvRk,15513
|
|
17
17
|
eval_lib/datagenerator/document_loader.py,sha256=vnQUz_Dxb3SxcVPUmMXZe-rgfPp6OfHb6D2Ie9iqPms,17025
|
|
18
18
|
eval_lib/datagenerator/prompts.py,sha256=iQAYitAbLud3vWJnXGh_OCF4syWYS_S71zZEPI6qYAU,7213
|
|
19
19
|
eval_lib/metrics/__init__.py,sha256=3qClCCjPXt5i0c38g5krfuQnqlAXEl-jhAHy1C_ICMY,1213
|
|
20
20
|
eval_lib/metrics/answer_precision_metric/answer_precision.py,sha256=AxPmwzGFU7tnTrrZuQZ7ow4nNSD-blDHdAGwhMHMxjM,15040
|
|
21
|
-
eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256
|
|
21
|
+
eval_lib/metrics/answer_relevancy_metric/answer_relevancy.py,sha256=-Xb9I-BVMDf5E55FbJzP6IyvD6IVTUPBI-uCrRnEboc,8522
|
|
22
22
|
eval_lib/metrics/bias_metric/bias.py,sha256=BVH8xlTUTRfVG_F1kauwpGAkVKBkUWhM9rUsrrLhpRU,4020
|
|
23
23
|
eval_lib/metrics/contextual_precision_metric/contextual_precision.py,sha256=CQOb6uR2KeffTkhPSqZae56sX5tXMr0pJVM5W_wU1fU,3993
|
|
24
24
|
eval_lib/metrics/contextual_recall_metric/contextual_recall.py,sha256=iw73_hGLWklHZSBkCRkPDNUt1xD5dknA_7CZ6Efkf5w,3913
|
|
25
|
-
eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=
|
|
26
|
-
eval_lib/metrics/custom_metric/custom_eval.py,sha256=
|
|
25
|
+
eval_lib/metrics/contextual_relevancy_metric/contextual_relevancy.py,sha256=G1cYlA95YNcpEqQsALVi6ZbyNzWr9ccO2DATcsES5pk,6546
|
|
26
|
+
eval_lib/metrics/custom_metric/custom_eval.py,sha256=Ov3-i6IytaJXlzcMgp46SRSeb8scyhqyuR2BqUtDFoM,11385
|
|
27
27
|
eval_lib/metrics/faithfulness_metric/faithfulness.py,sha256=OqamlhTOps7d-NOStSIK7Tq-UAJXNql8VKjgtuqlDhA,5930
|
|
28
28
|
eval_lib/metrics/geval/geval.py,sha256=mNciHXnqU2drOJsWlYmbwftGiKM89-Ykw2f6XneIGBM,10629
|
|
29
29
|
eval_lib/metrics/restricted_refusal_metric/restricted_refusal.py,sha256=4QqYgGMcp6W9Lw-v4s0AlUhMSOKvBOEgnLvhqVXaT9I,4286
|
|
30
30
|
eval_lib/metrics/toxicity_metric/toxicity.py,sha256=rBE1_fvpbCRdBpBep1y1LTIhofKR8GD4Eh76EOYzxL0,4076
|
|
31
|
-
eval_ai_library-0.3.
|
|
32
|
-
eval_ai_library-0.3.
|
|
33
|
-
eval_ai_library-0.3.
|
|
34
|
-
eval_ai_library-0.3.
|
|
31
|
+
eval_ai_library-0.3.2.dist-info/METADATA,sha256=7kIgiDelzM1wbCdKdy4PXuFudIZ3UCUTPXhlVccFe9k,37706
|
|
32
|
+
eval_ai_library-0.3.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
33
|
+
eval_ai_library-0.3.2.dist-info/top_level.txt,sha256=uQHpEd2XI0oZgq1eCww9zMvVgDJgwXMWkCD45fYUzEg,9
|
|
34
|
+
eval_ai_library-0.3.2.dist-info/RECORD,,
|
eval_lib/__init__.py
CHANGED
|
@@ -7,7 +7,7 @@ A powerful library for evaluating AI models with support for multiple LLM provid
|
|
|
7
7
|
and a wide range of evaluation metrics for RAG systems and AI agents.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
-
__version__ = "0.3.
|
|
10
|
+
__version__ = "0.3.2"
|
|
11
11
|
__author__ = "Aleksandr Meshkov"
|
|
12
12
|
|
|
13
13
|
# Core evaluation functions
|
|
@@ -68,12 +68,14 @@ from eval_lib.agent_metrics import (
|
|
|
68
68
|
|
|
69
69
|
def __getattr__(name):
|
|
70
70
|
"""
|
|
71
|
-
|
|
72
|
-
DataGenerator импортируется только когда реально используется.
|
|
71
|
+
Lazy loading for data generation components.
|
|
73
72
|
"""
|
|
74
|
-
if name == "
|
|
75
|
-
from eval_lib.datagenerator.datagenerator import
|
|
76
|
-
return
|
|
73
|
+
if name == "DatasetGenerator":
|
|
74
|
+
from eval_lib.datagenerator.datagenerator import DatasetGenerator
|
|
75
|
+
return DatasetGenerator
|
|
76
|
+
if name == "DataGenerator": # Alias for DatasetGenerator
|
|
77
|
+
from eval_lib.datagenerator.datagenerator import DatasetGenerator
|
|
78
|
+
return DatasetGenerator
|
|
77
79
|
if name == "DocumentLoader":
|
|
78
80
|
from eval_lib.datagenerator.document_loader import DocumentLoader
|
|
79
81
|
return DocumentLoader
|
|
@@ -217,6 +217,7 @@ Verdicts:
|
|
|
217
217
|
"final_score": final_score,
|
|
218
218
|
"comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
|
|
219
219
|
"threshold": self.threshold,
|
|
220
|
+
"temperature": self.temperature,
|
|
220
221
|
"success": success,
|
|
221
222
|
"comment_success": "Whether the retention score meets the required threshold.",
|
|
222
223
|
"final_reason": summary,
|
|
@@ -239,6 +239,7 @@ Return JSON array:
|
|
|
239
239
|
"final_score": final_score,
|
|
240
240
|
"comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
|
|
241
241
|
"threshold": self.threshold,
|
|
242
|
+
"temperature": self.temperature,
|
|
242
243
|
"success": success,
|
|
243
244
|
"comment_success": "Whether the role adherence score meets the required threshold.",
|
|
244
245
|
"final_reason": summary,
|
|
@@ -328,6 +328,7 @@ Criteria: [
|
|
|
328
328
|
"final_score": final_score,
|
|
329
329
|
"comment_final_score": f"Weighted average of verdict scores using softmax aggregation (temperature={self.temperature}).",
|
|
330
330
|
"threshold": self.threshold,
|
|
331
|
+
"temperature": self.temperature,
|
|
331
332
|
"success": success,
|
|
332
333
|
"comment_success": "Whether the task success score meets the required threshold.",
|
|
333
334
|
"final_reason": summary,
|
eval_lib/llm_client.py
CHANGED
|
@@ -101,7 +101,6 @@ def _get_client(provider: Provider):
|
|
|
101
101
|
if provider == Provider.AZURE:
|
|
102
102
|
_check_env_var("AZURE_OPENAI_API_KEY", "Azure OpenAI")
|
|
103
103
|
_check_env_var("AZURE_OPENAI_ENDPOINT", "Azure OpenAI")
|
|
104
|
-
# AZURE_OPENAI_DEPLOYMENT проверяется при вызове, не обязателен здесь
|
|
105
104
|
|
|
106
105
|
return AsyncAzureOpenAI(
|
|
107
106
|
api_key=os.getenv("AZURE_OPENAI_API_KEY"),
|
|
@@ -114,7 +113,6 @@ def _get_client(provider: Provider):
|
|
|
114
113
|
return genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
|
|
115
114
|
|
|
116
115
|
if provider == Provider.OLLAMA:
|
|
117
|
-
# Ollama может работать без ключа (локальный сервер)
|
|
118
116
|
api_key = _check_env_var(
|
|
119
117
|
"OLLAMA_API_KEY", "Ollama", required=False) or "ollama"
|
|
120
118
|
base_url = _check_env_var(
|
|
@@ -181,6 +181,7 @@ class AnswerRelevancyMetric(MetricPattern):
|
|
|
181
181
|
"final_score": final_score,
|
|
182
182
|
"comment_final_score": "Score based on the proportion of relevant statements.",
|
|
183
183
|
"threshold": self.threshold,
|
|
184
|
+
"temperature": self.temperature,
|
|
184
185
|
"success": success,
|
|
185
186
|
"comment_success": "Whether the score exceeds the pass threshold.",
|
|
186
187
|
"final_reason": summary_reason,
|
|
@@ -155,6 +155,7 @@ class ContextualRelevancyMetric(MetricPattern):
|
|
|
155
155
|
"final_score": score,
|
|
156
156
|
"comment_final_score": "Weighted support score from context.",
|
|
157
157
|
"threshold": self.threshold,
|
|
158
|
+
"temperature": self.temperature,
|
|
158
159
|
"success": success,
|
|
159
160
|
"comment_success": "Whether the score exceeds the threshold.",
|
|
160
161
|
"final_reason": summary,
|
|
@@ -312,6 +312,7 @@ JSON:"""
|
|
|
312
312
|
"final_score": final_score,
|
|
313
313
|
"comment_final_score": f"Weighted average of verdict scores calculated using softmax aggregation (temperature={self.temperature}).",
|
|
314
314
|
"threshold": self.threshold,
|
|
315
|
+
"temperature": self.temperature,
|
|
315
316
|
"success": success,
|
|
316
317
|
"comment_success": "Whether the final score meets the required threshold.",
|
|
317
318
|
"summary": summary,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|