rakam-eval-sdk 0.1.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
File without changes
@@ -0,0 +1,202 @@
1
+ import os
2
+ import random
3
+ from typing import Any, List, Optional, cast
4
+ import requests
5
+
6
+ from .schema import (
7
+ EvalConfig,
8
+ MetricConfig,
9
+ SchemaEvalConfig,
10
+ SchemaInputItem,
11
+ SchemaMetricConfig,
12
+ TextInputItem,
13
+ )
14
+
15
+
16
+ class DeepEvalClient:
17
+ """
18
+ Client for interacting with the DeepEval API.
19
+ Provides synchronous and background evaluation with optional probability-based execution.
20
+ """
21
+
22
+ def __init__(
23
+ self,
24
+ base_url: Optional[str] = None,
25
+ api_token: Optional[str] = None,
26
+ settings_module: Optional[Any] = None, # optional external settings
27
+ timeout: int = 30,
28
+ ):
29
+ settings_url = getattr(settings_module, "EVALFRAMWORK_URL", None)
30
+ settings_token = getattr(settings_module, "EVALFRAMWORK_API_KEY", None)
31
+
32
+ raw_url = (
33
+ base_url
34
+ or settings_url
35
+ or os.getenv("EVALFRAMWORK_URL")
36
+ or "http://localhost:8080"
37
+ )
38
+ self.base_url = raw_url.rstrip("/")
39
+ self.api_token = (
40
+ api_token or settings_token or os.getenv("EVALFRAMWORK_API_KEY", "")
41
+ )
42
+ self.timeout = timeout
43
+
44
+ def _request(
45
+ self,
46
+ endpoint: str,
47
+ payload: dict,
48
+ raise_exception: bool = False,
49
+ ) -> Optional[dict]:
50
+ """Internal helper to send POST requests with standard headers and error handling."""
51
+ url = f"{self.base_url}{endpoint}"
52
+ headers = {
53
+ "accept": "application/json",
54
+ "Content-Type": "application/json",
55
+ "X-API-Token": self.api_token,
56
+ }
57
+
58
+ try:
59
+ resp = requests.post(
60
+ url, headers=headers, json=payload, timeout=self.timeout
61
+ )
62
+ if raise_exception:
63
+ resp.raise_for_status()
64
+ except requests.RequestException as e:
65
+ if raise_exception:
66
+ raise
67
+ return {"error": str(e)}
68
+
69
+ try:
70
+ return cast(dict, resp.json())
71
+ except ValueError:
72
+ if raise_exception:
73
+ raise
74
+ return {"error": "Invalid JSON response", "raw": resp.text}
75
+
76
+ def text_eval(
77
+ self,
78
+ data: List[TextInputItem],
79
+ metrics: List[MetricConfig],
80
+ raise_exception: bool = False,
81
+ component: str = "unknown",
82
+ ) -> Optional[dict]:
83
+ """Run synchronous text evaluation."""
84
+ payload = EvalConfig.model_construct(
85
+ data=data, metrics=metrics, component=component
86
+ ).model_dump()
87
+ return self._request("/deepeval/text-eval", payload, raise_exception)
88
+
89
+ def text_eval_background(
90
+ self,
91
+ data: List[TextInputItem],
92
+ metrics: List[MetricConfig],
93
+ raise_exception: bool = False,
94
+ component: str = "unknown",
95
+ ) -> Optional[dict]:
96
+ """Run background text evaluation (async job)."""
97
+ payload = EvalConfig.model_construct(
98
+ data=data, metrics=metrics, component=component
99
+ ).model_dump()
100
+ return self._request("/deepeval/text-eval/background", payload, raise_exception)
101
+
102
+ def schema_eval(
103
+ self,
104
+ data: List[SchemaInputItem],
105
+ metrics: List[SchemaMetricConfig],
106
+ raise_exception: bool = False,
107
+ component: str = "unknown",
108
+ ) -> Optional[dict]:
109
+ """Run synchronous schema evaluation."""
110
+ payload = SchemaEvalConfig.model_construct(
111
+ data=data, metrics=metrics, component=component
112
+ ).model_dump()
113
+ return self._request("/deepeval/schema-eval", payload, raise_exception)
114
+
115
+ def schema_eval_background(
116
+ self,
117
+ data: List[SchemaInputItem],
118
+ metrics: List[SchemaMetricConfig],
119
+ raise_exception: bool = False,
120
+ component: str = "unknown",
121
+ ) -> Optional[dict]:
122
+ """Run background schema evaluation (async job)."""
123
+ payload = SchemaEvalConfig.model_construct(
124
+ data=data, metrics=metrics, component=component
125
+ ).model_dump()
126
+ return self._request(
127
+ "/deepeval/schema-eval/background", payload, raise_exception
128
+ )
129
+
130
+ def maybe_text_eval(
131
+ self,
132
+ data: List[TextInputItem],
133
+ metrics: List[MetricConfig],
134
+ chance: float,
135
+ raise_exception: bool = False,
136
+ component: str = "unknown",
137
+ ) -> Optional[dict]:
138
+ """Randomly run text_eval based on a probability between 0 and 1."""
139
+ self._validate_chance(chance)
140
+ return (
141
+ self.text_eval(data, metrics, raise_exception, component=component)
142
+ if random.random() <= chance
143
+ else None
144
+ )
145
+
146
+ def maybe_text_eval_background(
147
+ self,
148
+ data: List[TextInputItem],
149
+ metrics: List[MetricConfig],
150
+ chance: float,
151
+ raise_exception: bool = False,
152
+ component: str = "unknown",
153
+ ) -> Optional[dict]:
154
+ """Randomly run text_eval_background based on a probability between 0 and 1."""
155
+ self._validate_chance(chance)
156
+ return (
157
+ self.text_eval_background(
158
+ data, metrics, raise_exception, component=component
159
+ )
160
+ if random.random() <= chance
161
+ else None
162
+ )
163
+
164
+ def maybe_schema_eval(
165
+ self,
166
+ data: List[SchemaInputItem],
167
+ metrics: List[SchemaMetricConfig],
168
+ chance: float,
169
+ raise_exception: bool = False,
170
+ component: str = "unknown",
171
+ ) -> Optional[dict]:
172
+ """Randomly run schema_eval based on a probability between 0 and 1."""
173
+ self._validate_chance(chance)
174
+ return (
175
+ self.schema_eval(data, metrics, raise_exception, component=component)
176
+ if random.random() <= chance
177
+ else None
178
+ )
179
+
180
+ def maybe_schema_eval_background(
181
+ self,
182
+ data: List[SchemaInputItem],
183
+ metrics: List[SchemaMetricConfig],
184
+ chance: float,
185
+ raise_exception: bool = False,
186
+ component: str = "unknown",
187
+ ) -> Optional[dict]:
188
+ """Randomly run text_eval_background based on a probability between 0 and 1."""
189
+ self._validate_chance(chance)
190
+ return (
191
+ self.schema_eval_background(
192
+ data, metrics, raise_exception, component=component
193
+ )
194
+ if random.random() <= chance
195
+ else None
196
+ )
197
+
198
+ @staticmethod
199
+ def _validate_chance(chance: float) -> None:
200
+ """Ensure chance is a valid probability between 0 and 1."""
201
+ if not (0 <= chance <= 1):
202
+ raise ValueError("chance must be between 0 and 1.")
@@ -0,0 +1,128 @@
1
+ # Common base class for all metric configs
2
+ import sys
3
+ from typing import Annotated, Any, Dict, List, Literal, Optional, Union
4
+
5
+ # Base class (you can keep this abstract)
6
+ from pydantic import BaseModel, Field
7
+
8
+ if sys.version_info < (3, 9):
9
+ from typing_extensions import Annotated
10
+ else:
11
+ from typing import Annotated
12
+
13
+
14
+ class MetricConfigBase(BaseModel):
15
+ type: str
16
+ name: Optional[str] = None
17
+
18
+
19
+ class ClientSideMetricConfig(BaseModel):
20
+ name: str
21
+ score: float
22
+ success: Optional[int] = 1
23
+ evaluation_cost: Optional[float] = 0
24
+ reason: Optional[str] = None
25
+ threshold: Optional[float] = 0
26
+
27
+
28
+ class OCRSimilarityConfig(MetricConfigBase):
29
+ type: Literal["ocr_similarity"] = "ocr_similarity"
30
+ threshold: float = 0.5
31
+
32
+
33
+ class CorrectnessConfig(MetricConfigBase):
34
+ type: Literal["correctness"] = "correctness"
35
+ model: str = "gpt-4.1"
36
+ steps: List[str] = Field(
37
+ default=[
38
+ "Check if the OCR model extracted the important information correctly. "
39
+ "Minor formatting differences like '$1,250.00' vs '$1250.00' are acceptable."
40
+ ]
41
+ )
42
+ criteria: Optional[str] = None,
43
+ params: List[Literal["actual_output", "expected_output"]] = Field(
44
+ default=["actual_output", "expected_output"]
45
+ )
46
+
47
+
48
+ class AnswerRelevancyConfig(MetricConfigBase):
49
+ type: Literal["answer_relevancy"] = "answer_relevancy"
50
+ threshold: float = 0.7
51
+ model: str = "gpt-4.1"
52
+ include_reason: bool = True
53
+
54
+
55
+ class FaithfulnessConfig(MetricConfigBase):
56
+ type: Literal["faithfulness"] = "faithfulness"
57
+ threshold: float = 0.7
58
+ model: str = "gpt-4.1"
59
+ include_reason: bool = True
60
+
61
+
62
+ class ToxicityConfig(MetricConfigBase):
63
+ type: Literal["toxicity"] = "toxicity"
64
+ threshold: float = 0.5
65
+ model: str = "gpt-4.1"
66
+ include_reason: bool = True
67
+
68
+
69
+ class JsonCorrectnessConfig(MetricConfigBase):
70
+ type: Literal["json_correctness"] = "json_correctness"
71
+ threshold: float = 0.5
72
+ model: str = "gpt-4.1"
73
+ include_reason: bool = True
74
+ excpected_schema: Dict[str, Any]
75
+
76
+
77
+ class FieldsPresenceConfig(MetricConfigBase):
78
+ type: Literal["fields_presence"] = "fields_presence"
79
+ excpected_schema: Dict[str, Any]
80
+ threshold: float = 0.5
81
+ include_reason: bool = True
82
+ strict_mode: bool = True
83
+
84
+
85
+ MetricConfig = Annotated[
86
+ Union[
87
+ OCRSimilarityConfig,
88
+ CorrectnessConfig,
89
+ AnswerRelevancyConfig,
90
+ FaithfulnessConfig,
91
+ ToxicityConfig,
92
+ ],
93
+ Field(discriminator="type"),
94
+ ]
95
+
96
+ SchemaMetricConfig = Annotated[
97
+ Union[JsonCorrectnessConfig, FieldsPresenceConfig], Field(
98
+ discriminator="type")
99
+ ]
100
+
101
+
102
+ class InputItem(BaseModel):
103
+ id: Optional[str] = None # set to optional to keep backward compatibility
104
+ input: str
105
+ output: str
106
+ metrics: Optional[List[ClientSideMetricConfig]] = []
107
+
108
+
109
+ class TextInputItem(InputItem):
110
+ expected_output: Optional[str] = None
111
+ retrieval_context: Optional[list[str]] = None
112
+
113
+
114
+ class SchemaInputItem(InputItem):
115
+ expected_output: Optional[str] = None
116
+ # retrieval_context: list[Json[Any]] = None
117
+
118
+
119
+ class EvalConfig(BaseModel):
120
+ component: str = "unknown"
121
+ data: List[TextInputItem]
122
+ metrics: List[MetricConfig] = Field(default_factory=list)
123
+
124
+
125
+ class SchemaEvalConfig(BaseModel):
126
+ component: str = "unknown"
127
+ data: List[SchemaInputItem]
128
+ metrics: List[SchemaMetricConfig] = Field(default_factory=list)
@@ -0,0 +1,165 @@
1
+ Metadata-Version: 2.3
2
+ Name: rakam-eval-sdk
3
+ Version: 0.1.15
4
+ Summary: Evaluation Framework SDK
5
+ Author: Mohamed Bachar Touil
6
+ License: MIT
7
+ Requires-Dist: pydantic>=2.10.6
8
+ Requires-Dist: requests
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+
12
+ # DeepEvalClient
13
+
14
+ A lightweight Python client for interacting with the **Evaluation API**.
15
+ It provides convenient wrappers for text and schema evaluation endpoints, with support for background jobs and probabilistic execution.
16
+
17
+ ---
18
+
19
+ ## Features
20
+
21
+ - 🔹 **Text Evaluation** – Run evaluations on plain text inputs.
22
+ - 🔹 **Schema Evaluation** – Evaluate structured inputs against schema-based metrics.
23
+ - 🔹 **Background Jobs** – Submit jobs asynchronously and process later.
24
+ - 🔹 **Probabilistic Execution** – Run evaluations with a configurable chance (e.g., A/B testing scenarios).
25
+ - 🔹 **Robust Error Handling** – Handles network errors and invalid JSON gracefully.
26
+ - 🔹 **Configurable** – Configure via constructor args, environment variables, or external settings module.
27
+
28
+ ---
29
+
30
+ ## Installation
31
+
32
+ ```bash
33
+ pip install rakam-eval-sdk
34
+ ```
35
+
36
+ Usage
37
+
38
+ 1. Basic Setup
39
+
40
+ ```python
41
+ from deepeval.client import DeepEvalClient
42
+ from deepeval.schema import TextInputItem, MetricConfig
43
+
44
+ client = DeepEvalClient(
45
+ base_url="http://localhost:8080",
46
+ api_token="your-api-key"
47
+ )
48
+
49
+ ```
50
+
51
+ 2. Text Evaluation
52
+
53
+ ```python
54
+
55
+ client.maybe_text_eval_background(
56
+ component="ocr",
57
+ data=[
58
+ TextInputItem(
59
+
60
+ id="runtime evaluation", # identifiar (that can be unique). use same id in case you want to follow performance over time
61
+ input="...", # input given to ai component
62
+ output="...", # output of the ai component
63
+ # optional args/ condtional based on metrics passed
64
+ expected_output=["..."],
65
+ retrieval_context=[
66
+ ["..."]
67
+ ]
68
+
69
+ )
70
+ ],
71
+ metrics=[
72
+ ToxicityConfig(
73
+ # model="gpt-4.1",
74
+ threshold=0.2,
75
+ include_reason=False
76
+ ),
77
+ CorrectnessConfig(
78
+ steps=[
79
+ "You are evaluating text extracted from resumes and job descriptions using OCR.",
80
+ "1. Verify that the extracted text is coherent and free of major corruption (e.g., broken words, random characters).",
81
+ "2. Check whether key resume/job-related fields are preserved correctly (e.g., name, job title, skills, education, experience, company name, job requirements).",
82
+ "3. Ensure that important details are not missing or replaced with irrelevant content.",
83
+ "4. Ignore minor formatting issues (line breaks, spacing) as long as the information is readable and accurate.",
84
+ "5. Consider the output correct if it faithfully represents the resume or job description’s main information."
85
+ ],
86
+ params=["actual_output"],
87
+
88
+ )
89
+ ],
90
+ chance=.3
91
+ )
92
+
93
+ ```
94
+ 3. Schema Evaluation
95
+
96
+ ```python
97
+
98
+ client.maybe_text_eval_background(
99
+ component="ocr",
100
+ data=[
101
+ TextInputItem(
102
+
103
+ id="runtime evaluation", # identifiar (that can be unique). use same id in case you want to follow performance over time
104
+ input="...", # input given to ai component
105
+ output="...", # output of the ai component
106
+ # optional args/ condtional based on metrics passed
107
+ expected_output=["..."],
108
+ retrieval_context=[
109
+ ["..."]
110
+ ]
111
+
112
+ )
113
+ ],
114
+ metrics=[
115
+ ToxicityConfig(
116
+ # model="gpt-4.1",
117
+ threshold=0.2,
118
+ include_reason=False
119
+ ),
120
+ CorrectnessConfig(
121
+ steps=[
122
+ "You are evaluating text extracted from resumes and job descriptions using OCR.",
123
+ "1. Verify that the extracted text is coherent and free of major corruption (e.g., broken words, random characters).",
124
+ "2. Check whether key resume/job-related fields are preserved correctly (e.g., name, job title, skills, education, experience, company name, job requirements).",
125
+ "3. Ensure that important details are not missing or replaced with irrelevant content.",
126
+ "4. Ignore minor formatting issues (line breaks, spacing) as long as the information is readable and accurate.",
127
+ "5. Consider the output correct if it faithfully represents the resume or job description’s main information."
128
+ ],
129
+ params=["actual_output"],
130
+
131
+ )
132
+ ],
133
+ chance=.3
134
+ )
135
+
136
+ ```
137
+ ## Configuration
138
+
139
+ The client can be configured in multiple ways:
140
+
141
+ ### Directly via constructor arguments
142
+
143
+ ```python
144
+ DeepEvalClient(base_url="http://api", api_token="123")
145
+ ```
146
+
147
+ ### Environment variables
148
+
149
+ ```bash
150
+ export EVALFRAMWORK_URL=http://api
151
+ export EVALFRAMWORK_API_KEY=123
152
+ ```
153
+
154
+ ### Settings module
155
+
156
+ ```python
157
+ import settings # it can be django settings e.g.: from django.conf import settings
158
+ client = DeepEvalClient(settings_module=settings)
159
+ ```
160
+
161
+ <!-- uv publish --index testpypi
162
+ twine upload --repository testpypi dist/\*
163
+ uv add twine build --dev
164
+
165
+ uv build -->
@@ -0,0 +1,6 @@
1
+ rakam_eval_sdk/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ rakam_eval_sdk/client.py,sha256=EdYA8SFoq6PhO6JNxu_j2eJSd3g4I0rtUtGJmGgvfzA,6583
3
+ rakam_eval_sdk/schema.py,sha256=FaY7nlcbzlFhH7lZl9iFfJ6T0wGVte7TYbt-w_wpFuI,3400
4
+ rakam_eval_sdk-0.1.15.dist-info/WHEEL,sha256=eh7sammvW2TypMMMGKgsM83HyA_3qQ5Lgg3ynoecH3M,79
5
+ rakam_eval_sdk-0.1.15.dist-info/METADATA,sha256=PhyFhXFiTeCt2KK_kBjGGXDI69q8qFmyg-aEiKh16OQ,5930
6
+ rakam_eval_sdk-0.1.15.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.8.24
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any