judgeval 0.16.9__py3-none-any.whl → 0.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +29 -0
- judgeval/api/__init__.py +108 -0
- judgeval/api/api_types.py +56 -1
- judgeval/cli.py +7 -0
- judgeval/data/judgment_types.py +56 -1
- judgeval/prompts/prompt.py +320 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +0 -12
- judgeval/tracer/__init__.py +71 -33
- judgeval/tracer/exporters/store.py +32 -16
- judgeval/tracer/keys.py +1 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +4 -4
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +2 -2
- judgeval/tracer/llm/llm_google/generate_content.py +1 -1
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +2 -2
- judgeval/tracer/llm/llm_openai/chat_completions.py +4 -4
- judgeval/tracer/llm/llm_openai/responses.py +4 -4
- judgeval/tracer/llm/llm_together/chat_completions.py +4 -4
- judgeval/trainer/__init__.py +10 -1
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +1 -1
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainer.py +52 -387
- judgeval/utils/project.py +15 -0
- judgeval/version.py +1 -1
- {judgeval-0.16.9.dist-info → judgeval-0.18.0.dist-info}/METADATA +2 -3
- {judgeval-0.16.9.dist-info → judgeval-0.18.0.dist-info}/RECORD +29 -25
- {judgeval-0.16.9.dist-info → judgeval-0.18.0.dist-info}/WHEEL +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.18.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.16.9.dist-info → judgeval-0.18.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/trainer/trainer.py
CHANGED
|
@@ -1,405 +1,70 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import
|
|
4
|
-
from
|
|
5
|
-
from fireworks import Dataset # type: ignore[import-not-found]
|
|
6
|
-
from .config import TrainerConfig, ModelConfig
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from .config import TrainerConfig
|
|
3
|
+
from .base_trainer import BaseTrainer
|
|
4
|
+
from .fireworks_trainer import FireworksTrainer
|
|
7
5
|
from .trainable_model import TrainableModel
|
|
8
6
|
from judgeval.tracer import Tracer
|
|
9
|
-
from judgeval.tracer.exporters.store import SpanStore
|
|
10
|
-
from judgeval.tracer.exporters import InMemorySpanExporter
|
|
11
|
-
from judgeval.tracer.keys import AttributeKeys
|
|
12
|
-
from judgeval import JudgmentClient
|
|
13
|
-
from judgeval.scorers import ExampleScorer, ExampleAPIScorerConfig
|
|
14
|
-
from judgeval.data import Example
|
|
15
|
-
from .console import _spinner_progress, _print_progress, _print_progress_update
|
|
16
7
|
from judgeval.exceptions import JudgmentRuntimeError
|
|
17
8
|
|
|
18
9
|
|
|
19
|
-
|
|
10
|
+
def JudgmentTrainer(
|
|
11
|
+
config: TrainerConfig,
|
|
12
|
+
trainable_model: TrainableModel,
|
|
13
|
+
tracer: Tracer,
|
|
14
|
+
project_name: Optional[str] = None,
|
|
15
|
+
) -> BaseTrainer:
|
|
20
16
|
"""
|
|
21
|
-
|
|
17
|
+
Factory function for creating reinforcement learning trainers.
|
|
22
18
|
|
|
23
|
-
This
|
|
24
|
-
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
config: TrainerConfig,
|
|
30
|
-
trainable_model: TrainableModel,
|
|
31
|
-
tracer: Tracer,
|
|
32
|
-
project_name: Optional[str] = None,
|
|
33
|
-
):
|
|
34
|
-
"""
|
|
35
|
-
Initialize the JudgmentTrainer.
|
|
36
|
-
|
|
37
|
-
Args:
|
|
38
|
-
config: TrainerConfig instance with training parameters. If None, uses default config.
|
|
39
|
-
tracer: Optional tracer for observability
|
|
40
|
-
trainable_model: Optional trainable model instance
|
|
41
|
-
project_name: Project name for organizing training runs and evaluations
|
|
42
|
-
"""
|
|
43
|
-
try:
|
|
44
|
-
self.config = config
|
|
45
|
-
self.tracer = tracer
|
|
46
|
-
self.project_name = project_name or "judgment_training"
|
|
47
|
-
self.trainable_model = trainable_model
|
|
48
|
-
|
|
49
|
-
self.judgment_client = JudgmentClient()
|
|
50
|
-
self.span_store = SpanStore()
|
|
51
|
-
self.span_exporter = InMemorySpanExporter(self.span_store)
|
|
52
|
-
except Exception as e:
|
|
53
|
-
raise JudgmentRuntimeError(
|
|
54
|
-
f"Failed to initialize JudgmentTrainer: {str(e)}"
|
|
55
|
-
) from e
|
|
56
|
-
|
|
57
|
-
def _extract_message_history_from_spans(self) -> List[Dict[str, str]]:
|
|
58
|
-
"""
|
|
59
|
-
Extract message history from spans in the span store for training purposes.
|
|
60
|
-
|
|
61
|
-
This method processes trace spans to reconstruct the conversation flow,
|
|
62
|
-
extracting messages in chronological order from LLM, user, and tool spans.
|
|
63
|
-
|
|
64
|
-
Returns:
|
|
65
|
-
List of message dictionaries with 'role' and 'content' keys
|
|
66
|
-
"""
|
|
67
|
-
spans = self.span_store.get_all()
|
|
68
|
-
if not spans:
|
|
69
|
-
return []
|
|
70
|
-
|
|
71
|
-
messages = []
|
|
72
|
-
first_found = False
|
|
73
|
-
|
|
74
|
-
for span in sorted(spans, key=lambda s: getattr(s, "start_time", 0)):
|
|
75
|
-
span_attributes = span.attributes or {}
|
|
76
|
-
span_type = span_attributes.get(AttributeKeys.JUDGMENT_SPAN_KIND, "span")
|
|
77
|
-
|
|
78
|
-
if (
|
|
79
|
-
not span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
|
80
|
-
and span_type != "llm"
|
|
81
|
-
):
|
|
82
|
-
continue
|
|
83
|
-
|
|
84
|
-
if span_type == "llm":
|
|
85
|
-
if not first_found and span_attributes.get(
|
|
86
|
-
AttributeKeys.JUDGMENT_INPUT
|
|
87
|
-
):
|
|
88
|
-
input_data: Any = span_attributes.get(
|
|
89
|
-
AttributeKeys.JUDGMENT_INPUT, {}
|
|
90
|
-
)
|
|
91
|
-
if isinstance(input_data, dict) and "messages" in input_data:
|
|
92
|
-
input_messages = input_data["messages"]
|
|
93
|
-
if input_messages:
|
|
94
|
-
first_found = True
|
|
95
|
-
for msg in input_messages:
|
|
96
|
-
if (
|
|
97
|
-
isinstance(msg, dict)
|
|
98
|
-
and "role" in msg
|
|
99
|
-
and "content" in msg
|
|
100
|
-
):
|
|
101
|
-
messages.append(
|
|
102
|
-
{"role": msg["role"], "content": msg["content"]}
|
|
103
|
-
)
|
|
104
|
-
|
|
105
|
-
# Add assistant response from span output
|
|
106
|
-
output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
|
107
|
-
if output is not None:
|
|
108
|
-
content = str(output)
|
|
109
|
-
try:
|
|
110
|
-
parsed = json.loads(content)
|
|
111
|
-
if isinstance(parsed, dict) and "messages" in parsed:
|
|
112
|
-
# Extract the actual assistant message content
|
|
113
|
-
for msg in parsed["messages"]:
|
|
114
|
-
if (
|
|
115
|
-
isinstance(msg, dict)
|
|
116
|
-
and msg.get("role") == "assistant"
|
|
117
|
-
):
|
|
118
|
-
content = msg.get("content", content)
|
|
119
|
-
break
|
|
120
|
-
except (json.JSONDecodeError, KeyError):
|
|
121
|
-
pass
|
|
122
|
-
messages.append({"role": "assistant", "content": content})
|
|
123
|
-
|
|
124
|
-
elif span_type == "user":
|
|
125
|
-
output = span_attributes.get(AttributeKeys.JUDGMENT_OUTPUT)
|
|
126
|
-
if output is not None:
|
|
127
|
-
content = str(output)
|
|
128
|
-
try:
|
|
129
|
-
parsed = json.loads(content)
|
|
130
|
-
if isinstance(parsed, dict) and "messages" in parsed:
|
|
131
|
-
for msg in parsed["messages"]:
|
|
132
|
-
if isinstance(msg, dict) and msg.get("role") == "user":
|
|
133
|
-
content = msg.get("content", content)
|
|
134
|
-
break
|
|
135
|
-
except (json.JSONDecodeError, KeyError):
|
|
136
|
-
pass
|
|
137
|
-
messages.append({"role": "user", "content": content})
|
|
19
|
+
This factory creates and returns provider-specific trainer implementations
|
|
20
|
+
(FireworksTrainer, VerifiersTrainer, etc.) based on the configured RFT provider.
|
|
138
21
|
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
if output is not None:
|
|
142
|
-
content = str(output)
|
|
143
|
-
try:
|
|
144
|
-
parsed = json.loads(content)
|
|
145
|
-
if isinstance(parsed, dict) and "messages" in parsed:
|
|
146
|
-
for msg in parsed["messages"]:
|
|
147
|
-
if isinstance(msg, dict) and msg.get("role") == "user":
|
|
148
|
-
content = msg.get("content", content)
|
|
149
|
-
break
|
|
150
|
-
except (json.JSONDecodeError, KeyError):
|
|
151
|
-
pass
|
|
152
|
-
messages.append({"role": "user", "content": content})
|
|
22
|
+
The factory pattern allows for easy extension to support multiple training
|
|
23
|
+
providers without changing the client-facing API.
|
|
153
24
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
prompts: List[Any],
|
|
161
|
-
num_prompts_per_step: Optional[int] = None,
|
|
162
|
-
num_generations_per_prompt: Optional[int] = None,
|
|
163
|
-
concurrency: Optional[int] = None,
|
|
164
|
-
):
|
|
165
|
-
"""
|
|
166
|
-
Generate rollouts and compute rewards using the current model snapshot.
|
|
167
|
-
Each sample contains multiple generations for reinforcement learning optimization.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
agent_function: Function/agent to call for generating responses
|
|
171
|
-
scorers: List of scorer objects to evaluate responses
|
|
172
|
-
prompts: List of prompts to use for training
|
|
173
|
-
num_prompts_per_step: Number of prompts to use per step (defaults to config value, limited by prompts list length)
|
|
174
|
-
num_generations_per_prompt: Generations per prompt (defaults to config value)
|
|
175
|
-
concurrency: Concurrency limit (defaults to config value)
|
|
176
|
-
|
|
177
|
-
Returns:
|
|
178
|
-
List of dataset rows containing samples with messages and evaluations
|
|
179
|
-
"""
|
|
180
|
-
num_prompts_per_step = min(
|
|
181
|
-
num_prompts_per_step or self.config.num_prompts_per_step, len(prompts)
|
|
182
|
-
)
|
|
183
|
-
num_generations_per_prompt = (
|
|
184
|
-
num_generations_per_prompt or self.config.num_generations_per_prompt
|
|
25
|
+
Example:
|
|
26
|
+
config = TrainerConfig(
|
|
27
|
+
deployment_id="my-deployment",
|
|
28
|
+
user_id="my-user",
|
|
29
|
+
model_id="my-model",
|
|
30
|
+
rft_provider="fireworks" # or "verifiers" in the future
|
|
185
31
|
)
|
|
186
|
-
concurrency = concurrency or self.config.concurrency
|
|
187
|
-
|
|
188
|
-
semaphore = asyncio.Semaphore(concurrency)
|
|
189
|
-
|
|
190
|
-
@self.tracer.observe(span_type="function")
|
|
191
|
-
async def generate_single_response(prompt_id, generation_id):
|
|
192
|
-
async with semaphore:
|
|
193
|
-
prompt_input = prompts[prompt_id]
|
|
194
|
-
response_data = await agent_function(**prompt_input)
|
|
195
|
-
messages = response_data.get("messages", [])
|
|
196
|
-
|
|
197
|
-
try:
|
|
198
|
-
traced_messages = self._extract_message_history_from_spans()
|
|
199
|
-
if traced_messages:
|
|
200
|
-
messages = traced_messages
|
|
201
|
-
except Exception as e:
|
|
202
|
-
print(f"Warning: Failed to get message history from trace: {e}")
|
|
203
|
-
pass
|
|
204
|
-
|
|
205
|
-
finally:
|
|
206
|
-
self.span_store.spans = []
|
|
207
|
-
|
|
208
|
-
example = Example(
|
|
209
|
-
input=prompt_input,
|
|
210
|
-
messages=messages,
|
|
211
|
-
actual_output=response_data,
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
scoring_results = self.judgment_client.run_evaluation(
|
|
215
|
-
examples=[example],
|
|
216
|
-
scorers=scorers,
|
|
217
|
-
project_name=self.project_name,
|
|
218
|
-
eval_run_name=f"training_step_{self.trainable_model.current_step}_prompt_{prompt_id}_gen_{generation_id}",
|
|
219
|
-
)
|
|
220
|
-
|
|
221
|
-
if scoring_results and scoring_results[0].scorers_data:
|
|
222
|
-
scores = [
|
|
223
|
-
scorer_data.score
|
|
224
|
-
for scorer_data in scoring_results[0].scorers_data
|
|
225
|
-
if scorer_data.score is not None
|
|
226
|
-
]
|
|
227
|
-
reward = sum(scores) / len(scores) if scores else 0.0
|
|
228
|
-
else:
|
|
229
|
-
reward = 0.0
|
|
230
|
-
|
|
231
|
-
return {
|
|
232
|
-
"prompt_id": prompt_id,
|
|
233
|
-
"generation_id": generation_id,
|
|
234
|
-
"messages": messages,
|
|
235
|
-
"evals": {"score": reward},
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
coros = []
|
|
239
|
-
for prompt_id in range(num_prompts_per_step):
|
|
240
|
-
for generation_id in range(num_generations_per_prompt):
|
|
241
|
-
coro = generate_single_response(prompt_id, generation_id)
|
|
242
|
-
coros.append(coro)
|
|
243
32
|
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
33
|
+
# User creates and configures the trainable model
|
|
34
|
+
trainable_model = TrainableModel(config)
|
|
35
|
+
tracer = Tracer()
|
|
247
36
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
results.append(result)
|
|
251
|
-
num_completed += 1
|
|
37
|
+
# JudgmentTrainer automatically creates the appropriate provider-specific trainer
|
|
38
|
+
trainer = JudgmentTrainer(config, trainable_model, tracer)
|
|
252
39
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
dataset_rows = []
|
|
256
|
-
for prompt_id in range(num_prompts_per_step):
|
|
257
|
-
prompt_generations = [r for r in results if r["prompt_id"] == prompt_id]
|
|
258
|
-
sample_generations = [
|
|
259
|
-
{"messages": gen["messages"], "evals": gen["evals"]}
|
|
260
|
-
for gen in prompt_generations
|
|
261
|
-
]
|
|
262
|
-
dataset_rows.append({"samples": sample_generations})
|
|
263
|
-
|
|
264
|
-
return dataset_rows
|
|
265
|
-
|
|
266
|
-
async def run_reinforcement_learning(
|
|
267
|
-
self,
|
|
268
|
-
agent_function: Callable[[Any], Any],
|
|
269
|
-
scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
|
270
|
-
prompts: List[Any],
|
|
271
|
-
) -> ModelConfig:
|
|
272
|
-
"""
|
|
273
|
-
Run the iterative reinforcement learning fine-tuning loop.
|
|
274
|
-
|
|
275
|
-
This method performs multiple steps of reinforcement learning, where each step:
|
|
276
|
-
1. Advances to the appropriate model snapshot
|
|
277
|
-
2. Generates rollouts and computes rewards using scorers
|
|
278
|
-
3. Trains a new model using reinforcement learning
|
|
279
|
-
4. Waits for training completion
|
|
40
|
+
# The returned trainer implements the BaseTrainer interface
|
|
41
|
+
model_config = await trainer.train(agent_function, scorers, prompts)
|
|
280
42
|
|
|
281
43
|
Args:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
44
|
+
config: TrainerConfig instance with training parameters including rft_provider
|
|
45
|
+
trainable_model: Provider-specific trainable model instance (e.g., TrainableModel for Fireworks)
|
|
46
|
+
tracer: Tracer for observability
|
|
47
|
+
project_name: Project name for organizing training runs and evaluations
|
|
285
48
|
|
|
286
49
|
Returns:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
_print_progress("Starting reinforcement learning training")
|
|
291
|
-
|
|
292
|
-
training_params = {
|
|
293
|
-
"num_steps": self.config.num_steps,
|
|
294
|
-
"num_prompts_per_step": self.config.num_prompts_per_step,
|
|
295
|
-
"num_generations_per_prompt": self.config.num_generations_per_prompt,
|
|
296
|
-
"epochs": self.config.epochs,
|
|
297
|
-
"learning_rate": self.config.learning_rate,
|
|
298
|
-
"accelerator_count": self.config.accelerator_count,
|
|
299
|
-
"accelerator_type": self.config.accelerator_type,
|
|
300
|
-
"temperature": self.config.temperature,
|
|
301
|
-
"max_tokens": self.config.max_tokens,
|
|
302
|
-
}
|
|
303
|
-
|
|
304
|
-
start_step = self.trainable_model.current_step
|
|
305
|
-
|
|
306
|
-
for step in range(start_step, self.config.num_steps):
|
|
307
|
-
step_num = step + 1
|
|
308
|
-
_print_progress(
|
|
309
|
-
f"Starting training step {step_num}", step_num, self.config.num_steps
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
self.trainable_model.advance_to_next_step(step)
|
|
313
|
-
|
|
314
|
-
dataset_rows = await self.generate_rollouts_and_rewards(
|
|
315
|
-
agent_function, scorers, prompts
|
|
316
|
-
)
|
|
317
|
-
|
|
318
|
-
with _spinner_progress(
|
|
319
|
-
"Preparing training dataset", step_num, self.config.num_steps
|
|
320
|
-
):
|
|
321
|
-
dataset = Dataset.from_list(dataset_rows)
|
|
322
|
-
dataset.sync()
|
|
323
|
-
|
|
324
|
-
_print_progress(
|
|
325
|
-
"Starting reinforcement training", step_num, self.config.num_steps
|
|
326
|
-
)
|
|
327
|
-
job = self.trainable_model.perform_reinforcement_step(dataset, step)
|
|
328
|
-
|
|
329
|
-
last_state = None
|
|
330
|
-
with _spinner_progress(
|
|
331
|
-
"Training job in progress", step_num, self.config.num_steps
|
|
332
|
-
):
|
|
333
|
-
while not job.is_completed:
|
|
334
|
-
job.raise_if_bad_state()
|
|
335
|
-
current_state = job.state
|
|
336
|
-
|
|
337
|
-
if current_state != last_state:
|
|
338
|
-
if current_state in ["uploading", "validating"]:
|
|
339
|
-
_print_progress_update(
|
|
340
|
-
f"Training job: {current_state} data"
|
|
341
|
-
)
|
|
342
|
-
elif current_state == "training":
|
|
343
|
-
_print_progress_update(
|
|
344
|
-
"Training job: model training in progress"
|
|
345
|
-
)
|
|
346
|
-
else:
|
|
347
|
-
_print_progress_update(f"Training job: {current_state}")
|
|
348
|
-
last_state = current_state
|
|
349
|
-
|
|
350
|
-
time.sleep(10)
|
|
351
|
-
job = job.get()
|
|
352
|
-
if job is None:
|
|
353
|
-
raise JudgmentRuntimeError(
|
|
354
|
-
"Training job was deleted while waiting for completion"
|
|
355
|
-
)
|
|
50
|
+
Provider-specific trainer instance (FireworksTrainer, etc.) that implements
|
|
51
|
+
the BaseTrainer interface
|
|
356
52
|
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
scorers: List[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
|
376
|
-
prompts: List[Any],
|
|
377
|
-
rft_provider: Optional[str] = None,
|
|
378
|
-
) -> ModelConfig:
|
|
379
|
-
"""
|
|
380
|
-
Start the reinforcement learning fine-tuning process.
|
|
381
|
-
|
|
382
|
-
This is the main entry point for running the reinforcement learning training.
|
|
383
|
-
|
|
384
|
-
Args:
|
|
385
|
-
agent_function: Function/agent to call for generating responses.
|
|
386
|
-
scorers: List of scorer objects to evaluate responses
|
|
387
|
-
prompts: List of prompts to use for training
|
|
388
|
-
rft_provider: RFT provider to use for training. Currently only "fireworks" is supported.
|
|
389
|
-
Support for other providers is planned for future releases.
|
|
390
|
-
|
|
391
|
-
Returns:
|
|
392
|
-
ModelConfig: Configuration of the trained model for future loading
|
|
393
|
-
"""
|
|
394
|
-
try:
|
|
395
|
-
if rft_provider is not None:
|
|
396
|
-
self.config.rft_provider = rft_provider
|
|
397
|
-
|
|
398
|
-
return await self.run_reinforcement_learning(
|
|
399
|
-
agent_function, scorers, prompts
|
|
400
|
-
)
|
|
401
|
-
except JudgmentRuntimeError:
|
|
402
|
-
# Re-raise JudgmentAPIError as-is
|
|
403
|
-
raise
|
|
404
|
-
except Exception as e:
|
|
405
|
-
raise JudgmentRuntimeError(f"Training process failed: {str(e)}") from e
|
|
53
|
+
Raises:
|
|
54
|
+
JudgmentRuntimeError: If the specified provider is not supported
|
|
55
|
+
"""
|
|
56
|
+
provider = config.rft_provider.lower()
|
|
57
|
+
|
|
58
|
+
if provider == "fireworks":
|
|
59
|
+
return FireworksTrainer(config, trainable_model, tracer, project_name)
|
|
60
|
+
elif provider == "verifiers":
|
|
61
|
+
# Placeholder for future implementation
|
|
62
|
+
raise JudgmentRuntimeError(
|
|
63
|
+
"Verifiers provider is not yet implemented. "
|
|
64
|
+
"Currently supported providers: 'fireworks'"
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
raise JudgmentRuntimeError(
|
|
68
|
+
f"Unsupported RFT provider: '{config.rft_provider}'. "
|
|
69
|
+
f"Currently supported providers: 'fireworks'"
|
|
70
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
2
|
+
import functools
|
|
3
|
+
from judgeval.api import JudgmentSyncClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dont_throw
|
|
7
|
+
@functools.lru_cache(maxsize=64)
|
|
8
|
+
def _resolve_project_id(project_name: str, api_key: str, organization_id: str) -> str:
|
|
9
|
+
"""Resolve project_id from project_name using the API."""
|
|
10
|
+
client = JudgmentSyncClient(
|
|
11
|
+
api_key=api_key,
|
|
12
|
+
organization_id=organization_id,
|
|
13
|
+
)
|
|
14
|
+
response = client.projects_resolve({"project_name": project_name})
|
|
15
|
+
return response["project_id"]
|
judgeval/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: judgeval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.18.0
|
|
4
4
|
Summary: Judgeval Package
|
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
|
@@ -63,8 +63,7 @@ Judgeval's agent monitoring infra provides a simple harness for integrating GRPO
|
|
|
63
63
|
await trainer.train(
|
|
64
64
|
agent_function=your_agent_function, # entry point to your agent
|
|
65
65
|
scorers=[RewardScorer()], # Custom scorer you define based on task criteria, acts as reward
|
|
66
|
-
prompts=training_prompts
|
|
67
|
-
rft_provider="fireworks"
|
|
66
|
+
prompts=training_prompts # Tasks
|
|
68
67
|
)
|
|
69
68
|
```
|
|
70
69
|
|
|
@@ -1,17 +1,17 @@
|
|
|
1
|
-
judgeval/__init__.py,sha256=
|
|
2
|
-
judgeval/cli.py,sha256=
|
|
1
|
+
judgeval/__init__.py,sha256=74WyDtb9SisWwYZ8juQSUJpa6c0KTI6zzkiTX7Wvalc,6601
|
|
2
|
+
judgeval/cli.py,sha256=bkwsDqX0sdfChLxm9aTLAIw0sPYv-fUbjmaFeBgPgk8,1803
|
|
3
3
|
judgeval/constants.py,sha256=JZZJ1MqzZZDVk-5PRPRbmLnM8mXI-RDL5vxa1JFuscs,3408
|
|
4
4
|
judgeval/env.py,sha256=37Mn4g0OkpFxXCZGlO_CLqKJnyX-jx_R24tC28XJzig,2112
|
|
5
5
|
judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
|
|
6
6
|
judgeval/logger.py,sha256=VP5blbsJ53mvJbNHfBf5p2KrARUrkrErpPkB-__Hh3U,1562
|
|
7
|
-
judgeval/version.py,sha256=
|
|
7
|
+
judgeval/version.py,sha256=CybtPmbwRv_x6bsmmn5cZhdYjBHKkklFsk3eOsP-fMs,74
|
|
8
8
|
judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
|
|
9
|
-
judgeval/api/__init__.py,sha256=
|
|
10
|
-
judgeval/api/api_types.py,sha256=
|
|
9
|
+
judgeval/api/__init__.py,sha256=dGZm9KtgLMnmbiyDEJ_D7suuVqmsibR_Cd0YZRJ7qHI,15210
|
|
10
|
+
judgeval/api/api_types.py,sha256=PvwRVxP0_vCXg_ii7jo4SzbB_kbZcL8tiVnX7qotJA8,9878
|
|
11
11
|
judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
|
|
12
12
|
judgeval/data/evaluation_run.py,sha256=O41p99wNAuCAf6lsLNKzkZ6W-kL9LlzCYxVls7IcKkA,4727
|
|
13
13
|
judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
|
|
14
|
-
judgeval/data/judgment_types.py,sha256=
|
|
14
|
+
judgeval/data/judgment_types.py,sha256=_LUqYW-fXQcEfa1RQzqTNETnqdNQQ3eH21qBcfJnObU,18542
|
|
15
15
|
judgeval/data/result.py,sha256=XufFGSAkBDfevPUmzSgsR9HEqytISkM0U5HkhJmsjpY,2102
|
|
16
16
|
judgeval/data/scorer_data.py,sha256=HeP15ZgftFTJCF8JmDJCLWXRnZJIaGDJCzl7Hg6gWwE,2006
|
|
17
17
|
judgeval/data/trace.py,sha256=zSiR3o6xt8Z46XA3M9fJBtViF0BsPO6yKp9jxdscOSc,3881
|
|
@@ -26,6 +26,7 @@ judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY
|
|
|
26
26
|
judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
|
|
27
27
|
judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
|
|
28
28
|
judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
|
|
29
|
+
judgeval/prompts/prompt.py,sha256=N6G7ncVsmeXgTXzYNDrMw2NESzBJjSKvp4h-BACpX_8,10220
|
|
29
30
|
judgeval/scorers/__init__.py,sha256=pomKzEy4YNFyygYp8vbS3co8iB5CMstRkQwdUgi1u4g,744
|
|
30
31
|
judgeval/scorers/agent_scorer.py,sha256=-qcNSkY6i7ur2LXkM7H1jTKuuFbDuXbjTq42o3vjeQ8,595
|
|
31
32
|
judgeval/scorers/api_scorer.py,sha256=jPBQUBs_T3Xq33QoIbIXDzUaXinz56qeDfo96dfdX0g,2036
|
|
@@ -40,15 +41,15 @@ judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=WUeFy
|
|
|
40
41
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
|
|
41
42
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ucYOI6ztAjfoYmcgTDzN8u5RrehlVqrkeLEfss9b1fk,441
|
|
42
43
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=V3RdrWhnR_vLBrtWw7QbgN9K_A-Och7-v9I2fN4z8gY,506
|
|
43
|
-
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=
|
|
44
|
-
judgeval/tracer/__init__.py,sha256=
|
|
44
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=YdGr2bO5miAtF7fDn2t488RFRi0oYmycqkvm69qCrWs,10754
|
|
45
|
+
judgeval/tracer/__init__.py,sha256=U5RRAYuL_vg1SlWuaaufnobdoLS-J8ovM3FedP_vny4,36398
|
|
45
46
|
judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
|
|
46
|
-
judgeval/tracer/keys.py,sha256=
|
|
47
|
+
judgeval/tracer/keys.py,sha256=G2Qgb5ZlFsZvXPMylh-OLhHSnWYQ23g0GdtY9n9XuoE,2280
|
|
47
48
|
judgeval/tracer/managers.py,sha256=NEkovnK8Qaod1U5msT0_hyHUqo9C2uFro2IzNlC8jCo,5071
|
|
48
49
|
judgeval/tracer/utils.py,sha256=xWha5iwC733wCf2HKbNqzxOPS1ovO1OymWIUFLz-UpQ,537
|
|
49
50
|
judgeval/tracer/exporters/__init__.py,sha256=3WDXC28iY5gYMM5s7ejmy7P-DVDQ_iIuzwovZxUKJXg,1295
|
|
50
51
|
judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
|
|
51
|
-
judgeval/tracer/exporters/store.py,sha256=
|
|
52
|
+
judgeval/tracer/exporters/store.py,sha256=pA_KINcm0amO0WEDYmMFU05SSsMOgJ5ogIRaevSX1sk,1885
|
|
52
53
|
judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
|
|
53
54
|
judgeval/tracer/llm/__init__.py,sha256=ENxApieKSktYrIviofXWP9GU0WnhBm0Q9mlGe_m_gMY,139
|
|
54
55
|
judgeval/tracer/llm/config.py,sha256=J8-bTL82bgDqdTJPN-Px3Epvoa9FG7L-X329kitwBTc,2525
|
|
@@ -56,33 +57,36 @@ judgeval/tracer/llm/constants.py,sha256=IWa3CMes8wIt_UG7jrGEOztg2sHz54fdOMWIOOr-
|
|
|
56
57
|
judgeval/tracer/llm/providers.py,sha256=VAimkmChOOjhC1cUv-0iG8pa5PhOw1HIOyt3zrIrbcM,628
|
|
57
58
|
judgeval/tracer/llm/llm_anthropic/__init__.py,sha256=HG0gIlTgaRt-Y0u1ERPQ19pUgb4YHkTh7tZQPeyR4oM,80
|
|
58
59
|
judgeval/tracer/llm/llm_anthropic/config.py,sha256=ICfKODPQvZsRxpK4xWQ-YE79pmWJTmY2wryddxpNdpM,153
|
|
59
|
-
judgeval/tracer/llm/llm_anthropic/messages.py,sha256=
|
|
60
|
-
judgeval/tracer/llm/llm_anthropic/messages_stream.py,sha256=
|
|
60
|
+
judgeval/tracer/llm/llm_anthropic/messages.py,sha256=U11364nrTt6M58K218uj8AxGPrNwzJ4idhEmZQtFuik,15152
|
|
61
|
+
judgeval/tracer/llm/llm_anthropic/messages_stream.py,sha256=ZhHigQujU-zHhklgwSVoQYtSsL_7yC5Rwpq9vozekMc,12140
|
|
61
62
|
judgeval/tracer/llm/llm_anthropic/wrapper.py,sha256=JILcyC4NvjXZSqlFoZp-VB-JsCYZkQPMFEYaB4AysrA,1849
|
|
62
63
|
judgeval/tracer/llm/llm_google/__init__.py,sha256=otBZETsAfVZjtZaN5N36Ln0kw-I9jVB4tFGrV6novHo,74
|
|
63
64
|
judgeval/tracer/llm/llm_google/config.py,sha256=S3yCAE9oHbXjLVYiz5mGD16yIgXMBBUu5UN4lBjoCNQ,162
|
|
64
|
-
judgeval/tracer/llm/llm_google/generate_content.py,sha256=
|
|
65
|
+
judgeval/tracer/llm/llm_google/generate_content.py,sha256=meLWeoZ7J2JtSkpt2Lt8qapYi_mxv0204cXWaFZ0FKs,3973
|
|
65
66
|
judgeval/tracer/llm/llm_google/wrapper.py,sha256=jqaMXGoM9dlPBbCFadMI5EqFrNHzBt0h9VkNn7KPVLk,901
|
|
66
67
|
judgeval/tracer/llm/llm_openai/__init__.py,sha256=CyzwhY0-zmqWKlEno7JPBcvO7G_hI8dp6-_5_KEzFqg,74
|
|
67
|
-
judgeval/tracer/llm/llm_openai/beta_chat_completions.py,sha256=
|
|
68
|
-
judgeval/tracer/llm/llm_openai/chat_completions.py,sha256=
|
|
68
|
+
judgeval/tracer/llm/llm_openai/beta_chat_completions.py,sha256=KwetlVexleDSSyRBEezC7Fk5do1Vub3FwLbRhCPgktc,6490
|
|
69
|
+
judgeval/tracer/llm/llm_openai/chat_completions.py,sha256=NWPE_BQTGfTRfsqhzXtNlQAv1Cr2GymolrTGzIbr9Ok,15625
|
|
69
70
|
judgeval/tracer/llm/llm_openai/config.py,sha256=NE0ixKhd4WVeAVjY8jNTncuKYH6R4MQDLPmcCsd3zWY,144
|
|
70
|
-
judgeval/tracer/llm/llm_openai/responses.py,sha256=
|
|
71
|
+
judgeval/tracer/llm/llm_openai/responses.py,sha256=lhs4yS-rJU255vo5gsJkGFRloYurlfnXIkstjMwR3vA,15875
|
|
71
72
|
judgeval/tracer/llm/llm_openai/wrapper.py,sha256=Z5Ndib228yd1pXEQ4xIu7_CJHxpW_t0ofZAC6FLc5eU,2055
|
|
72
73
|
judgeval/tracer/llm/llm_together/__init__.py,sha256=MEnsF77IgFD4h73hNCMpo-9a1PHHdm-OxPlOalXOMac,78
|
|
73
|
-
judgeval/tracer/llm/llm_together/chat_completions.py,sha256=
|
|
74
|
+
judgeval/tracer/llm/llm_together/chat_completions.py,sha256=KC8sk40l1VDuFStuVGIV1GLLx3vrtPDk5Y2vJsnRe70,14156
|
|
74
75
|
judgeval/tracer/llm/llm_together/config.py,sha256=jCJY0KQcHJZZJk2vq038GKIDUMusqgvRjQ0B6OV5uEc,150
|
|
75
76
|
judgeval/tracer/llm/llm_together/wrapper.py,sha256=HFqy_MabQeSq8oj2diZhEuk1SDt_hDfk5MFdPn9MFhg,1733
|
|
76
77
|
judgeval/tracer/processors/__init__.py,sha256=BdOOPOD1RfMI5YHW76DNPKR07EAev-JxoolZ3KaXNNU,7100
|
|
77
|
-
judgeval/trainer/__init__.py,sha256=
|
|
78
|
-
judgeval/trainer/
|
|
78
|
+
judgeval/trainer/__init__.py,sha256=nJo913vFdss3E_PR-M1OUjznS0SYgNZ-MP-Y_6Mj5PA,437
|
|
79
|
+
judgeval/trainer/base_trainer.py,sha256=Lxm6OxJpifonLKofNIRG3TU7n_jZWQZ0I_f_jwtb_WU,4018
|
|
80
|
+
judgeval/trainer/config.py,sha256=7ZSwr6p7vq0MRadh9axm6XB-RAotdWqULZ5yDl0xGbQ,4340
|
|
79
81
|
judgeval/trainer/console.py,sha256=SvokkFEU-K1vLV4Rd1m6YJJ7HyYwTr4Azdzwx_JPZUY,4351
|
|
82
|
+
judgeval/trainer/fireworks_trainer.py,sha256=_B-fWovdhIpxh1RbXU0W5BlFGc9ZzuYtFw7CBtKTRO8,16074
|
|
80
83
|
judgeval/trainer/trainable_model.py,sha256=T-Sioi_sXtfYlcu3lE0cd60PHs8DrYaZ-Kxb4h1nU04,8993
|
|
81
|
-
judgeval/trainer/trainer.py,sha256=
|
|
84
|
+
judgeval/trainer/trainer.py,sha256=twLEHNaomelTg6ZYG6veI9OpB3wzhPCtPVQMTnDZWx4,2626
|
|
82
85
|
judgeval/utils/async_utils.py,sha256=AF1xdu8Ao5GyhFvfaLOaKJHn1RISyXZ4U70UZe9zfBA,1083
|
|
83
86
|
judgeval/utils/file_utils.py,sha256=vq-n5WZEZjVbZ5S9QTkW8nSH6Pvw-Jx0ttsQ1t0wnPQ,3140
|
|
84
87
|
judgeval/utils/guards.py,sha256=QBb6m6KElxdvt2bskLZCKh_zGHbBcqV-VfGzT63o3hY,807
|
|
85
88
|
judgeval/utils/meta.py,sha256=RAqZuvOlymqMwFoS0joBW_r65lcN9bY8BpNYHoytKps,773
|
|
89
|
+
judgeval/utils/project.py,sha256=kGpYmp6QGTD6h-GjQ-ovT7kBmGnyb99MWDJmRGFQHOg,527
|
|
86
90
|
judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6256
|
|
87
91
|
judgeval/utils/testing.py,sha256=m5Nexv65tmfSj1XvAPK5Ear7aJ7w5xjDtZN0tLZ_RBk,2939
|
|
88
92
|
judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
|
|
@@ -100,8 +104,8 @@ judgeval/utils/wrappers/mutable_wrap_async.py,sha256=stHISOUCGFUJXY8seXmxUo4ZpMF
|
|
|
100
104
|
judgeval/utils/wrappers/mutable_wrap_sync.py,sha256=t5jygAQ1vqhy8s1GfiLeYygYgaLTgfoYASN47U5JiPs,2888
|
|
101
105
|
judgeval/utils/wrappers/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
102
106
|
judgeval/utils/wrappers/utils.py,sha256=j18vaa6JWDw2s3nQy1z5PfV_9Xxio-bVARaHG_0XyL0,1228
|
|
103
|
-
judgeval-0.
|
|
104
|
-
judgeval-0.
|
|
105
|
-
judgeval-0.
|
|
106
|
-
judgeval-0.
|
|
107
|
-
judgeval-0.
|
|
107
|
+
judgeval-0.18.0.dist-info/METADATA,sha256=rkPsc8z-trMM27wunxLLI_3CGJNb1UXjuByMomklKIU,11483
|
|
108
|
+
judgeval-0.18.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
109
|
+
judgeval-0.18.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
|
110
|
+
judgeval-0.18.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
|
111
|
+
judgeval-0.18.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|