speedy-utils 1.1.22__tar.gz → 1.1.23__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/PKG-INFO +1 -1
  2. speedy_utils-1.1.23/notebooks/llm_utils/llm_as_a_judge.ipynb +642 -0
  3. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/pyproject.toml +1 -1
  4. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/__init__.py +11 -3
  5. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/__init__.py +10 -0
  6. speedy_utils-1.1.23/src/llm_utils/lm/llm_as_a_judge.py +390 -0
  7. speedy_utils-1.1.23/src/llm_utils/lm/signature.py +282 -0
  8. speedy_utils-1.1.22/src/llm_utils/lm/lm.py +0 -207
  9. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/.github/copilot-instructions.md +0 -0
  10. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/.github/workflows/publish.yml +0 -0
  11. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/.gitignore +0 -0
  12. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/.pre-commit-config.yaml +0 -0
  13. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/README.md +0 -0
  14. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/bumpversion.sh +0 -0
  15. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/notebooks/test_multi_thread.ipynb +0 -0
  16. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/ruff.toml +0 -0
  17. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/scripts/deploy.sh +0 -0
  18. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/setup.cfg +0 -0
  19. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/chat_format/__init__.py +0 -0
  20. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/chat_format/display.py +0 -0
  21. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/chat_format/transform.py +0 -0
  22. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/chat_format/utils.py +0 -0
  23. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/group_messages.py +0 -0
  24. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/async_lm/__init__.py +0 -0
  25. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/async_lm/_utils.py +0 -0
  26. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/async_lm/async_llm_task.py +0 -0
  27. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/async_lm/async_lm.py +0 -0
  28. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/async_lm/async_lm_base.py +0 -0
  29. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/async_lm/lm_specific.py +0 -0
  30. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/base_prompt_builder.py +0 -0
  31. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/llm_task.py +0 -0
  32. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/lm_base.py +0 -0
  33. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/openai_memoize.py +0 -0
  34. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/lm/utils.py +0 -0
  35. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/scripts/README.md +0 -0
  36. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/scripts/vllm_load_balancer.py +0 -0
  37. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/scripts/vllm_serve.py +0 -0
  38. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/vector_cache/__init__.py +0 -0
  39. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/vector_cache/cli.py +0 -0
  40. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/vector_cache/core.py +0 -0
  41. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/vector_cache/types.py +0 -0
  42. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/llm_utils/vector_cache/utils.py +0 -0
  43. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/__init__.py +0 -0
  44. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/all.py +0 -0
  45. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/__init__.py +0 -0
  46. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/clock.py +0 -0
  47. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/function_decorator.py +0 -0
  48. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/logger.py +0 -0
  49. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/notebook_utils.py +0 -0
  50. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/patcher.py +0 -0
  51. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/report_manager.py +0 -0
  52. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/utils_cache.py +0 -0
  53. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/utils_io.py +0 -0
  54. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/utils_misc.py +0 -0
  55. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/common/utils_print.py +0 -0
  56. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/multi_worker/__init__.py +0 -0
  57. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/multi_worker/process.py +0 -0
  58. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/multi_worker/thread.py +0 -0
  59. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/scripts/__init__.py +0 -0
  60. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/scripts/mpython.py +0 -0
  61. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
  62. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/sample_objects.py +0 -0
  63. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test.py +0 -0
  64. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_logger.py +0 -0
  65. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_logger_format.py +0 -0
  66. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_memoize_typing.py +0 -0
  67. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_mpython.py +0 -0
  68. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_process.py +0 -0
  69. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_process_update.py +0 -0
  70. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/tests/test_thread.py +0 -0
  71. {speedy_utils-1.1.22 → speedy_utils-1.1.23}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speedy-utils
3
- Version: 1.1.22
3
+ Version: 1.1.23
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Project-URL: Homepage, https://github.com/anhvth/speedy
6
6
  Project-URL: Repository, https://github.com/anhvth/speedy
@@ -0,0 +1,642 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "136ff273",
6
+ "metadata": {},
7
+ "source": [
8
+ "# LLM-as-a-Judge Tutorial\n",
9
+ "\n",
10
+ "This notebook demonstrates how to use the LLM-as-a-Judge system with structured prompts, variable substitution, and SFT export capabilities."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "id": "60c1bf59",
16
+ "metadata": {},
17
+ "source": [
18
+ "## Setup and Imports"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 1,
24
+ "id": "bb8f8e2b",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import sys\n",
29
+ "import os\n",
30
+ "# sys.path.append('../../src') # Add src to path for imports\n",
31
+ "\n",
32
+ "from llm_utils import (\n",
33
+ " LLMJudgeBase, \n",
34
+ " ChainOfThought, \n",
35
+ " TranslationEvaluatorJudge,\n",
36
+ " Signature, \n",
37
+ " InputField, \n",
38
+ " OutputField\n",
39
+ ")\n",
40
+ "from pydantic import BaseModel\n",
41
+ "import json"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "markdown",
46
+ "id": "eaceb8bd",
47
+ "metadata": {},
48
+ "source": [
49
+ "## Example 1: DSPy-like Signature System\n",
50
+ "\n",
51
+ "First, let's create a simple factual accuracy judge using the Signature system:"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "id": "5b5e2123",
58
+ "metadata": {},
59
+ "outputs": [
60
+ {
61
+ "name": "stdout",
62
+ "output_type": "stream",
63
+ "text": [
64
+ "Generated Instruction:\n",
65
+ "Judge if the answer is factually correct based on the context.\n",
66
+ "\n",
67
+ "**Input Fields:**\n",
68
+ "- context (str): Context for the prediction\n",
69
+ "- question (str): Question to be answered\n",
70
+ "- answer (str): Answer for the question\n",
71
+ "\n",
72
+ "**Output Fields:**\n",
73
+ "- factually_correct (bool): Is the answer factually correct based on the context?\n",
74
+ "\n",
75
+ "\n",
76
+ "==================================================\n",
77
+ "\n",
78
+ "Input Schema:\n",
79
+ "{\n",
80
+ " \"properties\": {\n",
81
+ " \"context\": {\n",
82
+ " \"description\": \"Context for the prediction\",\n",
83
+ " \"title\": \"Context\",\n",
84
+ " \"type\": \"string\"\n",
85
+ " },\n",
86
+ " \"question\": {\n",
87
+ " \"description\": \"Question to be answered\",\n",
88
+ " \"title\": \"Question\",\n",
89
+ " \"type\": \"string\"\n",
90
+ " },\n",
91
+ " \"answer\": {\n",
92
+ " \"description\": \"Answer for the question\",\n",
93
+ " \"title\": \"Answer\",\n",
94
+ " \"type\": \"string\"\n",
95
+ " }\n",
96
+ " },\n",
97
+ " \"required\": [\n",
98
+ " \"context\",\n",
99
+ " \"question\",\n",
100
+ " \"answer\"\n",
101
+ " ],\n",
102
+ " \"title\": \"FactJudgeInput\",\n",
103
+ " \"type\": \"object\"\n",
104
+ "}\n",
105
+ "\n",
106
+ "Output Schema:\n",
107
+ "{\n",
108
+ " \"properties\": {\n",
109
+ " \"factually_correct\": {\n",
110
+ " \"description\": \"Is the answer factually correct based on the context?\",\n",
111
+ " \"title\": \"Factually Correct\",\n",
112
+ " \"type\": \"boolean\"\n",
113
+ " }\n",
114
+ " },\n",
115
+ " \"required\": [\n",
116
+ " \"factually_correct\"\n",
117
+ " ],\n",
118
+ " \"title\": \"FactJudgeOutput\",\n",
119
+ " \"type\": \"object\"\n",
120
+ "}\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "# Define a signature like DSPy (original syntax - shows type warnings)\n",
126
+ "class FactJudge(Signature):\n",
127
+ " \"\"\"Judge if the answer is factually correct based on the context.\"\"\"\n",
128
+ " \n",
129
+ " # Note: The assignments below will show type warnings, but work correctly\n",
130
+ " context: str = InputField(desc=\"Context for the prediction\") # type: ignore\n",
131
+ " question: str = InputField(desc=\"Question to be answered\") # type: ignore\n",
132
+ " answer: str = InputField(desc=\"Answer for the question\") \n",
133
+ " factually_correct: bool = OutputField(desc=\"Is the answer factually correct based on the context?\") # type: ignore\n",
134
+ "\n",
135
+ "# Show the generated instruction\n",
136
+ "print(\"Generated Instruction:\")\n",
137
+ "print(FactJudge.get_instruction())\n",
138
+ "print(\"\\n\" + \"=\"*50 + \"\\n\")\n",
139
+ "\n",
140
+ "# Show the input/output models\n",
141
+ "input_model = FactJudge.get_input_model()\n",
142
+ "output_model = FactJudge.get_output_model()\n",
143
+ "\n",
144
+ "if input_model is not str:\n",
145
+ " print(\"Input Schema:\")\n",
146
+ " print(json.dumps(input_model.model_json_schema(), indent=2))\n",
147
+ "\n",
148
+ "if output_model is not str:\n",
149
+ " print(\"\\nOutput Schema:\")\n",
150
+ " print(json.dumps(output_model.model_json_schema(), indent=2))"
151
+ ]
152
+ },
153
+ {
154
+ "cell_type": "markdown",
155
+ "id": "13db83d4",
156
+ "metadata": {},
157
+ "source": [
158
+ "## Type-Safe Alternative Syntax\n",
159
+ "\n",
160
+ "The signature system now supports type-safe syntax using `typing.Annotated` to avoid type checker warnings:"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "48d7aef5",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "# Import the new type-safe helper functions\n",
171
+ "from typing import Annotated\n",
172
+ "from llm_utils import Input, Output\n",
173
+ "\n",
174
+ "# Type-safe syntax - no warnings!\n",
175
+ "class FactJudgeTypeSafe(Signature):\n",
176
+ " \"\"\"Judge if the answer is factually correct based on the context.\"\"\"\n",
177
+ " \n",
178
+ " context: Annotated[str, Input(\"Context for the prediction\")]\n",
179
+ " question: Annotated[str, Input(\"Question to be answered\")]\n",
180
+ " answer: Annotated[str, Input(\"Answer for the question\")]\n",
181
+ " factually_correct: Annotated[bool, Output(\"Is the answer factually correct?\")]\n",
182
+ "\n",
183
+ "print(\"Type-Safe Signature Instruction:\")\n",
184
+ "print(FactJudgeTypeSafe.get_instruction())\n",
185
+ "print(\"\\n\" + \"=\"*50 + \"\\n\")\n",
186
+ "\n",
187
+ "# Both approaches generate the same schemas\n",
188
+ "type_safe_input = FactJudgeTypeSafe.get_input_model()\n",
189
+ "type_safe_output = FactJudgeTypeSafe.get_output_model()\n",
190
+ "\n",
191
+ "if type_safe_input is not str:\n",
192
+ " print(\"Type-Safe Input Schema:\")\n",
193
+ " print(json.dumps(type_safe_input.model_json_schema(), indent=2))\n",
194
+ "\n",
195
+ "if type_safe_output is not str:\n",
196
+ " print(\"\\nType-Safe Output Schema:\")\n",
197
+ " print(json.dumps(type_safe_output.model_json_schema(), indent=2))"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "markdown",
202
+ "id": "795b1e2c",
203
+ "metadata": {},
204
+ "source": [
205
+ "## Example 2: Using ChainOfThought with Mock Client\n",
206
+ "\n",
207
+ "Note: In a real scenario, you would provide an actual OpenAI client or VLLM endpoint."
208
+ ]
209
+ },
210
+ {
211
+ "cell_type": "code",
212
+ "execution_count": 3,
213
+ "id": "1a225fc4",
214
+ "metadata": {},
215
+ "outputs": [
216
+ {
217
+ "name": "stdout",
218
+ "output_type": "stream",
219
+ "text": [
220
+ "Chain of Thought Usage Pattern:\n",
221
+ "\n",
222
+ "# With actual LLM client:\n",
223
+ "judge = ChainOfThought(FactJudge, client='http://localhost:8000/v1')\n",
224
+ "\n",
225
+ "# Execute judgment\n",
226
+ "result = judge(\n",
227
+ " context=\"The sky is blue during daytime due to light scattering.\",\n",
228
+ " question=\"What color is the sky?\",\n",
229
+ " answer=\"Blue\"\n",
230
+ ")\n",
231
+ "\n",
232
+ "print(f\"Is factually correct: {result.factually_correct}\")\n",
233
+ "\n"
234
+ ]
235
+ }
236
+ ],
237
+ "source": [
238
+ "# For demonstration purposes, let's see how you would use ChainOfThought\n",
239
+ "# (This requires an actual LLM client to run)\n",
240
+ "\n",
241
+ "print(\"Chain of Thought Usage Pattern:\")\n",
242
+ "print(\"\"\"\n",
243
+ "# With actual LLM client:\n",
244
+ "judge = ChainOfThought(FactJudge, client='http://localhost:8000/v1')\n",
245
+ "\n",
246
+ "# Execute judgment\n",
247
+ "result = judge(\n",
248
+ " context=\"The sky is blue during daytime due to light scattering.\",\n",
249
+ " question=\"What color is the sky?\",\n",
250
+ " answer=\"Blue\"\n",
251
+ ")\n",
252
+ "\n",
253
+ "print(f\"Is factually correct: {result.factually_correct}\")\n",
254
+ "\"\"\")"
255
+ ]
256
+ },
257
+ {
258
+ "cell_type": "markdown",
259
+ "id": "8af947a4",
260
+ "metadata": {},
261
+ "source": [
262
+ "## Example 3: Custom Judge with Template Variables"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "code",
267
+ "execution_count": 4,
268
+ "id": "0ba925e6",
269
+ "metadata": {},
270
+ "outputs": [
271
+ {
272
+ "name": "stdout",
273
+ "output_type": "stream",
274
+ "text": [
275
+ "Template Variables Required:\n",
276
+ " - criteria\n",
277
+ " - content\n",
278
+ " - judge_type\n",
279
+ " - categories\n",
280
+ " - content_type\n",
281
+ "\n",
282
+ "Template Preview:\n",
283
+ "You are a {judge_type} evaluating {content_type}.\n",
284
+ "\n",
285
+ "Evaluation Criteria:\n",
286
+ "- {criteria}\n",
287
+ "\n",
288
+ "Rate the following on a scale of 1-10 and provide reasoning.\n",
289
+ "Also identify relevant categories from: {categories}\n",
290
+ "...\n"
291
+ ]
292
+ }
293
+ ],
294
+ "source": [
295
+ "# Define a custom output model\n",
296
+ "class QualityScore(BaseModel):\n",
297
+ " score: int # 1-10 rating\n",
298
+ " reasoning: str\n",
299
+ " categories: list[str]\n",
300
+ "\n",
301
+ "# Create a judge with template variables\n",
302
+ "quality_prompt = \"\"\"\n",
303
+ "You are a {judge_type} evaluating {content_type}.\n",
304
+ "\n",
305
+ "Evaluation Criteria:\n",
306
+ "- {criteria}\n",
307
+ "\n",
308
+ "Rate the following on a scale of 1-10 and provide reasoning.\n",
309
+ "Also identify relevant categories from: {categories}\n",
310
+ "\n",
311
+ "Content to evaluate:\n",
312
+ "{content}\n",
313
+ "\"\"\".strip()\n",
314
+ "\n",
315
+ "# Show the template structure\n",
316
+ "print(\"Template Variables Required:\")\n",
317
+ "import re\n",
318
+ "variables = re.findall(r'\\{([^}]+)\\}', quality_prompt)\n",
319
+ "for var in set(variables):\n",
320
+ " print(f\" - {var}\")\n",
321
+ "\n",
322
+ "print(\"\\nTemplate Preview:\")\n",
323
+ "print(quality_prompt[:200] + \"...\")"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "markdown",
328
+ "id": "7f3518cf",
329
+ "metadata": {},
330
+ "source": [
331
+ "## Example 4: Translation Evaluator from Raw Code\n",
332
+ "\n",
333
+ "This demonstrates the TranslationEvaluatorJudge based on your raw code example:"
334
+ ]
335
+ },
336
+ {
337
+ "cell_type": "code",
338
+ "execution_count": null,
339
+ "id": "de4ef463",
340
+ "metadata": {},
341
+ "outputs": [],
342
+ "source": [
343
+ "# Create translation evaluator\n",
344
+ "evaluator = TranslationEvaluatorJudge()\n",
345
+ "\n",
346
+ "print(\"Translation Evaluator System Prompt:\")\n",
347
+ "print(evaluator.system_prompt_template[:300] + \"...\")\n",
348
+ "\n",
349
+ "print(\"\\nOutput Schema:\")\n",
350
+ "from llm_utils.lm.llm_as_a_judge import TranslationOutput\n",
351
+ "print(json.dumps(TranslationOutput.model_json_schema(), indent=2))\n",
352
+ "\n",
353
+ "print(\"\\nUsage Pattern:\")\n",
354
+ "print(\"\"\"\n",
355
+ "# With actual LLM client:\n",
356
+ "result = evaluator.evaluate_translation(\n",
357
+ " source_prompt=\"Translate this to French: Hello world\",\n",
358
+ " ai_translation=\"Bonjour le monde\",\n",
359
+ " human_reference=\"Bonjour tout le monde\",\n",
360
+ " system_message=\"NONE\",\n",
361
+ " glossaries=\"\"\n",
362
+ ")\n",
363
+ "\n",
364
+ "print(f\"Structure Score: {result.structure_score}\")\n",
365
+ "print(f\"Translation Score: {result.translation_score}\")\n",
366
+ "print(f\"Term Score: {result.term_score}\")\n",
367
+ "\"\"\")"
368
+ ]
369
+ },
370
+ {
371
+ "cell_type": "markdown",
372
+ "id": "168a06d3",
373
+ "metadata": {},
374
+ "source": [
375
+ "## Example 5: SFT Data Collection and Export"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": null,
381
+ "id": "8fae83e4",
382
+ "metadata": {},
383
+ "outputs": [],
384
+ "source": [
385
+ "# Create a mock judge for demonstration\n",
386
+ "mock_judge = LLMJudgeBase(\n",
387
+ " system_prompt_template=\"Rate the sentiment of this text: {text}. Scale: {scale}\",\n",
388
+ " output_model=str # Simple string output for demo\n",
389
+ ")\n",
390
+ "\n",
391
+ "# Simulate some SFT data\n",
392
+ "mock_judge.sft_data = [\n",
393
+ " {\n",
394
+ " 'messages': [\n",
395
+ " {\n",
396
+ " 'role': 'system',\n",
397
+ " 'content': 'Rate the sentiment of this text: I love sunny days! Scale: 1-10'\n",
398
+ " },\n",
399
+ " {\n",
400
+ " 'role': 'user',\n",
401
+ " 'content': 'Please rate the sentiment'\n",
402
+ " },\n",
403
+ " {\n",
404
+ " 'role': 'assistant',\n",
405
+ " 'content': '9 - Very positive sentiment'\n",
406
+ " }\n",
407
+ " ],\n",
408
+ " 'variables': {'text': 'I love sunny days!', 'scale': '1-10'},\n",
409
+ " 'input_data': 'Please rate the sentiment',\n",
410
+ " 'output': '9 - Very positive sentiment'\n",
411
+ " },\n",
412
+ " {\n",
413
+ " 'messages': [\n",
414
+ " {\n",
415
+ " 'role': 'system', \n",
416
+ " 'content': 'Rate the sentiment of this text: This is terrible. Scale: 1-10'\n",
417
+ " },\n",
418
+ " {\n",
419
+ " 'role': 'user',\n",
420
+ " 'content': 'Please rate the sentiment'\n",
421
+ " },\n",
422
+ " {\n",
423
+ " 'role': 'assistant',\n",
424
+ " 'content': '2 - Very negative sentiment'\n",
425
+ " }\n",
426
+ " ],\n",
427
+ " 'variables': {'text': 'This is terrible', 'scale': '1-10'},\n",
428
+ " 'input_data': 'Please rate the sentiment',\n",
429
+ " 'output': '2 - Very negative sentiment'\n",
430
+ " }\n",
431
+ "]\n",
432
+ "\n",
433
+ "print(f\"Collected {len(mock_judge.sft_data)} training examples\")\n",
434
+ "\n",
435
+ "# Test different export formats\n",
436
+ "formats = ['messages', 'sharegpt', 'full']\n",
437
+ "\n",
438
+ "for format_name in formats:\n",
439
+ " exported = mock_judge.export_sft_data(format_name)\n",
440
+ " print(f\"\\n=== {format_name.upper()} Format ===\")\n",
441
+ " print(f\"Exported {len(exported)} examples\")\n",
442
+ " print(\"Sample structure:\", list(exported[0].keys()))\n",
443
+ " \n",
444
+ " if format_name == 'sharegpt':\n",
445
+ " print(\"ShareGPT sample:\")\n",
446
+ " print(json.dumps(exported[0], indent=2)[:300] + \"...\")"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "markdown",
451
+ "id": "87fcd4d4",
452
+ "metadata": {},
453
+ "source": [
454
+ "## Example 6: Batch Processing Pattern"
455
+ ]
456
+ },
457
+ {
458
+ "cell_type": "code",
459
+ "execution_count": null,
460
+ "id": "8afba99b",
461
+ "metadata": {},
462
+ "outputs": [],
463
+ "source": [
464
+ "# Demonstrate how to use the judge system for batch processing\n",
465
+ "print(\"Batch Processing Pattern:\")\n",
466
+ "print(\"\"\"\n",
467
+ "# Example: Process multiple translations\n",
468
+ "evaluator = TranslationEvaluatorJudge(client='your-llm-endpoint')\n",
469
+ "\n",
470
+ "# Sample data\n",
471
+ "translations = [\n",
472
+ " {\n",
473
+ " 'source': 'Hello world',\n",
474
+ " 'ai_translation': 'Bonjour le monde', \n",
475
+ " 'human_reference': 'Bonjour tout le monde',\n",
476
+ " 'system_message': 'NONE',\n",
477
+ " 'glossaries': ''\n",
478
+ " },\n",
479
+ " # ... more translations\n",
480
+ "]\n",
481
+ "\n",
482
+ "# Process in batch\n",
483
+ "results = []\n",
484
+ "for item in translations:\n",
485
+ " result = evaluator.evaluate_translation(**item)\n",
486
+ " results.append(result)\n",
487
+ " \n",
488
+ "# Export all collected SFT data\n",
489
+ "evaluator.save_sft_data('translation_judge_training_data.json')\n",
490
+ "\n",
491
+ "# Analyze results\n",
492
+ "avg_structure = sum(r.structure_score for r in results) / len(results)\n",
493
+ "avg_translation = sum(r.translation_score for r in results) / len(results)\n",
494
+ "avg_term = sum(r.term_score for r in results) / len(results)\n",
495
+ "\n",
496
+ "print(f\"Average Scores:\")\n",
497
+ "print(f\" Structure: {avg_structure:.2f}\")\n",
498
+ "print(f\" Translation: {avg_translation:.2f}\")\n",
499
+ "print(f\" Terms: {avg_term:.2f}\")\n",
500
+ "\"\"\")"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "markdown",
505
+ "id": "9ba497a2",
506
+ "metadata": {},
507
+ "source": [
508
+ "## Example 7: Creating Custom Judge Classes"
509
+ ]
510
+ },
511
+ {
512
+ "cell_type": "code",
513
+ "execution_count": null,
514
+ "id": "1d2ba4f9",
515
+ "metadata": {},
516
+ "outputs": [],
517
+ "source": [
518
+ "# Example of creating a custom judge class\n",
519
+ "class CodeQualityJudge(LLMJudgeBase):\n",
520
+ " \"\"\"Judge code quality with multiple criteria.\"\"\"\n",
521
+ " \n",
522
+ " def __init__(self, **kwargs):\n",
523
+ " system_prompt = \"\"\"\n",
524
+ "You are an expert code reviewer evaluating {language} code.\n",
525
+ "\n",
526
+ "Criteria:\n",
527
+ "- Readability: How easy is it to understand?\n",
528
+ "- Performance: Are there obvious performance issues?\n",
529
+ "- Best Practices: Does it follow {language} best practices?\n",
530
+ "- Security: Are there security concerns?\n",
531
+ "\n",
532
+ "Code to review:\n",
533
+ "```{language}\n",
534
+ "{code}\n",
535
+ "```\n",
536
+ "\n",
537
+ "Additional context: {context}\n",
538
+ "\"\"\".strip()\n",
539
+ " \n",
540
+ " # Define output model\n",
541
+ " class CodeReview(BaseModel):\n",
542
+ " readability_score: int # 1-10\n",
543
+ " performance_score: int # 1-10\n",
544
+ " best_practices_score: int # 1-10\n",
545
+ " security_score: int # 1-10\n",
546
+ " overall_rating: str # \"excellent\", \"good\", \"fair\", \"poor\"\n",
547
+ " recommendations: list[str]\n",
548
+ " \n",
549
+ " super().__init__(\n",
550
+ " system_prompt_template=system_prompt,\n",
551
+ " output_model=CodeReview,\n",
552
+ " **kwargs\n",
553
+ " )\n",
554
+ " \n",
555
+ " def review_code(self, code: str, language: str = 'python', context: str = '') -> dict:\n",
556
+ " \"\"\"Review code with structured output.\"\"\"\n",
557
+ " variables = {\n",
558
+ " 'code': code,\n",
559
+ " 'language': language,\n",
560
+ " 'context': context\n",
561
+ " }\n",
562
+ " \n",
563
+ " results = self.judge(f\"Please review this {language} code\", variables=variables)\n",
564
+ " return results[0]['parsed']\n",
565
+ "\n",
566
+ "# Show the system prompt template\n",
567
+ "judge = CodeQualityJudge()\n",
568
+ "print(\"Code Quality Judge System Prompt Template:\")\n",
569
+ "print(judge.system_prompt_template)\n",
570
+ "\n",
571
+ "print(\"\\nUsage Example:\")\n",
572
+ "print(\"\"\"\n",
573
+ "# With actual LLM client:\n",
574
+ "code_judge = CodeQualityJudge(client='your-endpoint')\n",
575
+ "\n",
576
+ "result = code_judge.review_code(\n",
577
+ " code=\"def add(a, b): return a + b\",\n",
578
+ " language=\"python\",\n",
579
+ " context=\"Simple utility function\"\n",
580
+ ")\n",
581
+ "\n",
582
+ "print(f\"Overall Rating: {result.overall_rating}\")\n",
583
+ "print(f\"Readability: {result.readability_score}/10\")\n",
584
+ "\"\"\")"
585
+ ]
586
+ },
587
+ {
588
+ "cell_type": "markdown",
589
+ "id": "3cea50a9",
590
+ "metadata": {},
591
+ "source": [
592
+ "## Summary\n",
593
+ "\n",
594
+ "This notebook demonstrated the key features of the LLM-as-a-Judge system:\n",
595
+ "\n",
596
+ "1. **Signature System**: DSPy-like declarative interface for defining input/output schemas\n",
597
+ "2. **Template Variables**: System prompts with variable substitution\n",
598
+ "3. **SFT Export**: Automatic collection and export of training data\n",
599
+ "4. **Chain of Thought**: Built-in reasoning support\n",
600
+ "5. **Custom Judges**: Easy creation of domain-specific evaluation classes\n",
601
+ "6. **Multiple Formats**: Support for various export formats (messages, ShareGPT, etc.)\n",
602
+ "\n",
603
+ "### Next Steps:\n",
604
+ "\n",
605
+ "1. Set up your LLM endpoint (OpenAI API or VLLM server)\n",
606
+ "2. Create your own Signature classes for your specific use cases\n",
607
+ "3. Collect evaluation data and export for fine-tuning smaller models\n",
608
+ "4. Experiment with different prompt templates and evaluation criteria\n",
609
+ "\n",
610
+ "### Key Classes:\n",
611
+ "\n",
612
+ "- `Signature`: Define structured input/output schemas\n",
613
+ "- `LLMJudgeBase`: Core judge class with template support\n",
614
+ "- `ChainOfThought`: DSPy-like reasoning wrapper\n",
615
+ "- `TranslationEvaluatorJudge`: Ready-to-use translation evaluator\n",
616
+ "\n",
617
+ "The system is designed to be flexible and extensible for various evaluation tasks!"
618
+ ]
619
+ }
620
+ ],
621
+ "metadata": {
622
+ "kernelspec": {
623
+ "display_name": "speedy_utils",
624
+ "language": "python",
625
+ "name": "python3"
626
+ },
627
+ "language_info": {
628
+ "codemirror_mode": {
629
+ "name": "ipython",
630
+ "version": 3
631
+ },
632
+ "file_extension": ".py",
633
+ "mimetype": "text/x-python",
634
+ "name": "python",
635
+ "nbconvert_exporter": "python",
636
+ "pygments_lexer": "ipython3",
637
+ "version": "3.13.7"
638
+ }
639
+ },
640
+ "nbformat": 4,
641
+ "nbformat_minor": 5
642
+ }
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "speedy-utils"
3
- version = "1.1.22"
3
+ version = "1.1.23"
4
4
  description = "Fast and easy-to-use package for data science"
5
5
  authors = [{ name = "AnhVTH", email = "anhvth.226@gmail.com" }]
6
6
  readme = "README.md"
@@ -1,5 +1,5 @@
1
1
  from llm_utils.lm.openai_memoize import MOpenAI
2
- from llm_utils.lm import LLMTask, AsyncLM, AsyncLLMTask
2
+ from llm_utils.lm import LLMTask, AsyncLM, AsyncLLMTask, LLMJudgeBase, ChainOfThought, TranslationEvaluatorJudge, Signature, InputField, OutputField, Input, Output
3
3
  from llm_utils.vector_cache import VectorCache
4
4
  from llm_utils.lm.lm_base import get_model_name
5
5
  from llm_utils.lm.base_prompt_builder import BasePromptBuilder
@@ -15,7 +15,7 @@ def kill_vllm_on_port(port: int) -> bool:
15
15
  """Kill VLLM server on specific port. Returns True if server was killed."""
16
16
  return LLMTask.kill_vllm_on_port(port)
17
17
 
18
- from .chat_format import (
18
+ from llm_utils.chat_format import (
19
19
  build_chatml_input,
20
20
  display_chat_messages_as_html,
21
21
  display_conversations,
@@ -46,5 +46,13 @@ __all__ = [
46
46
  "BasePromptBuilder",
47
47
  "LLM",
48
48
  "kill_all_vllm",
49
- "kill_vllm_on_port"
49
+ "kill_vllm_on_port",
50
+ "LLMJudgeBase",
51
+ "ChainOfThought",
52
+ "TranslationEvaluatorJudge",
53
+ "Signature",
54
+ "InputField",
55
+ "OutputField",
56
+ "Input",
57
+ "Output",
50
58
  ]