speedy-utils 1.1.22__tar.gz → 1.1.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- speedy_utils-1.1.24/IMPROVEMENTS.md +141 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/PKG-INFO +1 -1
- speedy_utils-1.1.24/examples/temperature_range_example.py +119 -0
- speedy_utils-1.1.24/examples_improved_error_tracing.py +85 -0
- speedy_utils-1.1.24/notebooks/llm_utils/llm_as_a_judge.ipynb +300 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/pyproject.toml +1 -1
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/__init__.py +19 -7
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/chat_format/__init__.py +2 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/chat_format/display.py +115 -44
- speedy_utils-1.1.24/src/llm_utils/lm/__init__.py +31 -0
- speedy_utils-1.1.24/src/llm_utils/lm/llm.py +413 -0
- speedy_utils-1.1.24/src/llm_utils/lm/llm_signature.py +35 -0
- speedy_utils-1.1.24/src/llm_utils/lm/mixins.py +379 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/openai_memoize.py +18 -7
- speedy_utils-1.1.24/src/llm_utils/lm/signature.py +271 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/utils.py +61 -76
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/__init__.py +28 -1
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/all.py +30 -1
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_io.py +36 -26
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_misc.py +25 -1
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/multi_worker/thread.py +145 -58
- speedy_utils-1.1.24/tests/llm_utils/test_llm_mixins.py +153 -0
- speedy_utils-1.1.24/tests/test_multithread_error_trace.py +117 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/uv.lock +8 -8
- speedy_utils-1.1.22/src/llm_utils/lm/__init__.py +0 -13
- speedy_utils-1.1.22/src/llm_utils/lm/llm_task.py +0 -614
- speedy_utils-1.1.22/src/llm_utils/lm/lm.py +0 -207
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/.github/copilot-instructions.md +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/.github/workflows/publish.yml +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/.gitignore +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/.pre-commit-config.yaml +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/README.md +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/bumpversion.sh +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/notebooks/test_multi_thread.ipynb +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/ruff.toml +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/scripts/deploy.sh +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/setup.cfg +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/chat_format/transform.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/chat_format/utils.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/group_messages.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/__init__.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/_utils.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/async_llm_task.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/async_lm.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/async_lm_base.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/lm_specific.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/base_prompt_builder.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/lm/lm_base.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/scripts/README.md +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/scripts/vllm_load_balancer.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/scripts/vllm_serve.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/__init__.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/cli.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/core.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/types.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/utils.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/__init__.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/clock.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/function_decorator.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/logger.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/notebook_utils.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/patcher.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/report_manager.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_cache.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_print.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/multi_worker/__init__.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/multi_worker/process.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/scripts/__init__.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/scripts/mpython.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/sample_objects.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_logger.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_logger_format.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_memoize_typing.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_mpython.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_process.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_process_update.py +0 -0
- {speedy_utils-1.1.22 → speedy_utils-1.1.24}/tests/test_thread.py +0 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Multi-thread Error Tracing Improvements
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
Significantly improved error tracing in `multi_thread` to focus on user code rather than infrastructure frames, making debugging much faster and easier.
|
|
6
|
+
|
|
7
|
+
## Problem
|
|
8
|
+
|
|
9
|
+
Previously, when errors occurred in functions executed by `multi_thread`, the traceback was cluttered with infrastructure frames:
|
|
10
|
+
|
|
11
|
+
- `concurrent.futures` internals
|
|
12
|
+
- `threading.py` frames
|
|
13
|
+
- `multi_worker/thread.py` infrastructure code
|
|
14
|
+
|
|
15
|
+
This made it difficult to quickly identify the actual problem in user code.
|
|
16
|
+
|
|
17
|
+
### Example of OLD behavior:
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
TypeError Traceback (most recent call last)
|
|
21
|
+
Cell In[810], line 35
|
|
22
|
+
33 choices = multi_thread(fns, range(n))
|
|
23
|
+
34 return choices
|
|
24
|
+
---> 35 choices = translate()
|
|
25
|
+
|
|
26
|
+
File ~/projects/speedy_utils/src/speedy_utils/multi_worker/thread.py:474, in multi_thread(...)
|
|
27
|
+
472 idx, logical_size = _future_meta(fut)
|
|
28
|
+
473 try:
|
|
29
|
+
--> 474 result = fut.result()
|
|
30
|
+
475 except Exception as exc:
|
|
31
|
+
476 if stop_on_error:
|
|
32
|
+
|
|
33
|
+
File ~/.local/share/uv/python/.../concurrent/futures/_base.py:449, in Future.result(...)
|
|
34
|
+
447 raise CancelledError()
|
|
35
|
+
448 elif self._state == FINISHED:
|
|
36
|
+
--> 449 return self.__get_result()
|
|
37
|
+
|
|
38
|
+
... (many more infrastructure frames) ...
|
|
39
|
+
|
|
40
|
+
TypeError: 'list' object is not callable
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Solution
|
|
44
|
+
|
|
45
|
+
### 1. Added `UserFunctionError` Exception Class
|
|
46
|
+
|
|
47
|
+
A custom exception wrapper that:
|
|
48
|
+
|
|
49
|
+
- Captures the original exception
|
|
50
|
+
- Stores the function name and input that caused the error
|
|
51
|
+
- Filters traceback to include only user code frames
|
|
52
|
+
- Provides clear, focused error messages
|
|
53
|
+
|
|
54
|
+
### 2. Enhanced `_worker` Function
|
|
55
|
+
|
|
56
|
+
- Added validation to detect common mistakes (e.g., passing a list instead of a function)
|
|
57
|
+
- Filters tracebacks to remove infrastructure frames
|
|
58
|
+
- Wraps user function errors in `UserFunctionError` with clean context
|
|
59
|
+
- Provides helpful hints for common mistakes
|
|
60
|
+
|
|
61
|
+
### 3. Improved Error Reporting in `multi_thread`
|
|
62
|
+
|
|
63
|
+
- Logs clear error messages showing function name and input
|
|
64
|
+
- Displays only user code in tracebacks
|
|
65
|
+
- Re-raises exceptions with cleaned messages
|
|
66
|
+
- Maintains proper exception chaining while hiding infrastructure noise
|
|
67
|
+
|
|
68
|
+
## Benefits
|
|
69
|
+
|
|
70
|
+
### Clear Error Messages
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
Error in function "process_item" with input: 0
|
|
74
|
+
|
|
75
|
+
User code traceback:
|
|
76
|
+
File "your_script.py", line 20, in process_item
|
|
77
|
+
return my_list(x)
|
|
78
|
+
^^^^^^^^^^
|
|
79
|
+
TypeError: 'list' object is not callable
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Helpful Hints
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
TypeError:
|
|
86
|
+
multi_thread: func parameter must be callable, got list: [...]
|
|
87
|
+
Hint: Did you accidentally pass a list instead of a function?
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Nested Function Support
|
|
91
|
+
|
|
92
|
+
Shows complete call chain through user code:
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
Error in function "process_data" with input: 0
|
|
96
|
+
|
|
97
|
+
User code traceback:
|
|
98
|
+
File "your_script.py", line 44, in process_data
|
|
99
|
+
return validate_and_calc(val)
|
|
100
|
+
^^^^^^^^^^^^^^^^^^^^^^
|
|
101
|
+
File "your_script.py", line 42, in validate_and_calc
|
|
102
|
+
return 100 / x
|
|
103
|
+
~~~~^~~
|
|
104
|
+
ZeroDivisionError: division by zero
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Key Improvements
|
|
108
|
+
|
|
109
|
+
✅ **Errors show function name and problematic input**
|
|
110
|
+
✅ **Tracebacks filtered to show only user code**
|
|
111
|
+
✅ **No concurrent.futures/threading clutter**
|
|
112
|
+
✅ **Helpful hints for common mistakes**
|
|
113
|
+
✅ **Clear, actionable error messages**
|
|
114
|
+
✅ **Maintains backward compatibility - all existing tests pass**
|
|
115
|
+
|
|
116
|
+
## Testing
|
|
117
|
+
|
|
118
|
+
Run the comprehensive demo to see all improvements:
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
python tests/test_multithread_error_trace.py
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
This demonstrates:
|
|
125
|
+
|
|
126
|
+
1. Simple function errors
|
|
127
|
+
2. Nested function call traces
|
|
128
|
+
3. Common parameter type mistakes
|
|
129
|
+
4. Various exception types (TypeError, ValueError, AttributeError, etc.)
|
|
130
|
+
|
|
131
|
+
## Code Changes
|
|
132
|
+
|
|
133
|
+
Main files modified:
|
|
134
|
+
|
|
135
|
+
- `src/speedy_utils/multi_worker/thread.py`:
|
|
136
|
+
- Added `UserFunctionError` exception class
|
|
137
|
+
- Enhanced `_worker` function with validation and error filtering
|
|
138
|
+
- Improved error handling in `multi_thread` main loop
|
|
139
|
+
- Added imports for `sys` and `traceback`
|
|
140
|
+
|
|
141
|
+
All changes maintain backward compatibility - existing code continues to work unchanged.
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
"""Example demonstrating temperature range sampling with LLM."""
|
|
2
|
+
|
|
3
|
+
from llm_utils import LLM
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CreativeStory(BaseModel):
|
|
8
|
+
"""A creative story output."""
|
|
9
|
+
|
|
10
|
+
title: str
|
|
11
|
+
story: str
|
|
12
|
+
moral: str
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def example_temperature_range_text():
|
|
16
|
+
"""Example: Sample text responses with different temperatures."""
|
|
17
|
+
print("=" * 60)
|
|
18
|
+
print("Example 1: Temperature Range Sampling (Text Completion)")
|
|
19
|
+
print("=" * 60)
|
|
20
|
+
|
|
21
|
+
llm = LLM(
|
|
22
|
+
instruction="You are a creative writer. Write a very short story.",
|
|
23
|
+
output_model=str,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
prompt = "Write a one-sentence story about a brave mouse."
|
|
27
|
+
|
|
28
|
+
# Sample with 5 different temperatures from 0.1 to 1.0
|
|
29
|
+
responses = llm(
|
|
30
|
+
prompt,
|
|
31
|
+
temperature_ranges=(0.1, 1.0),
|
|
32
|
+
n=5,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
print(f"\nGenerated {len(responses)} responses with varying temperatures:\n")
|
|
36
|
+
for i, resp in enumerate(responses):
|
|
37
|
+
temp = 0.1 + i * ((1.0 - 0.1) / (5 - 1))
|
|
38
|
+
print(f"Temperature ~{temp:.2f}:")
|
|
39
|
+
print(f" {resp['parsed']}\n")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def example_temperature_range_pydantic():
|
|
43
|
+
"""Example: Sample structured responses with different temperatures."""
|
|
44
|
+
print("=" * 60)
|
|
45
|
+
print("Example 2: Temperature Range with Pydantic Models")
|
|
46
|
+
print("=" * 60)
|
|
47
|
+
|
|
48
|
+
llm = LLM(
|
|
49
|
+
instruction="Create a creative short story with a moral lesson.",
|
|
50
|
+
output_model=CreativeStory,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
prompt = "Topic: A robot learning to feel emotions"
|
|
54
|
+
|
|
55
|
+
# Sample with 3 different temperatures from 0.5 to 1.5
|
|
56
|
+
responses = llm(
|
|
57
|
+
prompt,
|
|
58
|
+
temperature_ranges=(0.5, 1.5),
|
|
59
|
+
n=3,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
print(f"\nGenerated {len(responses)} stories with varying creativity:\n")
|
|
63
|
+
for i, resp in enumerate(responses):
|
|
64
|
+
temp = 0.5 + i * ((1.5 - 0.5) / (3 - 1))
|
|
65
|
+
story = resp["parsed"]
|
|
66
|
+
print(f"Temperature ~{temp:.2f}:")
|
|
67
|
+
print(f" Title: {story.title}")
|
|
68
|
+
print(f" Story: {story.story[:80]}...")
|
|
69
|
+
print(f" Moral: {story.moral}\n")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def example_two_step_parsing():
|
|
73
|
+
"""Example: Two-step Pydantic parsing for models with reasoning."""
|
|
74
|
+
print("=" * 60)
|
|
75
|
+
print("Example 3: Two-Step Pydantic Parsing")
|
|
76
|
+
print("=" * 60)
|
|
77
|
+
|
|
78
|
+
llm = LLM(
|
|
79
|
+
instruction=("Analyze the given text and extract structured information. Think through your analysis first."),
|
|
80
|
+
output_model=CreativeStory,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
prompt = "Analyze the story: 'The tortoise won the race by persistence.'"
|
|
84
|
+
|
|
85
|
+
# Use two-step parsing (useful for reasoning models)
|
|
86
|
+
response = llm(
|
|
87
|
+
prompt,
|
|
88
|
+
two_step_parse_pydantic=True,
|
|
89
|
+
)[0]
|
|
90
|
+
|
|
91
|
+
story = response["parsed"]
|
|
92
|
+
print("\nExtracted structure:")
|
|
93
|
+
print(f" Title: {story.title}")
|
|
94
|
+
print(f" Story: {story.story}")
|
|
95
|
+
print(f" Moral: {story.moral}")
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
# Run examples
|
|
100
|
+
# Note: These require a working OpenAI API key or local LLM server
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
example_temperature_range_text()
|
|
104
|
+
except Exception as e:
|
|
105
|
+
print(f"Example 1 failed: {e}\n")
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
example_temperature_range_pydantic()
|
|
109
|
+
except Exception as e:
|
|
110
|
+
print(f"Example 2 failed: {e}\n")
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
example_two_step_parsing()
|
|
114
|
+
except Exception as e:
|
|
115
|
+
print(f"Example 3 failed: {e}\n")
|
|
116
|
+
|
|
117
|
+
print("\n" + "=" * 60)
|
|
118
|
+
print("Examples complete!")
|
|
119
|
+
print("=" * 60)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Direct comparison: Before and After error tracing improvements.
|
|
3
|
+
|
|
4
|
+
This demonstrates the exact improvement for the user's original error case.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from speedy_utils import multi_thread
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def simulate_original_error():
|
|
11
|
+
"""
|
|
12
|
+
Simulates the exact error from the user's example:
|
|
13
|
+
- User has a function that creates lambda functions
|
|
14
|
+
- Accidentally passes list of functions as 'func' parameter
|
|
15
|
+
- Gets TypeError: 'list' object is not callable
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def lm_translate(msgs, temperature, max_tokens):
|
|
19
|
+
"""Mock language model translate function."""
|
|
20
|
+
return [{'parsed': f'translation at temp={temperature:.2f}'}]
|
|
21
|
+
|
|
22
|
+
def translate(n=5, max_temperature=1.0):
|
|
23
|
+
"""Function that generates choices with different temperatures."""
|
|
24
|
+
step = max_temperature / n
|
|
25
|
+
fns = []
|
|
26
|
+
target_text = 'Some text to translate'
|
|
27
|
+
msgs = [{'role': 'user', 'content': 'Translate this'}]
|
|
28
|
+
|
|
29
|
+
for i in range(n):
|
|
30
|
+
fn = lambda x: lm_translate(
|
|
31
|
+
msgs,
|
|
32
|
+
temperature=0.1 + 0.1 * i * step,
|
|
33
|
+
max_tokens=len(target_text) + 32,
|
|
34
|
+
)[0]
|
|
35
|
+
fns.append(fn)
|
|
36
|
+
|
|
37
|
+
# THE BUG: User passed fns (a list) as the func parameter
|
|
38
|
+
# Should be: multi_thread(some_function, fns)
|
|
39
|
+
# Instead did: multi_thread(fns, range(n))
|
|
40
|
+
choices = multi_thread(fns, range(n), progress=False)
|
|
41
|
+
return choices
|
|
42
|
+
|
|
43
|
+
return translate()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def main():
|
|
47
|
+
print('='*70)
|
|
48
|
+
print('BEFORE vs AFTER: Error Tracing Improvements')
|
|
49
|
+
print('='*70)
|
|
50
|
+
|
|
51
|
+
print('\nBEFORE (old behavior):')
|
|
52
|
+
print('-' * 70)
|
|
53
|
+
print('''
|
|
54
|
+
The error traceback showed:
|
|
55
|
+
- Line in multi_thread.py:474
|
|
56
|
+
- concurrent.futures/_base.py:449
|
|
57
|
+
- concurrent.futures/thread.py:59
|
|
58
|
+
- multi_worker/thread.py:155
|
|
59
|
+
- ... many infrastructure frames ...
|
|
60
|
+
- Finally: TypeError: 'list' object is not callable
|
|
61
|
+
|
|
62
|
+
User had to dig through 10+ lines of infrastructure code
|
|
63
|
+
to find the actual problem.
|
|
64
|
+
''')
|
|
65
|
+
|
|
66
|
+
print('\nAFTER (new behavior):')
|
|
67
|
+
print('-' * 70)
|
|
68
|
+
|
|
69
|
+
try:
|
|
70
|
+
simulate_original_error()
|
|
71
|
+
except TypeError as e:
|
|
72
|
+
print(f'\n{type(e).__name__}: {e}\n')
|
|
73
|
+
|
|
74
|
+
print('-' * 70)
|
|
75
|
+
print('\nKey differences:')
|
|
76
|
+
print(' ✓ Immediate identification of the problem')
|
|
77
|
+
print(' ✓ Clear hint about what went wrong')
|
|
78
|
+
print(' ✓ Shows exactly what was passed (list of functions)')
|
|
79
|
+
print(' ✓ No infrastructure clutter')
|
|
80
|
+
print(' ✓ Debugging time: < 5 seconds vs > 1 minute')
|
|
81
|
+
print('='*70)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
if __name__ == '__main__':
|
|
85
|
+
main()
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "markdown",
|
|
5
|
+
"id": "136ff273",
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"source": [
|
|
8
|
+
"# LLM-as-a-Judge Tutorial\n",
|
|
9
|
+
"\n",
|
|
10
|
+
"This notebook demonstrates how to use the LLM-as-a-Judge system with structured prompts, variable substitution, and SFT export capabilities."
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"cell_type": "markdown",
|
|
15
|
+
"id": "60c1bf59",
|
|
16
|
+
"metadata": {},
|
|
17
|
+
"source": [
|
|
18
|
+
"## Setup and Imports"
|
|
19
|
+
]
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"cell_type": "code",
|
|
23
|
+
"execution_count": 2,
|
|
24
|
+
"id": "bb8f8e2b",
|
|
25
|
+
"metadata": {},
|
|
26
|
+
"outputs": [],
|
|
27
|
+
"source": [
|
|
28
|
+
"\n",
|
|
29
|
+
"from llm_utils import (\n",
|
|
30
|
+
" LLMJudgeBase, \n",
|
|
31
|
+
" Signature, \n",
|
|
32
|
+
" InputField, \n",
|
|
33
|
+
" OutputField\n",
|
|
34
|
+
")\n",
|
|
35
|
+
"from pydantic import BaseModel\n",
|
|
36
|
+
"import json"
|
|
37
|
+
]
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
"cell_type": "markdown",
|
|
41
|
+
"id": "eaceb8bd",
|
|
42
|
+
"metadata": {},
|
|
43
|
+
"source": [
|
|
44
|
+
"## Example 1: DSPy-like Signature System\n",
|
|
45
|
+
"\n",
|
|
46
|
+
"First, let's create a simple factual accuracy judge using the Signature system:"
|
|
47
|
+
]
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"cell_type": "code",
|
|
51
|
+
"execution_count": 3,
|
|
52
|
+
"id": "5b5e2123",
|
|
53
|
+
"metadata": {},
|
|
54
|
+
"outputs": [
|
|
55
|
+
{
|
|
56
|
+
"name": "stdout",
|
|
57
|
+
"output_type": "stream",
|
|
58
|
+
"text": [
|
|
59
|
+
"Generated Instruction:\n",
|
|
60
|
+
"Judge if the answer is factually correct based on the context.\n",
|
|
61
|
+
"\n",
|
|
62
|
+
"**Input Fields:**\n",
|
|
63
|
+
"- context (str): Context for the prediction\n",
|
|
64
|
+
"- question (str): Question to be answered\n",
|
|
65
|
+
"- answer (str): Answer for the question\n",
|
|
66
|
+
"\n",
|
|
67
|
+
"**Output Fields:**\n",
|
|
68
|
+
"- factually_correct (bool): Is the answer factually correct based on the context?\n",
|
|
69
|
+
"\n",
|
|
70
|
+
"\n",
|
|
71
|
+
"==================================================\n",
|
|
72
|
+
"\n",
|
|
73
|
+
"Input Schema:\n",
|
|
74
|
+
"{\n",
|
|
75
|
+
" \"properties\": {\n",
|
|
76
|
+
" \"context\": {\n",
|
|
77
|
+
" \"description\": \"Context for the prediction\",\n",
|
|
78
|
+
" \"title\": \"Context\",\n",
|
|
79
|
+
" \"type\": \"string\"\n",
|
|
80
|
+
" },\n",
|
|
81
|
+
" \"question\": {\n",
|
|
82
|
+
" \"description\": \"Question to be answered\",\n",
|
|
83
|
+
" \"title\": \"Question\",\n",
|
|
84
|
+
" \"type\": \"string\"\n",
|
|
85
|
+
" },\n",
|
|
86
|
+
" \"answer\": {\n",
|
|
87
|
+
" \"description\": \"Answer for the question\",\n",
|
|
88
|
+
" \"title\": \"Answer\",\n",
|
|
89
|
+
" \"type\": \"string\"\n",
|
|
90
|
+
" }\n",
|
|
91
|
+
" },\n",
|
|
92
|
+
" \"required\": [\n",
|
|
93
|
+
" \"context\",\n",
|
|
94
|
+
" \"question\",\n",
|
|
95
|
+
" \"answer\"\n",
|
|
96
|
+
" ],\n",
|
|
97
|
+
" \"title\": \"FactJudgeInput\",\n",
|
|
98
|
+
" \"type\": \"object\"\n",
|
|
99
|
+
"}\n",
|
|
100
|
+
"\n",
|
|
101
|
+
"Output Schema:\n",
|
|
102
|
+
"{\n",
|
|
103
|
+
" \"properties\": {\n",
|
|
104
|
+
" \"factually_correct\": {\n",
|
|
105
|
+
" \"description\": \"Is the answer factually correct based on the context?\",\n",
|
|
106
|
+
" \"title\": \"Factually Correct\",\n",
|
|
107
|
+
" \"type\": \"boolean\"\n",
|
|
108
|
+
" }\n",
|
|
109
|
+
" },\n",
|
|
110
|
+
" \"required\": [\n",
|
|
111
|
+
" \"factually_correct\"\n",
|
|
112
|
+
" ],\n",
|
|
113
|
+
" \"title\": \"FactJudgeOutput\",\n",
|
|
114
|
+
" \"type\": \"object\"\n",
|
|
115
|
+
"}\n"
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
],
|
|
119
|
+
"source": [
|
|
120
|
+
"# Define a signature like DSPy (original syntax - no more type warnings!)\n",
|
|
121
|
+
"class FactJudge(Signature):\n",
|
|
122
|
+
" \"\"\"Judge if the answer is factually correct based on the context.\"\"\"\n",
|
|
123
|
+
" \n",
|
|
124
|
+
" # No more type warnings with the updated InputField/OutputField!\n",
|
|
125
|
+
" context: str = InputField(desc=\"Context for the prediction\")\n",
|
|
126
|
+
" question: str = InputField(desc=\"Question to be answered\")\n",
|
|
127
|
+
" answer: str = InputField(desc=\"Answer for the question\")\n",
|
|
128
|
+
" factually_correct: bool = OutputField(desc=\"Is the answer factually correct based on the context?\")\n",
|
|
129
|
+
"\n",
|
|
130
|
+
"# Show the generated instruction\n",
|
|
131
|
+
"print(\"Generated Instruction:\")\n",
|
|
132
|
+
"print(FactJudge.get_instruction())\n",
|
|
133
|
+
"print(\"\\n\" + \"=\"*50 + \"\\n\")\n",
|
|
134
|
+
"\n",
|
|
135
|
+
"# Show the input/output models (now always Pydantic models)\n",
|
|
136
|
+
"input_model = FactJudge.get_input_model()\n",
|
|
137
|
+
"output_model = FactJudge.get_output_model()\n",
|
|
138
|
+
"\n",
|
|
139
|
+
"print(\"Input Schema:\")\n",
|
|
140
|
+
"print(json.dumps(input_model.model_json_schema(), indent=2))\n",
|
|
141
|
+
"\n",
|
|
142
|
+
"print(\"\\nOutput Schema:\")\n",
|
|
143
|
+
"print(json.dumps(output_model.model_json_schema(), indent=2))"
|
|
144
|
+
]
|
|
145
|
+
},
|
|
146
|
+
{
|
|
147
|
+
"cell_type": "code",
|
|
148
|
+
"execution_count": 23,
|
|
149
|
+
"id": "380542eb",
|
|
150
|
+
"metadata": {},
|
|
151
|
+
"outputs": [],
|
|
152
|
+
"source": [
|
|
153
|
+
"class Sig(Signature):\n",
|
|
154
|
+
" \"\"\"You are a careful **translation evaluator**.\n",
|
|
155
|
+
"\n",
|
|
156
|
+
"You are given five inputs:\n",
|
|
157
|
+
"\n",
|
|
158
|
+
"* **Source Prompt** (the original text & any constraints)\n",
|
|
159
|
+
"* **AI Translation** (the machine translation to evaluate)\n",
|
|
160
|
+
"* **Human Reference** (a reference rendering; use only for guidance, not as ground truth)\n",
|
|
161
|
+
"* **System Message** (an automated hint about a possible structural error)\n",
|
|
162
|
+
"* **Glossaries** (optional terminology constraints; may be empty)\n",
|
|
163
|
+
"\n",
|
|
164
|
+
"## Your tasks\n",
|
|
165
|
+
"\n",
|
|
166
|
+
"1. **Check structure correctness**:\n",
|
|
167
|
+
" - Use the System Message as a hint.\n",
|
|
168
|
+
" - Assign a `structure_score`:\n",
|
|
169
|
+
" * `0` = structure is clearly wrong or the error flagged is correct.\n",
|
|
170
|
+
" * `1` = partially correct but flawed.\n",
|
|
171
|
+
" * `2` = structure is correct; the system error is invalid.\n",
|
|
172
|
+
"\n",
|
|
173
|
+
"2. **Check translation quality**:\n",
|
|
174
|
+
" - Compare AI Translation with Source Prompt and Human Reference.\n",
|
|
175
|
+
" - Assign a `translation_score`:\n",
|
|
176
|
+
" * `0` = unfaithful (major omissions/additions/distortions/repetitions).\n",
|
|
177
|
+
" * `1` = somewhat faithful (mostly correct but noticeable issues).\n",
|
|
178
|
+
" * `2` = faithful (preserves meaning, scope, nuance; only minor style differences).\n",
|
|
179
|
+
"\n",
|
|
180
|
+
"3. **Check glossary/terminology adherence**:\n",
|
|
181
|
+
" - If no glossary is provided → `term_score = 2`.\n",
|
|
182
|
+
" - If glossary exists but only partially followed → `term_score = 1`.\n",
|
|
183
|
+
" - If glossary exists but not followed at all → `term_score = 0`.\n",
|
|
184
|
+
"\n",
|
|
185
|
+
"## Output format (JSON only; no commentary)\n",
|
|
186
|
+
"\n",
|
|
187
|
+
"{{\"structure_score\": <0|1|2>, \"translation_score\": <0|1|2>, \"term_score\": <0|1|2>}}\n",
|
|
188
|
+
"\n",
|
|
189
|
+
"* Return exactly one JSON object.\n",
|
|
190
|
+
"* Do not output any explanations.\n",
|
|
191
|
+
"\"\"\"\n",
|
|
192
|
+
" SOURCE_PROMPT: str = InputField(desc=\"The original text to be translated, along with any constraints.\")\n",
|
|
193
|
+
" AI_TRANSLATION: str = InputField(desc=\"The machine translation output to be evaluated.\")\n",
|
|
194
|
+
" HUMAN_REFERENCE: str = InputField(desc=\"A reference human translation, to be used for guidance but not as ground truth.\")\n",
|
|
195
|
+
" SYSTEM_MESSAGE: str = InputField(desc=\"An automated hint about a possible structural error in the AI translation.\")\n",
|
|
196
|
+
" GLOSSARIES: str = InputField(desc=\"Optional terminology constraints; may be empty.\")\n",
|
|
197
|
+
" \n",
|
|
198
|
+
" structure_score: int = OutputField(desc=\"Score for structural correctness: 0 (wrong), 1 (partially correct), 2 (correct)\")\n",
|
|
199
|
+
" glossary_score: int = OutputField(desc=\"Score for glossary adherence: 0 (not followed), 1 (partially followed), 2 (fully followed or no glossary)\")\n",
|
|
200
|
+
" translation_score: int = OutputField(desc=\"Score for translation quality: 0 (unfaithful), 1 (somewhat faithful), 2 (faithful)\")\n",
|
|
201
|
+
" \n",
|
|
202
|
+
"# --- Updated evaluation prompt ---\n",
|
|
203
|
+
"\n",
|
|
204
|
+
"import os\n",
|
|
205
|
+
"judge = LLMJudgeBase(signature=Sig, client=8000) # vllm is hosted at port 8000\n",
|
|
206
|
+
"judge = LLMJudgeBase(signature=Sig, model='gpt-4.1-mini', client=None) # use openai's gpt-4.1 model"
|
|
207
|
+
]
|
|
208
|
+
},
|
|
209
|
+
{
|
|
210
|
+
"cell_type": "code",
|
|
211
|
+
"execution_count": 24,
|
|
212
|
+
"id": "288ebc9b",
|
|
213
|
+
"metadata": {},
|
|
214
|
+
"outputs": [],
|
|
215
|
+
"source": [
|
|
216
|
+
"input_data = Sig.get_input_model()(\n",
|
|
217
|
+
" SOURCE_PROMPT=\"Translate the following English text to French, ensuring that the structure is preserved and the terminology is accurate. The text is: 'The quick brown fox jumps over the lazy dog.'\",\n",
|
|
218
|
+
" AI_TRANSLATION=\"Le renard brun rapide saute par-dessus le chien paresseux.\",\n",
|
|
219
|
+
" HUMAN_REFERENCE=\"Le vif renard brun bondit par-dessus le chien paresseux.\",\n",
|
|
220
|
+
" SYSTEM_MESSAGE=\"The AI translation has a structural error: it uses 'rapide' instead of 'vif' to describe the fox, which affects the nuance of the sentence.\",\n",
|
|
221
|
+
" GLOSSARIES=\"vif: quick, lively; paresseux: lazy\",\n",
|
|
222
|
+
")\n",
|
|
223
|
+
"output = judge(input_data)"
|
|
224
|
+
]
|
|
225
|
+
},
|
|
226
|
+
{
|
|
227
|
+
"cell_type": "code",
|
|
228
|
+
"execution_count": 28,
|
|
229
|
+
"id": "70d221c1",
|
|
230
|
+
"metadata": {},
|
|
231
|
+
"outputs": [
|
|
232
|
+
{
|
|
233
|
+
"ename": "AttributeError",
|
|
234
|
+
"evalue": "'LLMJudgeBase' object has no attribute 'inspect_history'",
|
|
235
|
+
"output_type": "error",
|
|
236
|
+
"traceback": [
|
|
237
|
+
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
238
|
+
"\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
|
|
239
|
+
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mjudge\u001b[49m\u001b[43m.\u001b[49m\u001b[43minspect_history\u001b[49m()\n",
|
|
240
|
+
"\u001b[31mAttributeError\u001b[39m: 'LLMJudgeBase' object has no attribute 'inspect_history'"
|
|
241
|
+
]
|
|
242
|
+
}
|
|
243
|
+
],
|
|
244
|
+
"source": [
|
|
245
|
+
"judge.inspect_history()"
|
|
246
|
+
]
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
"cell_type": "code",
|
|
250
|
+
"execution_count": null,
|
|
251
|
+
"id": "59a2c995",
|
|
252
|
+
"metadata": {},
|
|
253
|
+
"outputs": [
|
|
254
|
+
{
|
|
255
|
+
"name": "stdout",
|
|
256
|
+
"output_type": "stream",
|
|
257
|
+
"text": [
|
|
258
|
+
"\n",
|
|
259
|
+
"Okay, let's start by looking at the structure. The system message says the AI translation used 'rapide' instead of 'vif', which affects the nuance. The original prompt specified using accurate terminology. The glossary provides 'vif' for 'quick', so the AI should have used 'vif' instead of 'rapide'. That's a structural issue because the term choice is specified. So structure_score is 0.\n",
|
|
260
|
+
"\n",
|
|
261
|
+
"Next, translation quality. The AI's translation is mostly correct but uses the wrong term. The human reference uses 'vif', which is the correct term according to the glossary. The meaning is preserved, but the nuance is off. So translation_score is 1 because it's somewhat faithful but has a noticeable issue.\n",
|
|
262
|
+
"\n",
|
|
263
|
+
"Glossary adherence: The glossary specifies 'vif' for 'quick', but the AI used 'rapide'. So the term wasn't followed. Term_score is 0.\n",
|
|
264
|
+
"\n"
|
|
265
|
+
]
|
|
266
|
+
}
|
|
267
|
+
],
|
|
268
|
+
"source": []
|
|
269
|
+
},
|
|
270
|
+
{
|
|
271
|
+
"cell_type": "code",
|
|
272
|
+
"execution_count": null,
|
|
273
|
+
"id": "2b16f277",
|
|
274
|
+
"metadata": {},
|
|
275
|
+
"outputs": [],
|
|
276
|
+
"source": []
|
|
277
|
+
}
|
|
278
|
+
],
|
|
279
|
+
"metadata": {
|
|
280
|
+
"kernelspec": {
|
|
281
|
+
"display_name": "speedy-utils",
|
|
282
|
+
"language": "python",
|
|
283
|
+
"name": "python3"
|
|
284
|
+
},
|
|
285
|
+
"language_info": {
|
|
286
|
+
"codemirror_mode": {
|
|
287
|
+
"name": "ipython",
|
|
288
|
+
"version": 3
|
|
289
|
+
},
|
|
290
|
+
"file_extension": ".py",
|
|
291
|
+
"mimetype": "text/x-python",
|
|
292
|
+
"name": "python",
|
|
293
|
+
"nbconvert_exporter": "python",
|
|
294
|
+
"pygments_lexer": "ipython3",
|
|
295
|
+
"version": "3.13.7"
|
|
296
|
+
}
|
|
297
|
+
},
|
|
298
|
+
"nbformat": 4,
|
|
299
|
+
"nbformat_minor": 5
|
|
300
|
+
}
|