speedy-utils 1.1.23__tar.gz → 1.1.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. speedy_utils-1.1.24/IMPROVEMENTS.md +141 -0
  2. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/PKG-INFO +1 -1
  3. speedy_utils-1.1.24/examples/temperature_range_example.py +119 -0
  4. speedy_utils-1.1.24/examples_improved_error_tracing.py +85 -0
  5. speedy_utils-1.1.24/notebooks/llm_utils/llm_as_a_judge.ipynb +300 -0
  6. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/pyproject.toml +1 -1
  7. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/__init__.py +12 -8
  8. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/chat_format/__init__.py +2 -0
  9. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/chat_format/display.py +115 -44
  10. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/__init__.py +14 -6
  11. speedy_utils-1.1.24/src/llm_utils/lm/llm.py +413 -0
  12. speedy_utils-1.1.24/src/llm_utils/lm/llm_signature.py +35 -0
  13. speedy_utils-1.1.24/src/llm_utils/lm/mixins.py +379 -0
  14. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/openai_memoize.py +18 -7
  15. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/signature.py +26 -37
  16. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/utils.py +61 -76
  17. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/__init__.py +28 -1
  18. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/all.py +30 -1
  19. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_io.py +36 -26
  20. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_misc.py +25 -1
  21. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/multi_worker/thread.py +145 -58
  22. speedy_utils-1.1.24/tests/llm_utils/test_llm_mixins.py +153 -0
  23. speedy_utils-1.1.24/tests/test_multithread_error_trace.py +117 -0
  24. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/uv.lock +8 -8
  25. speedy_utils-1.1.23/notebooks/llm_utils/llm_as_a_judge.ipynb +0 -642
  26. speedy_utils-1.1.23/src/llm_utils/lm/llm_as_a_judge.py +0 -390
  27. speedy_utils-1.1.23/src/llm_utils/lm/llm_task.py +0 -614
  28. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/.github/copilot-instructions.md +0 -0
  29. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/.github/workflows/publish.yml +0 -0
  30. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/.gitignore +0 -0
  31. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/.pre-commit-config.yaml +0 -0
  32. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/README.md +0 -0
  33. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/bumpversion.sh +0 -0
  34. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/notebooks/test_multi_thread.ipynb +0 -0
  35. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/ruff.toml +0 -0
  36. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/scripts/deploy.sh +0 -0
  37. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/setup.cfg +0 -0
  38. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/chat_format/transform.py +0 -0
  39. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/chat_format/utils.py +0 -0
  40. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/group_messages.py +0 -0
  41. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/__init__.py +0 -0
  42. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/_utils.py +0 -0
  43. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/async_llm_task.py +0 -0
  44. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/async_lm.py +0 -0
  45. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/async_lm_base.py +0 -0
  46. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/async_lm/lm_specific.py +0 -0
  47. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/base_prompt_builder.py +0 -0
  48. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/lm/lm_base.py +0 -0
  49. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/scripts/README.md +0 -0
  50. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/scripts/vllm_load_balancer.py +0 -0
  51. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/scripts/vllm_serve.py +0 -0
  52. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/__init__.py +0 -0
  53. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/cli.py +0 -0
  54. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/core.py +0 -0
  55. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/types.py +0 -0
  56. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/llm_utils/vector_cache/utils.py +0 -0
  57. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/__init__.py +0 -0
  58. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/clock.py +0 -0
  59. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/function_decorator.py +0 -0
  60. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/logger.py +0 -0
  61. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/notebook_utils.py +0 -0
  62. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/patcher.py +0 -0
  63. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/report_manager.py +0 -0
  64. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_cache.py +0 -0
  65. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/common/utils_print.py +0 -0
  66. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/multi_worker/__init__.py +0 -0
  67. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/multi_worker/process.py +0 -0
  68. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/scripts/__init__.py +0 -0
  69. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/scripts/mpython.py +0 -0
  70. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/src/speedy_utils/scripts/openapi_client_codegen.py +0 -0
  71. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/sample_objects.py +0 -0
  72. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test.py +0 -0
  73. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_logger.py +0 -0
  74. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_logger_format.py +0 -0
  75. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_memoize_typing.py +0 -0
  76. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_mpython.py +0 -0
  77. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_process.py +0 -0
  78. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_process_update.py +0 -0
  79. {speedy_utils-1.1.23 → speedy_utils-1.1.24}/tests/test_thread.py +0 -0
@@ -0,0 +1,141 @@
1
+ # Multi-thread Error Tracing Improvements
2
+
3
+ ## Summary
4
+
5
+ Significantly improved error tracing in `multi_thread` to focus on user code rather than infrastructure frames, making debugging much faster and easier.
6
+
7
+ ## Problem
8
+
9
+ Previously, when errors occurred in functions executed by `multi_thread`, the traceback was cluttered with infrastructure frames:
10
+
11
+ - `concurrent.futures` internals
12
+ - `threading.py` frames
13
+ - `multi_worker/thread.py` infrastructure code
14
+
15
+ This made it difficult to quickly identify the actual problem in user code.
16
+
17
+ ### Example of OLD behavior:
18
+
19
+ ```
20
+ TypeError Traceback (most recent call last)
21
+ Cell In[810], line 35
22
+ 33 choices = multi_thread(fns, range(n))
23
+ 34 return choices
24
+ ---> 35 choices = translate()
25
+
26
+ File ~/projects/speedy_utils/src/speedy_utils/multi_worker/thread.py:474, in multi_thread(...)
27
+ 472 idx, logical_size = _future_meta(fut)
28
+ 473 try:
29
+ --> 474 result = fut.result()
30
+ 475 except Exception as exc:
31
+ 476 if stop_on_error:
32
+
33
+ File ~/.local/share/uv/python/.../concurrent/futures/_base.py:449, in Future.result(...)
34
+ 447 raise CancelledError()
35
+ 448 elif self._state == FINISHED:
36
+ --> 449 return self.__get_result()
37
+
38
+ ... (many more infrastructure frames) ...
39
+
40
+ TypeError: 'list' object is not callable
41
+ ```
42
+
43
+ ## Solution
44
+
45
+ ### 1. Added `UserFunctionError` Exception Class
46
+
47
+ A custom exception wrapper that:
48
+
49
+ - Captures the original exception
50
+ - Stores the function name and input that caused the error
51
+ - Filters traceback to include only user code frames
52
+ - Provides clear, focused error messages
53
+
54
+ ### 2. Enhanced `_worker` Function
55
+
56
+ - Added validation to detect common mistakes (e.g., passing a list instead of a function)
57
+ - Filters tracebacks to remove infrastructure frames
58
+ - Wraps user function errors in `UserFunctionError` with clean context
59
+ - Provides helpful hints for common mistakes
60
+
61
+ ### 3. Improved Error Reporting in `multi_thread`
62
+
63
+ - Logs clear error messages showing function name and input
64
+ - Displays only user code in tracebacks
65
+ - Re-raises exceptions with cleaned messages
66
+ - Maintains proper exception chaining while hiding infrastructure noise
67
+
68
+ ## Benefits
69
+
70
+ ### Clear Error Messages
71
+
72
+ ```
73
+ Error in function "process_item" with input: 0
74
+
75
+ User code traceback:
76
+ File "your_script.py", line 20, in process_item
77
+ return my_list(x)
78
+ ^^^^^^^^^^
79
+ TypeError: 'list' object is not callable
80
+ ```
81
+
82
+ ### Helpful Hints
83
+
84
+ ```
85
+ TypeError:
86
+ multi_thread: func parameter must be callable, got list: [...]
87
+ Hint: Did you accidentally pass a list instead of a function?
88
+ ```
89
+
90
+ ### Nested Function Support
91
+
92
+ Shows complete call chain through user code:
93
+
94
+ ```
95
+ Error in function "process_data" with input: 0
96
+
97
+ User code traceback:
98
+ File "your_script.py", line 44, in process_data
99
+ return validate_and_calc(val)
100
+ ^^^^^^^^^^^^^^^^^^^^^^
101
+ File "your_script.py", line 42, in validate_and_calc
102
+ return 100 / x
103
+ ~~~~^~~
104
+ ZeroDivisionError: division by zero
105
+ ```
106
+
107
+ ## Key Improvements
108
+
109
+ ✅ **Errors show function name and problematic input**
110
+ ✅ **Tracebacks filtered to show only user code**
111
+ ✅ **No concurrent.futures/threading clutter**
112
+ ✅ **Helpful hints for common mistakes**
113
+ ✅ **Clear, actionable error messages**
114
+ ✅ **Maintains backward compatibility - all existing tests pass**
115
+
116
+ ## Testing
117
+
118
+ Run the comprehensive demo to see all improvements:
119
+
120
+ ```bash
121
+ python tests/test_multithread_error_trace.py
122
+ ```
123
+
124
+ This demonstrates:
125
+
126
+ 1. Simple function errors
127
+ 2. Nested function call traces
128
+ 3. Common parameter type mistakes
129
+ 4. Various exception types (TypeError, ValueError, AttributeError, etc.)
130
+
131
+ ## Code Changes
132
+
133
+ Main files modified:
134
+
135
+ - `src/speedy_utils/multi_worker/thread.py`:
136
+ - Added `UserFunctionError` exception class
137
+ - Enhanced `_worker` function with validation and error filtering
138
+ - Improved error handling in `multi_thread` main loop
139
+ - Added imports for `sys` and `traceback`
140
+
141
+ All changes maintain backward compatibility - existing code continues to work unchanged.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: speedy-utils
3
- Version: 1.1.23
3
+ Version: 1.1.24
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Project-URL: Homepage, https://github.com/anhvth/speedy
6
6
  Project-URL: Repository, https://github.com/anhvth/speedy
@@ -0,0 +1,119 @@
1
+ """Example demonstrating temperature range sampling with LLM."""
2
+
3
+ from llm_utils import LLM
4
+ from pydantic import BaseModel
5
+
6
+
7
+ class CreativeStory(BaseModel):
8
+ """A creative story output."""
9
+
10
+ title: str
11
+ story: str
12
+ moral: str
13
+
14
+
15
+ def example_temperature_range_text():
16
+ """Example: Sample text responses with different temperatures."""
17
+ print("=" * 60)
18
+ print("Example 1: Temperature Range Sampling (Text Completion)")
19
+ print("=" * 60)
20
+
21
+ llm = LLM(
22
+ instruction="You are a creative writer. Write a very short story.",
23
+ output_model=str,
24
+ )
25
+
26
+ prompt = "Write a one-sentence story about a brave mouse."
27
+
28
+ # Sample with 5 different temperatures from 0.1 to 1.0
29
+ responses = llm(
30
+ prompt,
31
+ temperature_ranges=(0.1, 1.0),
32
+ n=5,
33
+ )
34
+
35
+ print(f"\nGenerated {len(responses)} responses with varying temperatures:\n")
36
+ for i, resp in enumerate(responses):
37
+ temp = 0.1 + i * ((1.0 - 0.1) / (5 - 1))
38
+ print(f"Temperature ~{temp:.2f}:")
39
+ print(f" {resp['parsed']}\n")
40
+
41
+
42
+ def example_temperature_range_pydantic():
43
+ """Example: Sample structured responses with different temperatures."""
44
+ print("=" * 60)
45
+ print("Example 2: Temperature Range with Pydantic Models")
46
+ print("=" * 60)
47
+
48
+ llm = LLM(
49
+ instruction="Create a creative short story with a moral lesson.",
50
+ output_model=CreativeStory,
51
+ )
52
+
53
+ prompt = "Topic: A robot learning to feel emotions"
54
+
55
+ # Sample with 3 different temperatures from 0.5 to 1.5
56
+ responses = llm(
57
+ prompt,
58
+ temperature_ranges=(0.5, 1.5),
59
+ n=3,
60
+ )
61
+
62
+ print(f"\nGenerated {len(responses)} stories with varying creativity:\n")
63
+ for i, resp in enumerate(responses):
64
+ temp = 0.5 + i * ((1.5 - 0.5) / (3 - 1))
65
+ story = resp["parsed"]
66
+ print(f"Temperature ~{temp:.2f}:")
67
+ print(f" Title: {story.title}")
68
+ print(f" Story: {story.story[:80]}...")
69
+ print(f" Moral: {story.moral}\n")
70
+
71
+
72
+ def example_two_step_parsing():
73
+ """Example: Two-step Pydantic parsing for models with reasoning."""
74
+ print("=" * 60)
75
+ print("Example 3: Two-Step Pydantic Parsing")
76
+ print("=" * 60)
77
+
78
+ llm = LLM(
79
+ instruction=("Analyze the given text and extract structured information. Think through your analysis first."),
80
+ output_model=CreativeStory,
81
+ )
82
+
83
+ prompt = "Analyze the story: 'The tortoise won the race by persistence.'"
84
+
85
+ # Use two-step parsing (useful for reasoning models)
86
+ response = llm(
87
+ prompt,
88
+ two_step_parse_pydantic=True,
89
+ )[0]
90
+
91
+ story = response["parsed"]
92
+ print("\nExtracted structure:")
93
+ print(f" Title: {story.title}")
94
+ print(f" Story: {story.story}")
95
+ print(f" Moral: {story.moral}")
96
+
97
+
98
+ if __name__ == "__main__":
99
+ # Run examples
100
+ # Note: These require a working OpenAI API key or local LLM server
101
+
102
+ try:
103
+ example_temperature_range_text()
104
+ except Exception as e:
105
+ print(f"Example 1 failed: {e}\n")
106
+
107
+ try:
108
+ example_temperature_range_pydantic()
109
+ except Exception as e:
110
+ print(f"Example 2 failed: {e}\n")
111
+
112
+ try:
113
+ example_two_step_parsing()
114
+ except Exception as e:
115
+ print(f"Example 3 failed: {e}\n")
116
+
117
+ print("\n" + "=" * 60)
118
+ print("Examples complete!")
119
+ print("=" * 60)
@@ -0,0 +1,85 @@
1
+ """
2
+ Direct comparison: Before and After error tracing improvements.
3
+
4
+ This demonstrates the exact improvement for the user's original error case.
5
+ """
6
+
7
+ from speedy_utils import multi_thread
8
+
9
+
10
+ def simulate_original_error():
11
+ """
12
+ Simulates the exact error from the user's example:
13
+ - User has a function that creates lambda functions
14
+ - Accidentally passes list of functions as 'func' parameter
15
+ - Gets TypeError: 'list' object is not callable
16
+ """
17
+
18
+ def lm_translate(msgs, temperature, max_tokens):
19
+ """Mock language model translate function."""
20
+ return [{'parsed': f'translation at temp={temperature:.2f}'}]
21
+
22
+ def translate(n=5, max_temperature=1.0):
23
+ """Function that generates choices with different temperatures."""
24
+ step = max_temperature / n
25
+ fns = []
26
+ target_text = 'Some text to translate'
27
+ msgs = [{'role': 'user', 'content': 'Translate this'}]
28
+
29
+ for i in range(n):
30
+ fn = lambda x: lm_translate(
31
+ msgs,
32
+ temperature=0.1 + 0.1 * i * step,
33
+ max_tokens=len(target_text) + 32,
34
+ )[0]
35
+ fns.append(fn)
36
+
37
+ # THE BUG: User passed fns (a list) as the func parameter
38
+ # Should be: multi_thread(some_function, fns)
39
+ # Instead did: multi_thread(fns, range(n))
40
+ choices = multi_thread(fns, range(n), progress=False)
41
+ return choices
42
+
43
+ return translate()
44
+
45
+
46
+ def main():
47
+ print('='*70)
48
+ print('BEFORE vs AFTER: Error Tracing Improvements')
49
+ print('='*70)
50
+
51
+ print('\nBEFORE (old behavior):')
52
+ print('-' * 70)
53
+ print('''
54
+ The error traceback showed:
55
+ - Line in multi_thread.py:474
56
+ - concurrent.futures/_base.py:449
57
+ - concurrent.futures/thread.py:59
58
+ - multi_worker/thread.py:155
59
+ - ... many infrastructure frames ...
60
+ - Finally: TypeError: 'list' object is not callable
61
+
62
+ User had to dig through 10+ lines of infrastructure code
63
+ to find the actual problem.
64
+ ''')
65
+
66
+ print('\nAFTER (new behavior):')
67
+ print('-' * 70)
68
+
69
+ try:
70
+ simulate_original_error()
71
+ except TypeError as e:
72
+ print(f'\n{type(e).__name__}: {e}\n')
73
+
74
+ print('-' * 70)
75
+ print('\nKey differences:')
76
+ print(' ✓ Immediate identification of the problem')
77
+ print(' ✓ Clear hint about what went wrong')
78
+ print(' ✓ Shows exactly what was passed (list of functions)')
79
+ print(' ✓ No infrastructure clutter')
80
+ print(' ✓ Debugging time: < 5 seconds vs > 1 minute')
81
+ print('='*70)
82
+
83
+
84
+ if __name__ == '__main__':
85
+ main()
@@ -0,0 +1,300 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "136ff273",
6
+ "metadata": {},
7
+ "source": [
8
+ "# LLM-as-a-Judge Tutorial\n",
9
+ "\n",
10
+ "This notebook demonstrates how to use the LLM-as-a-Judge system with structured prompts, variable substitution, and SFT export capabilities."
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "markdown",
15
+ "id": "60c1bf59",
16
+ "metadata": {},
17
+ "source": [
18
+ "## Setup and Imports"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 2,
24
+ "id": "bb8f8e2b",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "\n",
29
+ "from llm_utils import (\n",
30
+ " LLMJudgeBase, \n",
31
+ " Signature, \n",
32
+ " InputField, \n",
33
+ " OutputField\n",
34
+ ")\n",
35
+ "from pydantic import BaseModel\n",
36
+ "import json"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "markdown",
41
+ "id": "eaceb8bd",
42
+ "metadata": {},
43
+ "source": [
44
+ "## Example 1: DSPy-like Signature System\n",
45
+ "\n",
46
+ "First, let's create a simple factual accuracy judge using the Signature system:"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 3,
52
+ "id": "5b5e2123",
53
+ "metadata": {},
54
+ "outputs": [
55
+ {
56
+ "name": "stdout",
57
+ "output_type": "stream",
58
+ "text": [
59
+ "Generated Instruction:\n",
60
+ "Judge if the answer is factually correct based on the context.\n",
61
+ "\n",
62
+ "**Input Fields:**\n",
63
+ "- context (str): Context for the prediction\n",
64
+ "- question (str): Question to be answered\n",
65
+ "- answer (str): Answer for the question\n",
66
+ "\n",
67
+ "**Output Fields:**\n",
68
+ "- factually_correct (bool): Is the answer factually correct based on the context?\n",
69
+ "\n",
70
+ "\n",
71
+ "==================================================\n",
72
+ "\n",
73
+ "Input Schema:\n",
74
+ "{\n",
75
+ " \"properties\": {\n",
76
+ " \"context\": {\n",
77
+ " \"description\": \"Context for the prediction\",\n",
78
+ " \"title\": \"Context\",\n",
79
+ " \"type\": \"string\"\n",
80
+ " },\n",
81
+ " \"question\": {\n",
82
+ " \"description\": \"Question to be answered\",\n",
83
+ " \"title\": \"Question\",\n",
84
+ " \"type\": \"string\"\n",
85
+ " },\n",
86
+ " \"answer\": {\n",
87
+ " \"description\": \"Answer for the question\",\n",
88
+ " \"title\": \"Answer\",\n",
89
+ " \"type\": \"string\"\n",
90
+ " }\n",
91
+ " },\n",
92
+ " \"required\": [\n",
93
+ " \"context\",\n",
94
+ " \"question\",\n",
95
+ " \"answer\"\n",
96
+ " ],\n",
97
+ " \"title\": \"FactJudgeInput\",\n",
98
+ " \"type\": \"object\"\n",
99
+ "}\n",
100
+ "\n",
101
+ "Output Schema:\n",
102
+ "{\n",
103
+ " \"properties\": {\n",
104
+ " \"factually_correct\": {\n",
105
+ " \"description\": \"Is the answer factually correct based on the context?\",\n",
106
+ " \"title\": \"Factually Correct\",\n",
107
+ " \"type\": \"boolean\"\n",
108
+ " }\n",
109
+ " },\n",
110
+ " \"required\": [\n",
111
+ " \"factually_correct\"\n",
112
+ " ],\n",
113
+ " \"title\": \"FactJudgeOutput\",\n",
114
+ " \"type\": \"object\"\n",
115
+ "}\n"
116
+ ]
117
+ }
118
+ ],
119
+ "source": [
120
+ "# Define a signature like DSPy (original syntax - no more type warnings!)\n",
121
+ "class FactJudge(Signature):\n",
122
+ " \"\"\"Judge if the answer is factually correct based on the context.\"\"\"\n",
123
+ " \n",
124
+ " # No more type warnings with the updated InputField/OutputField!\n",
125
+ " context: str = InputField(desc=\"Context for the prediction\")\n",
126
+ " question: str = InputField(desc=\"Question to be answered\")\n",
127
+ " answer: str = InputField(desc=\"Answer for the question\")\n",
128
+ " factually_correct: bool = OutputField(desc=\"Is the answer factually correct based on the context?\")\n",
129
+ "\n",
130
+ "# Show the generated instruction\n",
131
+ "print(\"Generated Instruction:\")\n",
132
+ "print(FactJudge.get_instruction())\n",
133
+ "print(\"\\n\" + \"=\"*50 + \"\\n\")\n",
134
+ "\n",
135
+ "# Show the input/output models (now always Pydantic models)\n",
136
+ "input_model = FactJudge.get_input_model()\n",
137
+ "output_model = FactJudge.get_output_model()\n",
138
+ "\n",
139
+ "print(\"Input Schema:\")\n",
140
+ "print(json.dumps(input_model.model_json_schema(), indent=2))\n",
141
+ "\n",
142
+ "print(\"\\nOutput Schema:\")\n",
143
+ "print(json.dumps(output_model.model_json_schema(), indent=2))"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 23,
149
+ "id": "380542eb",
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "class Sig(Signature):\n",
154
+ " \"\"\"You are a careful **translation evaluator**.\n",
155
+ "\n",
156
+ "You are given five inputs:\n",
157
+ "\n",
158
+ "* **Source Prompt** (the original text & any constraints)\n",
159
+ "* **AI Translation** (the machine translation to evaluate)\n",
160
+ "* **Human Reference** (a reference rendering; use only for guidance, not as ground truth)\n",
161
+ "* **System Message** (an automated hint about a possible structural error)\n",
162
+ "* **Glossaries** (optional terminology constraints; may be empty)\n",
163
+ "\n",
164
+ "## Your tasks\n",
165
+ "\n",
166
+ "1. **Check structure correctness**:\n",
167
+ " - Use the System Message as a hint.\n",
168
+ " - Assign a `structure_score`:\n",
169
+ " * `0` = structure is clearly wrong or the error flagged is correct.\n",
170
+ " * `1` = partially correct but flawed.\n",
171
+ " * `2` = structure is correct; the system error is invalid.\n",
172
+ "\n",
173
+ "2. **Check translation quality**:\n",
174
+ " - Compare AI Translation with Source Prompt and Human Reference.\n",
175
+ " - Assign a `translation_score`:\n",
176
+ " * `0` = unfaithful (major omissions/additions/distortions/repetitions).\n",
177
+ " * `1` = somewhat faithful (mostly correct but noticeable issues).\n",
178
+ " * `2` = faithful (preserves meaning, scope, nuance; only minor style differences).\n",
179
+ "\n",
180
+ "3. **Check glossary/terminology adherence**:\n",
181
+ " - If no glossary is provided → `term_score = 2`.\n",
182
+ " - If glossary exists but only partially followed → `term_score = 1`.\n",
183
+ " - If glossary exists but not followed at all → `term_score = 0`.\n",
184
+ "\n",
185
+ "## Output format (JSON only; no commentary)\n",
186
+ "\n",
187
+ "{{\"structure_score\": <0|1|2>, \"translation_score\": <0|1|2>, \"term_score\": <0|1|2>}}\n",
188
+ "\n",
189
+ "* Return exactly one JSON object.\n",
190
+ "* Do not output any explanations.\n",
191
+ "\"\"\"\n",
192
+ " SOURCE_PROMPT: str = InputField(desc=\"The original text to be translated, along with any constraints.\")\n",
193
+ " AI_TRANSLATION: str = InputField(desc=\"The machine translation output to be evaluated.\")\n",
194
+ " HUMAN_REFERENCE: str = InputField(desc=\"A reference human translation, to be used for guidance but not as ground truth.\")\n",
195
+ " SYSTEM_MESSAGE: str = InputField(desc=\"An automated hint about a possible structural error in the AI translation.\")\n",
196
+ " GLOSSARIES: str = InputField(desc=\"Optional terminology constraints; may be empty.\")\n",
197
+ " \n",
198
+ " structure_score: int = OutputField(desc=\"Score for structural correctness: 0 (wrong), 1 (partially correct), 2 (correct)\")\n",
199
+ " glossary_score: int = OutputField(desc=\"Score for glossary adherence: 0 (not followed), 1 (partially followed), 2 (fully followed or no glossary)\")\n",
200
+ " translation_score: int = OutputField(desc=\"Score for translation quality: 0 (unfaithful), 1 (somewhat faithful), 2 (faithful)\")\n",
201
+ " \n",
202
+ "# --- Updated evaluation prompt ---\n",
203
+ "\n",
204
+ "import os\n",
205
+ "judge = LLMJudgeBase(signature=Sig, client=8000) # vllm is hosted at port 8000\n",
206
+ "judge = LLMJudgeBase(signature=Sig, model='gpt-4.1-mini', client=None) # use openai's gpt-4.1 model"
207
+ ]
208
+ },
209
+ {
210
+ "cell_type": "code",
211
+ "execution_count": 24,
212
+ "id": "288ebc9b",
213
+ "metadata": {},
214
+ "outputs": [],
215
+ "source": [
216
+ "input_data = Sig.get_input_model()(\n",
217
+ " SOURCE_PROMPT=\"Translate the following English text to French, ensuring that the structure is preserved and the terminology is accurate. The text is: 'The quick brown fox jumps over the lazy dog.'\",\n",
218
+ " AI_TRANSLATION=\"Le renard brun rapide saute par-dessus le chien paresseux.\",\n",
219
+ " HUMAN_REFERENCE=\"Le vif renard brun bondit par-dessus le chien paresseux.\",\n",
220
+ " SYSTEM_MESSAGE=\"The AI translation has a structural error: it uses 'rapide' instead of 'vif' to describe the fox, which affects the nuance of the sentence.\",\n",
221
+ " GLOSSARIES=\"vif: quick, lively; paresseux: lazy\",\n",
222
+ ")\n",
223
+ "output = judge(input_data)"
224
+ ]
225
+ },
226
+ {
227
+ "cell_type": "code",
228
+ "execution_count": 28,
229
+ "id": "70d221c1",
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "ename": "AttributeError",
234
+ "evalue": "'LLMJudgeBase' object has no attribute 'inspect_history'",
235
+ "output_type": "error",
236
+ "traceback": [
237
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
238
+ "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
239
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[28]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mjudge\u001b[49m\u001b[43m.\u001b[49m\u001b[43minspect_history\u001b[49m()\n",
240
+ "\u001b[31mAttributeError\u001b[39m: 'LLMJudgeBase' object has no attribute 'inspect_history'"
241
+ ]
242
+ }
243
+ ],
244
+ "source": [
245
+ "judge.inspect_history()"
246
+ ]
247
+ },
248
+ {
249
+ "cell_type": "code",
250
+ "execution_count": null,
251
+ "id": "59a2c995",
252
+ "metadata": {},
253
+ "outputs": [
254
+ {
255
+ "name": "stdout",
256
+ "output_type": "stream",
257
+ "text": [
258
+ "\n",
259
+ "Okay, let's start by looking at the structure. The system message says the AI translation used 'rapide' instead of 'vif', which affects the nuance. The original prompt specified using accurate terminology. The glossary provides 'vif' for 'quick', so the AI should have used 'vif' instead of 'rapide'. That's a structural issue because the term choice is specified. So structure_score is 0.\n",
260
+ "\n",
261
+ "Next, translation quality. The AI's translation is mostly correct but uses the wrong term. The human reference uses 'vif', which is the correct term according to the glossary. The meaning is preserved, but the nuance is off. So translation_score is 1 because it's somewhat faithful but has a noticeable issue.\n",
262
+ "\n",
263
+ "Glossary adherence: The glossary specifies 'vif' for 'quick', but the AI used 'rapide'. So the term wasn't followed. Term_score is 0.\n",
264
+ "\n"
265
+ ]
266
+ }
267
+ ],
268
+ "source": []
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": null,
273
+ "id": "2b16f277",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": []
277
+ }
278
+ ],
279
+ "metadata": {
280
+ "kernelspec": {
281
+ "display_name": "speedy-utils",
282
+ "language": "python",
283
+ "name": "python3"
284
+ },
285
+ "language_info": {
286
+ "codemirror_mode": {
287
+ "name": "ipython",
288
+ "version": 3
289
+ },
290
+ "file_extension": ".py",
291
+ "mimetype": "text/x-python",
292
+ "name": "python",
293
+ "nbconvert_exporter": "python",
294
+ "pygments_lexer": "ipython3",
295
+ "version": "3.13.7"
296
+ }
297
+ },
298
+ "nbformat": 4,
299
+ "nbformat_minor": 5
300
+ }
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "speedy-utils"
3
- version = "1.1.23"
3
+ version = "1.1.24"
4
4
  description = "Fast and easy-to-use package for data science"
5
5
  authors = [{ name = "AnhVTH", email = "anhvth.226@gmail.com" }]
6
6
  readme = "README.md"