edsl 0.1.50__py3-none-any.whl → 0.1.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +45 -34
- edsl/__version__.py +1 -1
- edsl/base/base_exception.py +2 -2
- edsl/buckets/bucket_collection.py +1 -1
- edsl/buckets/exceptions.py +32 -0
- edsl/buckets/token_bucket_api.py +26 -10
- edsl/caching/cache.py +5 -2
- edsl/caching/remote_cache_sync.py +5 -5
- edsl/caching/sql_dict.py +12 -11
- edsl/config/__init__.py +1 -1
- edsl/config/config_class.py +4 -2
- edsl/conversation/Conversation.py +9 -5
- edsl/conversation/car_buying.py +1 -3
- edsl/conversation/mug_negotiation.py +2 -6
- edsl/coop/__init__.py +11 -8
- edsl/coop/coop.py +15 -13
- edsl/coop/coop_functions.py +1 -1
- edsl/coop/ep_key_handling.py +1 -1
- edsl/coop/price_fetcher.py +2 -2
- edsl/coop/utils.py +2 -2
- edsl/dataset/dataset.py +144 -63
- edsl/dataset/dataset_operations_mixin.py +14 -6
- edsl/dataset/dataset_tree.py +3 -3
- edsl/dataset/display/table_renderers.py +6 -3
- edsl/dataset/file_exports.py +4 -4
- edsl/dataset/r/ggplot.py +3 -3
- edsl/inference_services/available_model_fetcher.py +2 -2
- edsl/inference_services/data_structures.py +5 -5
- edsl/inference_services/inference_service_abc.py +1 -1
- edsl/inference_services/inference_services_collection.py +1 -1
- edsl/inference_services/service_availability.py +3 -3
- edsl/inference_services/services/azure_ai.py +3 -3
- edsl/inference_services/services/google_service.py +1 -1
- edsl/inference_services/services/test_service.py +1 -1
- edsl/instructions/change_instruction.py +5 -4
- edsl/instructions/instruction.py +1 -0
- edsl/instructions/instruction_collection.py +5 -4
- edsl/instructions/instruction_handler.py +10 -8
- edsl/interviews/answering_function.py +20 -21
- edsl/interviews/exception_tracking.py +3 -2
- edsl/interviews/interview.py +1 -1
- edsl/interviews/interview_status_dictionary.py +1 -1
- edsl/interviews/interview_task_manager.py +7 -4
- edsl/interviews/request_token_estimator.py +3 -2
- edsl/interviews/statistics.py +2 -2
- edsl/invigilators/invigilators.py +34 -6
- edsl/jobs/__init__.py +39 -2
- edsl/jobs/async_interview_runner.py +1 -1
- edsl/jobs/check_survey_scenario_compatibility.py +5 -5
- edsl/jobs/data_structures.py +2 -2
- edsl/jobs/html_table_job_logger.py +494 -257
- edsl/jobs/jobs.py +2 -2
- edsl/jobs/jobs_checks.py +5 -5
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_pricing_estimation.py +1 -1
- edsl/jobs/jobs_runner_asyncio.py +2 -2
- edsl/jobs/jobs_status_enums.py +1 -0
- edsl/jobs/remote_inference.py +47 -13
- edsl/jobs/results_exceptions_handler.py +2 -2
- edsl/language_models/language_model.py +151 -145
- edsl/notebooks/__init__.py +24 -1
- edsl/notebooks/exceptions.py +82 -0
- edsl/notebooks/notebook.py +7 -3
- edsl/notebooks/notebook_to_latex.py +1 -1
- edsl/prompts/__init__.py +23 -2
- edsl/prompts/prompt.py +1 -1
- edsl/questions/__init__.py +4 -4
- edsl/questions/answer_validator_mixin.py +0 -5
- edsl/questions/compose_questions.py +2 -2
- edsl/questions/descriptors.py +1 -1
- edsl/questions/question_base.py +32 -3
- edsl/questions/question_base_prompts_mixin.py +4 -4
- edsl/questions/question_budget.py +503 -102
- edsl/questions/question_check_box.py +658 -156
- edsl/questions/question_dict.py +176 -2
- edsl/questions/question_extract.py +401 -61
- edsl/questions/question_free_text.py +77 -9
- edsl/questions/question_functional.py +118 -9
- edsl/questions/{derived/question_likert_five.py → question_likert_five.py} +2 -2
- edsl/questions/{derived/question_linear_scale.py → question_linear_scale.py} +3 -4
- edsl/questions/question_list.py +246 -26
- edsl/questions/question_matrix.py +586 -73
- edsl/questions/question_multiple_choice.py +213 -47
- edsl/questions/question_numerical.py +360 -29
- edsl/questions/question_rank.py +401 -124
- edsl/questions/question_registry.py +3 -3
- edsl/questions/{derived/question_top_k.py → question_top_k.py} +3 -3
- edsl/questions/{derived/question_yes_no.py → question_yes_no.py} +3 -4
- edsl/questions/register_questions_meta.py +2 -1
- edsl/questions/response_validator_abc.py +6 -2
- edsl/questions/response_validator_factory.py +10 -12
- edsl/results/report.py +1 -1
- edsl/results/result.py +7 -4
- edsl/results/results.py +500 -271
- edsl/results/results_selector.py +2 -2
- edsl/scenarios/construct_download_link.py +3 -3
- edsl/scenarios/scenario.py +1 -2
- edsl/scenarios/scenario_list.py +41 -23
- edsl/surveys/survey_css.py +3 -3
- edsl/surveys/survey_simulator.py +2 -1
- edsl/tasks/__init__.py +22 -2
- edsl/tasks/exceptions.py +72 -0
- edsl/tasks/task_history.py +48 -11
- edsl/templates/error_reporting/base.html +37 -4
- edsl/templates/error_reporting/exceptions_table.html +105 -33
- edsl/templates/error_reporting/interview_details.html +130 -126
- edsl/templates/error_reporting/overview.html +21 -25
- edsl/templates/error_reporting/report.css +215 -46
- edsl/templates/error_reporting/report.js +122 -20
- edsl/tokens/__init__.py +27 -1
- edsl/tokens/exceptions.py +37 -0
- edsl/tokens/interview_token_usage.py +3 -2
- edsl/tokens/token_usage.py +4 -3
- {edsl-0.1.50.dist-info → edsl-0.1.52.dist-info}/METADATA +1 -1
- {edsl-0.1.50.dist-info → edsl-0.1.52.dist-info}/RECORD +118 -116
- edsl/questions/derived/__init__.py +0 -0
- {edsl-0.1.50.dist-info → edsl-0.1.52.dist-info}/LICENSE +0 -0
- {edsl-0.1.50.dist-info → edsl-0.1.52.dist-info}/WHEEL +0 -0
- {edsl-0.1.50.dist-info → edsl-0.1.52.dist-info}/entry_points.txt +0 -0
edsl/results/results.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
"""
|
2
|
-
The Results module provides tools for working with collections of Result objects.
|
1
|
+
"""The Results module provides tools for working with collections of Result objects.
|
3
2
|
|
4
3
|
The Results class is the primary container for analyzing and manipulating data obtained
|
5
4
|
from running surveys with language models. It implements a powerful data analysis interface
|
@@ -45,10 +44,10 @@ from typing import Optional, Callable, Any, Union, List, TYPE_CHECKING
|
|
45
44
|
from bisect import bisect_left
|
46
45
|
|
47
46
|
from ..base import Base
|
47
|
+
from ..caching import Cache, CacheEntry
|
48
48
|
|
49
49
|
if TYPE_CHECKING:
|
50
50
|
from ..surveys import Survey
|
51
|
-
from ..data import Cache
|
52
51
|
from ..agents import AgentList
|
53
52
|
from ..scenarios import ScenarioList
|
54
53
|
from ..results import Result
|
@@ -70,23 +69,43 @@ from .exceptions import (
|
|
70
69
|
ResultsDeserializationError,
|
71
70
|
)
|
72
71
|
|
72
|
+
|
73
73
|
def ensure_fetched(method):
|
74
|
-
"""A decorator that checks if remote data is loaded, and if not, attempts to fetch it.
|
74
|
+
"""A decorator that checks if remote data is loaded, and if not, attempts to fetch it.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
method: The method to decorate.
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
The wrapped method that will ensure data is fetched before execution.
|
81
|
+
"""
|
82
|
+
|
75
83
|
def wrapper(self, *args, **kwargs):
|
76
84
|
if not self._fetched:
|
77
85
|
# If not fetched, try fetching now.
|
78
86
|
# (If you know you have job info stored in self.job_info)
|
79
87
|
self.fetch_remote(self.job_info)
|
80
88
|
return method(self, *args, **kwargs)
|
89
|
+
|
81
90
|
return wrapper
|
82
91
|
|
92
|
+
|
83
93
|
def ensure_ready(method):
|
84
|
-
"""
|
85
|
-
|
86
|
-
|
94
|
+
"""Decorator for Results methods to handle not-ready state.
|
95
|
+
|
87
96
|
If the Results object is not ready, for most methods we return a NotReadyObject.
|
88
97
|
However, for __repr__ (and other methods that need to return a string), we return
|
89
98
|
the string representation of NotReadyObject.
|
99
|
+
|
100
|
+
Args:
|
101
|
+
method: The method to decorate.
|
102
|
+
|
103
|
+
Returns:
|
104
|
+
The wrapped method that will handle not-ready Results objects appropriately.
|
105
|
+
|
106
|
+
Raises:
|
107
|
+
Exception: Any exception from fetch_remote will be caught and printed.
|
108
|
+
|
90
109
|
"""
|
91
110
|
from functools import wraps
|
92
111
|
|
@@ -101,7 +120,7 @@ def ensure_ready(method):
|
|
101
120
|
except Exception as e:
|
102
121
|
print(f"Error during fetch_remote in {method.__name__}: {e}")
|
103
122
|
if not self.completed:
|
104
|
-
not_ready = NotReadyObject(name
|
123
|
+
not_ready = NotReadyObject(name=method.__name__, job_info=self.job_info)
|
105
124
|
# For __repr__, ensure we return a string
|
106
125
|
if method.__name__ == "__repr__" or method.__name__ == "__str__":
|
107
126
|
return not_ready.__repr__()
|
@@ -110,59 +129,115 @@ def ensure_ready(method):
|
|
110
129
|
|
111
130
|
return wrapper
|
112
131
|
|
132
|
+
|
113
133
|
class NotReadyObject:
|
114
|
-
"""A placeholder object that
|
115
|
-
|
134
|
+
"""A placeholder object that indicates results are not ready yet.
|
135
|
+
|
136
|
+
This class returns itself for all attribute accesses and method calls,
|
137
|
+
displaying a message about the job's running status when represented as a string.
|
138
|
+
|
139
|
+
Attributes:
|
140
|
+
name: The name of the method that was originally called.
|
141
|
+
job_info: Information about the running job.
|
142
|
+
|
143
|
+
"""
|
144
|
+
|
145
|
+
def __init__(self, name: str, job_info: "Any"):
|
146
|
+
"""Initialize a NotReadyObject.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
name: The name of the method that was attempted to be called.
|
150
|
+
job_info: Information about the running job.
|
151
|
+
"""
|
116
152
|
self.name = name
|
117
153
|
self.job_info = job_info
|
118
|
-
#print(f"Not ready to call {name}")
|
154
|
+
# print(f"Not ready to call {name}")
|
119
155
|
|
120
156
|
def __repr__(self):
|
157
|
+
"""Generate a string representation showing the job is still running.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
str: A message indicating the job is still running, along with job details.
|
161
|
+
"""
|
121
162
|
message = """Results not ready - job still running on server."""
|
122
163
|
for key, value in self.job_info.creation_data.items():
|
123
164
|
message += f"\n{key}: {value}"
|
124
165
|
return message
|
125
166
|
|
126
167
|
def __getattr__(self, _):
|
168
|
+
"""Return self for any attribute access.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
_: The attribute name (ignored).
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
NotReadyObject: Returns self for chaining.
|
175
|
+
"""
|
127
176
|
return self
|
128
|
-
|
177
|
+
|
129
178
|
def __call__(self, *args, **kwargs):
|
179
|
+
"""Return self when called as a function.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
*args: Positional arguments (ignored).
|
183
|
+
**kwargs: Keyword arguments (ignored).
|
184
|
+
|
185
|
+
Returns:
|
186
|
+
NotReadyObject: Returns self for chaining.
|
187
|
+
"""
|
130
188
|
return self
|
131
189
|
|
132
190
|
|
133
191
|
class Results(UserList, ResultsOperationsMixin, Base):
|
134
|
-
"""
|
135
|
-
|
136
|
-
|
192
|
+
"""A collection of Result objects with powerful data analysis capabilities.
|
193
|
+
|
137
194
|
The Results class is the primary container for working with data from EDSL surveys.
|
138
195
|
It provides a rich set of methods for data analysis, transformation, and visualization
|
139
|
-
inspired by data manipulation libraries like dplyr and pandas. The Results class
|
140
|
-
implements a functional, fluent interface for data manipulation where each method
|
196
|
+
inspired by data manipulation libraries like dplyr and pandas. The Results class
|
197
|
+
implements a functional, fluent interface for data manipulation where each method
|
141
198
|
returns a new Results object, allowing method chaining.
|
142
|
-
|
199
|
+
|
200
|
+
Attributes:
|
201
|
+
survey: The Survey object containing the questions used to generate results.
|
202
|
+
data: A list of Result objects containing the responses.
|
203
|
+
created_columns: A list of column names created through transformations.
|
204
|
+
cache: A Cache object for storing model responses.
|
205
|
+
completed: Whether the Results object is ready for use.
|
206
|
+
task_history: A TaskHistory object containing information about the tasks.
|
207
|
+
known_data_types: List of valid data type strings for accessing data.
|
208
|
+
|
143
209
|
Key features:
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
210
|
+
- List-like interface for accessing individual Result objects
|
211
|
+
- Selection of specific data columns with `select()`
|
212
|
+
- Filtering results with boolean expressions using `filter()`
|
213
|
+
- Creating new derived columns with `mutate()`
|
214
|
+
- Recoding values with `recode()` and `answer_truncate()`
|
215
|
+
- Sorting results with `order_by()`
|
216
|
+
- Converting to other formats (dataset, table, pandas DataFrame)
|
217
|
+
- Serialization for storage and retrieval
|
218
|
+
- Support for remote execution and result retrieval
|
219
|
+
|
155
220
|
Results objects have a hierarchical structure with the following components:
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
221
|
+
1. Each Results object contains multiple Result objects
|
222
|
+
2. Each Result object contains data organized by type (agent, scenario, model, answer, etc.)
|
223
|
+
3. Each data type contains multiple attributes (e.g., "how_feeling" in the answer type)
|
224
|
+
|
161
225
|
You can access data in a Results object using dot notation (`answer.how_feeling`) or
|
162
226
|
using just the attribute name if it's not ambiguous (`how_feeling`).
|
163
|
-
|
227
|
+
|
164
228
|
The Results class also tracks "created columns" - new derived values that aren't
|
165
229
|
part of the original data but were created through transformations.
|
230
|
+
|
231
|
+
Examples:
|
232
|
+
>>> # Create a simple Results object from example data
|
233
|
+
>>> r = Results.example()
|
234
|
+
>>> len(r) > 0 # Contains Result objects
|
235
|
+
True
|
236
|
+
>>> # Filter and transform data
|
237
|
+
>>> filtered = r.filter("how_feeling == 'Great'")
|
238
|
+
>>> # Access hierarchical data
|
239
|
+
>>> 'agent' in r.known_data_types
|
240
|
+
True
|
166
241
|
"""
|
167
242
|
|
168
243
|
__documentation__ = "https://docs.expectedparrot.com/en/latest/results.html"
|
@@ -185,9 +260,28 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
185
260
|
]
|
186
261
|
|
187
262
|
@classmethod
|
188
|
-
def from_job_info(cls, job_info: dict) -> Results:
|
189
|
-
"""
|
190
|
-
|
263
|
+
def from_job_info(cls, job_info: dict) -> "Results":
|
264
|
+
"""Instantiate a Results object from a job info dictionary.
|
265
|
+
|
266
|
+
This method creates a Results object in a not-ready state that will
|
267
|
+
fetch its data from a remote source when methods are called on it.
|
268
|
+
|
269
|
+
Args:
|
270
|
+
job_info: Dictionary containing information about a remote job.
|
271
|
+
|
272
|
+
Returns:
|
273
|
+
Results: A new Results instance with completed=False that will
|
274
|
+
fetch remote data when needed.
|
275
|
+
|
276
|
+
Examples:
|
277
|
+
>>> # Create a job info dictionary
|
278
|
+
>>> job_info = {'job_uuid': '12345', 'creation_data': {'model': 'gpt-4'}}
|
279
|
+
>>> # Create a Results object from the job info
|
280
|
+
>>> results = Results.from_job_info(job_info)
|
281
|
+
>>> results.completed
|
282
|
+
False
|
283
|
+
>>> hasattr(results, 'job_info')
|
284
|
+
True
|
191
285
|
"""
|
192
286
|
results = cls()
|
193
287
|
results.completed = False
|
@@ -204,14 +298,37 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
204
298
|
total_results: Optional[int] = None,
|
205
299
|
task_history: Optional[TaskHistory] = None,
|
206
300
|
):
|
207
|
-
"""Instantiate a
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
:
|
214
|
-
|
301
|
+
"""Instantiate a Results object with a survey and a list of Result objects.
|
302
|
+
|
303
|
+
This initializes a completed Results object with the provided data.
|
304
|
+
For creating a not-ready Results object from a job info dictionary,
|
305
|
+
use the from_job_info class method instead.
|
306
|
+
|
307
|
+
Args:
|
308
|
+
survey: A Survey object containing the questions used to generate results.
|
309
|
+
data: A list of Result objects containing the responses.
|
310
|
+
created_columns: A list of column names created through transformations.
|
311
|
+
cache: A Cache object for storing model responses.
|
312
|
+
job_uuid: A string representing the job UUID.
|
313
|
+
total_results: An integer representing the total number of results.
|
314
|
+
task_history: A TaskHistory object containing information about the tasks.
|
315
|
+
|
316
|
+
Examples:
|
317
|
+
>>> from ..results import Result
|
318
|
+
>>> # Create an empty Results object
|
319
|
+
>>> r = Results()
|
320
|
+
>>> r.completed
|
321
|
+
True
|
322
|
+
>>> len(r.created_columns)
|
323
|
+
0
|
324
|
+
|
325
|
+
>>> # Create a Results object with data
|
326
|
+
>>> from unittest.mock import Mock
|
327
|
+
>>> mock_survey = Mock()
|
328
|
+
>>> mock_result = Mock(spec=Result)
|
329
|
+
>>> r = Results(survey=mock_survey, data=[mock_result])
|
330
|
+
>>> len(r)
|
331
|
+
1
|
215
332
|
"""
|
216
333
|
self.completed = True
|
217
334
|
self._fetching = False
|
@@ -230,19 +347,26 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
230
347
|
if hasattr(self, "_add_output_functions"):
|
231
348
|
self._add_output_functions()
|
232
349
|
|
233
|
-
|
234
350
|
def _fetch_list(self, data_type: str, key: str) -> list:
|
235
|
-
"""
|
236
|
-
Return a list of values from the data for a given data type and key.
|
351
|
+
"""Return a list of values from the data for a given data type and key.
|
237
352
|
|
238
353
|
Uses the filtered data, not the original data.
|
239
354
|
|
240
|
-
|
355
|
+
Args:
|
356
|
+
data_type: The type of data to fetch (e.g., 'answer', 'agent', 'scenario').
|
357
|
+
key: The key to fetch from each data type dictionary.
|
241
358
|
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
359
|
+
Returns:
|
360
|
+
list: A list of values, one from each result in the data.
|
361
|
+
|
362
|
+
Examples:
|
363
|
+
>>> from edsl.results import Results
|
364
|
+
>>> r = Results.example()
|
365
|
+
>>> values = r._fetch_list('answer', 'how_feeling')
|
366
|
+
>>> len(values) == len(r)
|
367
|
+
True
|
368
|
+
>>> all(isinstance(v, (str, type(None))) for v in values)
|
369
|
+
True
|
246
370
|
"""
|
247
371
|
returned_list = []
|
248
372
|
for row in self.data:
|
@@ -250,6 +374,25 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
250
374
|
|
251
375
|
return returned_list
|
252
376
|
|
377
|
+
def get_answers(self, question_name: str) -> list:
|
378
|
+
"""Get the answers for a given question name.
|
379
|
+
|
380
|
+
Args:
|
381
|
+
question_name: The name of the question to fetch answers for.
|
382
|
+
|
383
|
+
Returns:
|
384
|
+
list: A list of answers, one from each result in the data.
|
385
|
+
|
386
|
+
Examples:
|
387
|
+
>>> from edsl.results import Results
|
388
|
+
>>> r = Results.example()
|
389
|
+
>>> answers = r.get_answers('how_feeling')
|
390
|
+
>>> isinstance(answers, list)
|
391
|
+
True
|
392
|
+
>>> len(answers) == len(r)
|
393
|
+
True
|
394
|
+
"""
|
395
|
+
return self._fetch_list("answer", question_name)
|
253
396
|
|
254
397
|
def _summary(self) -> dict:
|
255
398
|
import reprlib
|
@@ -301,8 +444,23 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
301
444
|
self.insert(item)
|
302
445
|
|
303
446
|
def compute_job_cost(self, include_cached_responses_in_cost: bool = False) -> float:
|
304
|
-
"""
|
305
|
-
|
447
|
+
"""Compute the cost of a completed job in USD.
|
448
|
+
|
449
|
+
This method calculates the total cost of all model responses in the results.
|
450
|
+
By default, it only counts the cost of responses that were not cached.
|
451
|
+
|
452
|
+
Args:
|
453
|
+
include_cached_responses_in_cost: Whether to include the cost of cached
|
454
|
+
responses in the total. Defaults to False.
|
455
|
+
|
456
|
+
Returns:
|
457
|
+
float: The total cost in USD.
|
458
|
+
|
459
|
+
Examples:
|
460
|
+
>>> from edsl.results import Results
|
461
|
+
>>> r = Results.example()
|
462
|
+
>>> r.compute_job_cost()
|
463
|
+
0
|
306
464
|
"""
|
307
465
|
total_cost = 0
|
308
466
|
for result in self:
|
@@ -321,88 +479,55 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
321
479
|
|
322
480
|
return total_cost
|
323
481
|
|
324
|
-
# def leaves(self):
|
325
|
-
# leaves = []
|
326
|
-
# for result in self:
|
327
|
-
# leaves.extend(result.leaves())
|
328
|
-
# return leaves
|
329
|
-
|
330
|
-
# def tree(self, node_list: Optional[List[str]] = None):
|
331
|
-
# return self.to_scenario_list().tree(node_list)
|
332
|
-
|
333
|
-
# def interactive_tree(
|
334
|
-
# self,
|
335
|
-
# fold_attributes: Optional[List[str]] = None,
|
336
|
-
# drop: Optional[List[str]] = None,
|
337
|
-
# open_file=True,
|
338
|
-
# ) -> dict:
|
339
|
-
# """Return the results as a tree."""
|
340
|
-
# from edsl.results.tree_explore import FoldableHTMLTableGenerator
|
341
|
-
|
342
|
-
# if drop is None:
|
343
|
-
# drop = []
|
344
|
-
|
345
|
-
# valid_attributes = [
|
346
|
-
# "model",
|
347
|
-
# "scenario",
|
348
|
-
# "agent",
|
349
|
-
# "answer",
|
350
|
-
# "question",
|
351
|
-
# "iteration",
|
352
|
-
# ]
|
353
|
-
# if fold_attributes is None:
|
354
|
-
# fold_attributes = []
|
355
|
-
|
356
|
-
# for attribute in fold_attributes:
|
357
|
-
# if attribute not in valid_attributes:
|
358
|
-
# raise ValueError(
|
359
|
-
# f"Invalid fold attribute: {attribute}; must be in {valid_attributes}"
|
360
|
-
# )
|
361
|
-
# data = self.leaves()
|
362
|
-
# generator = FoldableHTMLTableGenerator(data)
|
363
|
-
# tree = generator.tree(fold_attributes=fold_attributes, drop=drop)
|
364
|
-
# html_content = generator.generate_html(tree, fold_attributes)
|
365
|
-
# import tempfile
|
366
|
-
# from edsl.utilities.utilities import is_notebook
|
367
|
-
|
368
|
-
# from IPython.display import display, HTML
|
369
|
-
|
370
|
-
# if is_notebook():
|
371
|
-
# import html
|
372
|
-
# from IPython.display import display, HTML
|
373
|
-
|
374
|
-
# height = 1000
|
375
|
-
# width = 1000
|
376
|
-
# escaped_output = html.escape(html_content)
|
377
|
-
# # escaped_output = rendered_html
|
378
|
-
# iframe = f""""
|
379
|
-
# <iframe srcdoc="{ escaped_output }" style="width: {width}px; height: {height}px;"></iframe>
|
380
|
-
# """
|
381
|
-
# display(HTML(iframe))
|
382
|
-
# return None
|
383
|
-
|
384
|
-
# with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as f:
|
385
|
-
# f.write(html_content.encode())
|
386
|
-
# print(f"HTML file has been generated: {f.name}")
|
387
|
-
|
388
|
-
# if open_file:
|
389
|
-
# import webbrowser
|
390
|
-
# import time
|
391
|
-
|
392
|
-
# time.sleep(1) # Wait for 1 second
|
393
|
-
# # webbrowser.open(f.name)
|
394
|
-
# import os
|
395
|
-
|
396
|
-
# filename = f.name
|
397
|
-
# webbrowser.open(f"file://{os.path.abspath(filename)}")
|
398
|
-
|
399
|
-
# else:
|
400
|
-
# return html_content
|
401
|
-
|
402
482
|
def code(self):
|
403
|
-
|
483
|
+
"""Method for generating code representations.
|
484
|
+
|
485
|
+
Raises:
|
486
|
+
ResultsError: This method is not implemented for Results objects.
|
487
|
+
|
488
|
+
Examples:
|
489
|
+
>>> from edsl.results import Results
|
490
|
+
>>> r = Results.example()
|
491
|
+
>>> try:
|
492
|
+
... r.code()
|
493
|
+
... except ResultsError as e:
|
494
|
+
... str(e).startswith("The code() method is not implemented")
|
495
|
+
True
|
496
|
+
"""
|
497
|
+
raise ResultsError("The code() method is not implemented for Results objects")
|
404
498
|
|
405
499
|
def __getitem__(self, i):
|
500
|
+
"""Get an item from the Results object by index, slice, or key.
|
501
|
+
|
502
|
+
Args:
|
503
|
+
i: An integer index, a slice, or a string key.
|
504
|
+
|
505
|
+
Returns:
|
506
|
+
The requested item, slice of results, or dictionary value.
|
507
|
+
|
508
|
+
Raises:
|
509
|
+
ResultsError: If the argument type is invalid for indexing.
|
510
|
+
|
511
|
+
Examples:
|
512
|
+
>>> from edsl.results import Results
|
513
|
+
>>> r = Results.example()
|
514
|
+
>>> # Get by integer index
|
515
|
+
>>> result = r[0]
|
516
|
+
>>> # Get by slice
|
517
|
+
>>> subset = r[0:2]
|
518
|
+
>>> len(subset) == 2
|
519
|
+
True
|
520
|
+
>>> # Get by string key
|
521
|
+
>>> data = r["data"]
|
522
|
+
>>> isinstance(data, list)
|
523
|
+
True
|
524
|
+
>>> # Invalid index type
|
525
|
+
>>> try:
|
526
|
+
... r[1.5]
|
527
|
+
... except ResultsError:
|
528
|
+
... True
|
529
|
+
True
|
530
|
+
"""
|
406
531
|
if isinstance(i, int):
|
407
532
|
return self.data[i]
|
408
533
|
|
@@ -412,18 +537,40 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
412
537
|
if isinstance(i, str):
|
413
538
|
return self.to_dict()[i]
|
414
539
|
|
415
|
-
raise
|
540
|
+
raise ResultsError("Invalid argument type for indexing Results object")
|
416
541
|
|
417
542
|
def __add__(self, other: Results) -> Results:
|
418
543
|
"""Add two Results objects together.
|
419
|
-
They must have the same survey and created columns.
|
420
|
-
:param other: A Results object.
|
421
544
|
|
422
|
-
|
545
|
+
Combines two Results objects into a new one. Both objects must have the same
|
546
|
+
survey and created columns.
|
423
547
|
|
424
|
-
|
425
|
-
|
426
|
-
|
548
|
+
Args:
|
549
|
+
other: A Results object to add to this one.
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
A new Results object containing data from both objects.
|
553
|
+
|
554
|
+
Raises:
|
555
|
+
ResultsError: If the surveys or created columns of the two objects don't match.
|
556
|
+
|
557
|
+
Examples:
|
558
|
+
>>> from edsl.results import Results
|
559
|
+
>>> r1 = Results.example()
|
560
|
+
>>> r2 = Results.example()
|
561
|
+
>>> # Combine two Results objects
|
562
|
+
>>> r3 = r1 + r2
|
563
|
+
>>> len(r3) == len(r1) + len(r2)
|
564
|
+
True
|
565
|
+
|
566
|
+
>>> # Attempting to add incompatible Results
|
567
|
+
>>> from unittest.mock import Mock
|
568
|
+
>>> r4 = Results(survey=Mock()) # Different survey
|
569
|
+
>>> try:
|
570
|
+
... r1 + r4
|
571
|
+
... except ResultsError:
|
572
|
+
... True
|
573
|
+
True
|
427
574
|
"""
|
428
575
|
if self.survey != other.survey:
|
429
576
|
raise ResultsError(
|
@@ -439,21 +586,17 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
439
586
|
data=self.data + other.data,
|
440
587
|
created_columns=self.created_columns,
|
441
588
|
)
|
442
|
-
|
589
|
+
|
443
590
|
def _repr_html_(self):
|
444
591
|
if not self.completed:
|
445
592
|
if hasattr(self, "job_info"):
|
446
593
|
self.fetch_remote(self.job_info)
|
447
|
-
|
594
|
+
|
448
595
|
if not self.completed:
|
449
596
|
return "Results not ready to call"
|
450
|
-
|
597
|
+
|
451
598
|
return super()._repr_html_()
|
452
599
|
|
453
|
-
# @ensure_ready
|
454
|
-
# def __str__(self):
|
455
|
-
# super().__str__()
|
456
|
-
|
457
600
|
@ensure_ready
|
458
601
|
def __repr__(self) -> str:
|
459
602
|
return f"Results(data = {self.data}, survey = {repr(self.survey)}, created_columns = {self.created_columns})"
|
@@ -495,8 +638,8 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
495
638
|
print_parameters=print_parameters,
|
496
639
|
)
|
497
640
|
)
|
498
|
-
|
499
|
-
def to_dataset(self) ->
|
641
|
+
|
642
|
+
def to_dataset(self) -> "Dataset":
|
500
643
|
return self.select()
|
501
644
|
|
502
645
|
def to_dict(
|
@@ -540,7 +683,7 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
540
683
|
d.update({"task_history": self.task_history.to_dict()})
|
541
684
|
|
542
685
|
if add_edsl_version:
|
543
|
-
from
|
686
|
+
from .. import __version__
|
544
687
|
|
545
688
|
d["edsl_version"] = __version__
|
546
689
|
d["edsl_class_name"] = "Results"
|
@@ -564,12 +707,41 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
564
707
|
"b_not_a": [other_results[i] for i in indices_other],
|
565
708
|
}
|
566
709
|
|
710
|
+
def initialize_cache_from_results(self):
|
711
|
+
cache = Cache(data={})
|
712
|
+
|
713
|
+
for result in self.data:
|
714
|
+
for key in result.data["prompt"]:
|
715
|
+
if key.endswith("_system_prompt"):
|
716
|
+
question_name = key.removesuffix("_system_prompt")
|
717
|
+
system_prompt = result.data["prompt"][key].text
|
718
|
+
user_key = f"{question_name}_user_prompt"
|
719
|
+
if user_key in result.data["prompt"]:
|
720
|
+
user_prompt = result.data["prompt"][user_key].text
|
721
|
+
else:
|
722
|
+
user_prompt = ""
|
723
|
+
|
724
|
+
# Get corresponding model response
|
725
|
+
response_key = f"{question_name}_raw_model_response"
|
726
|
+
output = result.data["raw_model_response"].get(response_key, "")
|
727
|
+
|
728
|
+
entry = CacheEntry(
|
729
|
+
model=result.model.model,
|
730
|
+
parameters=result.model.parameters,
|
731
|
+
system_prompt=system_prompt,
|
732
|
+
user_prompt=user_prompt,
|
733
|
+
output=json.dumps(output),
|
734
|
+
iteration=0,
|
735
|
+
)
|
736
|
+
cache.data[entry.key] = entry
|
737
|
+
|
738
|
+
self.cache = cache
|
739
|
+
|
567
740
|
@property
|
568
741
|
def has_unfixed_exceptions(self) -> bool:
|
569
742
|
return self.task_history.has_unfixed_exceptions
|
570
743
|
|
571
744
|
def __hash__(self) -> int:
|
572
|
-
|
573
745
|
return dict_hash(
|
574
746
|
self.to_dict(sort=True, add_edsl_version=False, include_cache_info=False)
|
575
747
|
)
|
@@ -695,7 +867,7 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
695
867
|
['agent.agent_index', ...]
|
696
868
|
"""
|
697
869
|
column_names = [f"{v}.{k}" for k, v in self._key_to_data_type.items()]
|
698
|
-
from
|
870
|
+
from ..utilities.PrettyList import PrettyList
|
699
871
|
|
700
872
|
return PrettyList(sorted(column_names))
|
701
873
|
|
@@ -709,7 +881,7 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
709
881
|
>>> r.answer_keys
|
710
882
|
{'how_feeling': 'How are you this {{ period }}?', 'how_feeling_yesterday': 'How were you feeling yesterday {{ period }}?'}
|
711
883
|
"""
|
712
|
-
from
|
884
|
+
from ..utilities.utilities import shorten_string
|
713
885
|
|
714
886
|
if not self.survey:
|
715
887
|
raise ResultsError("Survey is not defined so no answer keys are available.")
|
@@ -734,7 +906,7 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
734
906
|
>>> r.agents
|
735
907
|
AgentList([Agent(traits = {'status': 'Joyful'}), Agent(traits = {'status': 'Joyful'}), Agent(traits = {'status': 'Sad'}), Agent(traits = {'status': 'Sad'})])
|
736
908
|
"""
|
737
|
-
from
|
909
|
+
from ..agents import AgentList
|
738
910
|
|
739
911
|
return AgentList([r.agent for r in self.data])
|
740
912
|
|
@@ -845,7 +1017,7 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
845
1017
|
return self.data[0]
|
846
1018
|
|
847
1019
|
def answer_truncate(
|
848
|
-
self, column: str, top_n: int = 5, new_var_name: str = None
|
1020
|
+
self, column: str, top_n: int = 5, new_var_name: Optional[str] = None
|
849
1021
|
) -> Results:
|
850
1022
|
"""Create a new variable that truncates the answers to the top_n.
|
851
1023
|
|
@@ -976,24 +1148,23 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
976
1148
|
def mutate(
|
977
1149
|
self, new_var_string: str, functions_dict: Optional[dict] = None
|
978
1150
|
) -> Results:
|
979
|
-
"""
|
980
|
-
|
981
|
-
|
1151
|
+
"""Create a new column based on a computational expression.
|
1152
|
+
|
982
1153
|
The mutate method allows you to create new derived variables based on existing data.
|
983
1154
|
You provide an assignment expression where the left side is the new column name
|
984
1155
|
and the right side is a Python expression that computes the value. The expression
|
985
1156
|
can reference any existing columns in the Results object.
|
986
|
-
|
987
|
-
|
988
|
-
new_var_string: A string containing an assignment expression in the form
|
989
|
-
|
990
|
-
|
991
|
-
functions_dict: Optional dictionary of custom functions that can be used in
|
992
|
-
|
993
|
-
|
1157
|
+
|
1158
|
+
Args:
|
1159
|
+
new_var_string: A string containing an assignment expression in the form
|
1160
|
+
"new_column_name = expression". The expression can reference
|
1161
|
+
any existing column and use standard Python syntax.
|
1162
|
+
functions_dict: Optional dictionary of custom functions that can be used in
|
1163
|
+
the expression. Keys are function names, values are function objects.
|
1164
|
+
|
994
1165
|
Returns:
|
995
1166
|
A new Results object with the additional column.
|
996
|
-
|
1167
|
+
|
997
1168
|
Notes:
|
998
1169
|
- The expression must contain an equals sign (=) separating the new column name
|
999
1170
|
from the computation expression
|
@@ -1002,22 +1173,22 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1002
1173
|
- The expression can access any data in the Result object using the column names
|
1003
1174
|
- New columns are added to the "answer" data type
|
1004
1175
|
- Created columns are tracked in the `created_columns` property
|
1005
|
-
|
1176
|
+
|
1006
1177
|
Examples:
|
1007
1178
|
>>> r = Results.example()
|
1008
|
-
|
1009
|
-
# Create a simple derived column
|
1179
|
+
|
1180
|
+
>>> # Create a simple derived column
|
1010
1181
|
>>> r.mutate('how_feeling_x = how_feeling + "x"').select('how_feeling_x')
|
1011
1182
|
Dataset([{'answer.how_feeling_x': ['OKx', 'Greatx', 'Terriblex', 'OKx']}])
|
1012
|
-
|
1013
|
-
# Create a binary indicator column
|
1183
|
+
|
1184
|
+
>>> # Create a binary indicator column
|
1014
1185
|
>>> r.mutate('is_great = 1 if how_feeling == "Great" else 0').select('is_great')
|
1015
1186
|
Dataset([{'answer.is_great': [0, 1, 0, 0]}])
|
1016
|
-
|
1017
|
-
# Create a column with custom functions
|
1187
|
+
|
1188
|
+
>>> # Create a column with custom functions
|
1018
1189
|
>>> def sentiment(text):
|
1019
1190
|
... return len(text) > 5
|
1020
|
-
>>> r.mutate('is_long = sentiment(how_feeling)',
|
1191
|
+
>>> r.mutate('is_long = sentiment(how_feeling)',
|
1021
1192
|
... functions_dict={'sentiment': sentiment}).select('is_long')
|
1022
1193
|
Dataset([{'answer.is_long': [False, False, True, False]}])
|
1023
1194
|
"""
|
@@ -1028,7 +1199,7 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1028
1199
|
)
|
1029
1200
|
raw_var_name, expression = new_var_string.split("=", 1)
|
1030
1201
|
var_name = raw_var_name.strip()
|
1031
|
-
from
|
1202
|
+
from ..utilities.utilities import is_valid_variable_name
|
1032
1203
|
|
1033
1204
|
if not is_valid_variable_name(var_name):
|
1034
1205
|
raise ResultsInvalidNameError(f"{var_name} is not a valid variable name.")
|
@@ -1116,10 +1287,14 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1116
1287
|
random.seed(seed)
|
1117
1288
|
|
1118
1289
|
if n is None and frac is None:
|
1119
|
-
|
1290
|
+
from .exceptions import ResultsError
|
1291
|
+
|
1292
|
+
raise ResultsError("You must specify either n or frac.")
|
1120
1293
|
|
1121
1294
|
if n is not None and frac is not None:
|
1122
|
-
|
1295
|
+
from .exceptions import ResultsError
|
1296
|
+
|
1297
|
+
raise ResultsError("You cannot specify both n and frac.")
|
1123
1298
|
|
1124
1299
|
if frac is not None and n is None:
|
1125
1300
|
n = int(frac * len(self.data))
|
@@ -1132,61 +1307,62 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1132
1307
|
return Results(survey=self.survey, data=new_data, created_columns=None)
|
1133
1308
|
|
1134
1309
|
@ensure_ready
|
1135
|
-
def select(self, *columns: Union[str, list[str]]) ->
|
1136
|
-
"""
|
1137
|
-
|
1138
|
-
|
1310
|
+
def select(self, *columns: Union[str, list[str]]) -> "Dataset":
|
1311
|
+
"""Extract specific columns from the Results into a Dataset.
|
1312
|
+
|
1139
1313
|
This method allows you to select specific columns from the Results object
|
1140
1314
|
and transforms the data into a Dataset for further analysis and visualization.
|
1141
1315
|
A Dataset is a more general-purpose data structure optimized for analysis
|
1142
1316
|
operations rather than the hierarchical structure of Result objects.
|
1143
|
-
|
1144
|
-
|
1317
|
+
|
1318
|
+
Args:
|
1145
1319
|
*columns: Column names to select. Each column can be:
|
1146
|
-
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1320
|
+
- A simple attribute name (e.g., "how_feeling")
|
1321
|
+
- A fully qualified name with type (e.g., "answer.how_feeling")
|
1322
|
+
- A wildcard pattern (e.g., "answer.*" to select all answer fields)
|
1323
|
+
If no columns are provided, selects all data.
|
1324
|
+
|
1151
1325
|
Returns:
|
1152
1326
|
A Dataset object containing the selected data.
|
1153
|
-
|
1327
|
+
|
1154
1328
|
Notes:
|
1155
1329
|
- Column names are automatically disambiguated if needed
|
1156
1330
|
- When column names are ambiguous, specify the full path with data type
|
1157
1331
|
- You can use wildcard patterns with "*" to select multiple related fields
|
1158
1332
|
- Selecting with no arguments returns all data
|
1159
1333
|
- Results are restructured in a columnar format in the Dataset
|
1160
|
-
|
1334
|
+
|
1161
1335
|
Examples:
|
1162
1336
|
>>> results = Results.example()
|
1163
|
-
|
1164
|
-
# Select a single column by name
|
1337
|
+
|
1338
|
+
>>> # Select a single column by name
|
1165
1339
|
>>> results.select('how_feeling')
|
1166
1340
|
Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible', 'OK']}])
|
1167
|
-
|
1168
|
-
# Select multiple columns
|
1341
|
+
|
1342
|
+
>>> # Select multiple columns
|
1169
1343
|
>>> ds = results.select('how_feeling', 'how_feeling_yesterday')
|
1170
1344
|
>>> sorted([list(d.keys())[0] for d in ds])
|
1171
1345
|
['answer.how_feeling', 'answer.how_feeling_yesterday']
|
1172
|
-
|
1173
|
-
# Using fully qualified names with data type
|
1346
|
+
|
1347
|
+
>>> # Using fully qualified names with data type
|
1174
1348
|
>>> results.select('answer.how_feeling')
|
1175
1349
|
Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible', 'OK']}])
|
1176
|
-
|
1177
|
-
# Using partial matching for column names
|
1350
|
+
|
1351
|
+
>>> # Using partial matching for column names
|
1178
1352
|
>>> results.select('answer.how_feeling_y')
|
1179
1353
|
Dataset([{'answer.how_feeling_yesterday': ['Great', 'Good', 'OK', 'Terrible']}])
|
1180
|
-
|
1181
|
-
# Select all columns (same as calling select with no arguments)
|
1182
|
-
>>> results.select('*.*')
|
1354
|
+
|
1355
|
+
>>> # Select all columns (same as calling select with no arguments)
|
1356
|
+
>>> results.select('*.*')
|
1183
1357
|
Dataset([...])
|
1184
1358
|
"""
|
1185
1359
|
|
1186
|
-
from
|
1360
|
+
from .results_selector import Selector
|
1187
1361
|
|
1188
1362
|
if len(self) == 0:
|
1189
|
-
|
1363
|
+
from .exceptions import ResultsError
|
1364
|
+
|
1365
|
+
raise ResultsError("No data to select from---the Results object is empty.")
|
1190
1366
|
|
1191
1367
|
selector = Selector(
|
1192
1368
|
known_data_types=self.known_data_types,
|
@@ -1250,21 +1426,24 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1250
1426
|
|
1251
1427
|
@ensure_ready
|
1252
1428
|
def filter(self, expression: str) -> Results:
|
1253
|
-
"""
|
1254
|
-
|
1255
|
-
|
1429
|
+
"""Filter results based on a boolean expression.
|
1430
|
+
|
1256
1431
|
This method evaluates a boolean expression against each Result object in the
|
1257
1432
|
collection and returns a new Results object containing only those that match.
|
1258
1433
|
The expression can reference any column in the data and supports standard
|
1259
1434
|
Python operators and syntax.
|
1260
|
-
|
1261
|
-
|
1435
|
+
|
1436
|
+
Args:
|
1262
1437
|
expression: A string containing a Python expression that evaluates to a boolean.
|
1263
1438
|
The expression is applied to each Result object individually.
|
1264
|
-
|
1439
|
+
|
1265
1440
|
Returns:
|
1266
1441
|
A new Results object containing only the Result objects that satisfy the expression.
|
1267
|
-
|
1442
|
+
|
1443
|
+
Raises:
|
1444
|
+
ResultsFilterError: If the expression is invalid or uses improper syntax
|
1445
|
+
(like using '=' instead of '==').
|
1446
|
+
|
1268
1447
|
Notes:
|
1269
1448
|
- Column names can be specified with or without their data type prefix
|
1270
1449
|
(e.g., both "how_feeling" and "answer.how_feeling" work if unambiguous)
|
@@ -1273,23 +1452,23 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1273
1452
|
- You can use comparison operators like '==', '!=', '>', '<', '>=', '<='
|
1274
1453
|
- You can use membership tests with 'in'
|
1275
1454
|
- You can use string methods like '.startswith()', '.contains()', etc.
|
1276
|
-
|
1455
|
+
|
1277
1456
|
Examples:
|
1278
1457
|
>>> r = Results.example()
|
1279
|
-
|
1280
|
-
# Simple equality filter
|
1458
|
+
|
1459
|
+
>>> # Simple equality filter
|
1281
1460
|
>>> r.filter("how_feeling == 'Great'").select('how_feeling')
|
1282
1461
|
Dataset([{'answer.how_feeling': ['Great']}])
|
1283
|
-
|
1284
|
-
# Using OR condition
|
1462
|
+
|
1463
|
+
>>> # Using OR condition
|
1285
1464
|
>>> r.filter("how_feeling == 'Great' or how_feeling == 'Terrible'").select('how_feeling')
|
1286
1465
|
Dataset([{'answer.how_feeling': ['Great', 'Terrible']}])
|
1287
|
-
|
1288
|
-
# Filter on agent properties
|
1466
|
+
|
1467
|
+
>>> # Filter on agent properties
|
1289
1468
|
>>> r.filter("agent.status == 'Joyful'").select('agent.status')
|
1290
1469
|
Dataset([{'agent.status': ['Joyful', 'Joyful']}])
|
1291
|
-
|
1292
|
-
# Common error: using = instead of ==
|
1470
|
+
|
1471
|
+
>>> # Common error: using = instead of ==
|
1293
1472
|
>>> try:
|
1294
1473
|
... r.filter("how_feeling = 'Great'")
|
1295
1474
|
... except Exception as e:
|
@@ -1394,45 +1573,58 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1394
1573
|
[1, 1, 0, 0]
|
1395
1574
|
"""
|
1396
1575
|
return [r.score(f) for r in self.data]
|
1397
|
-
|
1576
|
+
|
1398
1577
|
def score_with_answer_key(self, answer_key: dict) -> list:
|
1399
1578
|
"""Score the results using an answer key.
|
1400
1579
|
|
1401
1580
|
:param answer_key: A dictionary that maps answer values to scores.
|
1402
1581
|
"""
|
1403
1582
|
return [r.score_with_answer_key(answer_key) for r in self.data]
|
1404
|
-
|
1405
1583
|
|
1406
1584
|
def fetch_remote(self, job_info: Any) -> None:
|
1407
|
-
"""
|
1408
|
-
|
1409
|
-
|
1410
|
-
This is useful when you have a Results object that was created locally but want to sync it with
|
1585
|
+
"""Fetch remote Results object and update this instance with the data.
|
1586
|
+
|
1587
|
+
This is useful when you have a Results object that was created locally but want to sync it with
|
1411
1588
|
the latest data from the remote server.
|
1412
|
-
|
1589
|
+
|
1413
1590
|
Args:
|
1414
1591
|
job_info: RemoteJobInfo object containing the job_uuid and other remote job details
|
1415
|
-
|
1592
|
+
|
1593
|
+
Returns:
|
1594
|
+
bool: True if the fetch was successful, False if the job is not yet completed.
|
1595
|
+
|
1596
|
+
Raises:
|
1597
|
+
ResultsError: If there's an error during the fetch process.
|
1598
|
+
|
1599
|
+
Examples:
|
1600
|
+
>>> # This is a simplified example since we can't actually test this without a remote server
|
1601
|
+
>>> from unittest.mock import Mock, patch
|
1602
|
+
>>> # Create a mock job_info and Results
|
1603
|
+
>>> job_info = Mock()
|
1604
|
+
>>> job_info.job_uuid = "test_uuid"
|
1605
|
+
>>> results = Results()
|
1606
|
+
>>> # In a real scenario:
|
1607
|
+
>>> # results.fetch_remote(job_info)
|
1608
|
+
>>> # results.completed # Would be True if successful
|
1416
1609
|
"""
|
1417
|
-
#print("Calling fetch_remote")
|
1418
1610
|
try:
|
1419
1611
|
from ..coop import Coop
|
1420
1612
|
from ..jobs import JobsRemoteInferenceHandler
|
1421
|
-
|
1613
|
+
|
1422
1614
|
# Get the remote job data
|
1423
1615
|
remote_job_data = JobsRemoteInferenceHandler.check_status(job_info.job_uuid)
|
1424
|
-
|
1616
|
+
|
1425
1617
|
if remote_job_data.get("status") not in ["completed", "failed"]:
|
1426
1618
|
return False
|
1427
|
-
#
|
1619
|
+
#
|
1428
1620
|
results_uuid = remote_job_data.get("results_uuid")
|
1429
1621
|
if not results_uuid:
|
1430
1622
|
raise ResultsError("No results_uuid found in remote job data")
|
1431
|
-
|
1623
|
+
|
1432
1624
|
# Fetch the remote Results object
|
1433
1625
|
coop = Coop()
|
1434
1626
|
remote_results = coop.get(results_uuid, expected_object_type="results")
|
1435
|
-
|
1627
|
+
|
1436
1628
|
# Update this instance with remote data
|
1437
1629
|
self.data = remote_results.data
|
1438
1630
|
self.survey = remote_results.survey
|
@@ -1440,10 +1632,10 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1440
1632
|
self.cache = remote_results.cache
|
1441
1633
|
self.task_history = remote_results.task_history
|
1442
1634
|
self.completed = True
|
1443
|
-
|
1635
|
+
|
1444
1636
|
# Set job_uuid and results_uuid from remote data
|
1445
1637
|
self.job_uuid = job_info.job_uuid
|
1446
|
-
if hasattr(remote_results,
|
1638
|
+
if hasattr(remote_results, "results_uuid"):
|
1447
1639
|
self.results_uuid = remote_results.results_uuid
|
1448
1640
|
|
1449
1641
|
return True
|
@@ -1451,39 +1643,60 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1451
1643
|
except Exception as e:
|
1452
1644
|
raise ResultsError(f"Failed to fetch remote results: {str(e)}")
|
1453
1645
|
|
1454
|
-
def fetch(self, polling_interval: [float, int] = 1.0) -> Results:
|
1455
|
-
"""
|
1456
|
-
|
1457
|
-
|
1646
|
+
def fetch(self, polling_interval: Union[float, int] = 1.0) -> Results:
|
1647
|
+
"""Poll the server for job completion and update this Results instance.
|
1648
|
+
|
1649
|
+
This method continuously polls the remote server until the job is completed or
|
1650
|
+
fails, then updates this Results object with the final data.
|
1651
|
+
|
1458
1652
|
Args:
|
1459
1653
|
polling_interval: Number of seconds to wait between polling attempts (default: 1.0)
|
1460
|
-
|
1654
|
+
|
1461
1655
|
Returns:
|
1462
1656
|
self: The updated Results instance
|
1657
|
+
|
1658
|
+
Raises:
|
1659
|
+
ResultsError: If no job info is available or if there's an error during fetch.
|
1660
|
+
|
1661
|
+
Examples:
|
1662
|
+
>>> # This is a simplified example since we can't actually test polling
|
1663
|
+
>>> from unittest.mock import Mock, patch
|
1664
|
+
>>> # Create a mock results object
|
1665
|
+
>>> results = Results()
|
1666
|
+
>>> # In a real scenario with a running job:
|
1667
|
+
>>> # results.job_info = remote_job_info
|
1668
|
+
>>> # results.fetch() # Would poll until complete
|
1669
|
+
>>> # results.completed # Would be True if successful
|
1463
1670
|
"""
|
1464
1671
|
if not hasattr(self, "job_info"):
|
1465
|
-
raise ResultsError(
|
1466
|
-
|
1672
|
+
raise ResultsError(
|
1673
|
+
"No job info available - this Results object wasn't created from a remote job"
|
1674
|
+
)
|
1675
|
+
|
1467
1676
|
from ..jobs import JobsRemoteInferenceHandler
|
1468
|
-
|
1677
|
+
|
1469
1678
|
try:
|
1470
1679
|
# Get the remote job data
|
1471
|
-
remote_job_data = JobsRemoteInferenceHandler.check_status(
|
1472
|
-
|
1680
|
+
remote_job_data = JobsRemoteInferenceHandler.check_status(
|
1681
|
+
self.job_info.job_uuid
|
1682
|
+
)
|
1683
|
+
|
1473
1684
|
while remote_job_data.get("status") not in ["completed", "failed"]:
|
1474
1685
|
print("Waiting for remote job to complete...")
|
1475
1686
|
import time
|
1687
|
+
|
1476
1688
|
time.sleep(polling_interval)
|
1477
|
-
remote_job_data = JobsRemoteInferenceHandler.check_status(
|
1478
|
-
|
1689
|
+
remote_job_data = JobsRemoteInferenceHandler.check_status(
|
1690
|
+
self.job_info.job_uuid
|
1691
|
+
)
|
1692
|
+
|
1479
1693
|
# Once complete, fetch the full results
|
1480
1694
|
self.fetch_remote(self.job_info)
|
1481
1695
|
return self
|
1482
|
-
|
1696
|
+
|
1483
1697
|
except Exception as e:
|
1484
1698
|
raise ResultsError(f"Failed to fetch remote results: {str(e)}")
|
1485
1699
|
|
1486
|
-
|
1487
1700
|
def spot_issues(self, models: Optional[ModelList] = None) -> Results:
|
1488
1701
|
"""Run a survey to spot issues and suggest improvements for prompts that had no model response, returning a new Results object.
|
1489
1702
|
Future version: Allow user to optionally pass a list of questions to review, regardless of whether they had a null model response.
|
@@ -1494,57 +1707,72 @@ class Results(UserList, ResultsOperationsMixin, Base):
|
|
1494
1707
|
from ..language_models import ModelList
|
1495
1708
|
import pandas as pd
|
1496
1709
|
|
1497
|
-
df = self.select(
|
1710
|
+
df = self.select(
|
1711
|
+
"agent.*", "scenario.*", "answer.*", "raw_model_response.*", "prompt.*"
|
1712
|
+
).to_pandas()
|
1498
1713
|
scenario_list = []
|
1499
1714
|
|
1500
1715
|
for _, row in df.iterrows():
|
1501
1716
|
for col in df.columns:
|
1502
1717
|
if col.endswith("_raw_model_response") and pd.isna(row[col]):
|
1503
|
-
q = col.split("_raw_model_response")[0].replace(
|
1504
|
-
|
1505
|
-
|
1506
|
-
|
1507
|
-
|
1508
|
-
|
1509
|
-
|
1510
|
-
|
1511
|
-
|
1718
|
+
q = col.split("_raw_model_response")[0].replace(
|
1719
|
+
"raw_model_response.", ""
|
1720
|
+
)
|
1721
|
+
|
1722
|
+
s = Scenario(
|
1723
|
+
{
|
1724
|
+
"original_question": q,
|
1725
|
+
"original_agent_index": row["agent.agent_index"],
|
1726
|
+
"original_scenario_index": row["scenario.scenario_index"],
|
1727
|
+
"original_prompts": f"User prompt: {row[f'prompt.{q}_user_prompt']}\nSystem prompt: {row[f'prompt.{q}_system_prompt']}",
|
1728
|
+
}
|
1729
|
+
)
|
1730
|
+
|
1512
1731
|
scenario_list.append(s)
|
1513
1732
|
|
1514
1733
|
sl = ScenarioList(set(scenario_list))
|
1515
1734
|
|
1516
1735
|
q1 = QuestionFreeText(
|
1517
|
-
question_name
|
1518
|
-
question_text
|
1736
|
+
question_name="issues",
|
1737
|
+
question_text="""
|
1519
1738
|
The following prompts generated a bad or null response: '{{ original_prompts }}'
|
1520
1739
|
What do you think was the likely issue(s)?
|
1521
|
-
"""
|
1740
|
+
""",
|
1522
1741
|
)
|
1523
1742
|
|
1524
1743
|
q2 = QuestionDict(
|
1525
|
-
question_name
|
1526
|
-
question_text
|
1744
|
+
question_name="revised",
|
1745
|
+
question_text="""
|
1527
1746
|
The following prompts generated a bad or null response: '{{ original_prompts }}'
|
1528
1747
|
You identified the issue(s) as '{{ issues.answer }}'.
|
1529
1748
|
Please revise the prompts to address the issue(s).
|
1530
1749
|
""",
|
1531
|
-
answer_keys
|
1750
|
+
answer_keys=["revised_user_prompt", "revised_system_prompt"],
|
1532
1751
|
)
|
1533
1752
|
|
1534
|
-
survey = Survey(questions
|
1753
|
+
survey = Survey(questions=[q1, q2])
|
1535
1754
|
|
1536
1755
|
if models is not None:
|
1537
1756
|
if not isinstance(models, ModelList):
|
1538
1757
|
raise ResultsError("models must be a ModelList")
|
1539
1758
|
results = survey.by(sl).by(models).run()
|
1540
1759
|
else:
|
1541
|
-
results = survey.by(sl).run()
|
1760
|
+
results = survey.by(sl).run() # use the default model
|
1542
1761
|
|
1543
1762
|
return results
|
1544
1763
|
|
1545
1764
|
|
1546
1765
|
def main(): # pragma: no cover
|
1547
|
-
"""
|
1766
|
+
"""Run example operations on a Results object.
|
1767
|
+
|
1768
|
+
This function demonstrates basic filtering and mutation operations on
|
1769
|
+
a Results object, printing the output.
|
1770
|
+
|
1771
|
+
Examples:
|
1772
|
+
>>> # This can be run directly as a script
|
1773
|
+
>>> # python -m edsl.results.results
|
1774
|
+
>>> # It will create example results and show filtering and mutation
|
1775
|
+
"""
|
1548
1776
|
from ..results import Results
|
1549
1777
|
|
1550
1778
|
results = Results.example(debug=True)
|
@@ -1554,4 +1782,5 @@ def main(): # pragma: no cover
|
|
1554
1782
|
|
1555
1783
|
if __name__ == "__main__":
|
1556
1784
|
import doctest
|
1785
|
+
|
1557
1786
|
doctest.testmod(optionflags=doctest.ELLIPSIS)
|