edsl 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +3 -2
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +105 -7
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/invigilators/invigilators.py +10 -1
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_pricing_estimation.py +127 -46
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +102 -12
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +45 -75
- edsl/language_models/registry.py +5 -0
- edsl/language_models/utilities.py +2 -1
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_check_box.py +171 -149
- edsl/questions/question_dict.py +243 -51
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +63 -16
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +117 -6
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/METADATA +52 -76
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/RECORD +102 -78
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
- {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
edsl/dataset/dataset.py
CHANGED
@@ -127,29 +127,85 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
127
127
|
def keys(self) -> list[str]:
|
128
128
|
"""Return the keys of the dataset.
|
129
129
|
|
130
|
-
|
131
|
-
|
132
|
-
|
130
|
+
Examples:
|
131
|
+
>>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
|
132
|
+
>>> d.keys()
|
133
|
+
['a', 'b']
|
134
|
+
|
135
|
+
>>> d = Dataset([{'x.y': [1, 2]}, {'z.w': [3, 4]}])
|
136
|
+
>>> d.keys()
|
137
|
+
['x.y', 'z.w']
|
138
|
+
"""
|
139
|
+
return [list(o.keys())[0] for o in self]
|
133
140
|
|
134
|
-
|
135
|
-
|
136
|
-
['a.b', 'c.d']
|
141
|
+
def filter(self, expression) -> "Dataset":
|
142
|
+
"""Filter the dataset based on a boolean expression.
|
137
143
|
|
144
|
+
Args:
|
145
|
+
expression: A string expression that evaluates to a boolean value.
|
146
|
+
Can reference column names in the dataset.
|
138
147
|
|
139
|
-
|
148
|
+
Examples:
|
149
|
+
>>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
|
150
|
+
>>> d.filter('a > 2').data
|
151
|
+
[{'a': [3, 4]}, {'b': [7, 8]}]
|
152
|
+
|
153
|
+
>>> d = Dataset([{'x': ['a', 'b', 'c']}, {'y': [1, 2, 3]}])
|
154
|
+
>>> d.filter('y < 3').data
|
155
|
+
[{'x': ['a', 'b']}, {'y': [1, 2]}]
|
140
156
|
"""
|
141
|
-
return [list(o.keys())[0] for o in self]
|
142
|
-
|
143
|
-
def filter(self, expression):
|
144
157
|
return self.to_scenario_list().filter(expression).to_dataset()
|
145
158
|
|
146
159
|
def mutate(self, new_var_string: str, functions_dict: Optional[dict[str, Callable]] = None) -> "Dataset":
|
160
|
+
"""Create new columns by applying functions to existing columns.
|
161
|
+
|
162
|
+
Args:
|
163
|
+
new_var_string: A string expression defining the new variable.
|
164
|
+
Can reference existing column names.
|
165
|
+
functions_dict: Optional dictionary of custom functions to use in the expression.
|
166
|
+
|
167
|
+
Examples:
|
168
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
169
|
+
>>> d.mutate('c = a + b').data
|
170
|
+
[{'a': [1, 2, 3]}, {'b': [4, 5, 6]}, {'c': [5, 7, 9]}]
|
171
|
+
|
172
|
+
>>> d = Dataset([{'x': [1, 2, 3]}])
|
173
|
+
>>> d.mutate('y = x * 2').data
|
174
|
+
[{'x': [1, 2, 3]}, {'y': [2, 4, 6]}]
|
175
|
+
"""
|
147
176
|
return self.to_scenario_list().mutate(new_var_string, functions_dict).to_dataset()
|
148
177
|
|
149
|
-
def collapse(self, field:str, separator: Optional[str] = None) -> "Dataset":
|
178
|
+
def collapse(self, field: str, separator: Optional[str] = None) -> "Dataset":
|
179
|
+
"""Collapse multiple values in a field into a single value using a separator.
|
180
|
+
|
181
|
+
Args:
|
182
|
+
field: The name of the field to collapse.
|
183
|
+
separator: Optional string to use as a separator between values.
|
184
|
+
Defaults to a space if not specified.
|
185
|
+
|
186
|
+
Examples:
|
187
|
+
>>> d = Dataset([{'words': [['hello', 'world'], ['good', 'morning']]}])
|
188
|
+
>>> d.collapse('words').data
|
189
|
+
[{'words': [[['hello', 'world'], ['good', 'morning']]]}]
|
190
|
+
|
191
|
+
>>> d = Dataset([{'numbers': [1, 2, 3]}])
|
192
|
+
>>> d.collapse('numbers', separator=',').data
|
193
|
+
[{'numbers': ['1,2,3']}]
|
194
|
+
"""
|
150
195
|
return self.to_scenario_list().collapse(field, separator).to_dataset()
|
151
196
|
|
152
197
|
def long(self, exclude_fields: list[str] = None) -> Dataset:
|
198
|
+
"""Convert the dataset from wide to long format.
|
199
|
+
|
200
|
+
Examples:
|
201
|
+
>>> d = Dataset([{'a': [1, 2], 'b': [3, 4]}])
|
202
|
+
>>> d.long().data
|
203
|
+
[{'row': [0, 0, 1, 1]}, {'key': ['a', 'b', 'a', 'b']}, {'value': [1, 3, 2, 4]}]
|
204
|
+
|
205
|
+
>>> d = Dataset([{'x': [1, 2], 'y': [3, 4], 'z': [5, 6]}])
|
206
|
+
>>> d.long(exclude_fields=['z']).data
|
207
|
+
[{'row': [0, 0, 1, 1]}, {'key': ['x', 'y', 'x', 'y']}, {'value': [1, 3, 2, 4]}, {'z': [5, 5, 6, 6]}]
|
208
|
+
"""
|
153
209
|
headers, data = self._tabular()
|
154
210
|
exclude_fields = exclude_fields or []
|
155
211
|
|
@@ -185,14 +241,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
185
241
|
"""
|
186
242
|
Convert a long-format dataset (with row, key, value columns) to wide format.
|
187
243
|
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
244
|
+
Examples:
|
245
|
+
>>> d = Dataset([{'row': [0, 0, 1, 1]}, {'key': ['a', 'b', 'a', 'b']}, {'value': [1, 3, 2, 4]}])
|
246
|
+
>>> d.wide().data
|
247
|
+
[{'a': [1, 2]}, {'b': [3, 4]}]
|
248
|
+
|
249
|
+
>>> d = Dataset([{'row': [0, 0, 1, 1]}, {'key': ['x', 'y', 'x', 'y']}, {'value': [1, 3, 2, 4]}, {'z': [5, 5, 6, 6]}])
|
250
|
+
>>> d.wide().data
|
251
|
+
[{'x': [1, 2]}, {'y': [3, 4]}, {'z': [5, 6]}]
|
196
252
|
"""
|
197
253
|
# Extract the component arrays
|
198
254
|
row_dict = next(col for col in self if "row" in col)
|
@@ -219,22 +275,84 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
219
275
|
output_row_idx = unique_rows.index(row_idx)
|
220
276
|
result[key][output_row_idx] = value
|
221
277
|
|
278
|
+
# Add any additional columns that weren't part of the key-value transformation
|
279
|
+
additional_columns = []
|
280
|
+
for col in self:
|
281
|
+
col_key = list(col.keys())[0]
|
282
|
+
if col_key not in ['row', 'key', 'value']:
|
283
|
+
# Get unique values for this column
|
284
|
+
unique_values = []
|
285
|
+
for row_idx in unique_rows:
|
286
|
+
# Find the first occurrence of this row index
|
287
|
+
for i, r in enumerate(rows):
|
288
|
+
if r == row_idx:
|
289
|
+
unique_values.append(col[col_key][i])
|
290
|
+
break
|
291
|
+
additional_columns.append({col_key: unique_values})
|
292
|
+
|
222
293
|
# Convert to list of column dictionaries format
|
223
|
-
|
294
|
+
result_columns = [{key: values} for key, values in result.items()]
|
295
|
+
return Dataset(result_columns + additional_columns)
|
224
296
|
|
225
297
|
def __repr__(self) -> str:
|
226
|
-
"""Return a string representation of the dataset.
|
298
|
+
"""Return a string representation of the dataset.
|
299
|
+
|
300
|
+
Examples:
|
301
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
302
|
+
>>> repr(d)
|
303
|
+
"Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])"
|
304
|
+
|
305
|
+
>>> d = Dataset([{'x': ['a', 'b']}])
|
306
|
+
>>> repr(d)
|
307
|
+
"Dataset([{'x': ['a', 'b']}])"
|
308
|
+
"""
|
227
309
|
return f"Dataset({self.data})"
|
228
310
|
|
229
311
|
def write(self, filename: str, tablefmt: Optional[str] = None) -> None:
|
312
|
+
"""Write the dataset to a file in the specified format.
|
313
|
+
|
314
|
+
Args:
|
315
|
+
filename: The name of the file to write to.
|
316
|
+
tablefmt: Optional format for the table (e.g., 'csv', 'html', 'latex').
|
317
|
+
"""
|
230
318
|
return self.table(tablefmt=tablefmt).write(filename)
|
231
319
|
|
232
320
|
def _repr_html_(self):
|
233
|
-
|
321
|
+
"""Return an HTML representation of the dataset for Jupyter notebooks.
|
322
|
+
|
323
|
+
Examples:
|
324
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
325
|
+
>>> html = d._repr_html_()
|
326
|
+
>>> isinstance(html, str)
|
327
|
+
True
|
328
|
+
>>> '<table' in html
|
329
|
+
True
|
330
|
+
"""
|
234
331
|
return self.table(print_parameters=self.print_parameters)._repr_html_()
|
235
|
-
# return TableDisplay(headers=headers, data=data, raw_data_set=self)
|
236
332
|
|
237
333
|
def _tabular(self) -> tuple[list[str], list[list[Any]]]:
|
334
|
+
"""Convert the dataset to a tabular format (headers and rows).
|
335
|
+
|
336
|
+
Returns:
|
337
|
+
A tuple containing:
|
338
|
+
- List of column headers
|
339
|
+
- List of rows, where each row is a list of values
|
340
|
+
|
341
|
+
Examples:
|
342
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
343
|
+
>>> headers, rows = d._tabular()
|
344
|
+
>>> headers
|
345
|
+
['a', 'b']
|
346
|
+
>>> rows
|
347
|
+
[[1, 4], [2, 5], [3, 6]]
|
348
|
+
|
349
|
+
>>> d = Dataset([{'x': ['a', 'b']}, {'y': [1, 2]}])
|
350
|
+
>>> headers, rows = d._tabular()
|
351
|
+
>>> headers
|
352
|
+
['x', 'y']
|
353
|
+
>>> rows
|
354
|
+
[['a', 1], ['b', 2]]
|
355
|
+
"""
|
238
356
|
# Extract headers
|
239
357
|
headers = []
|
240
358
|
for entry in self.data:
|
@@ -261,9 +379,20 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
261
379
|
def _key_to_value(self, key: str) -> Any:
|
262
380
|
"""Retrieve the value associated with the given key from the dataset.
|
263
381
|
|
264
|
-
|
265
|
-
|
266
|
-
|
382
|
+
Args:
|
383
|
+
key: The key to look up in the dataset.
|
384
|
+
|
385
|
+
Returns:
|
386
|
+
The list of values associated with the key.
|
387
|
+
|
388
|
+
Examples:
|
389
|
+
>>> d = Dataset([{'a.b': [1, 2, 3, 4]}])
|
390
|
+
>>> d._key_to_value('a.b')
|
391
|
+
[1, 2, 3, 4]
|
392
|
+
|
393
|
+
>>> d = Dataset([{'x.y': [1, 2]}, {'z.w': [3, 4]}])
|
394
|
+
>>> d._key_to_value('w')
|
395
|
+
[3, 4]
|
267
396
|
"""
|
268
397
|
potential_matches = []
|
269
398
|
for data_dict in self.data:
|
@@ -287,11 +416,15 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
287
416
|
def first(self) -> dict[str, Any]:
|
288
417
|
"""Get the first value of the first key in the first dictionary.
|
289
418
|
|
290
|
-
|
291
|
-
|
292
|
-
|
419
|
+
Examples:
|
420
|
+
>>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
|
421
|
+
>>> d.first()
|
422
|
+
1
|
423
|
+
|
424
|
+
>>> d = Dataset([{'x': ['first', 'second']}])
|
425
|
+
>>> d.first()
|
426
|
+
'first'
|
293
427
|
"""
|
294
|
-
|
295
428
|
def get_values(d):
|
296
429
|
"""Get the values of the first key in the dictionary."""
|
297
430
|
return list(d.values())[0]
|
@@ -299,9 +432,27 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
299
432
|
return get_values(self.data[0])[0]
|
300
433
|
|
301
434
|
def latex(self, **kwargs):
|
435
|
+
"""Return a LaTeX representation of the dataset.
|
436
|
+
|
437
|
+
Args:
|
438
|
+
**kwargs: Additional arguments to pass to the table formatter.
|
439
|
+
|
440
|
+
|
441
|
+
"""
|
302
442
|
return self.table().latex()
|
303
443
|
|
304
444
|
def remove_prefix(self) -> Dataset:
|
445
|
+
"""Remove the prefix from column names that contain dots.
|
446
|
+
|
447
|
+
Examples:
|
448
|
+
>>> d = Dataset([{'a.b': [1, 2, 3]}, {'c.d': [4, 5, 6]}])
|
449
|
+
>>> d.remove_prefix().data
|
450
|
+
[{'b': [1, 2, 3]}, {'d': [4, 5, 6]}]
|
451
|
+
|
452
|
+
>>> d = Dataset([{'x.y.z': [1, 2]}, {'a.b.c': [3, 4]}])
|
453
|
+
>>> d.remove_prefix().data
|
454
|
+
[{'y': [1, 2]}, {'b': [3, 4]}]
|
455
|
+
"""
|
305
456
|
new_data = []
|
306
457
|
for observation in self.data:
|
307
458
|
key, values = list(observation.items())[0]
|
@@ -323,6 +474,17 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
323
474
|
|
324
475
|
Returns:
|
325
476
|
TableDisplay object
|
477
|
+
|
478
|
+
Examples:
|
479
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
480
|
+
>>> display = d.print(format='rich')
|
481
|
+
>>> display is not None
|
482
|
+
True
|
483
|
+
|
484
|
+
>>> d = Dataset([{'long_column_name': [1, 2]}])
|
485
|
+
>>> display = d.print(pretty_labels={'long_column_name': 'Short'})
|
486
|
+
>>> display is not None
|
487
|
+
True
|
326
488
|
"""
|
327
489
|
if "format" in kwargs:
|
328
490
|
if kwargs["format"] not in ["html", "markdown", "rich", "latex"]:
|
@@ -342,6 +504,17 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
342
504
|
return self.table(tablefmt=tablefmt)
|
343
505
|
|
344
506
|
def rename(self, rename_dic) -> Dataset:
|
507
|
+
"""Rename columns in the dataset according to the provided dictionary.
|
508
|
+
|
509
|
+
Examples:
|
510
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
511
|
+
>>> d.rename({'a': 'x', 'b': 'y'}).data
|
512
|
+
[{'x': [1, 2, 3]}, {'y': [4, 5, 6]}]
|
513
|
+
|
514
|
+
>>> d = Dataset([{'old_name': [1, 2]}])
|
515
|
+
>>> d.rename({'old_name': 'new_name'}).data
|
516
|
+
[{'new_name': [1, 2]}]
|
517
|
+
"""
|
345
518
|
new_data = []
|
346
519
|
for observation in self.data:
|
347
520
|
key, values = list(observation.items())[0]
|
@@ -350,9 +523,20 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
350
523
|
return Dataset(new_data)
|
351
524
|
|
352
525
|
def merge(self, other: Dataset, by_x, by_y) -> Dataset:
|
353
|
-
"""Merge the dataset with another dataset on the given keys.
|
526
|
+
"""Merge the dataset with another dataset on the given keys.
|
354
527
|
|
355
|
-
|
528
|
+
Examples:
|
529
|
+
>>> d1 = Dataset([{'key': [1, 2, 3]}, {'value1': ['a', 'b', 'c']}])
|
530
|
+
>>> d2 = Dataset([{'key': [2, 3, 4]}, {'value2': ['x', 'y', 'z']}])
|
531
|
+
>>> merged = d1.merge(d2, 'key', 'key')
|
532
|
+
>>> len(merged.data[0]['key'])
|
533
|
+
3
|
534
|
+
|
535
|
+
>>> d1 = Dataset([{'id': [1, 2]}, {'name': ['Alice', 'Bob']}])
|
536
|
+
>>> d2 = Dataset([{'id': [2, 3]}, {'age': [25, 30]}])
|
537
|
+
>>> merged = d1.merge(d2, 'id', 'id')
|
538
|
+
>>> len(merged.data[0]['id'])
|
539
|
+
2
|
356
540
|
"""
|
357
541
|
df1 = self.to_pandas()
|
358
542
|
df2 = other.to_pandas()
|
@@ -360,17 +544,23 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
360
544
|
return Dataset.from_pandas_dataframe(merged_df)
|
361
545
|
|
362
546
|
def to(self, survey_or_question: Union["Survey", "QuestionBase"]) -> "Job":
|
363
|
-
"""
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
547
|
+
"""Transform the dataset using a survey or question.
|
548
|
+
|
549
|
+
Args:
|
550
|
+
survey_or_question: Either a Survey or QuestionBase object to apply to the dataset.
|
551
|
+
|
552
|
+
Examples:
|
553
|
+
>>> from edsl import QuestionFreeText
|
554
|
+
>>> from edsl.jobs import Jobs
|
555
|
+
>>> d = Dataset([{'name': ['Alice', 'Bob']}])
|
556
|
+
>>> q = QuestionFreeText(question_text="How are you, {{ name }}?", question_name="how_feeling")
|
557
|
+
>>> job = d.to(q)
|
558
|
+
>>> isinstance(job, Jobs)
|
559
|
+
True
|
371
560
|
"""
|
372
561
|
from ..surveys import Survey
|
373
562
|
from ..questions import QuestionBase
|
563
|
+
from ..jobs import Jobs
|
374
564
|
|
375
565
|
if isinstance(survey_or_question, Survey):
|
376
566
|
return survey_or_question.by(self.to_scenario_list())
|
@@ -380,15 +570,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
380
570
|
def select(self, *keys) -> Dataset:
|
381
571
|
"""Return a new dataset with only the selected keys.
|
382
572
|
|
383
|
-
:
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
573
|
+
Examples:
|
574
|
+
>>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}, {'c': [9, 10, 11, 12]}])
|
575
|
+
>>> d.select('a', 'c').data
|
576
|
+
[{'a': [1, 2, 3, 4]}, {'c': [9, 10, 11, 12]}]
|
577
|
+
|
578
|
+
>>> d = Dataset([{'x': [1, 2]}, {'y': [3, 4]}])
|
579
|
+
>>> d.select('x').data
|
580
|
+
[{'x': [1, 2]}]
|
392
581
|
"""
|
393
582
|
for key in keys:
|
394
583
|
if key not in self.keys():
|
@@ -410,9 +599,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
410
599
|
def to_json(self):
|
411
600
|
"""Return a JSON representation of the dataset.
|
412
601
|
|
413
|
-
|
414
|
-
|
415
|
-
|
602
|
+
Examples:
|
603
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
604
|
+
>>> d.to_json()
|
605
|
+
[{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]
|
606
|
+
|
607
|
+
>>> d = Dataset([{'x': ['a', 'b']}])
|
608
|
+
>>> d.to_json()
|
609
|
+
[{'x': ['a', 'b']}]
|
416
610
|
"""
|
417
611
|
return json.loads(
|
418
612
|
json.dumps(self.data)
|
@@ -421,9 +615,16 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
421
615
|
def shuffle(self, seed=None) -> Dataset:
|
422
616
|
"""Return a new dataset with the observations shuffled.
|
423
617
|
|
424
|
-
|
425
|
-
|
426
|
-
|
618
|
+
Examples:
|
619
|
+
>>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
|
620
|
+
>>> shuffled = d.shuffle(seed=42)
|
621
|
+
>>> len(shuffled.data[0]['a']) == len(d.data[0]['a'])
|
622
|
+
True
|
623
|
+
|
624
|
+
>>> d = Dataset([{'x': ['a', 'b', 'c']}])
|
625
|
+
>>> shuffled = d.shuffle(seed=123)
|
626
|
+
>>> set(shuffled.data[0]['x']) == set(d.data[0]['x'])
|
627
|
+
True
|
427
628
|
"""
|
428
629
|
if seed is not None:
|
429
630
|
random.seed(seed)
|
@@ -455,14 +656,16 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
455
656
|
) -> Dataset:
|
456
657
|
"""Return a new dataset with a sample of the observations.
|
457
658
|
|
458
|
-
:
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
659
|
+
Examples:
|
660
|
+
>>> d = Dataset([{'a': [1, 2, 3, 4, 5]}, {'b': [6, 7, 8, 9, 10]}])
|
661
|
+
>>> sampled = d.sample(n=3, seed=42)
|
662
|
+
>>> len(sampled.data[0]['a'])
|
663
|
+
3
|
664
|
+
|
665
|
+
>>> d = Dataset([{'x': ['a', 'b', 'c', 'd']}])
|
666
|
+
>>> sampled = d.sample(frac=0.5, seed=123)
|
667
|
+
>>> len(sampled.data[0]['x'])
|
668
|
+
2
|
466
669
|
"""
|
467
670
|
if seed is not None:
|
468
671
|
random.seed(seed)
|
@@ -503,7 +706,7 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
503
706
|
|
504
707
|
return self
|
505
708
|
|
506
|
-
def get_sort_indices(self, lst: list[Any], reverse: bool = False
|
709
|
+
def get_sort_indices(self, lst: list[Any], reverse: bool = False) -> list[int]:
|
507
710
|
"""
|
508
711
|
Return the indices that would sort the list, using either numpy or pure Python.
|
509
712
|
None values are placed at the end of the sorted list.
|
@@ -515,44 +718,35 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
515
718
|
|
516
719
|
Returns:
|
517
720
|
A list of indices that would sort the list
|
518
|
-
"""
|
519
|
-
if use_numpy:
|
520
|
-
try:
|
521
|
-
import numpy as np
|
522
|
-
# Convert list to numpy array
|
523
|
-
arr = np.array(lst, dtype=object)
|
524
|
-
# Get mask of non-None values
|
525
|
-
mask = ~(arr is None)
|
526
|
-
# Get indices of non-None and None values
|
527
|
-
non_none_indices = np.where(mask)[0]
|
528
|
-
none_indices = np.where(~mask)[0]
|
529
|
-
# Sort non-None values
|
530
|
-
sorted_indices = non_none_indices[np.argsort(arr[mask])]
|
531
|
-
# Combine sorted non-None indices with None indices
|
532
|
-
indices = np.concatenate([sorted_indices, none_indices]).tolist()
|
533
|
-
if reverse:
|
534
|
-
# When reversing, keep None values at end
|
535
|
-
indices = sorted_indices[::-1].tolist() + none_indices.tolist()
|
536
|
-
return indices
|
537
|
-
except ImportError:
|
538
|
-
# Fallback to pure Python if numpy is not available
|
539
|
-
pass
|
540
|
-
|
541
|
-
# Pure Python implementation
|
721
|
+
"""
|
542
722
|
enumerated = list(enumerate(lst))
|
543
|
-
# Sort None values to end by using (is_none, value) as sort key
|
544
723
|
sorted_pairs = sorted(enumerated,
|
545
724
|
key=lambda x: (x[1] is None, x[1]),
|
546
725
|
reverse=reverse)
|
547
726
|
return [index for index, _ in sorted_pairs]
|
548
727
|
|
549
|
-
def order_by(self, sort_key: str, reverse: bool = False
|
728
|
+
def order_by(self, sort_key: str, reverse: bool = False) -> Dataset:
|
550
729
|
"""Return a new dataset with the observations sorted by the given key.
|
551
730
|
|
552
731
|
Args:
|
553
732
|
sort_key: The key to sort the observations by
|
554
733
|
reverse: Whether to sort in reverse order
|
555
|
-
|
734
|
+
|
735
|
+
Examples:
|
736
|
+
>>> d = Dataset([{'a': [3, 1, 4, 1, 5]}, {'b': ['x', 'y', 'z', 'w', 'v']}])
|
737
|
+
>>> sorted_d = d.order_by('a')
|
738
|
+
>>> sorted_d.data
|
739
|
+
[{'a': [1, 1, 3, 4, 5]}, {'b': ['y', 'w', 'x', 'z', 'v']}]
|
740
|
+
|
741
|
+
>>> d = Dataset([{'a': [3, 1, 4, 1, 5]}, {'b': ['x', 'y', 'z', 'w', 'v']}])
|
742
|
+
>>> sorted_d = d.order_by('a', reverse=True)
|
743
|
+
>>> sorted_d.data
|
744
|
+
[{'a': [5, 4, 3, 1, 1]}, {'b': ['v', 'z', 'x', 'y', 'w']}]
|
745
|
+
|
746
|
+
>>> d = Dataset([{'a': [3, None, 1, 4, None]}, {'b': ['x', 'y', 'z', 'w', 'v']}])
|
747
|
+
>>> sorted_d = d.order_by('a')
|
748
|
+
>>> sorted_d.data
|
749
|
+
[{'a': [1, 3, 4, None, None]}, {'b': ['z', 'x', 'w', 'y', 'v']}]
|
556
750
|
"""
|
557
751
|
number_found = 0
|
558
752
|
for obs in self.data:
|
@@ -566,7 +760,7 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
566
760
|
elif number_found > 1:
|
567
761
|
raise DatasetKeyError(f"Key '{sort_key}' found in more than one dictionary.")
|
568
762
|
|
569
|
-
sort_indices_list = self.get_sort_indices(relevant_values, reverse=reverse
|
763
|
+
sort_indices_list = self.get_sort_indices(relevant_values, reverse=reverse)
|
570
764
|
new_data = []
|
571
765
|
for observation in self.data:
|
572
766
|
key, values = list(observation.items())[0]
|
@@ -663,15 +857,26 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
663
857
|
data=data, headers=headers, tablefmt=tablefmt, raw_data_set=self
|
664
858
|
)
|
665
859
|
|
666
|
-
def summary(self):
|
667
|
-
|
860
|
+
def summary(self) -> "Dataset":
|
861
|
+
"""Return a summary of the dataset.
|
862
|
+
|
863
|
+
Examples:
|
864
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
865
|
+
>>> d.summary().data
|
866
|
+
[{'num_observations': [3]}, {'keys': [['a', 'b']]}]
|
867
|
+
"""
|
868
|
+
return Dataset([{"num_observations": [len(self)]}, {"keys": [self.keys()]}])
|
668
869
|
|
669
870
|
@classmethod
|
670
|
-
def example(self, n: int = None):
|
871
|
+
def example(self, n: int = None) -> "Dataset":
|
671
872
|
"""Return an example dataset.
|
672
873
|
|
673
|
-
|
674
|
-
|
874
|
+
Examples:
|
875
|
+
>>> Dataset.example()
|
876
|
+
Dataset([{'a': [1, 2, 3, 4]}, {'b': [4, 3, 2, 1]}])
|
877
|
+
|
878
|
+
>>> Dataset.example(n=2)
|
879
|
+
Dataset([{'a': [1, 1]}, {'b': [2, 2]}])
|
675
880
|
"""
|
676
881
|
if n is None:
|
677
882
|
return Dataset([{"a": [1, 2, 3, 4]}, {"b": [4, 3, 2, 1]}])
|
@@ -691,6 +896,15 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
691
896
|
def to_dict(self) -> dict:
|
692
897
|
"""
|
693
898
|
Convert the dataset to a dictionary.
|
899
|
+
|
900
|
+
Examples:
|
901
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
902
|
+
>>> d.to_dict()
|
903
|
+
{'data': [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]}
|
904
|
+
|
905
|
+
>>> d = Dataset([{'x': ['a', 'b']}])
|
906
|
+
>>> d.to_dict()
|
907
|
+
{'data': [{'x': ['a', 'b']}]}
|
694
908
|
"""
|
695
909
|
return {'data': self.data}
|
696
910
|
|
@@ -698,6 +912,17 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
698
912
|
def from_dict(cls, data: dict) -> 'Dataset':
|
699
913
|
"""
|
700
914
|
Convert a dictionary to a dataset.
|
915
|
+
|
916
|
+
Examples:
|
917
|
+
>>> d = Dataset.from_dict({'data': [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]})
|
918
|
+
>>> isinstance(d, Dataset)
|
919
|
+
True
|
920
|
+
>>> d.data
|
921
|
+
[{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]
|
922
|
+
|
923
|
+
>>> d = Dataset.from_dict({'data': [{'x': ['a', 'b']}]})
|
924
|
+
>>> d.data
|
925
|
+
[{'x': ['a', 'b']}]
|
701
926
|
"""
|
702
927
|
return cls(data['data'])
|
703
928
|
|
@@ -708,6 +933,15 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
708
933
|
Args:
|
709
934
|
output_file (str): Path to save the Word document
|
710
935
|
title (str, optional): Title for the document
|
936
|
+
|
937
|
+
Examples:
|
938
|
+
>>> import tempfile
|
939
|
+
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
940
|
+
>>> with tempfile.NamedTemporaryFile(suffix='.docx') as tmp:
|
941
|
+
... d.to_docx(tmp.name, title='Test Document')
|
942
|
+
... import os
|
943
|
+
... os.path.exists(tmp.name)
|
944
|
+
True
|
711
945
|
"""
|
712
946
|
from docx import Document
|
713
947
|
from docx.shared import Inches
|
@@ -762,11 +996,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
|
|
762
996
|
Returns:
|
763
997
|
A new Dataset with the expanded rows
|
764
998
|
|
765
|
-
|
766
|
-
>>> from edsl.dataset import Dataset
|
999
|
+
Examples:
|
767
1000
|
>>> d = Dataset([{'a': [[1, 2, 3], [4, 5, 6]]}, {'b': ['x', 'y']}])
|
768
|
-
>>> d.expand('a')
|
769
|
-
|
1001
|
+
>>> d.expand('a').data
|
1002
|
+
[{'a': [1, 2, 3, 4, 5, 6]}, {'b': ['x', 'x', 'x', 'y', 'y', 'y']}]
|
1003
|
+
|
1004
|
+
>>> d = Dataset([{'items': [['apple', 'banana'], ['orange']]}, {'id': [1, 2]}])
|
1005
|
+
>>> d.expand('items', number_field=True).data
|
1006
|
+
[{'items': ['apple', 'banana', 'orange']}, {'id': [1, 1, 2]}, {'items_number': [1, 2, 1]}]
|
770
1007
|
"""
|
771
1008
|
from collections.abc import Iterable
|
772
1009
|
|