edsl 0.1.53__py3-none-any.whl → 0.1.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. edsl/__init__.py +8 -1
  2. edsl/__init__original.py +134 -0
  3. edsl/__version__.py +1 -1
  4. edsl/agents/agent.py +29 -0
  5. edsl/agents/agent_list.py +36 -1
  6. edsl/base/base_class.py +281 -151
  7. edsl/buckets/__init__.py +8 -3
  8. edsl/buckets/bucket_collection.py +9 -3
  9. edsl/buckets/model_buckets.py +4 -2
  10. edsl/buckets/token_bucket.py +2 -2
  11. edsl/buckets/token_bucket_client.py +5 -3
  12. edsl/caching/cache.py +131 -62
  13. edsl/caching/cache_entry.py +70 -58
  14. edsl/caching/sql_dict.py +17 -0
  15. edsl/cli.py +99 -0
  16. edsl/config/config_class.py +16 -0
  17. edsl/conversation/__init__.py +31 -0
  18. edsl/coop/coop.py +276 -242
  19. edsl/coop/coop_jobs_objects.py +59 -0
  20. edsl/coop/coop_objects.py +29 -0
  21. edsl/coop/coop_regular_objects.py +26 -0
  22. edsl/coop/utils.py +24 -19
  23. edsl/dataset/dataset.py +338 -101
  24. edsl/db_list/sqlite_list.py +349 -0
  25. edsl/inference_services/__init__.py +40 -5
  26. edsl/inference_services/exceptions.py +11 -0
  27. edsl/inference_services/services/anthropic_service.py +5 -2
  28. edsl/inference_services/services/aws_bedrock.py +6 -2
  29. edsl/inference_services/services/azure_ai.py +6 -2
  30. edsl/inference_services/services/google_service.py +3 -2
  31. edsl/inference_services/services/mistral_ai_service.py +6 -2
  32. edsl/inference_services/services/open_ai_service.py +6 -2
  33. edsl/inference_services/services/perplexity_service.py +6 -2
  34. edsl/inference_services/services/test_service.py +105 -7
  35. edsl/interviews/answering_function.py +167 -59
  36. edsl/interviews/interview.py +124 -72
  37. edsl/interviews/interview_task_manager.py +10 -0
  38. edsl/invigilators/invigilators.py +10 -1
  39. edsl/jobs/async_interview_runner.py +146 -104
  40. edsl/jobs/data_structures.py +6 -4
  41. edsl/jobs/decorators.py +61 -0
  42. edsl/jobs/fetch_invigilator.py +61 -18
  43. edsl/jobs/html_table_job_logger.py +14 -2
  44. edsl/jobs/jobs.py +180 -104
  45. edsl/jobs/jobs_component_constructor.py +2 -2
  46. edsl/jobs/jobs_interview_constructor.py +2 -0
  47. edsl/jobs/jobs_pricing_estimation.py +127 -46
  48. edsl/jobs/jobs_remote_inference_logger.py +4 -0
  49. edsl/jobs/jobs_runner_status.py +30 -25
  50. edsl/jobs/progress_bar_manager.py +79 -0
  51. edsl/jobs/remote_inference.py +35 -1
  52. edsl/key_management/key_lookup_builder.py +6 -1
  53. edsl/language_models/language_model.py +102 -12
  54. edsl/language_models/model.py +10 -3
  55. edsl/language_models/price_manager.py +45 -75
  56. edsl/language_models/registry.py +5 -0
  57. edsl/language_models/utilities.py +2 -1
  58. edsl/notebooks/notebook.py +77 -10
  59. edsl/questions/VALIDATION_README.md +134 -0
  60. edsl/questions/__init__.py +24 -1
  61. edsl/questions/exceptions.py +21 -0
  62. edsl/questions/question_check_box.py +171 -149
  63. edsl/questions/question_dict.py +243 -51
  64. edsl/questions/question_multiple_choice_with_other.py +624 -0
  65. edsl/questions/question_registry.py +2 -1
  66. edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
  67. edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
  68. edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
  69. edsl/questions/validation_analysis.py +185 -0
  70. edsl/questions/validation_cli.py +131 -0
  71. edsl/questions/validation_html_report.py +404 -0
  72. edsl/questions/validation_logger.py +136 -0
  73. edsl/results/result.py +63 -16
  74. edsl/results/results.py +702 -171
  75. edsl/scenarios/construct_download_link.py +16 -3
  76. edsl/scenarios/directory_scanner.py +226 -226
  77. edsl/scenarios/file_methods.py +5 -0
  78. edsl/scenarios/file_store.py +117 -6
  79. edsl/scenarios/handlers/__init__.py +5 -1
  80. edsl/scenarios/handlers/mp4_file_store.py +104 -0
  81. edsl/scenarios/handlers/webm_file_store.py +104 -0
  82. edsl/scenarios/scenario.py +120 -101
  83. edsl/scenarios/scenario_list.py +800 -727
  84. edsl/scenarios/scenario_list_gc_test.py +146 -0
  85. edsl/scenarios/scenario_list_memory_test.py +214 -0
  86. edsl/scenarios/scenario_list_source_refactor.md +35 -0
  87. edsl/scenarios/scenario_selector.py +5 -4
  88. edsl/scenarios/scenario_source.py +1990 -0
  89. edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
  90. edsl/surveys/survey.py +22 -0
  91. edsl/tasks/__init__.py +4 -2
  92. edsl/tasks/task_history.py +198 -36
  93. edsl/tests/scenarios/test_ScenarioSource.py +51 -0
  94. edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
  95. edsl/utilities/__init__.py +2 -1
  96. edsl/utilities/decorators.py +121 -0
  97. edsl/utilities/memory_debugger.py +1010 -0
  98. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/METADATA +52 -76
  99. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/RECORD +102 -78
  100. edsl/jobs/jobs_runner_asyncio.py +0 -281
  101. edsl/language_models/unused/fake_openai_service.py +0 -60
  102. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/LICENSE +0 -0
  103. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/WHEEL +0 -0
  104. {edsl-0.1.53.dist-info → edsl-0.1.55.dist-info}/entry_points.txt +0 -0
edsl/dataset/dataset.py CHANGED
@@ -127,29 +127,85 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
127
127
  def keys(self) -> list[str]:
128
128
  """Return the keys of the dataset.
129
129
 
130
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
131
- >>> d.keys()
132
- ['a.b']
130
+ Examples:
131
+ >>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
132
+ >>> d.keys()
133
+ ['a', 'b']
134
+
135
+ >>> d = Dataset([{'x.y': [1, 2]}, {'z.w': [3, 4]}])
136
+ >>> d.keys()
137
+ ['x.y', 'z.w']
138
+ """
139
+ return [list(o.keys())[0] for o in self]
133
140
 
134
- >>> d = Dataset([{'a.b':[1,2,3,4]}, {'c.d':[5,6,7,8]}])
135
- >>> d.keys()
136
- ['a.b', 'c.d']
141
+ def filter(self, expression) -> "Dataset":
142
+ """Filter the dataset based on a boolean expression.
137
143
 
144
+ Args:
145
+ expression: A string expression that evaluates to a boolean value.
146
+ Can reference column names in the dataset.
138
147
 
139
- ['a.b']
148
+ Examples:
149
+ >>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
150
+ >>> d.filter('a > 2').data
151
+ [{'a': [3, 4]}, {'b': [7, 8]}]
152
+
153
+ >>> d = Dataset([{'x': ['a', 'b', 'c']}, {'y': [1, 2, 3]}])
154
+ >>> d.filter('y < 3').data
155
+ [{'x': ['a', 'b']}, {'y': [1, 2]}]
140
156
  """
141
- return [list(o.keys())[0] for o in self]
142
-
143
- def filter(self, expression):
144
157
  return self.to_scenario_list().filter(expression).to_dataset()
145
158
 
146
159
  def mutate(self, new_var_string: str, functions_dict: Optional[dict[str, Callable]] = None) -> "Dataset":
160
+ """Create new columns by applying functions to existing columns.
161
+
162
+ Args:
163
+ new_var_string: A string expression defining the new variable.
164
+ Can reference existing column names.
165
+ functions_dict: Optional dictionary of custom functions to use in the expression.
166
+
167
+ Examples:
168
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
169
+ >>> d.mutate('c = a + b').data
170
+ [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}, {'c': [5, 7, 9]}]
171
+
172
+ >>> d = Dataset([{'x': [1, 2, 3]}])
173
+ >>> d.mutate('y = x * 2').data
174
+ [{'x': [1, 2, 3]}, {'y': [2, 4, 6]}]
175
+ """
147
176
  return self.to_scenario_list().mutate(new_var_string, functions_dict).to_dataset()
148
177
 
149
- def collapse(self, field:str, separator: Optional[str] = None) -> "Dataset":
178
+ def collapse(self, field: str, separator: Optional[str] = None) -> "Dataset":
179
+ """Collapse multiple values in a field into a single value using a separator.
180
+
181
+ Args:
182
+ field: The name of the field to collapse.
183
+ separator: Optional string to use as a separator between values.
184
+ Defaults to a space if not specified.
185
+
186
+ Examples:
187
+ >>> d = Dataset([{'words': [['hello', 'world'], ['good', 'morning']]}])
188
+ >>> d.collapse('words').data
189
+ [{'words': [[['hello', 'world'], ['good', 'morning']]]}]
190
+
191
+ >>> d = Dataset([{'numbers': [1, 2, 3]}])
192
+ >>> d.collapse('numbers', separator=',').data
193
+ [{'numbers': ['1,2,3']}]
194
+ """
150
195
  return self.to_scenario_list().collapse(field, separator).to_dataset()
151
196
 
152
197
  def long(self, exclude_fields: list[str] = None) -> Dataset:
198
+ """Convert the dataset from wide to long format.
199
+
200
+ Examples:
201
+ >>> d = Dataset([{'a': [1, 2], 'b': [3, 4]}])
202
+ >>> d.long().data
203
+ [{'row': [0, 0, 1, 1]}, {'key': ['a', 'b', 'a', 'b']}, {'value': [1, 3, 2, 4]}]
204
+
205
+ >>> d = Dataset([{'x': [1, 2], 'y': [3, 4], 'z': [5, 6]}])
206
+ >>> d.long(exclude_fields=['z']).data
207
+ [{'row': [0, 0, 1, 1]}, {'key': ['x', 'y', 'x', 'y']}, {'value': [1, 3, 2, 4]}, {'z': [5, 5, 6, 6]}]
208
+ """
153
209
  headers, data = self._tabular()
154
210
  exclude_fields = exclude_fields or []
155
211
 
@@ -185,14 +241,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
185
241
  """
186
242
  Convert a long-format dataset (with row, key, value columns) to wide format.
187
243
 
188
- Expected input format:
189
- - A dataset with three columns containing dictionaries:
190
- - row: list of row indices
191
- - key: list of column names
192
- - value: list of values
193
-
194
- Returns:
195
- - Dataset: A new dataset with columns corresponding to unique keys
244
+ Examples:
245
+ >>> d = Dataset([{'row': [0, 0, 1, 1]}, {'key': ['a', 'b', 'a', 'b']}, {'value': [1, 3, 2, 4]}])
246
+ >>> d.wide().data
247
+ [{'a': [1, 2]}, {'b': [3, 4]}]
248
+
249
+ >>> d = Dataset([{'row': [0, 0, 1, 1]}, {'key': ['x', 'y', 'x', 'y']}, {'value': [1, 3, 2, 4]}, {'z': [5, 5, 6, 6]}])
250
+ >>> d.wide().data
251
+ [{'x': [1, 2]}, {'y': [3, 4]}, {'z': [5, 6]}]
196
252
  """
197
253
  # Extract the component arrays
198
254
  row_dict = next(col for col in self if "row" in col)
@@ -219,22 +275,84 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
219
275
  output_row_idx = unique_rows.index(row_idx)
220
276
  result[key][output_row_idx] = value
221
277
 
278
+ # Add any additional columns that weren't part of the key-value transformation
279
+ additional_columns = []
280
+ for col in self:
281
+ col_key = list(col.keys())[0]
282
+ if col_key not in ['row', 'key', 'value']:
283
+ # Get unique values for this column
284
+ unique_values = []
285
+ for row_idx in unique_rows:
286
+ # Find the first occurrence of this row index
287
+ for i, r in enumerate(rows):
288
+ if r == row_idx:
289
+ unique_values.append(col[col_key][i])
290
+ break
291
+ additional_columns.append({col_key: unique_values})
292
+
222
293
  # Convert to list of column dictionaries format
223
- return Dataset([{key: values} for key, values in result.items()])
294
+ result_columns = [{key: values} for key, values in result.items()]
295
+ return Dataset(result_columns + additional_columns)
224
296
 
225
297
  def __repr__(self) -> str:
226
- """Return a string representation of the dataset."""
298
+ """Return a string representation of the dataset.
299
+
300
+ Examples:
301
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
302
+ >>> repr(d)
303
+ "Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])"
304
+
305
+ >>> d = Dataset([{'x': ['a', 'b']}])
306
+ >>> repr(d)
307
+ "Dataset([{'x': ['a', 'b']}])"
308
+ """
227
309
  return f"Dataset({self.data})"
228
310
 
229
311
  def write(self, filename: str, tablefmt: Optional[str] = None) -> None:
312
+ """Write the dataset to a file in the specified format.
313
+
314
+ Args:
315
+ filename: The name of the file to write to.
316
+ tablefmt: Optional format for the table (e.g., 'csv', 'html', 'latex').
317
+ """
230
318
  return self.table(tablefmt=tablefmt).write(filename)
231
319
 
232
320
  def _repr_html_(self):
233
- # headers, data = self._tabular()
321
+ """Return an HTML representation of the dataset for Jupyter notebooks.
322
+
323
+ Examples:
324
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
325
+ >>> html = d._repr_html_()
326
+ >>> isinstance(html, str)
327
+ True
328
+ >>> '<table' in html
329
+ True
330
+ """
234
331
  return self.table(print_parameters=self.print_parameters)._repr_html_()
235
- # return TableDisplay(headers=headers, data=data, raw_data_set=self)
236
332
 
237
333
  def _tabular(self) -> tuple[list[str], list[list[Any]]]:
334
+ """Convert the dataset to a tabular format (headers and rows).
335
+
336
+ Returns:
337
+ A tuple containing:
338
+ - List of column headers
339
+ - List of rows, where each row is a list of values
340
+
341
+ Examples:
342
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
343
+ >>> headers, rows = d._tabular()
344
+ >>> headers
345
+ ['a', 'b']
346
+ >>> rows
347
+ [[1, 4], [2, 5], [3, 6]]
348
+
349
+ >>> d = Dataset([{'x': ['a', 'b']}, {'y': [1, 2]}])
350
+ >>> headers, rows = d._tabular()
351
+ >>> headers
352
+ ['x', 'y']
353
+ >>> rows
354
+ [['a', 1], ['b', 2]]
355
+ """
238
356
  # Extract headers
239
357
  headers = []
240
358
  for entry in self.data:
@@ -261,9 +379,20 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
261
379
  def _key_to_value(self, key: str) -> Any:
262
380
  """Retrieve the value associated with the given key from the dataset.
263
381
 
264
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
265
- >>> d._key_to_value('a.b')
266
- [1, 2, 3, 4]
382
+ Args:
383
+ key: The key to look up in the dataset.
384
+
385
+ Returns:
386
+ The list of values associated with the key.
387
+
388
+ Examples:
389
+ >>> d = Dataset([{'a.b': [1, 2, 3, 4]}])
390
+ >>> d._key_to_value('a.b')
391
+ [1, 2, 3, 4]
392
+
393
+ >>> d = Dataset([{'x.y': [1, 2]}, {'z.w': [3, 4]}])
394
+ >>> d._key_to_value('w')
395
+ [3, 4]
267
396
  """
268
397
  potential_matches = []
269
398
  for data_dict in self.data:
@@ -287,11 +416,15 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
287
416
  def first(self) -> dict[str, Any]:
288
417
  """Get the first value of the first key in the first dictionary.
289
418
 
290
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
291
- >>> d.first()
292
- 1
419
+ Examples:
420
+ >>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
421
+ >>> d.first()
422
+ 1
423
+
424
+ >>> d = Dataset([{'x': ['first', 'second']}])
425
+ >>> d.first()
426
+ 'first'
293
427
  """
294
-
295
428
  def get_values(d):
296
429
  """Get the values of the first key in the dictionary."""
297
430
  return list(d.values())[0]
@@ -299,9 +432,27 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
299
432
  return get_values(self.data[0])[0]
300
433
 
301
434
  def latex(self, **kwargs):
435
+ """Return a LaTeX representation of the dataset.
436
+
437
+ Args:
438
+ **kwargs: Additional arguments to pass to the table formatter.
439
+
440
+
441
+ """
302
442
  return self.table().latex()
303
443
 
304
444
  def remove_prefix(self) -> Dataset:
445
+ """Remove the prefix from column names that contain dots.
446
+
447
+ Examples:
448
+ >>> d = Dataset([{'a.b': [1, 2, 3]}, {'c.d': [4, 5, 6]}])
449
+ >>> d.remove_prefix().data
450
+ [{'b': [1, 2, 3]}, {'d': [4, 5, 6]}]
451
+
452
+ >>> d = Dataset([{'x.y.z': [1, 2]}, {'a.b.c': [3, 4]}])
453
+ >>> d.remove_prefix().data
454
+ [{'y': [1, 2]}, {'b': [3, 4]}]
455
+ """
305
456
  new_data = []
306
457
  for observation in self.data:
307
458
  key, values = list(observation.items())[0]
@@ -323,6 +474,17 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
323
474
 
324
475
  Returns:
325
476
  TableDisplay object
477
+
478
+ Examples:
479
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
480
+ >>> display = d.print(format='rich')
481
+ >>> display is not None
482
+ True
483
+
484
+ >>> d = Dataset([{'long_column_name': [1, 2]}])
485
+ >>> display = d.print(pretty_labels={'long_column_name': 'Short'})
486
+ >>> display is not None
487
+ True
326
488
  """
327
489
  if "format" in kwargs:
328
490
  if kwargs["format"] not in ["html", "markdown", "rich", "latex"]:
@@ -342,6 +504,17 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
342
504
  return self.table(tablefmt=tablefmt)
343
505
 
344
506
  def rename(self, rename_dic) -> Dataset:
507
+ """Rename columns in the dataset according to the provided dictionary.
508
+
509
+ Examples:
510
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
511
+ >>> d.rename({'a': 'x', 'b': 'y'}).data
512
+ [{'x': [1, 2, 3]}, {'y': [4, 5, 6]}]
513
+
514
+ >>> d = Dataset([{'old_name': [1, 2]}])
515
+ >>> d.rename({'old_name': 'new_name'}).data
516
+ [{'new_name': [1, 2]}]
517
+ """
345
518
  new_data = []
346
519
  for observation in self.data:
347
520
  key, values = list(observation.items())[0]
@@ -350,9 +523,20 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
350
523
  return Dataset(new_data)
351
524
 
352
525
  def merge(self, other: Dataset, by_x, by_y) -> Dataset:
353
- """Merge the dataset with another dataset on the given keys.""
526
+ """Merge the dataset with another dataset on the given keys.
354
527
 
355
- merged_df = df1.merge(df2, how="left", on=["key1", "key2"])
528
+ Examples:
529
+ >>> d1 = Dataset([{'key': [1, 2, 3]}, {'value1': ['a', 'b', 'c']}])
530
+ >>> d2 = Dataset([{'key': [2, 3, 4]}, {'value2': ['x', 'y', 'z']}])
531
+ >>> merged = d1.merge(d2, 'key', 'key')
532
+ >>> len(merged.data[0]['key'])
533
+ 3
534
+
535
+ >>> d1 = Dataset([{'id': [1, 2]}, {'name': ['Alice', 'Bob']}])
536
+ >>> d2 = Dataset([{'id': [2, 3]}, {'age': [25, 30]}])
537
+ >>> merged = d1.merge(d2, 'id', 'id')
538
+ >>> len(merged.data[0]['id'])
539
+ 2
356
540
  """
357
541
  df1 = self.to_pandas()
358
542
  df2 = other.to_pandas()
@@ -360,17 +544,23 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
360
544
  return Dataset.from_pandas_dataframe(merged_df)
361
545
 
362
546
  def to(self, survey_or_question: Union["Survey", "QuestionBase"]) -> "Job":
363
- """Return a new dataset with the observations transformed by the given survey or question.
364
-
365
- >>> d = Dataset([{'person_name':["John"]}])
366
- >>> from edsl import QuestionFreeText
367
- >>> q = QuestionFreeText(question_text = "How are you, {{ person_name ?}}?", question_name = "how_feeling")
368
- >>> jobs = d.to(q)
369
- >>> isinstance(jobs, object)
370
- True
547
+ """Transform the dataset using a survey or question.
548
+
549
+ Args:
550
+ survey_or_question: Either a Survey or QuestionBase object to apply to the dataset.
551
+
552
+ Examples:
553
+ >>> from edsl import QuestionFreeText
554
+ >>> from edsl.jobs import Jobs
555
+ >>> d = Dataset([{'name': ['Alice', 'Bob']}])
556
+ >>> q = QuestionFreeText(question_text="How are you, {{ name }}?", question_name="how_feeling")
557
+ >>> job = d.to(q)
558
+ >>> isinstance(job, Jobs)
559
+ True
371
560
  """
372
561
  from ..surveys import Survey
373
562
  from ..questions import QuestionBase
563
+ from ..jobs import Jobs
374
564
 
375
565
  if isinstance(survey_or_question, Survey):
376
566
  return survey_or_question.by(self.to_scenario_list())
@@ -380,15 +570,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
380
570
  def select(self, *keys) -> Dataset:
381
571
  """Return a new dataset with only the selected keys.
382
572
 
383
- :param keys: The keys to select.
384
-
385
- >>> d = Dataset([{'a.b':[1,2,3,4]}, {'c.d':[5,6,7,8]}])
386
- >>> d.select('a.b')
387
- Dataset([{'a.b': [1, 2, 3, 4]}])
388
-
389
- >>> d.select('a.b', 'c.d')
390
- Dataset([{'a.b': [1, 2, 3, 4]}, {'c.d': [5, 6, 7, 8]}])
391
-
573
+ Examples:
574
+ >>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}, {'c': [9, 10, 11, 12]}])
575
+ >>> d.select('a', 'c').data
576
+ [{'a': [1, 2, 3, 4]}, {'c': [9, 10, 11, 12]}]
577
+
578
+ >>> d = Dataset([{'x': [1, 2]}, {'y': [3, 4]}])
579
+ >>> d.select('x').data
580
+ [{'x': [1, 2]}]
392
581
  """
393
582
  for key in keys:
394
583
  if key not in self.keys():
@@ -410,9 +599,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
410
599
  def to_json(self):
411
600
  """Return a JSON representation of the dataset.
412
601
 
413
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
414
- >>> d.to_json()
415
- [{'a.b': [1, 2, 3, 4]}]
602
+ Examples:
603
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
604
+ >>> d.to_json()
605
+ [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]
606
+
607
+ >>> d = Dataset([{'x': ['a', 'b']}])
608
+ >>> d.to_json()
609
+ [{'x': ['a', 'b']}]
416
610
  """
417
611
  return json.loads(
418
612
  json.dumps(self.data)
@@ -421,9 +615,16 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
421
615
  def shuffle(self, seed=None) -> Dataset:
422
616
  """Return a new dataset with the observations shuffled.
423
617
 
424
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
425
- >>> d.shuffle(seed=0)
426
- Dataset([{'a.b': [3, 1, 2, 4]}])
618
+ Examples:
619
+ >>> d = Dataset([{'a': [1, 2, 3, 4]}, {'b': [5, 6, 7, 8]}])
620
+ >>> shuffled = d.shuffle(seed=42)
621
+ >>> len(shuffled.data[0]['a']) == len(d.data[0]['a'])
622
+ True
623
+
624
+ >>> d = Dataset([{'x': ['a', 'b', 'c']}])
625
+ >>> shuffled = d.shuffle(seed=123)
626
+ >>> set(shuffled.data[0]['x']) == set(d.data[0]['x'])
627
+ True
427
628
  """
428
629
  if seed is not None:
429
630
  random.seed(seed)
@@ -455,14 +656,16 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
455
656
  ) -> Dataset:
456
657
  """Return a new dataset with a sample of the observations.
457
658
 
458
- :param n: The number of samples to take.
459
- :param frac: The fraction of samples to take.
460
- :param with_replacement: Whether to sample with replacement.
461
- :param seed: The seed for the random number generator.
462
-
463
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
464
- >>> d.sample(n=2, seed=0, with_replacement=True)
465
- Dataset([{'a.b': [4, 4]}])
659
+ Examples:
660
+ >>> d = Dataset([{'a': [1, 2, 3, 4, 5]}, {'b': [6, 7, 8, 9, 10]}])
661
+ >>> sampled = d.sample(n=3, seed=42)
662
+ >>> len(sampled.data[0]['a'])
663
+ 3
664
+
665
+ >>> d = Dataset([{'x': ['a', 'b', 'c', 'd']}])
666
+ >>> sampled = d.sample(frac=0.5, seed=123)
667
+ >>> len(sampled.data[0]['x'])
668
+ 2
466
669
  """
467
670
  if seed is not None:
468
671
  random.seed(seed)
@@ -503,7 +706,7 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
503
706
 
504
707
  return self
505
708
 
506
- def get_sort_indices(self, lst: list[Any], reverse: bool = False, use_numpy: bool = True) -> list[int]:
709
+ def get_sort_indices(self, lst: list[Any], reverse: bool = False) -> list[int]:
507
710
  """
508
711
  Return the indices that would sort the list, using either numpy or pure Python.
509
712
  None values are placed at the end of the sorted list.
@@ -515,44 +718,35 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
515
718
 
516
719
  Returns:
517
720
  A list of indices that would sort the list
518
- """
519
- if use_numpy:
520
- try:
521
- import numpy as np
522
- # Convert list to numpy array
523
- arr = np.array(lst, dtype=object)
524
- # Get mask of non-None values
525
- mask = ~(arr is None)
526
- # Get indices of non-None and None values
527
- non_none_indices = np.where(mask)[0]
528
- none_indices = np.where(~mask)[0]
529
- # Sort non-None values
530
- sorted_indices = non_none_indices[np.argsort(arr[mask])]
531
- # Combine sorted non-None indices with None indices
532
- indices = np.concatenate([sorted_indices, none_indices]).tolist()
533
- if reverse:
534
- # When reversing, keep None values at end
535
- indices = sorted_indices[::-1].tolist() + none_indices.tolist()
536
- return indices
537
- except ImportError:
538
- # Fallback to pure Python if numpy is not available
539
- pass
540
-
541
- # Pure Python implementation
721
+ """
542
722
  enumerated = list(enumerate(lst))
543
- # Sort None values to end by using (is_none, value) as sort key
544
723
  sorted_pairs = sorted(enumerated,
545
724
  key=lambda x: (x[1] is None, x[1]),
546
725
  reverse=reverse)
547
726
  return [index for index, _ in sorted_pairs]
548
727
 
549
- def order_by(self, sort_key: str, reverse: bool = False, use_numpy: bool = True) -> Dataset:
728
+ def order_by(self, sort_key: str, reverse: bool = False) -> Dataset:
550
729
  """Return a new dataset with the observations sorted by the given key.
551
730
 
552
731
  Args:
553
732
  sort_key: The key to sort the observations by
554
733
  reverse: Whether to sort in reverse order
555
- use_numpy: Whether to use numpy for sorting (faster for large lists)
734
+
735
+ Examples:
736
+ >>> d = Dataset([{'a': [3, 1, 4, 1, 5]}, {'b': ['x', 'y', 'z', 'w', 'v']}])
737
+ >>> sorted_d = d.order_by('a')
738
+ >>> sorted_d.data
739
+ [{'a': [1, 1, 3, 4, 5]}, {'b': ['y', 'w', 'x', 'z', 'v']}]
740
+
741
+ >>> d = Dataset([{'a': [3, 1, 4, 1, 5]}, {'b': ['x', 'y', 'z', 'w', 'v']}])
742
+ >>> sorted_d = d.order_by('a', reverse=True)
743
+ >>> sorted_d.data
744
+ [{'a': [5, 4, 3, 1, 1]}, {'b': ['v', 'z', 'x', 'y', 'w']}]
745
+
746
+ >>> d = Dataset([{'a': [3, None, 1, 4, None]}, {'b': ['x', 'y', 'z', 'w', 'v']}])
747
+ >>> sorted_d = d.order_by('a')
748
+ >>> sorted_d.data
749
+ [{'a': [1, 3, 4, None, None]}, {'b': ['z', 'x', 'w', 'y', 'v']}]
556
750
  """
557
751
  number_found = 0
558
752
  for obs in self.data:
@@ -566,7 +760,7 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
566
760
  elif number_found > 1:
567
761
  raise DatasetKeyError(f"Key '{sort_key}' found in more than one dictionary.")
568
762
 
569
- sort_indices_list = self.get_sort_indices(relevant_values, reverse=reverse, use_numpy=use_numpy)
763
+ sort_indices_list = self.get_sort_indices(relevant_values, reverse=reverse)
570
764
  new_data = []
571
765
  for observation in self.data:
572
766
  key, values = list(observation.items())[0]
@@ -663,15 +857,26 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
663
857
  data=data, headers=headers, tablefmt=tablefmt, raw_data_set=self
664
858
  )
665
859
 
666
- def summary(self):
667
- return Dataset([{"num_observations": [len(self)], "keys": [self.keys()]}])
860
+ def summary(self) -> "Dataset":
861
+ """Return a summary of the dataset.
862
+
863
+ Examples:
864
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
865
+ >>> d.summary().data
866
+ [{'num_observations': [3]}, {'keys': [['a', 'b']]}]
867
+ """
868
+ return Dataset([{"num_observations": [len(self)]}, {"keys": [self.keys()]}])
668
869
 
669
870
  @classmethod
670
- def example(self, n: int = None):
871
+ def example(self, n: int = None) -> "Dataset":
671
872
  """Return an example dataset.
672
873
 
673
- >>> Dataset.example()
674
- Dataset([{'a': [1, 2, 3, 4]}, {'b': [4, 3, 2, 1]}])
874
+ Examples:
875
+ >>> Dataset.example()
876
+ Dataset([{'a': [1, 2, 3, 4]}, {'b': [4, 3, 2, 1]}])
877
+
878
+ >>> Dataset.example(n=2)
879
+ Dataset([{'a': [1, 1]}, {'b': [2, 2]}])
675
880
  """
676
881
  if n is None:
677
882
  return Dataset([{"a": [1, 2, 3, 4]}, {"b": [4, 3, 2, 1]}])
@@ -691,6 +896,15 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
691
896
  def to_dict(self) -> dict:
692
897
  """
693
898
  Convert the dataset to a dictionary.
899
+
900
+ Examples:
901
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
902
+ >>> d.to_dict()
903
+ {'data': [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]}
904
+
905
+ >>> d = Dataset([{'x': ['a', 'b']}])
906
+ >>> d.to_dict()
907
+ {'data': [{'x': ['a', 'b']}]}
694
908
  """
695
909
  return {'data': self.data}
696
910
 
@@ -698,6 +912,17 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
698
912
  def from_dict(cls, data: dict) -> 'Dataset':
699
913
  """
700
914
  Convert a dictionary to a dataset.
915
+
916
+ Examples:
917
+ >>> d = Dataset.from_dict({'data': [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]})
918
+ >>> isinstance(d, Dataset)
919
+ True
920
+ >>> d.data
921
+ [{'a': [1, 2, 3]}, {'b': [4, 5, 6]}]
922
+
923
+ >>> d = Dataset.from_dict({'data': [{'x': ['a', 'b']}]})
924
+ >>> d.data
925
+ [{'x': ['a', 'b']}]
701
926
  """
702
927
  return cls(data['data'])
703
928
 
@@ -708,6 +933,15 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
708
933
  Args:
709
934
  output_file (str): Path to save the Word document
710
935
  title (str, optional): Title for the document
936
+
937
+ Examples:
938
+ >>> import tempfile
939
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
940
+ >>> with tempfile.NamedTemporaryFile(suffix='.docx') as tmp:
941
+ ... d.to_docx(tmp.name, title='Test Document')
942
+ ... import os
943
+ ... os.path.exists(tmp.name)
944
+ True
711
945
  """
712
946
  from docx import Document
713
947
  from docx.shared import Inches
@@ -762,11 +996,14 @@ class Dataset(UserList, DatasetOperationsMixin, PersistenceMixin, HashingMixin):
762
996
  Returns:
763
997
  A new Dataset with the expanded rows
764
998
 
765
- Example:
766
- >>> from edsl.dataset import Dataset
999
+ Examples:
767
1000
  >>> d = Dataset([{'a': [[1, 2, 3], [4, 5, 6]]}, {'b': ['x', 'y']}])
768
- >>> d.expand('a')
769
- Dataset([{'a': [1, 2, 3, 4, 5, 6]}, {'b': ['x', 'x', 'x', 'y', 'y', 'y']}])
1001
+ >>> d.expand('a').data
1002
+ [{'a': [1, 2, 3, 4, 5, 6]}, {'b': ['x', 'x', 'x', 'y', 'y', 'y']}]
1003
+
1004
+ >>> d = Dataset([{'items': [['apple', 'banana'], ['orange']]}, {'id': [1, 2]}])
1005
+ >>> d.expand('items', number_field=True).data
1006
+ [{'items': ['apple', 'banana', 'orange']}, {'id': [1, 1, 2]}, {'items_number': [1, 2, 1]}]
770
1007
  """
771
1008
  from collections.abc import Iterable
772
1009