edsl 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (328) hide show
  1. edsl/__init__.py +44 -39
  2. edsl/__version__.py +1 -1
  3. edsl/agents/__init__.py +4 -2
  4. edsl/agents/{Agent.py → agent.py} +442 -152
  5. edsl/agents/{AgentList.py → agent_list.py} +220 -162
  6. edsl/agents/descriptors.py +46 -7
  7. edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
  8. edsl/base/__init__.py +75 -0
  9. edsl/base/base_class.py +1303 -0
  10. edsl/base/data_transfer_models.py +114 -0
  11. edsl/base/enums.py +215 -0
  12. edsl/base.py +8 -0
  13. edsl/buckets/__init__.py +25 -0
  14. edsl/buckets/bucket_collection.py +324 -0
  15. edsl/buckets/model_buckets.py +206 -0
  16. edsl/buckets/token_bucket.py +502 -0
  17. edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
  18. edsl/buckets/token_bucket_client.py +509 -0
  19. edsl/caching/__init__.py +20 -0
  20. edsl/caching/cache.py +814 -0
  21. edsl/caching/cache_entry.py +427 -0
  22. edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
  23. edsl/caching/exceptions.py +24 -0
  24. edsl/caching/orm.py +30 -0
  25. edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
  26. edsl/caching/sql_dict.py +441 -0
  27. edsl/config/__init__.py +8 -0
  28. edsl/config/config_class.py +177 -0
  29. edsl/config.py +4 -176
  30. edsl/conversation/Conversation.py +7 -7
  31. edsl/conversation/car_buying.py +4 -4
  32. edsl/conversation/chips.py +6 -6
  33. edsl/coop/__init__.py +25 -2
  34. edsl/coop/coop.py +430 -113
  35. edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
  36. edsl/coop/exceptions.py +62 -0
  37. edsl/coop/price_fetcher.py +126 -0
  38. edsl/coop/utils.py +89 -24
  39. edsl/data_transfer_models.py +5 -72
  40. edsl/dataset/__init__.py +10 -0
  41. edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
  42. edsl/dataset/dataset_operations_mixin.py +1492 -0
  43. edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
  44. edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
  45. edsl/{results → dataset/display}/table_renderers.py +58 -2
  46. edsl/{results → dataset}/file_exports.py +4 -5
  47. edsl/{results → dataset}/smart_objects.py +2 -2
  48. edsl/enums.py +5 -205
  49. edsl/inference_services/__init__.py +5 -0
  50. edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
  51. edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
  52. edsl/inference_services/data_structures.py +3 -2
  53. edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
  54. edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
  55. edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
  56. edsl/inference_services/registry.py +4 -41
  57. edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
  58. edsl/inference_services/services/__init__.py +31 -0
  59. edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
  60. edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
  61. edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
  62. edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
  63. edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
  64. edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
  65. edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
  66. edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
  67. edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
  68. edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
  69. edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +12 -12
  70. edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
  71. edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
  72. edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
  73. edsl/inference_services/write_available.py +1 -2
  74. edsl/instructions/__init__.py +6 -0
  75. edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
  76. edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
  77. edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
  78. edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
  79. edsl/interviews/__init__.py +4 -0
  80. edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
  81. edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
  82. edsl/interviews/interview.py +638 -0
  83. edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
  84. edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
  85. edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
  86. edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
  87. edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
  88. edsl/invigilators/__init__.py +38 -0
  89. edsl/invigilators/invigilator_base.py +477 -0
  90. edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
  91. edsl/invigilators/prompt_constructor.py +476 -0
  92. edsl/{agents → invigilators}/prompt_helpers.py +2 -1
  93. edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
  94. edsl/{agents → invigilators}/question_option_processor.py +96 -21
  95. edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
  96. edsl/jobs/__init__.py +7 -1
  97. edsl/jobs/async_interview_runner.py +99 -35
  98. edsl/jobs/check_survey_scenario_compatibility.py +7 -5
  99. edsl/jobs/data_structures.py +153 -22
  100. edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
  101. edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
  102. edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
  103. edsl/jobs/{Jobs.py → jobs.py} +321 -155
  104. edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
  105. edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +20 -17
  106. edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
  107. edsl/jobs/jobs_pricing_estimation.py +347 -0
  108. edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
  109. edsl/jobs/jobs_runner_asyncio.py +282 -0
  110. edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
  111. edsl/jobs/results_exceptions_handler.py +2 -2
  112. edsl/key_management/__init__.py +28 -0
  113. edsl/key_management/key_lookup.py +161 -0
  114. edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
  115. edsl/key_management/key_lookup_collection.py +82 -0
  116. edsl/key_management/models.py +218 -0
  117. edsl/language_models/__init__.py +7 -2
  118. edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
  119. edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
  120. edsl/language_models/language_model.py +1080 -0
  121. edsl/language_models/model.py +10 -25
  122. edsl/language_models/{ModelList.py → model_list.py} +9 -14
  123. edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
  124. edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
  125. edsl/language_models/repair.py +4 -4
  126. edsl/language_models/utilities.py +4 -4
  127. edsl/notebooks/__init__.py +3 -1
  128. edsl/notebooks/{Notebook.py → notebook.py} +7 -8
  129. edsl/prompts/__init__.py +1 -1
  130. edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
  131. edsl/prompts/{Prompt.py → prompt.py} +101 -95
  132. edsl/questions/HTMLQuestion.py +1 -1
  133. edsl/questions/__init__.py +154 -25
  134. edsl/questions/answer_validator_mixin.py +1 -1
  135. edsl/questions/compose_questions.py +4 -3
  136. edsl/questions/derived/question_likert_five.py +166 -0
  137. edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
  138. edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
  139. edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
  140. edsl/questions/descriptors.py +24 -30
  141. edsl/questions/loop_processor.py +65 -19
  142. edsl/questions/question_base.py +881 -0
  143. edsl/questions/question_base_gen_mixin.py +15 -16
  144. edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
  145. edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
  146. edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
  147. edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
  148. edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
  149. edsl/questions/question_free_text.py +282 -0
  150. edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
  151. edsl/questions/{QuestionList.py → question_list.py} +6 -7
  152. edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
  153. edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
  154. edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
  155. edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
  156. edsl/questions/question_registry.py +10 -16
  157. edsl/questions/register_questions_meta.py +8 -4
  158. edsl/questions/response_validator_abc.py +17 -16
  159. edsl/results/__init__.py +4 -1
  160. edsl/{exceptions/results.py → results/exceptions.py} +1 -1
  161. edsl/results/report.py +197 -0
  162. edsl/results/{Result.py → result.py} +131 -45
  163. edsl/results/{Results.py → results.py} +420 -216
  164. edsl/results/results_selector.py +344 -25
  165. edsl/scenarios/__init__.py +30 -3
  166. edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
  167. edsl/scenarios/directory_scanner.py +156 -13
  168. edsl/scenarios/document_chunker.py +186 -0
  169. edsl/scenarios/exceptions.py +101 -0
  170. edsl/scenarios/file_methods.py +2 -3
  171. edsl/scenarios/file_store.py +755 -0
  172. edsl/scenarios/handlers/__init__.py +14 -14
  173. edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
  174. edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
  175. edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
  176. edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
  177. edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
  178. edsl/scenarios/handlers/latex_file_store.py +5 -0
  179. edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
  180. edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
  181. edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
  182. edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
  183. edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
  184. edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
  185. edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
  186. edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
  187. edsl/scenarios/scenario.py +928 -0
  188. edsl/scenarios/scenario_join.py +18 -5
  189. edsl/scenarios/{ScenarioList.py → scenario_list.py} +424 -106
  190. edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
  191. edsl/scenarios/scenario_selector.py +5 -1
  192. edsl/study/ObjectEntry.py +2 -2
  193. edsl/study/SnapShot.py +5 -5
  194. edsl/study/Study.py +20 -21
  195. edsl/study/__init__.py +6 -4
  196. edsl/surveys/__init__.py +7 -4
  197. edsl/surveys/dag/__init__.py +2 -0
  198. edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
  199. edsl/surveys/{DAG.py → dag/dag.py} +13 -10
  200. edsl/surveys/descriptors.py +1 -1
  201. edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
  202. edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
  203. edsl/surveys/memory/__init__.py +3 -0
  204. edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
  205. edsl/surveys/rules/__init__.py +3 -0
  206. edsl/surveys/{Rule.py → rules/rule.py} +103 -43
  207. edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
  208. edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
  209. edsl/surveys/survey.py +1743 -0
  210. edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
  211. edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
  212. edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
  213. edsl/tasks/__init__.py +32 -0
  214. edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
  215. edsl/tasks/task_creators.py +135 -0
  216. edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
  217. edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
  218. edsl/tasks/task_status_log.py +85 -0
  219. edsl/tokens/__init__.py +2 -0
  220. edsl/tokens/interview_token_usage.py +53 -0
  221. edsl/utilities/PrettyList.py +1 -1
  222. edsl/utilities/SystemInfo.py +25 -22
  223. edsl/utilities/__init__.py +29 -21
  224. edsl/utilities/gcp_bucket/__init__.py +2 -0
  225. edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
  226. edsl/utilities/interface.py +44 -536
  227. edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
  228. edsl/utilities/repair_functions.py +1 -1
  229. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/METADATA +3 -2
  230. edsl-0.1.48.dist-info/RECORD +347 -0
  231. edsl/Base.py +0 -426
  232. edsl/BaseDiff.py +0 -260
  233. edsl/agents/InvigilatorBase.py +0 -260
  234. edsl/agents/PromptConstructor.py +0 -318
  235. edsl/auto/AutoStudy.py +0 -130
  236. edsl/auto/StageBase.py +0 -243
  237. edsl/auto/StageGenerateSurvey.py +0 -178
  238. edsl/auto/StageLabelQuestions.py +0 -125
  239. edsl/auto/StagePersona.py +0 -61
  240. edsl/auto/StagePersonaDimensionValueRanges.py +0 -88
  241. edsl/auto/StagePersonaDimensionValues.py +0 -74
  242. edsl/auto/StagePersonaDimensions.py +0 -69
  243. edsl/auto/StageQuestions.py +0 -74
  244. edsl/auto/SurveyCreatorPipeline.py +0 -21
  245. edsl/auto/utilities.py +0 -218
  246. edsl/base/Base.py +0 -279
  247. edsl/coop/PriceFetcher.py +0 -54
  248. edsl/data/Cache.py +0 -580
  249. edsl/data/CacheEntry.py +0 -230
  250. edsl/data/SQLiteDict.py +0 -292
  251. edsl/data/__init__.py +0 -5
  252. edsl/data/orm.py +0 -10
  253. edsl/exceptions/cache.py +0 -5
  254. edsl/exceptions/coop.py +0 -14
  255. edsl/exceptions/data.py +0 -14
  256. edsl/exceptions/scenarios.py +0 -29
  257. edsl/jobs/Answers.py +0 -43
  258. edsl/jobs/JobsPrompts.py +0 -354
  259. edsl/jobs/buckets/BucketCollection.py +0 -134
  260. edsl/jobs/buckets/ModelBuckets.py +0 -65
  261. edsl/jobs/buckets/TokenBucket.py +0 -283
  262. edsl/jobs/buckets/TokenBucketClient.py +0 -191
  263. edsl/jobs/interviews/Interview.py +0 -395
  264. edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
  265. edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
  266. edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
  267. edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
  268. edsl/jobs/tasks/TaskCreators.py +0 -64
  269. edsl/jobs/tasks/TaskStatusLog.py +0 -23
  270. edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
  271. edsl/language_models/LanguageModel.py +0 -635
  272. edsl/language_models/ServiceDataSources.py +0 -0
  273. edsl/language_models/key_management/KeyLookup.py +0 -63
  274. edsl/language_models/key_management/KeyLookupCollection.py +0 -38
  275. edsl/language_models/key_management/models.py +0 -137
  276. edsl/questions/QuestionBase.py +0 -539
  277. edsl/questions/QuestionFreeText.py +0 -130
  278. edsl/questions/derived/QuestionLikertFive.py +0 -76
  279. edsl/results/DatasetExportMixin.py +0 -911
  280. edsl/results/ResultsExportMixin.py +0 -45
  281. edsl/results/TextEditor.py +0 -50
  282. edsl/results/results_fetch_mixin.py +0 -33
  283. edsl/results/results_tools_mixin.py +0 -98
  284. edsl/scenarios/DocumentChunker.py +0 -104
  285. edsl/scenarios/FileStore.py +0 -564
  286. edsl/scenarios/Scenario.py +0 -548
  287. edsl/scenarios/ScenarioHtmlMixin.py +0 -65
  288. edsl/scenarios/ScenarioListExportMixin.py +0 -45
  289. edsl/scenarios/handlers/latex.py +0 -5
  290. edsl/shared.py +0 -1
  291. edsl/surveys/Survey.py +0 -1306
  292. edsl/surveys/SurveyQualtricsImport.py +0 -284
  293. edsl/surveys/SurveyToApp.py +0 -141
  294. edsl/surveys/instructions/__init__.py +0 -0
  295. edsl/tools/__init__.py +0 -1
  296. edsl/tools/clusters.py +0 -192
  297. edsl/tools/embeddings.py +0 -27
  298. edsl/tools/embeddings_plotting.py +0 -118
  299. edsl/tools/plotting.py +0 -112
  300. edsl/tools/summarize.py +0 -18
  301. edsl/utilities/data/Registry.py +0 -6
  302. edsl/utilities/data/__init__.py +0 -1
  303. edsl/utilities/data/scooter_results.json +0 -1
  304. edsl-0.1.46.dist-info/RECORD +0 -366
  305. /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
  306. /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
  307. /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
  308. /edsl/{results → dataset/display}/table_data_class.py +0 -0
  309. /edsl/{results → dataset/display}/table_display.css +0 -0
  310. /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
  311. /edsl/{results → dataset}/tree_explore.py +0 -0
  312. /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
  313. /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
  314. /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
  315. /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
  316. /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
  317. /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
  318. /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
  319. /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
  320. /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
  321. /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
  322. /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
  323. /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
  324. /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
  325. /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
  326. /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
  327. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/LICENSE +0 -0
  328. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/WHEEL +0 -0
@@ -1,911 +0,0 @@
1
- """Mixin class for exporting results."""
2
-
3
- import io
4
- import warnings
5
- import textwrap
6
- from typing import Optional, Tuple, Union, List
7
-
8
- from edsl.results.file_exports import CSVExport, ExcelExport, JSONLExport, SQLiteExport
9
-
10
-
11
- class DatasetExportMixin:
12
- """Mixin class for exporting Dataset objects."""
13
-
14
- def relevant_columns(
15
- self, data_type: Optional[str] = None, remove_prefix=False
16
- ) -> list:
17
- """Return the set of keys that are present in the dataset.
18
-
19
- :param data_type: The data type to filter by.
20
- :param remove_prefix: Whether to remove the prefix from the column names.
21
-
22
- >>> from edsl.results.Dataset import Dataset
23
- >>> d = Dataset([{'a.b':[1,2,3,4]}])
24
- >>> d.relevant_columns()
25
- ['a.b']
26
-
27
- >>> d.relevant_columns(remove_prefix=True)
28
- ['b']
29
-
30
- >>> d = Dataset([{'a':[1,2,3,4]}, {'b':[5,6,7,8]}])
31
- >>> d.relevant_columns()
32
- ['a', 'b']
33
-
34
- >>> from edsl.results import Results; Results.example().select('how_feeling', 'how_feeling_yesterday').relevant_columns()
35
- ['answer.how_feeling', 'answer.how_feeling_yesterday']
36
-
37
- >>> from edsl.results import Results
38
- >>> sorted(Results.example().select().relevant_columns(data_type = "model"))
39
- ['model.frequency_penalty', ...]
40
-
41
- >>> Results.example().relevant_columns(data_type = "flimflam")
42
- Traceback (most recent call last):
43
- ...
44
- ValueError: No columns found for data type: flimflam. Available data types are: ...
45
- """
46
- columns = [list(x.keys())[0] for x in self]
47
- if remove_prefix:
48
- columns = [column.split(".")[-1] for column in columns]
49
-
50
- def get_data_type(column):
51
- if "." in column:
52
- return column.split(".")[0]
53
- else:
54
- return None
55
-
56
- if data_type:
57
- all_columns = columns[:]
58
- columns = [
59
- column for column in columns if get_data_type(column) == data_type
60
- ]
61
- if len(columns) == 0:
62
- all_data_types = sorted(
63
- list(set(get_data_type(column) for column in all_columns))
64
- )
65
- raise ValueError(
66
- f"No columns found for data type: {data_type}. Available data types are: {all_data_types}."
67
- )
68
-
69
- return columns
70
-
71
- def num_observations(self):
72
- """Return the number of observations in the dataset.
73
-
74
- >>> from edsl.results.Results import Results
75
- >>> Results.example().num_observations()
76
- 4
77
- """
78
- _num_observations = None
79
- for entry in self:
80
- key, values = list(entry.items())[0]
81
- if _num_observations is None:
82
- _num_observations = len(values)
83
- else:
84
- if len(values) != _num_observations:
85
- raise ValueError(
86
- f"The number of observations is not consistent across columns. "
87
- f"Column '{key}' has {len(values)} observations, but previous columns had {_num_observations} observations."
88
- )
89
-
90
- return _num_observations
91
-
92
- def _make_tabular(
93
- self, remove_prefix: bool, pretty_labels: Optional[dict] = None
94
- ) -> tuple[list, List[list]]:
95
- """Turn the results into a tabular format.
96
-
97
- :param remove_prefix: Whether to remove the prefix from the column names.
98
-
99
- >>> from edsl.results import Results
100
- >>> r = Results.example()
101
- >>> r.select('how_feeling')._make_tabular(remove_prefix = True)
102
- (['how_feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
103
-
104
- >>> r.select('how_feeling')._make_tabular(remove_prefix = True, pretty_labels = {'how_feeling': "How are you feeling"})
105
- (['How are you feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
106
- """
107
-
108
- def create_dict_from_list_of_dicts(list_of_dicts):
109
- for entry in list_of_dicts:
110
- key, list_of_values = list(entry.items())[0]
111
- yield key, list_of_values
112
-
113
- tabular_repr = dict(create_dict_from_list_of_dicts(self.data))
114
-
115
- full_header = [list(x.keys())[0] for x in self]
116
-
117
- rows = []
118
- for i in range(self.num_observations()):
119
- row = [tabular_repr[h][i] for h in full_header]
120
- rows.append(row)
121
-
122
- if remove_prefix:
123
- header = [h.split(".")[-1] for h in full_header]
124
- else:
125
- header = full_header
126
-
127
- if pretty_labels is not None:
128
- header = [pretty_labels.get(h, h) for h in header]
129
-
130
- return header, rows
131
-
132
- def print_long(self):
133
- """Print the results in a long format.
134
- >>> from edsl.results import Results
135
- >>> r = Results.example()
136
- >>> r.select('how_feeling').print_long()
137
- answer.how_feeling: OK
138
- answer.how_feeling: Great
139
- answer.how_feeling: Terrible
140
- answer.how_feeling: OK
141
- """
142
- for entry in self:
143
- key, list_of_values = list(entry.items())[0]
144
- for value in list_of_values:
145
- print(f"{key}: {value}")
146
-
147
- def _get_tabular_data(
148
- self,
149
- remove_prefix: bool = False,
150
- pretty_labels: Optional[dict] = None,
151
- ) -> Tuple[List[str], List[List]]:
152
- """Internal method to get tabular data in a standard format.
153
-
154
- Args:
155
- remove_prefix: Whether to remove the prefix from column names
156
- pretty_labels: Dictionary mapping original column names to pretty labels
157
-
158
- Returns:
159
- Tuple containing (header_row, data_rows)
160
- """
161
- if pretty_labels is None:
162
- pretty_labels = {}
163
-
164
- return self._make_tabular(
165
- remove_prefix=remove_prefix, pretty_labels=pretty_labels
166
- )
167
-
168
- def to_jsonl(self, filename: Optional[str] = None) -> Optional["FileStore"]:
169
- """Export the results to a FileStore instance containing JSONL data."""
170
- exporter = JSONLExport(data=self, filename=filename)
171
- return exporter.export()
172
-
173
- def to_sqlite(
174
- self,
175
- filename: Optional[str] = None,
176
- remove_prefix: bool = False,
177
- pretty_labels: Optional[dict] = None,
178
- table_name: str = "results",
179
- if_exists: str = "replace",
180
- ) -> Optional["FileStore"]:
181
- """Export the results to a SQLite database file."""
182
- exporter = SQLiteExport(
183
- data=self,
184
- filename=filename,
185
- remove_prefix=remove_prefix,
186
- pretty_labels=pretty_labels,
187
- table_name=table_name,
188
- if_exists=if_exists,
189
- )
190
- return exporter.export()
191
-
192
- def to_csv(
193
- self,
194
- filename: Optional[str] = None,
195
- remove_prefix: bool = False,
196
- pretty_labels: Optional[dict] = None,
197
- ) -> Optional["FileStore"]:
198
- """Export the results to a FileStore instance containing CSV data."""
199
- exporter = CSVExport(
200
- data=self,
201
- filename=filename,
202
- remove_prefix=remove_prefix,
203
- pretty_labels=pretty_labels,
204
- )
205
- return exporter.export()
206
-
207
- def to_excel(
208
- self,
209
- filename: Optional[str] = None,
210
- remove_prefix: bool = False,
211
- pretty_labels: Optional[dict] = None,
212
- sheet_name: Optional[str] = None,
213
- ) -> Optional["FileStore"]:
214
- """Export the results to a FileStore instance containing Excel data."""
215
- exporter = ExcelExport(
216
- data=self,
217
- filename=filename,
218
- remove_prefix=remove_prefix,
219
- pretty_labels=pretty_labels,
220
- sheet_name=sheet_name,
221
- )
222
- return exporter.export()
223
-
224
- def _db(
225
- self, remove_prefix: bool = True, shape: str = "wide"
226
- ) -> "sqlalchemy.engine.Engine":
227
- """Create a SQLite database in memory and return the connection.
228
-
229
- Args:
230
- remove_prefix: Whether to remove the prefix from the column names
231
- shape: The shape of the data in the database ("wide" or "long")
232
-
233
- Returns:
234
- A database connection
235
- >>> from sqlalchemy import text
236
- >>> from edsl import Results
237
- >>> engine = Results.example()._db()
238
- >>> len(engine.execute(text("SELECT * FROM self")).fetchall())
239
- 4
240
- >>> engine = Results.example()._db(shape = "long")
241
- >>> len(engine.execute(text("SELECT * FROM self")).fetchall())
242
- 172
243
- """
244
- from sqlalchemy import create_engine, text
245
-
246
- engine = create_engine("sqlite:///:memory:")
247
- if remove_prefix and shape == "wide":
248
- df = self.remove_prefix().to_pandas(lists_as_strings=True)
249
- else:
250
- df = self.to_pandas(lists_as_strings=True)
251
-
252
- if shape == "long":
253
- # Melt the dataframe to convert it to long format
254
- df = df.melt(var_name="key", value_name="value")
255
- # Add a row number column for reference
256
- df.insert(0, "row_number", range(1, len(df) + 1))
257
-
258
- # Split the key into data_type and key
259
- df["data_type"] = df["key"].apply(
260
- lambda x: x.split(".")[0] if "." in x else None
261
- )
262
- df["key"] = df["key"].apply(
263
- lambda x: ".".join(x.split(".")[1:]) if "." in x else x
264
- )
265
-
266
- df.to_sql(
267
- "self",
268
- engine,
269
- index=False,
270
- if_exists="replace",
271
- )
272
- return engine.connect()
273
-
274
- def sql(
275
- self,
276
- query: str,
277
- transpose: bool = None,
278
- transpose_by: str = None,
279
- remove_prefix: bool = True,
280
- shape: str = "wide",
281
- ) -> Union["pd.DataFrame", str]:
282
- """Execute a SQL query and return the results as a DataFrame.
283
-
284
- Args:
285
- query: The SQL query to execute
286
- shape: The shape of the data in the database (wide or long)
287
- remove_prefix: Whether to remove the prefix from the column names
288
- transpose: Whether to transpose the DataFrame
289
- transpose_by: The column to use as the index when transposing
290
- csv: Whether to return the DataFrame as a CSV string
291
- to_list: Whether to return the results as a list
292
- to_latex: Whether to return the results as LaTeX
293
- filename: Optional filename to save the results to
294
-
295
- Returns:
296
- DataFrame, CSV string, list, or LaTeX string depending on parameters
297
-
298
- Examples:
299
- >>> from edsl import Results
300
- >>> r = Results.example();
301
- >>> len(r.sql("SELECT * FROM self", shape = "wide"))
302
- 4
303
- >>> len(r.sql("SELECT * FROM self", shape = "long"))
304
- 172
305
- """
306
- import pandas as pd
307
-
308
- conn = self._db(remove_prefix=remove_prefix, shape=shape)
309
- df = pd.read_sql_query(query, conn)
310
-
311
- # Transpose the DataFrame if transpose is True
312
- if transpose or transpose_by:
313
- df = pd.DataFrame(df)
314
- if transpose_by:
315
- df = df.set_index(transpose_by)
316
- else:
317
- df = df.set_index(df.columns[0])
318
- df = df.transpose()
319
- from edsl.results.Dataset import Dataset
320
-
321
- return Dataset.from_pandas_dataframe(df)
322
-
323
- def to_pandas(
324
- self, remove_prefix: bool = False, lists_as_strings=False
325
- ) -> "DataFrame":
326
- """Convert the results to a pandas DataFrame, ensuring that lists remain as lists.
327
-
328
- :param remove_prefix: Whether to remove the prefix from the column names.
329
-
330
- """
331
- return self._to_pandas_strings(remove_prefix)
332
-
333
- def _to_pandas_strings(self, remove_prefix: bool = False) -> "pd.DataFrame":
334
- """Convert the results to a pandas DataFrame.
335
-
336
- :param remove_prefix: Whether to remove the prefix from the column names.
337
-
338
- >>> from edsl.results import Results
339
- >>> r = Results.example()
340
- >>> r.select('how_feeling').to_pandas()
341
- answer.how_feeling
342
- 0 OK
343
- 1 Great
344
- 2 Terrible
345
- 3 OK
346
- """
347
-
348
- import pandas as pd
349
-
350
- csv_string = self.to_csv(remove_prefix=remove_prefix).text
351
- csv_buffer = io.StringIO(csv_string)
352
- df = pd.read_csv(csv_buffer)
353
- # df_sorted = df.sort_index(axis=1) # Sort columns alphabetically
354
- return df
355
-
356
- def to_polars(
357
- self, remove_prefix: bool = False, lists_as_strings=False
358
- ) -> "pl.DataFrame":
359
- """Convert the results to a Polars DataFrame.
360
-
361
- :param remove_prefix: Whether to remove the prefix from the column names.
362
- """
363
- return self._to_polars_strings(remove_prefix)
364
-
365
- def _to_polars_strings(self, remove_prefix: bool = False) -> "pl.DataFrame":
366
- """Convert the results to a Polars DataFrame.
367
-
368
- :param remove_prefix: Whether to remove the prefix from the column names.
369
- """
370
- import polars as pl
371
-
372
- csv_string = self.to_csv(remove_prefix=remove_prefix).text
373
- df = pl.read_csv(io.StringIO(csv_string))
374
- return df
375
-
376
- def to_scenario_list(self, remove_prefix: bool = True) -> list[dict]:
377
- """Convert the results to a list of dictionaries, one per scenario.
378
-
379
- :param remove_prefix: Whether to remove the prefix from the column names.
380
-
381
- >>> from edsl.results import Results
382
- >>> r = Results.example()
383
- >>> r.select('how_feeling').to_scenario_list()
384
- ScenarioList([Scenario({'how_feeling': 'OK'}), Scenario({'how_feeling': 'Great'}), Scenario({'how_feeling': 'Terrible'}), Scenario({'how_feeling': 'OK'})])
385
- """
386
- from edsl.scenarios.ScenarioList import ScenarioList
387
- from edsl.scenarios.Scenario import Scenario
388
-
389
- list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
390
- scenarios = []
391
- for d in list_of_dicts:
392
- scenarios.append(Scenario(d))
393
- return ScenarioList(scenarios)
394
-
395
- def to_agent_list(self, remove_prefix: bool = True):
396
- """Convert the results to a list of dictionaries, one per agent.
397
-
398
- :param remove_prefix: Whether to remove the prefix from the column names.
399
-
400
- >>> from edsl.results import Results
401
- >>> r = Results.example()
402
- >>> r.select('how_feeling').to_agent_list()
403
- AgentList([Agent(traits = {'how_feeling': 'OK'}), Agent(traits = {'how_feeling': 'Great'}), Agent(traits = {'how_feeling': 'Terrible'}), Agent(traits = {'how_feeling': 'OK'})])
404
- """
405
- from edsl.agents import Agent
406
- from edsl.agents.AgentList import AgentList
407
-
408
- list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
409
- agents = []
410
- for d in list_of_dicts:
411
- if "name" in d:
412
- d["agent_name"] = d.pop("name")
413
- agents.append(Agent(d, name=d["agent_name"]))
414
- if "agent_parameters" in d:
415
- agent_parameters = d.pop("agent_parameters")
416
- agent_name = agent_parameters.get("name", None)
417
- instruction = agent_parameters.get("instruction", None)
418
- agents.append(Agent(d, name=agent_name, instruction=instruction))
419
- else:
420
- agents.append(Agent(d))
421
- return AgentList(agents)
422
-
423
- def to_dicts(self, remove_prefix: bool = True) -> list[dict]:
424
- """Convert the results to a list of dictionaries.
425
-
426
- :param remove_prefix: Whether to remove the prefix from the column names.
427
-
428
- >>> from edsl.results import Results
429
- >>> r = Results.example()
430
- >>> r.select('how_feeling').to_dicts()
431
- [{'how_feeling': 'OK'}, {'how_feeling': 'Great'}, {'how_feeling': 'Terrible'}, {'how_feeling': 'OK'}]
432
-
433
- """
434
- list_of_keys = []
435
- list_of_values = []
436
- for entry in self:
437
- key, values = list(entry.items())[0]
438
- list_of_keys.append(key)
439
- list_of_values.append(values)
440
-
441
- if remove_prefix:
442
- list_of_keys = [key.split(".")[-1] for key in list_of_keys]
443
-
444
- list_of_dicts = []
445
- for entries in zip(*list_of_values):
446
- list_of_dicts.append(dict(zip(list_of_keys, entries)))
447
-
448
- return list_of_dicts
449
-
450
- def to_list(self, flatten=False, remove_none=False, unzipped=False) -> list[list]:
451
- """Convert the results to a list of lists.
452
-
453
- :param flatten: Whether to flatten the list of lists.
454
- :param remove_none: Whether to remove None values from the list.
455
-
456
- >>> from edsl.results import Results
457
- >>> Results.example().select('how_feeling', 'how_feeling_yesterday')
458
- Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible', 'OK']}, {'answer.how_feeling_yesterday': ['Great', 'Good', 'OK', 'Terrible']}])
459
-
460
- >>> Results.example().select('how_feeling', 'how_feeling_yesterday').to_list()
461
- [('OK', 'Great'), ('Great', 'Good'), ('Terrible', 'OK'), ('OK', 'Terrible')]
462
-
463
- >>> r = Results.example()
464
- >>> r.select('how_feeling').to_list()
465
- ['OK', 'Great', 'Terrible', 'OK']
466
-
467
- >>> from edsl.results.Dataset import Dataset
468
- >>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}]).select('a.b').to_list(flatten = True)
469
- [1, 9, 2, 3, 4]
470
-
471
- >>> from edsl.results.Dataset import Dataset
472
- >>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}, {'c': [6, 2, 3, 4]}]).select('a.b', 'c').to_list(flatten = True)
473
- Traceback (most recent call last):
474
- ...
475
- ValueError: Cannot flatten a list of lists when there are multiple columns selected.
476
-
477
-
478
- """
479
- if len(self.relevant_columns()) > 1 and flatten:
480
- raise ValueError(
481
- "Cannot flatten a list of lists when there are multiple columns selected."
482
- )
483
-
484
- if len(self.relevant_columns()) == 1:
485
- # if only one 'column' is selected (which is typical for this method
486
- list_to_return = list(self[0].values())[0]
487
- else:
488
- keys = self.relevant_columns()
489
- data = self.to_dicts(remove_prefix=False)
490
- list_to_return = []
491
- for d in data:
492
- list_to_return.append(tuple([d[key] for key in keys]))
493
-
494
- if remove_none:
495
- list_to_return = [item for item in list_to_return if item is not None]
496
-
497
- if flatten:
498
- new_list = []
499
- for item in list_to_return:
500
- if isinstance(item, list):
501
- new_list.extend(item)
502
- else:
503
- new_list.append(item)
504
- list_to_return = new_list
505
-
506
- from edsl.utilities.PrettyList import PrettyList
507
-
508
- return PrettyList(list_to_return)
509
-
510
- def html(
511
- self,
512
- filename: Optional[str] = None,
513
- cta: str = "Open in browser",
514
- return_link: bool = False,
515
- ):
516
- import os
517
- import tempfile
518
- from edsl.utilities.utilities import is_notebook
519
- from IPython.display import HTML, display
520
- from edsl.utilities.utilities import is_notebook
521
-
522
- df = self.to_pandas()
523
-
524
- if filename is None:
525
- current_directory = os.getcwd()
526
- filename = tempfile.NamedTemporaryFile(
527
- "w", delete=False, suffix=".html", dir=current_directory
528
- ).name
529
-
530
- with open(filename, "w") as f:
531
- f.write(df.to_html())
532
-
533
- if is_notebook():
534
- html_url = f"/files/{filename}"
535
- html_link = f'<a href="{html_url}" target="_blank">{cta}</a>'
536
- display(HTML(html_link))
537
- else:
538
- print(f"Saved to {filename}")
539
- import webbrowser
540
- import os
541
-
542
- webbrowser.open(f"file://{os.path.abspath(filename)}")
543
-
544
- if return_link:
545
- return filename
546
-
547
- def report(self, *fields: Optional[str], top_n: Optional[int] = None,
548
- header_fields: Optional[List[str]] = None, divider: bool = True,
549
- return_string: bool = False) -> Optional[str]:
550
- """Takes the fields in order and returns a report of the results by iterating through rows.
551
- The row number is printed as # Observation: <row number>
552
- The name of the field is used as markdown header at level "##"
553
- The content of that field is then printed.
554
- Then the next field and so on.
555
- Once that row is done, a new line is printed and the next row is shown.
556
- If in a jupyter notebook, the report is displayed as markdown.
557
-
558
- Args:
559
- *fields: The fields to include in the report. If none provided, all fields are used.
560
- top_n: Optional limit on the number of observations to include.
561
- header_fields: Optional list of fields to include in the main header instead of as sections.
562
- divider: If True, adds a horizontal rule between observations for better visual separation.
563
- return_string: If True, returns the markdown string. If False (default in notebooks),
564
- only displays the markdown without returning.
565
-
566
- Returns:
567
- A string containing the markdown report if return_string is True, otherwise None.
568
-
569
- Examples:
570
- >>> from edsl.results import Results
571
- >>> r = Results.example()
572
- >>> report = r.select('how_feeling', 'how_feeling_yesterday').report(return_string=True)
573
- >>> "# Observation: 1" in report
574
- True
575
- >>> "## answer.how_feeling" in report
576
- True
577
- >>> report = r.select('how_feeling').report(header_fields=['answer.how_feeling'], return_string=True)
578
- >>> "# Observation: 1 (`how_feeling`: OK)" in report
579
- True
580
- """
581
- from edsl.utilities.utilities import is_notebook
582
-
583
- # If no fields specified, use all columns
584
- if not fields:
585
- fields = self.relevant_columns()
586
-
587
- # Initialize header_fields if not provided
588
- if header_fields is None:
589
- header_fields = []
590
-
591
- # Validate all fields
592
- all_fields = list(fields) + [f for f in header_fields if f not in fields]
593
- for field in all_fields:
594
- if field not in self.relevant_columns():
595
- raise ValueError(f"Field '{field}' not found in dataset")
596
-
597
- # Get data for each field
598
- field_data = {}
599
- for field in all_fields:
600
- for entry in self:
601
- if field in entry:
602
- field_data[field] = entry[field]
603
- break
604
-
605
- # Number of observations to process
606
- num_obs = self.num_observations()
607
- if top_n is not None:
608
- num_obs = min(num_obs, top_n)
609
-
610
- # Build the report
611
- report_lines = []
612
- for i in range(num_obs):
613
- # Create header with observation number and any header fields
614
- header = f"# Observation: {i+1}"
615
- if header_fields:
616
- header_parts = []
617
- for field in header_fields:
618
- value = field_data[field][i]
619
- # Get the field name without prefix for cleaner display
620
- display_name = field.split('.')[-1] if '.' in field else field
621
- # Format with backticks for monospace
622
- header_parts.append(f"`{display_name}`: {value}")
623
- if header_parts:
624
- header += f" ({', '.join(header_parts)})"
625
- report_lines.append(header)
626
-
627
- # Add the remaining fields
628
- for field in fields:
629
- if field not in header_fields:
630
- report_lines.append(f"## {field}")
631
- value = field_data[field][i]
632
- if isinstance(value, list) or isinstance(value, dict):
633
- import json
634
- report_lines.append(f"```\n{json.dumps(value, indent=2)}\n```")
635
- else:
636
- report_lines.append(str(value))
637
-
638
- # Add divider between observations if requested
639
- if divider and i < num_obs - 1:
640
- report_lines.append("\n---\n")
641
- else:
642
- report_lines.append("") # Empty line between observations
643
-
644
- report_text = "\n".join(report_lines)
645
-
646
- # In notebooks, display as markdown and optionally return
647
- is_nb = is_notebook()
648
- if is_nb:
649
- from IPython.display import Markdown, display
650
- display(Markdown(report_text))
651
-
652
- # Return the string if requested or if not in a notebook
653
- if return_string or not is_nb:
654
- return report_text
655
- return None
656
-
657
- def tally(
658
- self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
659
- ) -> Union[dict, "Dataset"]:
660
- """Tally the values of a field or perform a cross-tab of multiple fields.
661
-
662
- :param fields: The field(s) to tally, multiple fields for cross-tabulation.
663
-
664
- >>> from edsl.results import Results
665
- >>> r = Results.example()
666
- >>> r.select('how_feeling').tally('answer.how_feeling', output = "dict")
667
- {'OK': 2, 'Great': 1, 'Terrible': 1}
668
- >>> from edsl.results.Dataset import Dataset
669
- >>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
670
- >>> r.select('how_feeling').tally('answer.how_feeling', output = "Dataset") == expected
671
- True
672
- """
673
- from collections import Counter
674
-
675
- if len(fields) == 0:
676
- fields = self.relevant_columns()
677
-
678
- relevant_columns_without_prefix = [
679
- column.split(".")[-1] for column in self.relevant_columns()
680
- ]
681
-
682
- if not all(
683
- f in self.relevant_columns() or f in relevant_columns_without_prefix
684
- for f in fields
685
- ):
686
- raise ValueError("One or more specified fields are not in the dataset.")
687
-
688
- if len(fields) == 1:
689
- field = fields[0]
690
- values = self._key_to_value(field)
691
- else:
692
- values = list(zip(*(self._key_to_value(field) for field in fields)))
693
-
694
- for value in values:
695
- if isinstance(value, list):
696
- value = tuple(value)
697
-
698
- tally = dict(Counter(values))
699
- sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
700
- if top_n is not None:
701
- sorted_tally = dict(list(sorted_tally.items())[:top_n])
702
-
703
- from edsl.results.Dataset import Dataset
704
-
705
- if output == "dict":
706
- # why did I do this?
707
- warnings.warn(
708
- textwrap.dedent(
709
- """\
710
- The default output from tally will change to Dataset in the future.
711
- Use output='Dataset' to get the Dataset object for now.
712
- """
713
- )
714
- )
715
- return sorted_tally
716
- elif output == "Dataset":
717
- dataset = Dataset(
718
- [
719
- {"value": list(sorted_tally.keys())},
720
- {"count": list(sorted_tally.values())},
721
- ]
722
- )
723
- # return dataset
724
- sl = dataset.to_scenario_list().unpack(
725
- "value",
726
- new_names=[fields] if isinstance(fields, str) else fields,
727
- keep_original=False,
728
- )
729
- keys = list(sl[0].keys())
730
- keys.remove("count")
731
- keys.append("count")
732
- return sl.reorder_keys(keys).to_dataset()
733
-
734
- def flatten(self, field, keep_original=False):
735
- """
736
- Flatten a field containing a list of dictionaries into separate fields.
737
-
738
- >>> from edsl.results.Dataset import Dataset
739
- >>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5] }]).flatten('a')
740
- Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
741
-
742
-
743
- >>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5] }]).flatten('answer.example')
744
- Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
745
-
746
-
747
- Args:
748
- field: The field to flatten
749
- keep_original: If True, keeps the original field in the dataset
750
-
751
- Returns:
752
- A new dataset with the flattened fields
753
- """
754
- from edsl.results.Dataset import Dataset
755
-
756
- # Ensure the dataset isn't empty
757
- if not self.data:
758
- return self.copy()
759
-
760
- # Find all columns that contain the field
761
- matching_entries = []
762
- for entry in self.data:
763
- col_name = next(iter(entry.keys()))
764
- if field == col_name or (
765
- '.' in col_name and
766
- (col_name.endswith('.' + field) or col_name.startswith(field + '.'))
767
- ):
768
- matching_entries.append(entry)
769
-
770
- # Check if the field is ambiguous
771
- if len(matching_entries) > 1:
772
- matching_cols = [next(iter(entry.keys())) for entry in matching_entries]
773
- raise ValueError(
774
- f"Ambiguous field name '{field}'. It matches multiple columns: {matching_cols}. "
775
- f"Please specify the full column name to flatten."
776
- )
777
-
778
- # Get the number of observations
779
- num_observations = self.num_observations()
780
-
781
- # Find the column to flatten
782
- field_entry = None
783
- for entry in self.data:
784
- if field in entry:
785
- field_entry = entry
786
- break
787
-
788
- if field_entry is None:
789
- warnings.warn(
790
- f"Field '{field}' not found in dataset, returning original dataset"
791
- )
792
- return self.copy()
793
-
794
- # Create new dictionary for flattened data
795
- flattened_data = []
796
-
797
- # Copy all existing columns except the one we're flattening (if keep_original is False)
798
- for entry in self.data:
799
- col_name = next(iter(entry.keys()))
800
- if col_name != field or keep_original:
801
- flattened_data.append(entry.copy())
802
-
803
- # Get field data and make sure it's valid
804
- field_values = field_entry[field]
805
- if not all(isinstance(item, dict) for item in field_values if item is not None):
806
- warnings.warn(
807
- f"Field '{field}' contains non-dictionary values that cannot be flattened"
808
- )
809
- return self.copy()
810
-
811
- # Collect all unique keys across all dictionaries
812
- all_keys = set()
813
- for item in field_values:
814
- if isinstance(item, dict):
815
- all_keys.update(item.keys())
816
-
817
- # Create new columns for each key
818
- for key in sorted(all_keys): # Sort for consistent output
819
- new_values = []
820
- for i in range(num_observations):
821
- value = None
822
- if i < len(field_values) and isinstance(field_values[i], dict):
823
- value = field_values[i].get(key, None)
824
- new_values.append(value)
825
-
826
- # Add this as a new column
827
- flattened_data.append({f"{field}.{key}": new_values})
828
-
829
- # Return a new Dataset with the flattened data
830
- return Dataset(flattened_data)
831
-
832
- def unpack_list(
833
- self,
834
- field: str,
835
- new_names: Optional[List[str]] = None,
836
- keep_original: bool = True,
837
- ) -> "Dataset":
838
- """Unpack list columns into separate columns with provided names or numeric suffixes.
839
-
840
- For example, if a dataset contains:
841
- [{'data': [[1, 2, 3], [4, 5, 6]], 'other': ['x', 'y']}]
842
-
843
- After d.unpack_list('data'), it should become:
844
- [{'other': ['x', 'y'], 'data_1': [1, 4], 'data_2': [2, 5], 'data_3': [3, 6]}]
845
-
846
- Args:
847
- field: The field containing lists to unpack
848
- new_names: Optional list of names for the unpacked fields. If None, uses numeric suffixes.
849
- keep_original: If True, keeps the original field in the dataset
850
-
851
- Returns:
852
- A new Dataset with unpacked columns
853
-
854
- Examples:
855
- >>> from edsl.results.Dataset import Dataset
856
- >>> d = Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}])
857
- >>> d.unpack_list('data')
858
- Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'data_1': [1, 4]}, {'data_2': [2, 5]}, {'data_3': [3, 6]}])
859
-
860
- >>> d.unpack_list('data', new_names=['first', 'second', 'third'])
861
- Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'first': [1, 4]}, {'second': [2, 5]}, {'third': [3, 6]}])
862
- """
863
- from edsl.results.Dataset import Dataset
864
-
865
- # Create a copy of the dataset
866
- result = Dataset(self.data.copy())
867
-
868
- # Find the field in the dataset
869
- field_index = None
870
- for i, entry in enumerate(result.data):
871
- if field in entry:
872
- field_index = i
873
- break
874
-
875
- if field_index is None:
876
- raise ValueError(f"Field '{field}' not found in dataset")
877
-
878
- field_data = result.data[field_index][field]
879
-
880
- # Check if values are lists
881
- if not all(isinstance(v, list) for v in field_data):
882
- raise ValueError(f"Field '{field}' does not contain lists in all entries")
883
-
884
- # Get the maximum length of lists
885
- max_len = max(len(v) for v in field_data)
886
-
887
- # Create new fields for each index
888
- for i in range(max_len):
889
- if new_names and i < len(new_names):
890
- new_field = new_names[i]
891
- else:
892
- new_field = f"{field}_{i+1}"
893
-
894
- # Extract the i-th element from each list
895
- new_values = []
896
- for item in field_data:
897
- new_values.append(item[i] if i < len(item) else None)
898
-
899
- result.data.append({new_field: new_values})
900
-
901
- # Remove the original field if keep_original is False
902
- if not keep_original:
903
- result.data.pop(field_index)
904
-
905
- return result
906
-
907
-
908
- if __name__ == "__main__":
909
- import doctest
910
-
911
- doctest.testmod(optionflags=doctest.ELLIPSIS)