edsl 0.1.46__py3-none-any.whl → 0.1.48__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (328) hide show
  1. edsl/__init__.py +44 -39
  2. edsl/__version__.py +1 -1
  3. edsl/agents/__init__.py +4 -2
  4. edsl/agents/{Agent.py → agent.py} +442 -152
  5. edsl/agents/{AgentList.py → agent_list.py} +220 -162
  6. edsl/agents/descriptors.py +46 -7
  7. edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
  8. edsl/base/__init__.py +75 -0
  9. edsl/base/base_class.py +1303 -0
  10. edsl/base/data_transfer_models.py +114 -0
  11. edsl/base/enums.py +215 -0
  12. edsl/base.py +8 -0
  13. edsl/buckets/__init__.py +25 -0
  14. edsl/buckets/bucket_collection.py +324 -0
  15. edsl/buckets/model_buckets.py +206 -0
  16. edsl/buckets/token_bucket.py +502 -0
  17. edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
  18. edsl/buckets/token_bucket_client.py +509 -0
  19. edsl/caching/__init__.py +20 -0
  20. edsl/caching/cache.py +814 -0
  21. edsl/caching/cache_entry.py +427 -0
  22. edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
  23. edsl/caching/exceptions.py +24 -0
  24. edsl/caching/orm.py +30 -0
  25. edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
  26. edsl/caching/sql_dict.py +441 -0
  27. edsl/config/__init__.py +8 -0
  28. edsl/config/config_class.py +177 -0
  29. edsl/config.py +4 -176
  30. edsl/conversation/Conversation.py +7 -7
  31. edsl/conversation/car_buying.py +4 -4
  32. edsl/conversation/chips.py +6 -6
  33. edsl/coop/__init__.py +25 -2
  34. edsl/coop/coop.py +430 -113
  35. edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
  36. edsl/coop/exceptions.py +62 -0
  37. edsl/coop/price_fetcher.py +126 -0
  38. edsl/coop/utils.py +89 -24
  39. edsl/data_transfer_models.py +5 -72
  40. edsl/dataset/__init__.py +10 -0
  41. edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
  42. edsl/dataset/dataset_operations_mixin.py +1492 -0
  43. edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
  44. edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
  45. edsl/{results → dataset/display}/table_renderers.py +58 -2
  46. edsl/{results → dataset}/file_exports.py +4 -5
  47. edsl/{results → dataset}/smart_objects.py +2 -2
  48. edsl/enums.py +5 -205
  49. edsl/inference_services/__init__.py +5 -0
  50. edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
  51. edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
  52. edsl/inference_services/data_structures.py +3 -2
  53. edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
  54. edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
  55. edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
  56. edsl/inference_services/registry.py +4 -41
  57. edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
  58. edsl/inference_services/services/__init__.py +31 -0
  59. edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
  60. edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
  61. edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
  62. edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
  63. edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
  64. edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
  65. edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
  66. edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
  67. edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
  68. edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
  69. edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +12 -12
  70. edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
  71. edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
  72. edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
  73. edsl/inference_services/write_available.py +1 -2
  74. edsl/instructions/__init__.py +6 -0
  75. edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
  76. edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
  77. edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
  78. edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
  79. edsl/interviews/__init__.py +4 -0
  80. edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
  81. edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
  82. edsl/interviews/interview.py +638 -0
  83. edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
  84. edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
  85. edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
  86. edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
  87. edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
  88. edsl/invigilators/__init__.py +38 -0
  89. edsl/invigilators/invigilator_base.py +477 -0
  90. edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
  91. edsl/invigilators/prompt_constructor.py +476 -0
  92. edsl/{agents → invigilators}/prompt_helpers.py +2 -1
  93. edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
  94. edsl/{agents → invigilators}/question_option_processor.py +96 -21
  95. edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
  96. edsl/jobs/__init__.py +7 -1
  97. edsl/jobs/async_interview_runner.py +99 -35
  98. edsl/jobs/check_survey_scenario_compatibility.py +7 -5
  99. edsl/jobs/data_structures.py +153 -22
  100. edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
  101. edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
  102. edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
  103. edsl/jobs/{Jobs.py → jobs.py} +321 -155
  104. edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
  105. edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +20 -17
  106. edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
  107. edsl/jobs/jobs_pricing_estimation.py +347 -0
  108. edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
  109. edsl/jobs/jobs_runner_asyncio.py +282 -0
  110. edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
  111. edsl/jobs/results_exceptions_handler.py +2 -2
  112. edsl/key_management/__init__.py +28 -0
  113. edsl/key_management/key_lookup.py +161 -0
  114. edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
  115. edsl/key_management/key_lookup_collection.py +82 -0
  116. edsl/key_management/models.py +218 -0
  117. edsl/language_models/__init__.py +7 -2
  118. edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
  119. edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
  120. edsl/language_models/language_model.py +1080 -0
  121. edsl/language_models/model.py +10 -25
  122. edsl/language_models/{ModelList.py → model_list.py} +9 -14
  123. edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
  124. edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
  125. edsl/language_models/repair.py +4 -4
  126. edsl/language_models/utilities.py +4 -4
  127. edsl/notebooks/__init__.py +3 -1
  128. edsl/notebooks/{Notebook.py → notebook.py} +7 -8
  129. edsl/prompts/__init__.py +1 -1
  130. edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
  131. edsl/prompts/{Prompt.py → prompt.py} +101 -95
  132. edsl/questions/HTMLQuestion.py +1 -1
  133. edsl/questions/__init__.py +154 -25
  134. edsl/questions/answer_validator_mixin.py +1 -1
  135. edsl/questions/compose_questions.py +4 -3
  136. edsl/questions/derived/question_likert_five.py +166 -0
  137. edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
  138. edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
  139. edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
  140. edsl/questions/descriptors.py +24 -30
  141. edsl/questions/loop_processor.py +65 -19
  142. edsl/questions/question_base.py +881 -0
  143. edsl/questions/question_base_gen_mixin.py +15 -16
  144. edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
  145. edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
  146. edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
  147. edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
  148. edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
  149. edsl/questions/question_free_text.py +282 -0
  150. edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
  151. edsl/questions/{QuestionList.py → question_list.py} +6 -7
  152. edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
  153. edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
  154. edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
  155. edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
  156. edsl/questions/question_registry.py +10 -16
  157. edsl/questions/register_questions_meta.py +8 -4
  158. edsl/questions/response_validator_abc.py +17 -16
  159. edsl/results/__init__.py +4 -1
  160. edsl/{exceptions/results.py → results/exceptions.py} +1 -1
  161. edsl/results/report.py +197 -0
  162. edsl/results/{Result.py → result.py} +131 -45
  163. edsl/results/{Results.py → results.py} +420 -216
  164. edsl/results/results_selector.py +344 -25
  165. edsl/scenarios/__init__.py +30 -3
  166. edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
  167. edsl/scenarios/directory_scanner.py +156 -13
  168. edsl/scenarios/document_chunker.py +186 -0
  169. edsl/scenarios/exceptions.py +101 -0
  170. edsl/scenarios/file_methods.py +2 -3
  171. edsl/scenarios/file_store.py +755 -0
  172. edsl/scenarios/handlers/__init__.py +14 -14
  173. edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
  174. edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
  175. edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
  176. edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
  177. edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
  178. edsl/scenarios/handlers/latex_file_store.py +5 -0
  179. edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
  180. edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
  181. edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
  182. edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
  183. edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
  184. edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
  185. edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
  186. edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
  187. edsl/scenarios/scenario.py +928 -0
  188. edsl/scenarios/scenario_join.py +18 -5
  189. edsl/scenarios/{ScenarioList.py → scenario_list.py} +424 -106
  190. edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
  191. edsl/scenarios/scenario_selector.py +5 -1
  192. edsl/study/ObjectEntry.py +2 -2
  193. edsl/study/SnapShot.py +5 -5
  194. edsl/study/Study.py +20 -21
  195. edsl/study/__init__.py +6 -4
  196. edsl/surveys/__init__.py +7 -4
  197. edsl/surveys/dag/__init__.py +2 -0
  198. edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
  199. edsl/surveys/{DAG.py → dag/dag.py} +13 -10
  200. edsl/surveys/descriptors.py +1 -1
  201. edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
  202. edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
  203. edsl/surveys/memory/__init__.py +3 -0
  204. edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
  205. edsl/surveys/rules/__init__.py +3 -0
  206. edsl/surveys/{Rule.py → rules/rule.py} +103 -43
  207. edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
  208. edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
  209. edsl/surveys/survey.py +1743 -0
  210. edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
  211. edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
  212. edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
  213. edsl/tasks/__init__.py +32 -0
  214. edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
  215. edsl/tasks/task_creators.py +135 -0
  216. edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
  217. edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
  218. edsl/tasks/task_status_log.py +85 -0
  219. edsl/tokens/__init__.py +2 -0
  220. edsl/tokens/interview_token_usage.py +53 -0
  221. edsl/utilities/PrettyList.py +1 -1
  222. edsl/utilities/SystemInfo.py +25 -22
  223. edsl/utilities/__init__.py +29 -21
  224. edsl/utilities/gcp_bucket/__init__.py +2 -0
  225. edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
  226. edsl/utilities/interface.py +44 -536
  227. edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
  228. edsl/utilities/repair_functions.py +1 -1
  229. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/METADATA +3 -2
  230. edsl-0.1.48.dist-info/RECORD +347 -0
  231. edsl/Base.py +0 -426
  232. edsl/BaseDiff.py +0 -260
  233. edsl/agents/InvigilatorBase.py +0 -260
  234. edsl/agents/PromptConstructor.py +0 -318
  235. edsl/auto/AutoStudy.py +0 -130
  236. edsl/auto/StageBase.py +0 -243
  237. edsl/auto/StageGenerateSurvey.py +0 -178
  238. edsl/auto/StageLabelQuestions.py +0 -125
  239. edsl/auto/StagePersona.py +0 -61
  240. edsl/auto/StagePersonaDimensionValueRanges.py +0 -88
  241. edsl/auto/StagePersonaDimensionValues.py +0 -74
  242. edsl/auto/StagePersonaDimensions.py +0 -69
  243. edsl/auto/StageQuestions.py +0 -74
  244. edsl/auto/SurveyCreatorPipeline.py +0 -21
  245. edsl/auto/utilities.py +0 -218
  246. edsl/base/Base.py +0 -279
  247. edsl/coop/PriceFetcher.py +0 -54
  248. edsl/data/Cache.py +0 -580
  249. edsl/data/CacheEntry.py +0 -230
  250. edsl/data/SQLiteDict.py +0 -292
  251. edsl/data/__init__.py +0 -5
  252. edsl/data/orm.py +0 -10
  253. edsl/exceptions/cache.py +0 -5
  254. edsl/exceptions/coop.py +0 -14
  255. edsl/exceptions/data.py +0 -14
  256. edsl/exceptions/scenarios.py +0 -29
  257. edsl/jobs/Answers.py +0 -43
  258. edsl/jobs/JobsPrompts.py +0 -354
  259. edsl/jobs/buckets/BucketCollection.py +0 -134
  260. edsl/jobs/buckets/ModelBuckets.py +0 -65
  261. edsl/jobs/buckets/TokenBucket.py +0 -283
  262. edsl/jobs/buckets/TokenBucketClient.py +0 -191
  263. edsl/jobs/interviews/Interview.py +0 -395
  264. edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
  265. edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
  266. edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
  267. edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
  268. edsl/jobs/tasks/TaskCreators.py +0 -64
  269. edsl/jobs/tasks/TaskStatusLog.py +0 -23
  270. edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
  271. edsl/language_models/LanguageModel.py +0 -635
  272. edsl/language_models/ServiceDataSources.py +0 -0
  273. edsl/language_models/key_management/KeyLookup.py +0 -63
  274. edsl/language_models/key_management/KeyLookupCollection.py +0 -38
  275. edsl/language_models/key_management/models.py +0 -137
  276. edsl/questions/QuestionBase.py +0 -539
  277. edsl/questions/QuestionFreeText.py +0 -130
  278. edsl/questions/derived/QuestionLikertFive.py +0 -76
  279. edsl/results/DatasetExportMixin.py +0 -911
  280. edsl/results/ResultsExportMixin.py +0 -45
  281. edsl/results/TextEditor.py +0 -50
  282. edsl/results/results_fetch_mixin.py +0 -33
  283. edsl/results/results_tools_mixin.py +0 -98
  284. edsl/scenarios/DocumentChunker.py +0 -104
  285. edsl/scenarios/FileStore.py +0 -564
  286. edsl/scenarios/Scenario.py +0 -548
  287. edsl/scenarios/ScenarioHtmlMixin.py +0 -65
  288. edsl/scenarios/ScenarioListExportMixin.py +0 -45
  289. edsl/scenarios/handlers/latex.py +0 -5
  290. edsl/shared.py +0 -1
  291. edsl/surveys/Survey.py +0 -1306
  292. edsl/surveys/SurveyQualtricsImport.py +0 -284
  293. edsl/surveys/SurveyToApp.py +0 -141
  294. edsl/surveys/instructions/__init__.py +0 -0
  295. edsl/tools/__init__.py +0 -1
  296. edsl/tools/clusters.py +0 -192
  297. edsl/tools/embeddings.py +0 -27
  298. edsl/tools/embeddings_plotting.py +0 -118
  299. edsl/tools/plotting.py +0 -112
  300. edsl/tools/summarize.py +0 -18
  301. edsl/utilities/data/Registry.py +0 -6
  302. edsl/utilities/data/__init__.py +0 -1
  303. edsl/utilities/data/scooter_results.json +0 -1
  304. edsl-0.1.46.dist-info/RECORD +0 -366
  305. /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
  306. /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
  307. /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
  308. /edsl/{results → dataset/display}/table_data_class.py +0 -0
  309. /edsl/{results → dataset/display}/table_display.css +0 -0
  310. /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
  311. /edsl/{results → dataset}/tree_explore.py +0 -0
  312. /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
  313. /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
  314. /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
  315. /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
  316. /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
  317. /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
  318. /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
  319. /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
  320. /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
  321. /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
  322. /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
  323. /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
  324. /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
  325. /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
  326. /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
  327. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/LICENSE +0 -0
  328. {edsl-0.1.46.dist-info → edsl-0.1.48.dist-info}/WHEEL +0 -0
@@ -0,0 +1,1492 @@
1
+ """
2
+ This module provides mixin classes that enable powerful data manipulation operations
3
+ across various EDSL list-like objects.
4
+
5
+ The DataOperationsBase class defines common operations for working with structured data,
6
+ including data transformation, visualization, export, querying, and analysis. These
7
+ operations are inherited by different specialized mixins (DatasetOperationsMixin,
8
+ ResultsOperationsMixin, etc.) which implement class-specific behaviors.
9
+
10
+ The design pattern used here allows different container types (Results, Dataset,
11
+ ScenarioList, AgentList) to share the same data manipulation interface, enabling
12
+ fluid operations across different parts of the EDSL ecosystem.
13
+ """
14
+
15
+ from abc import ABC, abstractmethod
16
+ import io
17
+ import warnings
18
+ import textwrap
19
+ from typing import Optional, Tuple, Union, List, TYPE_CHECKING
20
+ from .r.ggplot import GGPlotMethod
21
+
22
+ if TYPE_CHECKING:
23
+ from docx import Document
24
+ from .dataset import Dataset
25
+
26
+ class DataOperationsBase:
27
+ """
28
+ Base class providing common data operations for EDSL container objects.
29
+
30
+ This class serves as the foundation for various data manipulation mixins,
31
+ providing a consistent interface for operations like filtering, aggregation,
32
+ transformation, visualization, and export across different types of EDSL
33
+ containers (Results, Dataset, ScenarioList, AgentList).
34
+
35
+ Key functionality categories:
36
+
37
+ 1. Data Transformation:
38
+ - Filtering with `filter()`
39
+ - Creating new columns with `mutate()`
40
+ - Reshaping with `long()`, `wide()`, `flatten()`, etc.
41
+ - Selecting specific columns with `select()`
42
+
43
+ 2. Visualization and Display:
44
+ - Tabular display with `table()`
45
+ - Plotting with `ggplot2()`
46
+ - Generating reports with `report()`
47
+
48
+ 3. Data Export:
49
+ - To various formats with `to_csv()`, `to_excel()`, etc.
50
+ - To other data structures with `to_pandas()`, `to_dicts()`, etc.
51
+
52
+ 4. Analysis:
53
+ - SQL-based querying with `sql()`
54
+ - Aggregation with `tally()`
55
+ - Tree-based exploration
56
+
57
+ These operations are designed to be applied fluently in sequence, enabling
58
+ expressive data manipulation pipelines.
59
+ """
60
+
61
+
62
+ def ggplot2(
63
+ self,
64
+ ggplot_code: str,
65
+ shape: str = "wide",
66
+ sql: Optional[str] = None,
67
+ remove_prefix: bool = True,
68
+ debug: bool = False,
69
+ height: float = 4,
70
+ width: float = 6,
71
+ factor_orders: Optional[dict] = None,
72
+ ):
73
+ """
74
+ Create visualizations using R's ggplot2 library.
75
+
76
+ This method provides a bridge to R's powerful ggplot2 visualization library,
77
+ allowing you to create sophisticated plots directly from EDSL data structures.
78
+
79
+ Parameters:
80
+ ggplot_code: R code string containing ggplot2 commands
81
+ shape: Data shape to use ("wide" or "long")
82
+ sql: Optional SQL query to transform data before visualization
83
+ remove_prefix: Whether to remove prefixes (like "answer.") from column names
84
+ debug: Whether to display debugging information
85
+ height: Plot height in inches
86
+ width: Plot width in inches
87
+ factor_orders: Dictionary mapping factor variables to their desired order
88
+
89
+ Returns:
90
+ A plot object that renders in Jupyter notebooks
91
+
92
+ Notes:
93
+ - Requires R and the ggplot2 package to be installed
94
+ - Data is automatically converted to a format suitable for ggplot2
95
+ - The ggplot2 code should reference column names as they appear after
96
+ any transformations from the shape and remove_prefix parameters
97
+
98
+ Examples:
99
+ >>> from edsl.results import Results
100
+ >>> r = Results.example()
101
+ >>> # The following would create a plot if R is installed (not shown in doctest):
102
+ >>> # r.ggplot2('''
103
+ >>> # ggplot(df, aes(x=how_feeling)) +
104
+ >>> # geom_bar() +
105
+ >>> # labs(title="Distribution of Feelings")
106
+ >>> # ''')
107
+ """
108
+ return GGPlotMethod(self).ggplot2(ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders)
109
+
110
+
111
+ def relevant_columns(
112
+ self, data_type: Optional[str] = None, remove_prefix:bool=False
113
+ ) -> list:
114
+ """Return the set of keys that are present in the dataset.
115
+
116
+ :param data_type: The data type to filter by.
117
+ :param remove_prefix: Whether to remove the prefix from the column names.
118
+
119
+ >>> from ..dataset import Dataset
120
+ >>> d = Dataset([{'a.b':[1,2,3,4]}])
121
+ >>> d.relevant_columns()
122
+ ['a.b']
123
+
124
+ >>> d.relevant_columns(remove_prefix=True)
125
+ ['b']
126
+
127
+ >>> d = Dataset([{'a':[1,2,3,4]}, {'b':[5,6,7,8]}])
128
+ >>> d.relevant_columns()
129
+ ['a', 'b']
130
+
131
+ >>> from edsl.results import Results; Results.example().select('how_feeling', 'how_feeling_yesterday').relevant_columns()
132
+ ['answer.how_feeling', 'answer.how_feeling_yesterday']
133
+
134
+ >>> from edsl.results import Results
135
+ >>> sorted(Results.example().select().relevant_columns(data_type = "model"))
136
+ ['model.frequency_penalty', ...]
137
+
138
+ >>> Results.example().relevant_columns(data_type = "flimflam")
139
+ Traceback (most recent call last):
140
+ ...
141
+ ValueError: No columns found for data type: flimflam. Available data types are: ...
142
+ """
143
+ columns = [list(x.keys())[0] for x in self]
144
+ if remove_prefix:
145
+ columns = [column.split(".")[-1] for column in columns]
146
+
147
+ def get_data_type(column):
148
+ if "." in column:
149
+ return column.split(".")[0]
150
+ else:
151
+ return None
152
+
153
+ if data_type:
154
+ all_columns = columns[:]
155
+ columns = [
156
+ column for column in columns if get_data_type(column) == data_type
157
+ ]
158
+ if len(columns) == 0:
159
+ all_data_types = sorted(
160
+ list(set(get_data_type(column) for column in all_columns))
161
+ )
162
+ raise ValueError(
163
+ f"No columns found for data type: {data_type}. Available data types are: {all_data_types}."
164
+ )
165
+
166
+ return columns
167
+
168
+ def num_observations(self):
169
+ """Return the number of observations in the dataset.
170
+
171
+ >>> from edsl.results import Results
172
+ >>> Results.example().num_observations()
173
+ 4
174
+ """
175
+ _num_observations = None
176
+ for entry in self:
177
+ key, values = list(entry.items())[0]
178
+ if _num_observations is None:
179
+ _num_observations = len(values)
180
+ else:
181
+ if len(values) != _num_observations:
182
+ raise ValueError(
183
+ f"The number of observations is not consistent across columns. "
184
+ f"Column '{key}' has {len(values)} observations, but previous columns had {_num_observations} observations."
185
+ )
186
+
187
+ return _num_observations
188
+
189
+ def make_tabular(
190
+ self, remove_prefix: bool, pretty_labels: Optional[dict] = None
191
+ ) -> tuple[list, List[list]]:
192
+ """Turn the results into a tabular format.
193
+
194
+ :param remove_prefix: Whether to remove the prefix from the column names.
195
+
196
+ >>> from edsl.results import Results
197
+ >>> r = Results.example()
198
+ >>> r.select('how_feeling').make_tabular(remove_prefix = True)
199
+ (['how_feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
200
+
201
+ >>> r.select('how_feeling').make_tabular(remove_prefix = True, pretty_labels = {'how_feeling': "How are you feeling"})
202
+ (['How are you feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
203
+ """
204
+
205
+ def create_dict_from_list_of_dicts(list_of_dicts):
206
+ for entry in list_of_dicts:
207
+ key, list_of_values = list(entry.items())[0]
208
+ yield key, list_of_values
209
+
210
+ tabular_repr = dict(create_dict_from_list_of_dicts(self.data))
211
+
212
+ full_header = [list(x.keys())[0] for x in self]
213
+
214
+ rows = []
215
+ for i in range(self.num_observations()):
216
+ row = [tabular_repr[h][i] for h in full_header]
217
+ rows.append(row)
218
+
219
+ if remove_prefix:
220
+ header = [h.split(".")[-1] for h in full_header]
221
+ else:
222
+ header = full_header
223
+
224
+ if pretty_labels is not None:
225
+ header = [pretty_labels.get(h, h) for h in header]
226
+
227
+ return header, rows
228
+
229
+ def print_long(self):
230
+ """Print the results in a long format.
231
+ >>> from edsl.results import Results
232
+ >>> r = Results.example()
233
+ >>> r.select('how_feeling').print_long()
234
+ answer.how_feeling: OK
235
+ answer.how_feeling: Great
236
+ answer.how_feeling: Terrible
237
+ answer.how_feeling: OK
238
+ """
239
+ for entry in self:
240
+ key, list_of_values = list(entry.items())[0]
241
+ for value in list_of_values:
242
+ print(f"{key}: {value}")
243
+
244
+ def get_tabular_data(
245
+ self,
246
+ remove_prefix: bool = False,
247
+ pretty_labels: Optional[dict] = None,
248
+ ) -> Tuple[List[str], List[List]]:
249
+ """Internal method to get tabular data in a standard format.
250
+
251
+ Args:
252
+ remove_prefix: Whether to remove the prefix from column names
253
+ pretty_labels: Dictionary mapping original column names to pretty labels
254
+
255
+ Returns:
256
+ Tuple containing (header_row, data_rows)
257
+ """
258
+ if pretty_labels is None:
259
+ pretty_labels = {}
260
+
261
+ return self.make_tabular(
262
+ remove_prefix=remove_prefix, pretty_labels=pretty_labels
263
+ )
264
+
265
+ def to_jsonl(self, filename: Optional[str] = None) -> Optional["FileStore"]:
266
+ """Export the results to a FileStore instance containing JSONL data."""
267
+ exporter = JSONLExport(data=self, filename=filename)
268
+ return exporter.export()
269
+
270
+ def to_sqlite(
271
+ self,
272
+ filename: Optional[str] = None,
273
+ remove_prefix: bool = False,
274
+ pretty_labels: Optional[dict] = None,
275
+ table_name: str = "results",
276
+ if_exists: str = "replace",
277
+ ) -> Optional["FileStore"]:
278
+ """Export the results to a SQLite database file."""
279
+ exporter = SQLiteExport(
280
+ data=self,
281
+ filename=filename,
282
+ remove_prefix=remove_prefix,
283
+ pretty_labels=pretty_labels,
284
+ table_name=table_name,
285
+ if_exists=if_exists,
286
+ )
287
+ return exporter.export()
288
+
289
+ def to_csv(
290
+ self,
291
+ filename: Optional[str] = None,
292
+ remove_prefix: bool = False,
293
+ pretty_labels: Optional[dict] = None,
294
+ ) -> Optional["FileStore"]:
295
+ """Export the results to a FileStore instance containing CSV data."""
296
+ from .file_exports import CSVExport
297
+
298
+ exporter = CSVExport(
299
+ data=self,
300
+ filename=filename,
301
+ remove_prefix=remove_prefix,
302
+ pretty_labels=pretty_labels,
303
+ )
304
+ return exporter.export()
305
+
306
+ def to_excel(
307
+ self,
308
+ filename: Optional[str] = None,
309
+ remove_prefix: bool = False,
310
+ pretty_labels: Optional[dict] = None,
311
+ sheet_name: Optional[str] = None,
312
+ ) -> Optional["FileStore"]:
313
+ """Export the results to a FileStore instance containing Excel data."""
314
+ from .file_exports import ExcelExport
315
+
316
+ exporter = ExcelExport(
317
+ data=self,
318
+ filename=filename,
319
+ remove_prefix=remove_prefix,
320
+ pretty_labels=pretty_labels,
321
+ sheet_name=sheet_name,
322
+ )
323
+ return exporter.export()
324
+
325
+ def _db(
326
+ self, remove_prefix: bool = True, shape: str = "wide"
327
+ ) -> "sqlalchemy.engine.Engine":
328
+ """Create a SQLite database in memory and return the connection.
329
+
330
+ Args:
331
+ remove_prefix: Whether to remove the prefix from the column names
332
+ shape: The shape of the data in the database ("wide" or "long")
333
+
334
+ Returns:
335
+ A database connection
336
+ >>> from sqlalchemy import text
337
+ >>> from edsl import Results
338
+ >>> engine = Results.example()._db()
339
+ >>> len(engine.execute(text("SELECT * FROM self")).fetchall())
340
+ 4
341
+ >>> engine = Results.example()._db(shape = "long")
342
+ >>> len(engine.execute(text("SELECT * FROM self")).fetchall())
343
+ 172
344
+ """
345
+ from sqlalchemy import create_engine, text
346
+
347
+ engine = create_engine("sqlite:///:memory:")
348
+ if remove_prefix and shape == "wide":
349
+ df = self.remove_prefix().to_pandas(lists_as_strings=True)
350
+ else:
351
+ df = self.to_pandas(lists_as_strings=True)
352
+
353
+ if shape == "long":
354
+ # Melt the dataframe to convert it to long format
355
+ df = df.melt(var_name="key", value_name="value")
356
+ # Add a row number column for reference
357
+ df.insert(0, "row_number", range(1, len(df) + 1))
358
+
359
+ # Split the key into data_type and key
360
+ df["data_type"] = df["key"].apply(
361
+ lambda x: x.split(".")[0] if "." in x else None
362
+ )
363
+ df["key"] = df["key"].apply(
364
+ lambda x: ".".join(x.split(".")[1:]) if "." in x else x
365
+ )
366
+
367
+ df.to_sql(
368
+ "self",
369
+ engine,
370
+ index=False,
371
+ if_exists="replace",
372
+ )
373
+ return engine.connect()
374
+
375
+ def sql(
376
+ self,
377
+ query: str,
378
+ transpose: bool = None,
379
+ transpose_by: str = None,
380
+ remove_prefix: bool = True,
381
+ shape: str = "wide",
382
+ ) -> "Dataset":
383
+ """
384
+ Execute SQL queries on the dataset.
385
+
386
+ This powerful method allows you to use SQL to query and transform your data,
387
+ combining the expressiveness of SQL with EDSL's data structures. It works by
388
+ creating an in-memory SQLite database from your data and executing the query
389
+ against it.
390
+
391
+ Parameters:
392
+ query: SQL query string to execute
393
+ transpose: Whether to transpose the resulting table (rows become columns)
394
+ transpose_by: Column to use as the new index when transposing
395
+ remove_prefix: Whether to remove type prefixes (e.g., "answer.") from column names
396
+ shape: Data shape to use ("wide" or "long")
397
+ - "wide": Default tabular format with columns for each field
398
+ - "long": Melted format with key-value pairs, useful for certain queries
399
+
400
+ Returns:
401
+ A Dataset object containing the query results
402
+
403
+ Notes:
404
+ - The data is stored in a table named "self" in the SQLite database
405
+ - In wide format, column names include their type prefix unless remove_prefix=True
406
+ - In long format, the data is melted into columns: row_number, key, value, data_type
407
+ - Complex objects like lists and dictionaries are converted to strings
408
+
409
+ Examples:
410
+ >>> from edsl import Results
411
+ >>> r = Results.example()
412
+
413
+ # Basic selection
414
+ >>> len(r.sql("SELECT * FROM self", shape="wide"))
415
+ 4
416
+
417
+ # Filtering with WHERE clause
418
+ >>> r.sql("SELECT * FROM self WHERE how_feeling = 'Great'").num_observations()
419
+ 1
420
+
421
+ # Aggregation
422
+ >>> r.sql("SELECT how_feeling, COUNT(*) as count FROM self GROUP BY how_feeling").keys()
423
+ ['how_feeling', 'count']
424
+
425
+ # Using long format
426
+ >>> len(r.sql("SELECT * FROM self", shape="long"))
427
+ 172
428
+ """
429
+ import pandas as pd
430
+
431
+ conn = self._db(remove_prefix=remove_prefix, shape=shape)
432
+ df = pd.read_sql_query(query, conn)
433
+
434
+ # Transpose the DataFrame if transpose is True
435
+ if transpose or transpose_by:
436
+ df = pd.DataFrame(df)
437
+ if transpose_by:
438
+ df = df.set_index(transpose_by)
439
+ else:
440
+ df = df.set_index(df.columns[0])
441
+ df = df.transpose()
442
+ from .dataset import Dataset
443
+
444
+ return Dataset.from_pandas_dataframe(df)
445
+
446
+ def to_pandas(
447
+ self, remove_prefix: bool = False, lists_as_strings=False
448
+ ) -> "DataFrame":
449
+ """Convert the results to a pandas DataFrame, ensuring that lists remain as lists.
450
+
451
+ :param remove_prefix: Whether to remove the prefix from the column names.
452
+
453
+ """
454
+ return self._to_pandas_strings(remove_prefix)
455
+
456
+ def _to_pandas_strings(self, remove_prefix: bool = False) -> "pd.DataFrame":
457
+ """Convert the results to a pandas DataFrame.
458
+
459
+ :param remove_prefix: Whether to remove the prefix from the column names.
460
+
461
+ >>> from edsl.results import Results
462
+ >>> r = Results.example()
463
+ >>> r.select('how_feeling').to_pandas()
464
+ answer.how_feeling
465
+ 0 OK
466
+ 1 Great
467
+ 2 Terrible
468
+ 3 OK
469
+ """
470
+
471
+ import pandas as pd
472
+
473
+ csv_string = self.to_csv(remove_prefix=remove_prefix).text
474
+ csv_buffer = io.StringIO(csv_string)
475
+ df = pd.read_csv(csv_buffer)
476
+ # df_sorted = df.sort_index(axis=1) # Sort columns alphabetically
477
+ return df
478
+
479
+ def to_polars(
480
+ self, remove_prefix: bool = False, lists_as_strings=False
481
+ ) -> "pl.DataFrame":
482
+ """Convert the results to a Polars DataFrame.
483
+
484
+ :param remove_prefix: Whether to remove the prefix from the column names.
485
+ """
486
+ return self._to_polars_strings(remove_prefix)
487
+
488
+ def _to_polars_strings(self, remove_prefix: bool = False) -> "pl.DataFrame":
489
+ """Convert the results to a Polars DataFrame.
490
+
491
+ :param remove_prefix: Whether to remove the prefix from the column names.
492
+ """
493
+ import polars as pl
494
+
495
+ csv_string = self.to_csv(remove_prefix=remove_prefix).text
496
+ df = pl.read_csv(io.StringIO(csv_string))
497
+ return df
498
+
499
+ def tree(self, node_order: Optional[List[str]] = None) -> "Tree":
500
+ """Convert the results to a Tree.
501
+
502
+ :param node_order: The order of the nodes.
503
+ """
504
+ from .dataset_tree import Tree
505
+ return Tree(self, node_order=node_order)
506
+
507
+ def to_scenario_list(self, remove_prefix: bool = True) -> list[dict]:
508
+ """Convert the results to a list of dictionaries, one per scenario.
509
+
510
+ :param remove_prefix: Whether to remove the prefix from the column names.
511
+
512
+ >>> from edsl.results import Results
513
+ >>> r = Results.example()
514
+ >>> r.select('how_feeling').to_scenario_list()
515
+ ScenarioList([Scenario({'how_feeling': 'OK'}), Scenario({'how_feeling': 'Great'}), Scenario({'how_feeling': 'Terrible'}), Scenario({'how_feeling': 'OK'})])
516
+ """
517
+ from edsl.scenarios import ScenarioList, Scenario
518
+
519
+ list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
520
+ scenarios = []
521
+ for d in list_of_dicts:
522
+ scenarios.append(Scenario(d))
523
+ return ScenarioList(scenarios)
524
+
525
+ def to_agent_list(self, remove_prefix: bool = True):
526
+ """Convert the results to a list of dictionaries, one per agent.
527
+
528
+ :param remove_prefix: Whether to remove the prefix from the column names.
529
+
530
+ >>> from edsl.results import Results
531
+ >>> r = Results.example()
532
+ >>> r.select('how_feeling').to_agent_list()
533
+ AgentList([Agent(traits = {'how_feeling': 'OK'}), Agent(traits = {'how_feeling': 'Great'}), Agent(traits = {'how_feeling': 'Terrible'}), Agent(traits = {'how_feeling': 'OK'})])
534
+ """
535
+ from edsl.agents import Agent, AgentList
536
+
537
+ list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
538
+ agents = []
539
+ for d in list_of_dicts:
540
+ if "name" in d:
541
+ d["agent_name"] = d.pop("name")
542
+ agents.append(Agent(d, name=d["agent_name"]))
543
+ if "agent_parameters" in d:
544
+ agent_parameters = d.pop("agent_parameters")
545
+ agent_name = agent_parameters.get("name", None)
546
+ instruction = agent_parameters.get("instruction", None)
547
+ agents.append(Agent(d, name=agent_name, instruction=instruction))
548
+ else:
549
+ agents.append(Agent(d))
550
+ return AgentList(agents)
551
+
552
+ def to_dicts(self, remove_prefix: bool = True) -> list[dict]:
553
+ """Convert the results to a list of dictionaries.
554
+
555
+ :param remove_prefix: Whether to remove the prefix from the column names.
556
+
557
+ >>> from edsl.results import Results
558
+ >>> r = Results.example()
559
+ >>> r.select('how_feeling').to_dicts()
560
+ [{'how_feeling': 'OK'}, {'how_feeling': 'Great'}, {'how_feeling': 'Terrible'}, {'how_feeling': 'OK'}]
561
+
562
+ """
563
+ list_of_keys = []
564
+ list_of_values = []
565
+ for entry in self:
566
+ key, values = list(entry.items())[0]
567
+ list_of_keys.append(key)
568
+ list_of_values.append(values)
569
+
570
+ if remove_prefix:
571
+ list_of_keys = [key.split(".")[-1] for key in list_of_keys]
572
+
573
+ list_of_dicts = []
574
+ for entries in zip(*list_of_values):
575
+ list_of_dicts.append(dict(zip(list_of_keys, entries)))
576
+
577
+ return list_of_dicts
578
+
579
+ def to_list(self, flatten=False, remove_none=False, unzipped=False) -> list[list]:
580
+ """Convert the results to a list of lists.
581
+
582
+ :param flatten: Whether to flatten the list of lists.
583
+ :param remove_none: Whether to remove None values from the list.
584
+
585
+ >>> from edsl.results import Results
586
+ >>> Results.example().select('how_feeling', 'how_feeling_yesterday')
587
+ Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible', 'OK']}, {'answer.how_feeling_yesterday': ['Great', 'Good', 'OK', 'Terrible']}])
588
+
589
+ >>> Results.example().select('how_feeling', 'how_feeling_yesterday').to_list()
590
+ [('OK', 'Great'), ('Great', 'Good'), ('Terrible', 'OK'), ('OK', 'Terrible')]
591
+
592
+ >>> r = Results.example()
593
+ >>> r.select('how_feeling').to_list()
594
+ ['OK', 'Great', 'Terrible', 'OK']
595
+
596
+ >>> from edsl.dataset import Dataset
597
+ >>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}]).select('a.b').to_list(flatten = True)
598
+ [1, 9, 2, 3, 4]
599
+
600
+ >>> from edsl.dataset import Dataset
601
+ >>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}, {'c': [6, 2, 3, 4]}]).select('a.b', 'c').to_list(flatten = True)
602
+ Traceback (most recent call last):
603
+ ...
604
+ ValueError: Cannot flatten a list of lists when there are multiple columns selected.
605
+
606
+
607
+ """
608
+ if len(self.relevant_columns()) > 1 and flatten:
609
+ raise ValueError(
610
+ "Cannot flatten a list of lists when there are multiple columns selected."
611
+ )
612
+
613
+ if len(self.relevant_columns()) == 1:
614
+ # if only one 'column' is selected (which is typical for this method
615
+ list_to_return = list(self[0].values())[0]
616
+ else:
617
+ keys = self.relevant_columns()
618
+ data = self.to_dicts(remove_prefix=False)
619
+ list_to_return = []
620
+ for d in data:
621
+ list_to_return.append(tuple([d[key] for key in keys]))
622
+
623
+ if remove_none:
624
+ list_to_return = [item for item in list_to_return if item is not None]
625
+
626
+ if flatten:
627
+ new_list = []
628
+ for item in list_to_return:
629
+ if isinstance(item, list):
630
+ new_list.extend(item)
631
+ else:
632
+ new_list.append(item)
633
+ list_to_return = new_list
634
+
635
+ from edsl.utilities.PrettyList import PrettyList
636
+
637
+ #return PrettyList(list_to_return)
638
+ return list_to_return
639
+
640
+ def html(
641
+ self,
642
+ filename: Optional[str] = None,
643
+ cta: str = "Open in browser",
644
+ return_link: bool = False,
645
+ ):
646
+ import os
647
+ import tempfile
648
+ from edsl.utilities.utilities import is_notebook
649
+ from IPython.display import HTML, display
650
+ from edsl.utilities.utilities import is_notebook
651
+
652
+ df = self.to_pandas()
653
+
654
+ if filename is None:
655
+ current_directory = os.getcwd()
656
+ filename = tempfile.NamedTemporaryFile(
657
+ "w", delete=False, suffix=".html", dir=current_directory
658
+ ).name
659
+
660
+ with open(filename, "w") as f:
661
+ f.write(df.to_html())
662
+
663
+ if is_notebook():
664
+ html_url = f"/files/{filename}"
665
+ html_link = f'<a href="{html_url}" target="_blank">{cta}</a>'
666
+ display(HTML(html_link))
667
+ else:
668
+ print(f"Saved to {filename}")
669
+ import webbrowser
670
+ import os
671
+
672
+ webbrowser.open(f"file://{os.path.abspath(filename)}")
673
+
674
+ if return_link:
675
+ return filename
676
+
677
+ def _prepare_report_data(self, *fields: Optional[str], top_n: Optional[int] = None,
678
+ header_fields: Optional[List[str]] = None) -> tuple:
679
+ """Prepares data for report generation in various formats.
680
+
681
+ Args:
682
+ *fields: The fields to include in the report. If none provided, all fields are used.
683
+ top_n: Optional limit on the number of observations to include.
684
+ header_fields: Optional list of fields to include in the main header instead of as sections.
685
+
686
+ Returns:
687
+ A tuple containing (field_data, num_obs, fields, header_fields)
688
+ """
689
+ # If no fields specified, use all columns
690
+ if not fields:
691
+ fields = self.relevant_columns()
692
+
693
+ # Initialize header_fields if not provided
694
+ if header_fields is None:
695
+ header_fields = []
696
+
697
+ # Validate all fields
698
+ all_fields = list(fields) + [f for f in header_fields if f not in fields]
699
+ for field in all_fields:
700
+ if field not in self.relevant_columns():
701
+ raise ValueError(f"Field '{field}' not found in dataset")
702
+
703
+ # Get data for each field
704
+ field_data = {}
705
+ for field in all_fields:
706
+ for entry in self:
707
+ if field in entry:
708
+ field_data[field] = entry[field]
709
+ break
710
+
711
+ # Number of observations to process
712
+ num_obs = self.num_observations()
713
+ if top_n is not None:
714
+ num_obs = min(num_obs, top_n)
715
+
716
+ return field_data, num_obs, fields, header_fields
717
+
718
+ def _report_markdown(self, field_data, num_obs, fields, header_fields, divider: bool = True) -> str:
719
+ """Generates a markdown report from the prepared data.
720
+
721
+ Args:
722
+ field_data: Dictionary mapping field names to their values
723
+ num_obs: Number of observations to include
724
+ fields: Fields to include as sections
725
+ header_fields: Fields to include in the observation header
726
+ divider: If True, adds a horizontal rule between observations
727
+
728
+ Returns:
729
+ A string containing the markdown report
730
+ """
731
+ report_lines = []
732
+ for i in range(num_obs):
733
+ # Create header with observation number and any header fields
734
+ header = f"# Observation: {i+1}"
735
+ if header_fields:
736
+ header_parts = []
737
+ for field in header_fields:
738
+ value = field_data[field][i]
739
+ # Get the field name without prefix for cleaner display
740
+ display_name = field.split('.')[-1] if '.' in field else field
741
+ # Format with backticks for monospace
742
+ header_parts.append(f"`{display_name}`: {value}")
743
+ if header_parts:
744
+ header += f" ({', '.join(header_parts)})"
745
+ report_lines.append(header)
746
+
747
+ # Add the remaining fields
748
+ for field in fields:
749
+ if field not in header_fields:
750
+ report_lines.append(f"## {field}")
751
+ value = field_data[field][i]
752
+ if isinstance(value, list) or isinstance(value, dict):
753
+ import json
754
+ report_lines.append(f"```\n{json.dumps(value, indent=2)}\n```")
755
+ else:
756
+ report_lines.append(str(value))
757
+
758
+ # Add divider between observations if requested
759
+ if divider and i < num_obs - 1:
760
+ report_lines.append("\n---\n")
761
+ else:
762
+ report_lines.append("") # Empty line between observations
763
+
764
+ return "\n".join(report_lines)
765
+
766
+ def _report_docx(self, field_data, num_obs, fields, header_fields) -> "Document":
767
+ """Generates a Word document report from the prepared data.
768
+
769
+ Args:
770
+ field_data: Dictionary mapping field names to their values
771
+ num_obs: Number of observations to include
772
+ fields: Fields to include as sections
773
+ header_fields: Fields to include in the observation header
774
+
775
+ Returns:
776
+ A docx.Document object containing the report
777
+ """
778
+ try:
779
+ from docx import Document
780
+ from docx.shared import Pt
781
+ import json
782
+ except ImportError:
783
+ raise ImportError("The python-docx package is required for DOCX export. Install it with 'pip install python-docx'.")
784
+
785
+ doc = Document()
786
+
787
+ for i in range(num_obs):
788
+ # Create header with observation number and any header fields
789
+ header_text = f"Observation: {i+1}"
790
+ if header_fields:
791
+ header_parts = []
792
+ for field in header_fields:
793
+ value = field_data[field][i]
794
+ # Get the field name without prefix for cleaner display
795
+ display_name = field.split('.')[-1] if '.' in field else field
796
+ header_parts.append(f"{display_name}: {value}")
797
+ if header_parts:
798
+ header_text += f" ({', '.join(header_parts)})"
799
+
800
+ heading = doc.add_heading(header_text, level=1)
801
+
802
+ # Add the remaining fields
803
+ for field in fields:
804
+ if field not in header_fields:
805
+ doc.add_heading(field, level=2)
806
+ value = field_data[field][i]
807
+
808
+ if isinstance(value, (list, dict)):
809
+ # Format structured data with indentation
810
+ formatted_value = json.dumps(value, indent=2)
811
+ p = doc.add_paragraph()
812
+ p.add_run(formatted_value).font.name = 'Courier New'
813
+ p.add_run().font.size = Pt(10)
814
+ else:
815
+ doc.add_paragraph(str(value))
816
+
817
+ # Add page break between observations except for the last one
818
+ if i < num_obs - 1:
819
+ doc.add_page_break()
820
+
821
+ return doc
822
+
823
+ def report(self, *fields: Optional[str], top_n: Optional[int] = None,
824
+ header_fields: Optional[List[str]] = None, divider: bool = True,
825
+ return_string: bool = False, format: str = "markdown",
826
+ filename: Optional[str] = None) -> Optional[Union[str, "docx.Document"]]:
827
+ """Generates a report of the results by iterating through rows.
828
+
829
+ Args:
830
+ *fields: The fields to include in the report. If none provided, all fields are used.
831
+ top_n: Optional limit on the number of observations to include.
832
+ header_fields: Optional list of fields to include in the main header instead of as sections.
833
+ divider: If True, adds a horizontal rule between observations (markdown only).
834
+ return_string: If True, returns the markdown string. If False (default in notebooks),
835
+ only displays the markdown without returning.
836
+ format: Output format - either "markdown" or "docx".
837
+ filename: If provided and format is "docx", saves the document to this file.
838
+
839
+ Returns:
840
+ Depending on format and return_string:
841
+ - For markdown: A string if return_string is True, otherwise None (displays in notebook)
842
+ - For docx: A docx.Document object, or None if filename is provided (saves to file)
843
+
844
+ Examples:
845
+ >>> from edsl.results import Results
846
+ >>> r = Results.example()
847
+ >>> report = r.select('how_feeling').report(return_string=True)
848
+ >>> "# Observation: 1" in report
849
+ True
850
+ >>> doc = r.select('how_feeling').report(format="docx")
851
+ >>> isinstance(doc, object)
852
+ True
853
+ """
854
+ from edsl.utilities.utilities import is_notebook
855
+
856
+ # Prepare the data for the report
857
+ field_data, num_obs, fields, header_fields = self._prepare_report_data(
858
+ *fields, top_n=top_n, header_fields=header_fields
859
+ )
860
+
861
+ # Generate the report in the requested format
862
+ if format.lower() == "markdown":
863
+ report_text = self._report_markdown(
864
+ field_data, num_obs, fields, header_fields, divider
865
+ )
866
+
867
+ # In notebooks, display as markdown
868
+ is_nb = is_notebook()
869
+ if is_nb and not return_string:
870
+ from IPython.display import Markdown, display
871
+ display(Markdown(report_text))
872
+ return None
873
+
874
+ # Return the string if requested or if not in a notebook
875
+ return report_text
876
+
877
+ elif format.lower() == "docx":
878
+ doc = self._report_docx(field_data, num_obs, fields, header_fields)
879
+
880
+ # Save to file if filename is provided
881
+ if filename:
882
+ doc.save(filename)
883
+ print(f"Report saved to {filename}")
884
+ return None
885
+
886
+ return doc
887
+
888
+ else:
889
+ raise ValueError(f"Unsupported format: {format}. Use 'markdown' or 'docx'.")
890
+
891
+ def tally(
892
+ self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
893
+ ) -> Union[dict, "Dataset"]:
894
+ """
895
+ Count frequency distributions of values in specified fields.
896
+
897
+ This method tallies the occurrence of unique values within one or more fields,
898
+ similar to a GROUP BY and COUNT in SQL. When multiple fields are provided, it
899
+ performs cross-tabulation across those fields.
900
+
901
+ Parameters:
902
+ *fields: Field names to tally. If none provided, uses all available fields.
903
+ top_n: Optional limit to return only the top N most frequent values.
904
+ output: Format for results, either "Dataset" (recommended) or "dict".
905
+
906
+ Returns:
907
+ By default, returns a Dataset with columns for the field(s) and a 'count' column.
908
+ If output="dict", returns a dictionary mapping values to counts.
909
+
910
+ Notes:
911
+ - For single fields, returns counts of each unique value
912
+ - For multiple fields, returns counts of each unique combination of values
913
+ - Results are sorted in descending order by count
914
+ - Fields can be specified with or without their type prefix
915
+
916
+ Examples:
917
+ >>> from edsl import Results
918
+ >>> r = Results.example()
919
+
920
+ # Single field frequency count
921
+ >>> r.select('how_feeling').tally('answer.how_feeling', output="dict")
922
+ {'OK': 2, 'Great': 1, 'Terrible': 1}
923
+
924
+ # Return as Dataset (default)
925
+ >>> from edsl.dataset import Dataset
926
+ >>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
927
+ >>> r.select('how_feeling').tally('answer.how_feeling', output="Dataset") == expected
928
+ True
929
+
930
+ # Multi-field cross-tabulation - exact output varies based on data
931
+ >>> result = r.tally('how_feeling', 'how_feeling_yesterday')
932
+ >>> 'how_feeling' in result.keys() and 'how_feeling_yesterday' in result.keys() and 'count' in result.keys()
933
+ True
934
+ """
935
+ from collections import Counter
936
+
937
+ if len(fields) == 0:
938
+ fields = self.relevant_columns()
939
+
940
+ relevant_columns_without_prefix = [
941
+ column.split(".")[-1] for column in self.relevant_columns()
942
+ ]
943
+
944
+ if not all(
945
+ f in self.relevant_columns() or f in relevant_columns_without_prefix
946
+ for f in fields
947
+ ):
948
+ raise ValueError("One or more specified fields are not in the dataset."
949
+ f"The available fields are: {self.relevant_columns()}"
950
+ )
951
+
952
+ if len(fields) == 1:
953
+ field = fields[0]
954
+ values = self._key_to_value(field)
955
+ else:
956
+ values = list(zip(*(self._key_to_value(field) for field in fields)))
957
+
958
+ for value in values:
959
+ if isinstance(value, list):
960
+ value = tuple(value)
961
+ try:
962
+ tally = dict(Counter(values))
963
+ except TypeError:
964
+ tally = dict(Counter([str(v) for v in values]))
965
+ except Exception as e:
966
+ raise ValueError(f"Error tallying values: {e}")
967
+
968
+ sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
969
+ if top_n is not None:
970
+ sorted_tally = dict(list(sorted_tally.items())[:top_n])
971
+
972
+ from ..dataset import Dataset
973
+
974
+ if output == "dict":
975
+ # why did I do this?
976
+ warnings.warn(
977
+ textwrap.dedent(
978
+ """\
979
+ The default output from tally will change to Dataset in the future.
980
+ Use output='Dataset' to get the Dataset object for now.
981
+ """
982
+ )
983
+ )
984
+ return sorted_tally
985
+ elif output == "Dataset":
986
+ dataset = Dataset(
987
+ [
988
+ {"value": list(sorted_tally.keys())},
989
+ {"count": list(sorted_tally.values())},
990
+ ]
991
+ )
992
+ # return dataset
993
+ sl = dataset.to_scenario_list().unpack(
994
+ "value",
995
+ new_names=[fields] if isinstance(fields, str) else fields,
996
+ keep_original=False,
997
+ )
998
+ keys = list(sl[0].keys())
999
+ keys.remove("count")
1000
+ keys.append("count")
1001
+ return sl.reorder_keys(keys).to_dataset()
1002
+
1003
+ def flatten(self, field: str, keep_original: bool = False) -> "Dataset":
1004
+ """
1005
+ Expand a field containing dictionaries into separate fields.
1006
+
1007
+ This method takes a field that contains a list of dictionaries and expands
1008
+ it into multiple fields, one for each key in the dictionaries. This is useful
1009
+ when working with nested data structures or results from extraction operations.
1010
+
1011
+ Parameters:
1012
+ field: The field containing dictionaries to flatten
1013
+ keep_original: Whether to retain the original field in the result
1014
+
1015
+ Returns:
1016
+ A new Dataset with the dictionary keys expanded into separate fields
1017
+
1018
+ Notes:
1019
+ - Each key in the dictionaries becomes a new field with name pattern "{field}.{key}"
1020
+ - All dictionaries in the field must have compatible structures
1021
+ - If a dictionary is missing a key, the corresponding value will be None
1022
+ - Non-dictionary values in the field will cause a warning
1023
+
1024
+ Examples:
1025
+ >>> from edsl.dataset import Dataset
1026
+
1027
+ # Basic flattening of nested dictionaries
1028
+ >>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('a')
1029
+ Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
1030
+
1031
+ # Works with prefixed fields too
1032
+ >>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('answer.example')
1033
+ Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
1034
+
1035
+ # Keep the original field if needed
1036
+ >>> d = Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}])
1037
+ >>> d.flatten('a', keep_original=True)
1038
+ Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
1039
+ """
1040
+ from ..dataset import Dataset
1041
+
1042
+ # Ensure the dataset isn't empty
1043
+ if not self.data:
1044
+ return self.copy()
1045
+
1046
+ # Find all columns that contain the field
1047
+ matching_entries = []
1048
+ for entry in self.data:
1049
+ col_name = next(iter(entry.keys()))
1050
+ if field == col_name or (
1051
+ '.' in col_name and
1052
+ (col_name.endswith('.' + field) or col_name.startswith(field + '.'))
1053
+ ):
1054
+ matching_entries.append(entry)
1055
+
1056
+ # Check if the field is ambiguous
1057
+ if len(matching_entries) > 1:
1058
+ matching_cols = [next(iter(entry.keys())) for entry in matching_entries]
1059
+ raise ValueError(
1060
+ f"Ambiguous field name '{field}'. It matches multiple columns: {matching_cols}. "
1061
+ f"Please specify the full column name to flatten."
1062
+ )
1063
+
1064
+ # Get the number of observations
1065
+ num_observations = self.num_observations()
1066
+
1067
+ # Find the column to flatten
1068
+ field_entry = None
1069
+ for entry in self.data:
1070
+ if field in entry:
1071
+ field_entry = entry
1072
+ break
1073
+
1074
+ if field_entry is None:
1075
+ warnings.warn(
1076
+ f"Field '{field}' not found in dataset, returning original dataset"
1077
+ )
1078
+ return self.copy()
1079
+
1080
+ # Create new dictionary for flattened data
1081
+ flattened_data = []
1082
+
1083
+ # Copy all existing columns except the one we're flattening (if keep_original is False)
1084
+ for entry in self.data:
1085
+ col_name = next(iter(entry.keys()))
1086
+ if col_name != field or keep_original:
1087
+ flattened_data.append(entry.copy())
1088
+
1089
+ # Get field data and make sure it's valid
1090
+ field_values = field_entry[field]
1091
+ if not all(isinstance(item, dict) for item in field_values if item is not None):
1092
+ warnings.warn(
1093
+ f"Field '{field}' contains non-dictionary values that cannot be flattened"
1094
+ )
1095
+ return self.copy()
1096
+
1097
+ # Collect all unique keys across all dictionaries
1098
+ all_keys = set()
1099
+ for item in field_values:
1100
+ if isinstance(item, dict):
1101
+ all_keys.update(item.keys())
1102
+
1103
+ # Create new columns for each key
1104
+ for key in sorted(all_keys): # Sort for consistent output
1105
+ new_values = []
1106
+ for i in range(num_observations):
1107
+ value = None
1108
+ if i < len(field_values) and isinstance(field_values[i], dict):
1109
+ value = field_values[i].get(key, None)
1110
+ new_values.append(value)
1111
+
1112
+ # Add this as a new column
1113
+ flattened_data.append({f"{field}.{key}": new_values})
1114
+
1115
+ # Return a new Dataset with the flattened data
1116
+ return Dataset(flattened_data)
1117
+
1118
+ def unpack_list(
1119
+ self,
1120
+ field: str,
1121
+ new_names: Optional[List[str]] = None,
1122
+ keep_original: bool = True,
1123
+ ) -> "Dataset":
1124
+ """Unpack list columns into separate columns with provided names or numeric suffixes.
1125
+
1126
+ For example, if a dataset contains:
1127
+ [{'data': [[1, 2, 3], [4, 5, 6]], 'other': ['x', 'y']}]
1128
+
1129
+ After d.unpack_list('data'), it should become:
1130
+ [{'other': ['x', 'y'], 'data_1': [1, 4], 'data_2': [2, 5], 'data_3': [3, 6]}]
1131
+
1132
+ Args:
1133
+ field: The field containing lists to unpack
1134
+ new_names: Optional list of names for the unpacked fields. If None, uses numeric suffixes.
1135
+ keep_original: If True, keeps the original field in the dataset
1136
+
1137
+ Returns:
1138
+ A new Dataset with unpacked columns
1139
+
1140
+ Examples:
1141
+ >>> from edsl.dataset import Dataset
1142
+ >>> d = Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}])
1143
+ >>> d.unpack_list('data')
1144
+ Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'data_1': [1, 4]}, {'data_2': [2, 5]}, {'data_3': [3, 6]}])
1145
+
1146
+ >>> d.unpack_list('data', new_names=['first', 'second', 'third'])
1147
+ Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'first': [1, 4]}, {'second': [2, 5]}, {'third': [3, 6]}])
1148
+ """
1149
+ from .dataset import Dataset
1150
+
1151
+ # Create a copy of the dataset
1152
+ result = Dataset(self.data.copy())
1153
+
1154
+ # Find the field in the dataset
1155
+ field_index = None
1156
+ for i, entry in enumerate(result.data):
1157
+ if field in entry:
1158
+ field_index = i
1159
+ break
1160
+
1161
+ if field_index is None:
1162
+ raise ValueError(f"Field '{field}' not found in dataset")
1163
+
1164
+ field_data = result.data[field_index][field]
1165
+
1166
+ # Check if values are lists
1167
+ if not all(isinstance(v, list) for v in field_data):
1168
+ raise ValueError(f"Field '{field}' does not contain lists in all entries")
1169
+
1170
+ # Get the maximum length of lists
1171
+ max_len = max(len(v) for v in field_data)
1172
+
1173
+ # Create new fields for each index
1174
+ for i in range(max_len):
1175
+ if new_names and i < len(new_names):
1176
+ new_field = new_names[i]
1177
+ else:
1178
+ new_field = f"{field}_{i+1}"
1179
+
1180
+ # Extract the i-th element from each list
1181
+ new_values = []
1182
+ for item in field_data:
1183
+ new_values.append(item[i] if i < len(item) else None)
1184
+
1185
+ result.data.append({new_field: new_values})
1186
+
1187
+ # Remove the original field if keep_original is False
1188
+ if not keep_original:
1189
+ result.data.pop(field_index)
1190
+
1191
+ return result
1192
+
1193
+ def drop(self, field_name):
1194
+ """
1195
+ Returns a new Dataset with the specified field removed.
1196
+
1197
+ Args:
1198
+ field_name (str): The name of the field to remove.
1199
+
1200
+ Returns:
1201
+ Dataset: A new Dataset instance without the specified field.
1202
+
1203
+ Raises:
1204
+ KeyError: If the field_name doesn't exist in the dataset.
1205
+
1206
+ Examples:
1207
+ >>> from .dataset import Dataset
1208
+ >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
1209
+ >>> d.drop('a')
1210
+ Dataset([{'b': [4, 5, 6]}])
1211
+
1212
+ >>> d.drop('c')
1213
+ Traceback (most recent call last):
1214
+ ...
1215
+ KeyError: "Field 'c' not found in dataset"
1216
+ """
1217
+ from .dataset import Dataset
1218
+
1219
+ # Check if field exists in the dataset
1220
+ if field_name not in self.relevant_columns():
1221
+ raise KeyError(f"Field '{field_name}' not found in dataset")
1222
+
1223
+ # Create a new dataset without the specified field
1224
+ new_data = [entry for entry in self.data if field_name not in entry]
1225
+ return Dataset(new_data)
1226
+
1227
+ def remove_prefix(self):
1228
+ """Returns a new Dataset with the prefix removed from all column names.
1229
+
1230
+ The prefix is defined as everything before the first dot (.) in the column name.
1231
+ If removing prefixes would result in duplicate column names, an exception is raised.
1232
+
1233
+ Returns:
1234
+ Dataset: A new Dataset with prefixes removed from column names
1235
+
1236
+ Raises:
1237
+ ValueError: If removing prefixes would result in duplicate column names
1238
+
1239
+ Examples:
1240
+ >>> from edsl.results import Results
1241
+ >>> r = Results.example()
1242
+ >>> r.select('how_feeling', 'how_feeling_yesterday').relevant_columns()
1243
+ ['answer.how_feeling', 'answer.how_feeling_yesterday']
1244
+ >>> r.select('how_feeling', 'how_feeling_yesterday').remove_prefix().relevant_columns()
1245
+ ['how_feeling', 'how_feeling_yesterday']
1246
+
1247
+ >>> from edsl.dataset import Dataset
1248
+ >>> d = Dataset([{'a.x': [1, 2, 3]}, {'b.x': [4, 5, 6]}])
1249
+ >>> # d.remove_prefix()
1250
+
1251
+ Traceback (most recent call last):
1252
+ ...
1253
+ ValueError: Removing prefixes would result in duplicate column names: ['x']
1254
+ """
1255
+ from .dataset import Dataset
1256
+
1257
+ # Get all column names
1258
+ columns = self.relevant_columns()
1259
+
1260
+ # Extract the unprefixed names
1261
+ unprefixed = {}
1262
+ duplicates = set()
1263
+
1264
+ for col in columns:
1265
+ if '.' in col:
1266
+ unprefixed_name = col.split('.', 1)[1]
1267
+ if unprefixed_name in unprefixed:
1268
+ duplicates.add(unprefixed_name)
1269
+ unprefixed[unprefixed_name] = col
1270
+ else:
1271
+ # For columns without a prefix, keep them as is
1272
+ unprefixed[col] = col
1273
+
1274
+ # Check for duplicates
1275
+ if duplicates:
1276
+ raise ValueError(f"Removing prefixes would result in duplicate column names: {sorted(list(duplicates))}")
1277
+
1278
+ # Create a new dataset with unprefixed column names
1279
+ new_data = []
1280
+ for entry in self.data:
1281
+ key, values = list(entry.items())[0]
1282
+ if '.' in key:
1283
+ new_key = key.split('.', 1)[1]
1284
+ else:
1285
+ new_key = key
1286
+ new_data.append({new_key: values})
1287
+
1288
+ return Dataset(new_data)
1289
+
1290
+
1291
+ from functools import wraps
1292
+
1293
+ def to_dataset(func):
1294
+ """
1295
+ Decorator that ensures functions receive a Dataset object as their first argument.
1296
+
1297
+ This decorator automatically converts various EDSL container objects (Results,
1298
+ AgentList, ScenarioList) to Dataset objects before passing them to the decorated
1299
+ function. This allows methods defined in DataOperationsBase to work seamlessly
1300
+ across different container types without duplicating conversion logic.
1301
+
1302
+ Parameters:
1303
+ func: The function to decorate
1304
+
1305
+ Returns:
1306
+ A wrapped function that ensures its first argument is a Dataset
1307
+
1308
+ Notes:
1309
+ - For Results objects, calls select() to convert to a Dataset
1310
+ - For AgentList and ScenarioList objects, calls their to_dataset() method
1311
+ - For Dataset objects, passes them through unchanged
1312
+ - This decorator is used internally by the mixin system to enable method sharing
1313
+ """
1314
+ @wraps(func)
1315
+ def wrapper(self, *args, **kwargs):
1316
+ """Execute the function with self converted to a Dataset if needed."""
1317
+ # Convert to Dataset based on the class type
1318
+ if self.__class__.__name__ == "Results":
1319
+ dataset_self = self.select()
1320
+ elif self.__class__.__name__ == "AgentList":
1321
+ dataset_self = self.to_dataset()
1322
+ elif self.__class__.__name__ == "ScenarioList":
1323
+ dataset_self = self.to_dataset()
1324
+ else:
1325
+ dataset_self = self
1326
+
1327
+ # Call the function with the converted self
1328
+ return func(dataset_self, *args, **kwargs)
1329
+
1330
+ # Mark the wrapper as being wrapped by to_dataset
1331
+ wrapper._is_wrapped = True
1332
+ return wrapper
1333
+
1334
+
1335
+ def decorate_methods_from_mixin(cls, mixin_cls):
1336
+ """
1337
+ Apply the to_dataset decorator to methods inherited from a mixin class.
1338
+
1339
+ This function is part of EDSL's method inheritance system. It takes methods
1340
+ from a source mixin class, applies the to_dataset decorator to them, and adds
1341
+ them to a target class. This enables the sharing of data manipulation methods
1342
+ across different container types while ensuring they receive the right data type.
1343
+
1344
+ The function is careful not to override methods that are already defined in
1345
+ more specific parent classes, preserving the method resolution order (MRO).
1346
+
1347
+ Parameters:
1348
+ cls: The target class to add decorated methods to
1349
+ mixin_cls: The source mixin class providing the methods
1350
+
1351
+ Returns:
1352
+ The modified target class with decorated methods added
1353
+
1354
+ Notes:
1355
+ - Only public methods (not starting with "_") are decorated and added
1356
+ - Methods already defined in more specific parent classes are not overridden
1357
+ - Methods from DataOperationsBase are not skipped to ensure all base methods are available
1358
+ """
1359
+ # Get all attributes, including inherited ones
1360
+ for attr_name in dir(mixin_cls):
1361
+ # Skip magic methods and private methods
1362
+ if not attr_name.startswith('_'):
1363
+ attr_value = getattr(mixin_cls, attr_name)
1364
+ if callable(attr_value):
1365
+ # Check if the method is already defined in the class's MRO
1366
+ # but skip DataOperationsBase methods
1367
+ for base in cls.__mro__[1:]: # Skip the class itself
1368
+ if (attr_name in base.__dict__ and
1369
+ base is not DataOperationsBase):
1370
+ # Method is overridden in a more specific class, skip decorating
1371
+ break
1372
+ else:
1373
+ # Method not overridden, safe to decorate
1374
+ setattr(cls, attr_name, to_dataset(attr_value))
1375
+ return cls
1376
+
1377
+ # def decorate_methods_from_mixin(cls, mixin_cls):
1378
+ # """Decorates all methods from mixin_cls with to_dataset decorator."""
1379
+
1380
+ # # Get all attributes, including inherited ones
1381
+ # for attr_name in dir(mixin_cls):
1382
+ # # Skip magic methods and private methods
1383
+ # if not attr_name.startswith('_'):
1384
+ # attr_value = getattr(mixin_cls, attr_name)
1385
+ # if callable(attr_value):
1386
+ # setattr(cls, attr_name, to_dataset(attr_value))
1387
+ # return cls
1388
+
1389
+ class DatasetOperationsMixin(DataOperationsBase):
1390
+ """
1391
+ Mixin providing data manipulation operations for Dataset objects.
1392
+
1393
+ This mixin class is the cornerstone of EDSL's data manipulation system. It directly
1394
+ inherits methods from DataOperationsBase without requiring conversion, as it's
1395
+ designed specifically for the Dataset class. It serves as the primary implementation
1396
+ of all data operations methods that other container types will inherit and adapt
1397
+ through the to_dataset decorator.
1398
+
1399
+ The design follows a standard mixin pattern where common functionality is defined
1400
+ in a standalone class that can be "mixed in" to other classes. In EDSL's case,
1401
+ this allows different container types (Results, AgentList, ScenarioList) to share
1402
+ the same powerful data manipulation interface.
1403
+
1404
+ Key features:
1405
+
1406
+ 1. Data Transformation:
1407
+ - Filtering with `filter()`
1408
+ - Creating new columns with `mutate()`
1409
+ - Reshaping with `long()`, `wide()`, `flatten()`, etc.
1410
+ - Selecting specific data with `select()`
1411
+
1412
+ 2. Visualization:
1413
+ - Table display with `table()`
1414
+ - R integration with `ggplot2()`
1415
+ - Report generation with `report()`
1416
+
1417
+ 3. Data Export:
1418
+ - To files with `to_csv()`, `to_excel()`, etc.
1419
+ - To other formats with `to_pandas()`, `to_dicts()`, etc.
1420
+
1421
+ 4. Analysis:
1422
+ - SQL queries with `sql()`
1423
+ - Aggregation with `tally()`
1424
+ - Tree-based exploration with `tree()`
1425
+
1426
+ This mixin is designed for fluent method chaining, allowing complex data manipulation
1427
+ pipelines to be built in an expressive and readable way.
1428
+ """
1429
+ pass
1430
+
1431
+ class ResultsOperationsMixin(DataOperationsBase):
1432
+ """
1433
+ Mixin providing data operations for Results objects.
1434
+
1435
+ This mixin adapts DatasetOperationsMixin methods to work with Results objects.
1436
+ When a method is called on a Results object, it's automatically converted to
1437
+ a Dataset first via the to_dataset decorator applied in __init_subclass__.
1438
+
1439
+ This allows Results objects to have the same data manipulation capabilities
1440
+ as Dataset objects without duplicating code.
1441
+ """
1442
+ def __init_subclass__(cls, **kwargs):
1443
+ """
1444
+ Automatically decorate all methods from DatasetOperationsMixin.
1445
+
1446
+ This hook runs when a class inherits from ResultsOperationsMixin,
1447
+ applying the to_dataset decorator to all methods from DatasetOperationsMixin.
1448
+ """
1449
+ super().__init_subclass__(**kwargs)
1450
+ decorate_methods_from_mixin(cls, DatasetOperationsMixin)
1451
+
1452
+ class ScenarioListOperationsMixin(DataOperationsBase):
1453
+ """
1454
+ Mixin providing data operations for ScenarioList objects.
1455
+
1456
+ This mixin adapts DatasetOperationsMixin methods to work with ScenarioList objects.
1457
+ ScenarioList objects are converted to Dataset objects before method execution
1458
+ via the to_dataset decorator applied in __init_subclass__.
1459
+ """
1460
+ def __init_subclass__(cls, **kwargs):
1461
+ """
1462
+ Automatically decorate all methods from DatasetOperationsMixin.
1463
+
1464
+ This hook runs when a class inherits from ScenarioListOperationsMixin,
1465
+ applying the to_dataset decorator to all methods from DatasetOperationsMixin.
1466
+ """
1467
+ super().__init_subclass__(**kwargs)
1468
+ decorate_methods_from_mixin(cls, DatasetOperationsMixin)
1469
+
1470
+ class AgentListOperationsMixin(DataOperationsBase):
1471
+ """
1472
+ Mixin providing data operations for AgentList objects.
1473
+
1474
+ This mixin adapts DatasetOperationsMixin methods to work with AgentList objects.
1475
+ AgentList objects are converted to Dataset objects before method execution
1476
+ via the to_dataset decorator applied in __init_subclass__.
1477
+ """
1478
+ def __init_subclass__(cls, **kwargs):
1479
+ """
1480
+ Automatically decorate all methods from DatasetOperationsMixin.
1481
+
1482
+ This hook runs when a class inherits from AgentListOperationsMixin,
1483
+ applying the to_dataset decorator to all methods from DatasetOperationsMixin.
1484
+ """
1485
+ super().__init_subclass__(**kwargs)
1486
+ decorate_methods_from_mixin(cls, DatasetOperationsMixin)
1487
+
1488
+
1489
+ if __name__ == "__main__":
1490
+ import doctest
1491
+
1492
+ doctest.testmod(optionflags=doctest.ELLIPSIS)