edsl 0.1.47__py3-none-any.whl → 0.1.49__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (314) hide show
  1. edsl/__init__.py +44 -39
  2. edsl/__version__.py +1 -1
  3. edsl/agents/__init__.py +4 -2
  4. edsl/agents/{Agent.py → agent.py} +442 -152
  5. edsl/agents/{AgentList.py → agent_list.py} +220 -162
  6. edsl/agents/descriptors.py +46 -7
  7. edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
  8. edsl/base/__init__.py +75 -0
  9. edsl/base/base_class.py +1303 -0
  10. edsl/base/data_transfer_models.py +114 -0
  11. edsl/base/enums.py +215 -0
  12. edsl/base.py +8 -0
  13. edsl/buckets/__init__.py +25 -0
  14. edsl/buckets/bucket_collection.py +324 -0
  15. edsl/buckets/model_buckets.py +206 -0
  16. edsl/buckets/token_bucket.py +502 -0
  17. edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
  18. edsl/buckets/token_bucket_client.py +509 -0
  19. edsl/caching/__init__.py +20 -0
  20. edsl/caching/cache.py +814 -0
  21. edsl/caching/cache_entry.py +427 -0
  22. edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
  23. edsl/caching/exceptions.py +24 -0
  24. edsl/caching/orm.py +30 -0
  25. edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
  26. edsl/caching/sql_dict.py +441 -0
  27. edsl/config/__init__.py +8 -0
  28. edsl/config/config_class.py +177 -0
  29. edsl/config.py +4 -176
  30. edsl/conversation/Conversation.py +7 -7
  31. edsl/conversation/car_buying.py +4 -4
  32. edsl/conversation/chips.py +6 -6
  33. edsl/coop/__init__.py +25 -2
  34. edsl/coop/coop.py +311 -75
  35. edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
  36. edsl/coop/exceptions.py +62 -0
  37. edsl/coop/price_fetcher.py +126 -0
  38. edsl/coop/utils.py +89 -24
  39. edsl/data_transfer_models.py +5 -72
  40. edsl/dataset/__init__.py +10 -0
  41. edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
  42. edsl/{results/DatasetExportMixin.py → dataset/dataset_operations_mixin.py} +606 -122
  43. edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
  44. edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
  45. edsl/{results → dataset/display}/table_renderers.py +58 -2
  46. edsl/{results → dataset}/file_exports.py +4 -5
  47. edsl/{results → dataset}/smart_objects.py +2 -2
  48. edsl/enums.py +5 -205
  49. edsl/inference_services/__init__.py +5 -0
  50. edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
  51. edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
  52. edsl/inference_services/data_structures.py +3 -2
  53. edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
  54. edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
  55. edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
  56. edsl/inference_services/registry.py +4 -41
  57. edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
  58. edsl/inference_services/services/__init__.py +31 -0
  59. edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
  60. edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
  61. edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
  62. edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
  63. edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
  64. edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
  65. edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
  66. edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
  67. edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
  68. edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
  69. edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +3 -7
  70. edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
  71. edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
  72. edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
  73. edsl/inference_services/write_available.py +1 -2
  74. edsl/instructions/__init__.py +6 -0
  75. edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
  76. edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
  77. edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
  78. edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
  79. edsl/interviews/__init__.py +4 -0
  80. edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
  81. edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
  82. edsl/interviews/interview.py +638 -0
  83. edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
  84. edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
  85. edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
  86. edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
  87. edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
  88. edsl/invigilators/__init__.py +38 -0
  89. edsl/invigilators/invigilator_base.py +477 -0
  90. edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
  91. edsl/invigilators/prompt_constructor.py +476 -0
  92. edsl/{agents → invigilators}/prompt_helpers.py +2 -1
  93. edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
  94. edsl/{agents → invigilators}/question_option_processor.py +96 -21
  95. edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
  96. edsl/jobs/__init__.py +7 -1
  97. edsl/jobs/async_interview_runner.py +99 -35
  98. edsl/jobs/check_survey_scenario_compatibility.py +7 -5
  99. edsl/jobs/data_structures.py +153 -22
  100. edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
  101. edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
  102. edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
  103. edsl/jobs/{Jobs.py → jobs.py} +313 -167
  104. edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
  105. edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +19 -17
  106. edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
  107. edsl/jobs/jobs_pricing_estimation.py +347 -0
  108. edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
  109. edsl/jobs/jobs_runner_asyncio.py +282 -0
  110. edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
  111. edsl/jobs/results_exceptions_handler.py +2 -2
  112. edsl/key_management/__init__.py +28 -0
  113. edsl/key_management/key_lookup.py +161 -0
  114. edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
  115. edsl/key_management/key_lookup_collection.py +82 -0
  116. edsl/key_management/models.py +218 -0
  117. edsl/language_models/__init__.py +7 -2
  118. edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
  119. edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
  120. edsl/language_models/language_model.py +1080 -0
  121. edsl/language_models/model.py +10 -25
  122. edsl/language_models/{ModelList.py → model_list.py} +9 -14
  123. edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
  124. edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
  125. edsl/language_models/repair.py +4 -4
  126. edsl/language_models/utilities.py +4 -4
  127. edsl/notebooks/__init__.py +3 -1
  128. edsl/notebooks/{Notebook.py → notebook.py} +7 -8
  129. edsl/prompts/__init__.py +1 -1
  130. edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
  131. edsl/prompts/{Prompt.py → prompt.py} +101 -95
  132. edsl/questions/HTMLQuestion.py +1 -1
  133. edsl/questions/__init__.py +154 -25
  134. edsl/questions/answer_validator_mixin.py +1 -1
  135. edsl/questions/compose_questions.py +4 -3
  136. edsl/questions/derived/question_likert_five.py +166 -0
  137. edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
  138. edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
  139. edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
  140. edsl/questions/descriptors.py +24 -30
  141. edsl/questions/loop_processor.py +65 -19
  142. edsl/questions/question_base.py +881 -0
  143. edsl/questions/question_base_gen_mixin.py +15 -16
  144. edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
  145. edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
  146. edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
  147. edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
  148. edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
  149. edsl/questions/question_free_text.py +282 -0
  150. edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
  151. edsl/questions/{QuestionList.py → question_list.py} +6 -7
  152. edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
  153. edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
  154. edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
  155. edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
  156. edsl/questions/question_registry.py +4 -9
  157. edsl/questions/register_questions_meta.py +8 -4
  158. edsl/questions/response_validator_abc.py +17 -16
  159. edsl/results/__init__.py +4 -1
  160. edsl/{exceptions/results.py → results/exceptions.py} +1 -1
  161. edsl/results/report.py +197 -0
  162. edsl/results/{Result.py → result.py} +131 -45
  163. edsl/results/{Results.py → results.py} +365 -220
  164. edsl/results/results_selector.py +344 -25
  165. edsl/scenarios/__init__.py +30 -3
  166. edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
  167. edsl/scenarios/directory_scanner.py +156 -13
  168. edsl/scenarios/document_chunker.py +186 -0
  169. edsl/scenarios/exceptions.py +101 -0
  170. edsl/scenarios/file_methods.py +2 -3
  171. edsl/scenarios/{FileStore.py → file_store.py} +275 -189
  172. edsl/scenarios/handlers/__init__.py +14 -14
  173. edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
  174. edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
  175. edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
  176. edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
  177. edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
  178. edsl/scenarios/handlers/latex_file_store.py +5 -0
  179. edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
  180. edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
  181. edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
  182. edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
  183. edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
  184. edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
  185. edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
  186. edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
  187. edsl/scenarios/scenario.py +928 -0
  188. edsl/scenarios/scenario_join.py +18 -5
  189. edsl/scenarios/{ScenarioList.py → scenario_list.py} +294 -106
  190. edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
  191. edsl/scenarios/scenario_selector.py +5 -1
  192. edsl/study/ObjectEntry.py +2 -2
  193. edsl/study/SnapShot.py +5 -5
  194. edsl/study/Study.py +18 -19
  195. edsl/study/__init__.py +6 -4
  196. edsl/surveys/__init__.py +7 -4
  197. edsl/surveys/dag/__init__.py +2 -0
  198. edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
  199. edsl/surveys/{DAG.py → dag/dag.py} +13 -10
  200. edsl/surveys/descriptors.py +1 -1
  201. edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
  202. edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
  203. edsl/surveys/memory/__init__.py +3 -0
  204. edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
  205. edsl/surveys/rules/__init__.py +3 -0
  206. edsl/surveys/{Rule.py → rules/rule.py} +103 -43
  207. edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
  208. edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
  209. edsl/surveys/survey.py +1743 -0
  210. edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
  211. edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
  212. edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
  213. edsl/tasks/__init__.py +32 -0
  214. edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
  215. edsl/tasks/task_creators.py +135 -0
  216. edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
  217. edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
  218. edsl/tasks/task_status_log.py +85 -0
  219. edsl/tokens/__init__.py +2 -0
  220. edsl/tokens/interview_token_usage.py +53 -0
  221. edsl/utilities/PrettyList.py +1 -1
  222. edsl/utilities/SystemInfo.py +25 -22
  223. edsl/utilities/__init__.py +29 -21
  224. edsl/utilities/gcp_bucket/__init__.py +2 -0
  225. edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
  226. edsl/utilities/interface.py +44 -536
  227. edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
  228. edsl/utilities/repair_functions.py +1 -1
  229. {edsl-0.1.47.dist-info → edsl-0.1.49.dist-info}/METADATA +1 -1
  230. edsl-0.1.49.dist-info/RECORD +347 -0
  231. edsl/Base.py +0 -493
  232. edsl/BaseDiff.py +0 -260
  233. edsl/agents/InvigilatorBase.py +0 -260
  234. edsl/agents/PromptConstructor.py +0 -318
  235. edsl/coop/PriceFetcher.py +0 -54
  236. edsl/data/Cache.py +0 -582
  237. edsl/data/CacheEntry.py +0 -238
  238. edsl/data/SQLiteDict.py +0 -292
  239. edsl/data/__init__.py +0 -5
  240. edsl/data/orm.py +0 -10
  241. edsl/exceptions/cache.py +0 -5
  242. edsl/exceptions/coop.py +0 -14
  243. edsl/exceptions/data.py +0 -14
  244. edsl/exceptions/scenarios.py +0 -29
  245. edsl/jobs/Answers.py +0 -43
  246. edsl/jobs/JobsPrompts.py +0 -354
  247. edsl/jobs/buckets/BucketCollection.py +0 -134
  248. edsl/jobs/buckets/ModelBuckets.py +0 -65
  249. edsl/jobs/buckets/TokenBucket.py +0 -283
  250. edsl/jobs/buckets/TokenBucketClient.py +0 -191
  251. edsl/jobs/interviews/Interview.py +0 -395
  252. edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
  253. edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
  254. edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
  255. edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
  256. edsl/jobs/tasks/TaskCreators.py +0 -64
  257. edsl/jobs/tasks/TaskStatusLog.py +0 -23
  258. edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
  259. edsl/language_models/LanguageModel.py +0 -635
  260. edsl/language_models/ServiceDataSources.py +0 -0
  261. edsl/language_models/key_management/KeyLookup.py +0 -63
  262. edsl/language_models/key_management/KeyLookupCollection.py +0 -38
  263. edsl/language_models/key_management/models.py +0 -137
  264. edsl/questions/QuestionBase.py +0 -544
  265. edsl/questions/QuestionFreeText.py +0 -130
  266. edsl/questions/derived/QuestionLikertFive.py +0 -76
  267. edsl/results/ResultsExportMixin.py +0 -45
  268. edsl/results/TextEditor.py +0 -50
  269. edsl/results/results_fetch_mixin.py +0 -33
  270. edsl/results/results_tools_mixin.py +0 -98
  271. edsl/scenarios/DocumentChunker.py +0 -104
  272. edsl/scenarios/Scenario.py +0 -548
  273. edsl/scenarios/ScenarioHtmlMixin.py +0 -65
  274. edsl/scenarios/ScenarioListExportMixin.py +0 -45
  275. edsl/scenarios/handlers/latex.py +0 -5
  276. edsl/shared.py +0 -1
  277. edsl/surveys/Survey.py +0 -1301
  278. edsl/surveys/SurveyQualtricsImport.py +0 -284
  279. edsl/surveys/SurveyToApp.py +0 -141
  280. edsl/surveys/instructions/__init__.py +0 -0
  281. edsl/tools/__init__.py +0 -1
  282. edsl/tools/clusters.py +0 -192
  283. edsl/tools/embeddings.py +0 -27
  284. edsl/tools/embeddings_plotting.py +0 -118
  285. edsl/tools/plotting.py +0 -112
  286. edsl/tools/summarize.py +0 -18
  287. edsl/utilities/data/Registry.py +0 -6
  288. edsl/utilities/data/__init__.py +0 -1
  289. edsl/utilities/data/scooter_results.json +0 -1
  290. edsl-0.1.47.dist-info/RECORD +0 -354
  291. /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
  292. /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
  293. /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
  294. /edsl/{results → dataset/display}/table_data_class.py +0 -0
  295. /edsl/{results → dataset/display}/table_display.css +0 -0
  296. /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
  297. /edsl/{results → dataset}/tree_explore.py +0 -0
  298. /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
  299. /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
  300. /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
  301. /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
  302. /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
  303. /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
  304. /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
  305. /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
  306. /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
  307. /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
  308. /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
  309. /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
  310. /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
  311. /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
  312. /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
  313. {edsl-0.1.47.dist-info → edsl-0.1.49.dist-info}/LICENSE +0 -0
  314. {edsl-0.1.47.dist-info → edsl-0.1.49.dist-info}/WHEEL +0 -0
@@ -1,25 +1,122 @@
1
- """Mixin class for exporting results."""
1
+ """
2
+ This module provides mixin classes that enable powerful data manipulation operations
3
+ across various EDSL list-like objects.
2
4
 
5
+ The DataOperationsBase class defines common operations for working with structured data,
6
+ including data transformation, visualization, export, querying, and analysis. These
7
+ operations are inherited by different specialized mixins (DatasetOperationsMixin,
8
+ ResultsOperationsMixin, etc.) which implement class-specific behaviors.
9
+
10
+ The design pattern used here allows different container types (Results, Dataset,
11
+ ScenarioList, AgentList) to share the same data manipulation interface, enabling
12
+ fluid operations across different parts of the EDSL ecosystem.
13
+ """
14
+
15
+ from abc import ABC, abstractmethod
3
16
  import io
4
17
  import warnings
5
18
  import textwrap
6
- from typing import Optional, Tuple, Union, List
19
+ from typing import Optional, Tuple, Union, List, TYPE_CHECKING
20
+ from .r.ggplot import GGPlotMethod
7
21
 
8
- from edsl.results.file_exports import CSVExport, ExcelExport, JSONLExport, SQLiteExport
22
+ if TYPE_CHECKING:
23
+ from docx import Document
24
+ from .dataset import Dataset
9
25
 
26
+ class DataOperationsBase:
27
+ """
28
+ Base class providing common data operations for EDSL container objects.
29
+
30
+ This class serves as the foundation for various data manipulation mixins,
31
+ providing a consistent interface for operations like filtering, aggregation,
32
+ transformation, visualization, and export across different types of EDSL
33
+ containers (Results, Dataset, ScenarioList, AgentList).
34
+
35
+ Key functionality categories:
36
+
37
+ 1. Data Transformation:
38
+ - Filtering with `filter()`
39
+ - Creating new columns with `mutate()`
40
+ - Reshaping with `long()`, `wide()`, `flatten()`, etc.
41
+ - Selecting specific columns with `select()`
42
+
43
+ 2. Visualization and Display:
44
+ - Tabular display with `table()`
45
+ - Plotting with `ggplot2()`
46
+ - Generating reports with `report()`
47
+
48
+ 3. Data Export:
49
+ - To various formats with `to_csv()`, `to_excel()`, etc.
50
+ - To other data structures with `to_pandas()`, `to_dicts()`, etc.
51
+
52
+ 4. Analysis:
53
+ - SQL-based querying with `sql()`
54
+ - Aggregation with `tally()`
55
+ - Tree-based exploration
56
+
57
+ These operations are designed to be applied fluently in sequence, enabling
58
+ expressive data manipulation pipelines.
59
+ """
60
+
61
+
62
+ def ggplot2(
63
+ self,
64
+ ggplot_code: str,
65
+ shape: str = "wide",
66
+ sql: Optional[str] = None,
67
+ remove_prefix: bool = True,
68
+ debug: bool = False,
69
+ height: float = 4,
70
+ width: float = 6,
71
+ factor_orders: Optional[dict] = None,
72
+ ):
73
+ """
74
+ Create visualizations using R's ggplot2 library.
75
+
76
+ This method provides a bridge to R's powerful ggplot2 visualization library,
77
+ allowing you to create sophisticated plots directly from EDSL data structures.
78
+
79
+ Parameters:
80
+ ggplot_code: R code string containing ggplot2 commands
81
+ shape: Data shape to use ("wide" or "long")
82
+ sql: Optional SQL query to transform data before visualization
83
+ remove_prefix: Whether to remove prefixes (like "answer.") from column names
84
+ debug: Whether to display debugging information
85
+ height: Plot height in inches
86
+ width: Plot width in inches
87
+ factor_orders: Dictionary mapping factor variables to their desired order
88
+
89
+ Returns:
90
+ A plot object that renders in Jupyter notebooks
91
+
92
+ Notes:
93
+ - Requires R and the ggplot2 package to be installed
94
+ - Data is automatically converted to a format suitable for ggplot2
95
+ - The ggplot2 code should reference column names as they appear after
96
+ any transformations from the shape and remove_prefix parameters
97
+
98
+ Examples:
99
+ >>> from edsl.results import Results
100
+ >>> r = Results.example()
101
+ >>> # The following would create a plot if R is installed (not shown in doctest):
102
+ >>> # r.ggplot2('''
103
+ >>> # ggplot(df, aes(x=how_feeling)) +
104
+ >>> # geom_bar() +
105
+ >>> # labs(title="Distribution of Feelings")
106
+ >>> # ''')
107
+ """
108
+ return GGPlotMethod(self).ggplot2(ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders)
10
109
 
11
- class DatasetExportMixin:
12
- """Mixin class for exporting Dataset objects."""
13
110
 
14
111
  def relevant_columns(
15
- self, data_type: Optional[str] = None, remove_prefix=False
112
+ self, data_type: Optional[str] = None, remove_prefix:bool=False
16
113
  ) -> list:
17
114
  """Return the set of keys that are present in the dataset.
18
115
 
19
116
  :param data_type: The data type to filter by.
20
117
  :param remove_prefix: Whether to remove the prefix from the column names.
21
118
 
22
- >>> from edsl.results.Dataset import Dataset
119
+ >>> from ..dataset import Dataset
23
120
  >>> d = Dataset([{'a.b':[1,2,3,4]}])
24
121
  >>> d.relevant_columns()
25
122
  ['a.b']
@@ -71,7 +168,7 @@ class DatasetExportMixin:
71
168
  def num_observations(self):
72
169
  """Return the number of observations in the dataset.
73
170
 
74
- >>> from edsl.results.Results import Results
171
+ >>> from edsl.results import Results
75
172
  >>> Results.example().num_observations()
76
173
  4
77
174
  """
@@ -89,7 +186,7 @@ class DatasetExportMixin:
89
186
 
90
187
  return _num_observations
91
188
 
92
- def _make_tabular(
189
+ def make_tabular(
93
190
  self, remove_prefix: bool, pretty_labels: Optional[dict] = None
94
191
  ) -> tuple[list, List[list]]:
95
192
  """Turn the results into a tabular format.
@@ -98,10 +195,10 @@ class DatasetExportMixin:
98
195
 
99
196
  >>> from edsl.results import Results
100
197
  >>> r = Results.example()
101
- >>> r.select('how_feeling')._make_tabular(remove_prefix = True)
198
+ >>> r.select('how_feeling').make_tabular(remove_prefix = True)
102
199
  (['how_feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
103
200
 
104
- >>> r.select('how_feeling')._make_tabular(remove_prefix = True, pretty_labels = {'how_feeling': "How are you feeling"})
201
+ >>> r.select('how_feeling').make_tabular(remove_prefix = True, pretty_labels = {'how_feeling': "How are you feeling"})
105
202
  (['How are you feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
106
203
  """
107
204
 
@@ -144,7 +241,7 @@ class DatasetExportMixin:
144
241
  for value in list_of_values:
145
242
  print(f"{key}: {value}")
146
243
 
147
- def _get_tabular_data(
244
+ def get_tabular_data(
148
245
  self,
149
246
  remove_prefix: bool = False,
150
247
  pretty_labels: Optional[dict] = None,
@@ -161,7 +258,7 @@ class DatasetExportMixin:
161
258
  if pretty_labels is None:
162
259
  pretty_labels = {}
163
260
 
164
- return self._make_tabular(
261
+ return self.make_tabular(
165
262
  remove_prefix=remove_prefix, pretty_labels=pretty_labels
166
263
  )
167
264
 
@@ -196,6 +293,8 @@ class DatasetExportMixin:
196
293
  pretty_labels: Optional[dict] = None,
197
294
  ) -> Optional["FileStore"]:
198
295
  """Export the results to a FileStore instance containing CSV data."""
296
+ from .file_exports import CSVExport
297
+
199
298
  exporter = CSVExport(
200
299
  data=self,
201
300
  filename=filename,
@@ -212,6 +311,8 @@ class DatasetExportMixin:
212
311
  sheet_name: Optional[str] = None,
213
312
  ) -> Optional["FileStore"]:
214
313
  """Export the results to a FileStore instance containing Excel data."""
314
+ from .file_exports import ExcelExport
315
+
215
316
  exporter = ExcelExport(
216
317
  data=self,
217
318
  filename=filename,
@@ -278,29 +379,51 @@ class DatasetExportMixin:
278
379
  transpose_by: str = None,
279
380
  remove_prefix: bool = True,
280
381
  shape: str = "wide",
281
- ) -> Union["pd.DataFrame", str]:
282
- """Execute a SQL query and return the results as a DataFrame.
283
-
284
- Args:
285
- query: The SQL query to execute
286
- shape: The shape of the data in the database (wide or long)
287
- remove_prefix: Whether to remove the prefix from the column names
288
- transpose: Whether to transpose the DataFrame
289
- transpose_by: The column to use as the index when transposing
290
- csv: Whether to return the DataFrame as a CSV string
291
- to_list: Whether to return the results as a list
292
- to_latex: Whether to return the results as LaTeX
293
- filename: Optional filename to save the results to
294
-
295
- Returns:
296
- DataFrame, CSV string, list, or LaTeX string depending on parameters
297
-
382
+ ) -> "Dataset":
383
+ """
384
+ Execute SQL queries on the dataset.
385
+
386
+ This powerful method allows you to use SQL to query and transform your data,
387
+ combining the expressiveness of SQL with EDSL's data structures. It works by
388
+ creating an in-memory SQLite database from your data and executing the query
389
+ against it.
390
+
391
+ Parameters:
392
+ query: SQL query string to execute
393
+ transpose: Whether to transpose the resulting table (rows become columns)
394
+ transpose_by: Column to use as the new index when transposing
395
+ remove_prefix: Whether to remove type prefixes (e.g., "answer.") from column names
396
+ shape: Data shape to use ("wide" or "long")
397
+ - "wide": Default tabular format with columns for each field
398
+ - "long": Melted format with key-value pairs, useful for certain queries
399
+
400
+ Returns:
401
+ A Dataset object containing the query results
402
+
403
+ Notes:
404
+ - The data is stored in a table named "self" in the SQLite database
405
+ - In wide format, column names include their type prefix unless remove_prefix=True
406
+ - In long format, the data is melted into columns: row_number, key, value, data_type
407
+ - Complex objects like lists and dictionaries are converted to strings
408
+
298
409
  Examples:
299
410
  >>> from edsl import Results
300
- >>> r = Results.example();
301
- >>> len(r.sql("SELECT * FROM self", shape = "wide"))
411
+ >>> r = Results.example()
412
+
413
+ # Basic selection
414
+ >>> len(r.sql("SELECT * FROM self", shape="wide"))
302
415
  4
303
- >>> len(r.sql("SELECT * FROM self", shape = "long"))
416
+
417
+ # Filtering with WHERE clause
418
+ >>> r.sql("SELECT * FROM self WHERE how_feeling = 'Great'").num_observations()
419
+ 1
420
+
421
+ # Aggregation
422
+ >>> r.sql("SELECT how_feeling, COUNT(*) as count FROM self GROUP BY how_feeling").keys()
423
+ ['how_feeling', 'count']
424
+
425
+ # Using long format
426
+ >>> len(r.sql("SELECT * FROM self", shape="long"))
304
427
  172
305
428
  """
306
429
  import pandas as pd
@@ -316,7 +439,7 @@ class DatasetExportMixin:
316
439
  else:
317
440
  df = df.set_index(df.columns[0])
318
441
  df = df.transpose()
319
- from edsl.results.Dataset import Dataset
442
+ from .dataset import Dataset
320
443
 
321
444
  return Dataset.from_pandas_dataframe(df)
322
445
 
@@ -372,6 +495,14 @@ class DatasetExportMixin:
372
495
  csv_string = self.to_csv(remove_prefix=remove_prefix).text
373
496
  df = pl.read_csv(io.StringIO(csv_string))
374
497
  return df
498
+
499
+ def tree(self, node_order: Optional[List[str]] = None) -> "Tree":
500
+ """Convert the results to a Tree.
501
+
502
+ :param node_order: The order of the nodes.
503
+ """
504
+ from .dataset_tree import Tree
505
+ return Tree(self, node_order=node_order)
375
506
 
376
507
  def to_scenario_list(self, remove_prefix: bool = True) -> list[dict]:
377
508
  """Convert the results to a list of dictionaries, one per scenario.
@@ -383,8 +514,7 @@ class DatasetExportMixin:
383
514
  >>> r.select('how_feeling').to_scenario_list()
384
515
  ScenarioList([Scenario({'how_feeling': 'OK'}), Scenario({'how_feeling': 'Great'}), Scenario({'how_feeling': 'Terrible'}), Scenario({'how_feeling': 'OK'})])
385
516
  """
386
- from edsl.scenarios.ScenarioList import ScenarioList
387
- from edsl.scenarios.Scenario import Scenario
517
+ from edsl.scenarios import ScenarioList, Scenario
388
518
 
389
519
  list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
390
520
  scenarios = []
@@ -402,8 +532,7 @@ class DatasetExportMixin:
402
532
  >>> r.select('how_feeling').to_agent_list()
403
533
  AgentList([Agent(traits = {'how_feeling': 'OK'}), Agent(traits = {'how_feeling': 'Great'}), Agent(traits = {'how_feeling': 'Terrible'}), Agent(traits = {'how_feeling': 'OK'})])
404
534
  """
405
- from edsl.agents import Agent
406
- from edsl.agents.AgentList import AgentList
535
+ from edsl.agents import Agent, AgentList
407
536
 
408
537
  list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
409
538
  agents = []
@@ -464,11 +593,11 @@ class DatasetExportMixin:
464
593
  >>> r.select('how_feeling').to_list()
465
594
  ['OK', 'Great', 'Terrible', 'OK']
466
595
 
467
- >>> from edsl.results.Dataset import Dataset
596
+ >>> from edsl.dataset import Dataset
468
597
  >>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}]).select('a.b').to_list(flatten = True)
469
598
  [1, 9, 2, 3, 4]
470
599
 
471
- >>> from edsl.results.Dataset import Dataset
600
+ >>> from edsl.dataset import Dataset
472
601
  >>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}, {'c': [6, 2, 3, 4]}]).select('a.b', 'c').to_list(flatten = True)
473
602
  Traceback (most recent call last):
474
603
  ...
@@ -545,42 +674,18 @@ class DatasetExportMixin:
545
674
  if return_link:
546
675
  return filename
547
676
 
548
- def report(self, *fields: Optional[str], top_n: Optional[int] = None,
549
- header_fields: Optional[List[str]] = None, divider: bool = True,
550
- return_string: bool = False) -> Optional[str]:
551
- """Takes the fields in order and returns a report of the results by iterating through rows.
552
- The row number is printed as # Observation: <row number>
553
- The name of the field is used as markdown header at level "##"
554
- The content of that field is then printed.
555
- Then the next field and so on.
556
- Once that row is done, a new line is printed and the next row is shown.
557
- If in a jupyter notebook, the report is displayed as markdown.
677
+ def _prepare_report_data(self, *fields: Optional[str], top_n: Optional[int] = None,
678
+ header_fields: Optional[List[str]] = None) -> tuple:
679
+ """Prepares data for report generation in various formats.
558
680
 
559
681
  Args:
560
682
  *fields: The fields to include in the report. If none provided, all fields are used.
561
683
  top_n: Optional limit on the number of observations to include.
562
684
  header_fields: Optional list of fields to include in the main header instead of as sections.
563
- divider: If True, adds a horizontal rule between observations for better visual separation.
564
- return_string: If True, returns the markdown string. If False (default in notebooks),
565
- only displays the markdown without returning.
566
685
 
567
686
  Returns:
568
- A string containing the markdown report if return_string is True, otherwise None.
569
-
570
- Examples:
571
- >>> from edsl.results import Results
572
- >>> r = Results.example()
573
- >>> report = r.select('how_feeling', 'how_feeling_yesterday').report(return_string=True)
574
- >>> "# Observation: 1" in report
575
- True
576
- >>> "## answer.how_feeling" in report
577
- True
578
- >>> report = r.select('how_feeling').report(header_fields=['answer.how_feeling'], return_string=True)
579
- >>> "# Observation: 1 (`how_feeling`: OK)" in report
580
- True
687
+ A tuple containing (field_data, num_obs, fields, header_fields)
581
688
  """
582
- from edsl.utilities.utilities import is_notebook
583
-
584
689
  # If no fields specified, use all columns
585
690
  if not fields:
586
691
  fields = self.relevant_columns()
@@ -607,8 +712,22 @@ class DatasetExportMixin:
607
712
  num_obs = self.num_observations()
608
713
  if top_n is not None:
609
714
  num_obs = min(num_obs, top_n)
715
+
716
+ return field_data, num_obs, fields, header_fields
717
+
718
+ def _report_markdown(self, field_data, num_obs, fields, header_fields, divider: bool = True) -> str:
719
+ """Generates a markdown report from the prepared data.
610
720
 
611
- # Build the report
721
+ Args:
722
+ field_data: Dictionary mapping field names to their values
723
+ num_obs: Number of observations to include
724
+ fields: Fields to include as sections
725
+ header_fields: Fields to include in the observation header
726
+ divider: If True, adds a horizontal rule between observations
727
+
728
+ Returns:
729
+ A string containing the markdown report
730
+ """
612
731
  report_lines = []
613
732
  for i in range(num_obs):
614
733
  # Create header with observation number and any header fields
@@ -642,34 +761,176 @@ class DatasetExportMixin:
642
761
  else:
643
762
  report_lines.append("") # Empty line between observations
644
763
 
645
- report_text = "\n".join(report_lines)
764
+ return "\n".join(report_lines)
765
+
766
+ def _report_docx(self, field_data, num_obs, fields, header_fields) -> "Document":
767
+ """Generates a Word document report from the prepared data.
768
+
769
+ Args:
770
+ field_data: Dictionary mapping field names to their values
771
+ num_obs: Number of observations to include
772
+ fields: Fields to include as sections
773
+ header_fields: Fields to include in the observation header
774
+
775
+ Returns:
776
+ A docx.Document object containing the report
777
+ """
778
+ try:
779
+ from docx import Document
780
+ from docx.shared import Pt
781
+ import json
782
+ except ImportError:
783
+ raise ImportError("The python-docx package is required for DOCX export. Install it with 'pip install python-docx'.")
646
784
 
647
- # In notebooks, display as markdown and optionally return
648
- is_nb = is_notebook()
649
- if is_nb:
650
- from IPython.display import Markdown, display
651
- display(Markdown(report_text))
785
+ doc = Document()
652
786
 
653
- # Return the string if requested or if not in a notebook
654
- if return_string or not is_nb:
787
+ for i in range(num_obs):
788
+ # Create header with observation number and any header fields
789
+ header_text = f"Observation: {i+1}"
790
+ if header_fields:
791
+ header_parts = []
792
+ for field in header_fields:
793
+ value = field_data[field][i]
794
+ # Get the field name without prefix for cleaner display
795
+ display_name = field.split('.')[-1] if '.' in field else field
796
+ header_parts.append(f"{display_name}: {value}")
797
+ if header_parts:
798
+ header_text += f" ({', '.join(header_parts)})"
799
+
800
+ heading = doc.add_heading(header_text, level=1)
801
+
802
+ # Add the remaining fields
803
+ for field in fields:
804
+ if field not in header_fields:
805
+ doc.add_heading(field, level=2)
806
+ value = field_data[field][i]
807
+
808
+ if isinstance(value, (list, dict)):
809
+ # Format structured data with indentation
810
+ formatted_value = json.dumps(value, indent=2)
811
+ p = doc.add_paragraph()
812
+ p.add_run(formatted_value).font.name = 'Courier New'
813
+ p.add_run().font.size = Pt(10)
814
+ else:
815
+ doc.add_paragraph(str(value))
816
+
817
+ # Add page break between observations except for the last one
818
+ if i < num_obs - 1:
819
+ doc.add_page_break()
820
+
821
+ return doc
822
+
823
+ def report(self, *fields: Optional[str], top_n: Optional[int] = None,
824
+ header_fields: Optional[List[str]] = None, divider: bool = True,
825
+ return_string: bool = False, format: str = "markdown",
826
+ filename: Optional[str] = None) -> Optional[Union[str, "docx.Document"]]:
827
+ """Generates a report of the results by iterating through rows.
828
+
829
+ Args:
830
+ *fields: The fields to include in the report. If none provided, all fields are used.
831
+ top_n: Optional limit on the number of observations to include.
832
+ header_fields: Optional list of fields to include in the main header instead of as sections.
833
+ divider: If True, adds a horizontal rule between observations (markdown only).
834
+ return_string: If True, returns the markdown string. If False (default in notebooks),
835
+ only displays the markdown without returning.
836
+ format: Output format - either "markdown" or "docx".
837
+ filename: If provided and format is "docx", saves the document to this file.
838
+
839
+ Returns:
840
+ Depending on format and return_string:
841
+ - For markdown: A string if return_string is True, otherwise None (displays in notebook)
842
+ - For docx: A docx.Document object, or None if filename is provided (saves to file)
843
+
844
+ Examples:
845
+ >>> from edsl.results import Results
846
+ >>> r = Results.example()
847
+ >>> report = r.select('how_feeling').report(return_string=True)
848
+ >>> "# Observation: 1" in report
849
+ True
850
+ >>> doc = r.select('how_feeling').report(format="docx")
851
+ >>> isinstance(doc, object)
852
+ True
853
+ """
854
+ from edsl.utilities.utilities import is_notebook
855
+
856
+ # Prepare the data for the report
857
+ field_data, num_obs, fields, header_fields = self._prepare_report_data(
858
+ *fields, top_n=top_n, header_fields=header_fields
859
+ )
860
+
861
+ # Generate the report in the requested format
862
+ if format.lower() == "markdown":
863
+ report_text = self._report_markdown(
864
+ field_data, num_obs, fields, header_fields, divider
865
+ )
866
+
867
+ # In notebooks, display as markdown
868
+ is_nb = is_notebook()
869
+ if is_nb and not return_string:
870
+ from IPython.display import Markdown, display
871
+ display(Markdown(report_text))
872
+ return None
873
+
874
+ # Return the string if requested or if not in a notebook
655
875
  return report_text
656
- return None
876
+
877
+ elif format.lower() == "docx":
878
+ doc = self._report_docx(field_data, num_obs, fields, header_fields)
879
+
880
+ # Save to file if filename is provided
881
+ if filename:
882
+ doc.save(filename)
883
+ print(f"Report saved to {filename}")
884
+ return None
885
+
886
+ return doc
887
+
888
+ else:
889
+ raise ValueError(f"Unsupported format: {format}. Use 'markdown' or 'docx'.")
657
890
 
658
891
  def tally(
659
892
  self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
660
893
  ) -> Union[dict, "Dataset"]:
661
- """Tally the values of a field or perform a cross-tab of multiple fields.
662
-
663
- :param fields: The field(s) to tally, multiple fields for cross-tabulation.
664
-
665
- >>> from edsl.results import Results
666
- >>> r = Results.example()
667
- >>> r.select('how_feeling').tally('answer.how_feeling', output = "dict")
668
- {'OK': 2, 'Great': 1, 'Terrible': 1}
669
- >>> from edsl.results.Dataset import Dataset
670
- >>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
671
- >>> r.select('how_feeling').tally('answer.how_feeling', output = "Dataset") == expected
672
- True
894
+ """
895
+ Count frequency distributions of values in specified fields.
896
+
897
+ This method tallies the occurrence of unique values within one or more fields,
898
+ similar to a GROUP BY and COUNT in SQL. When multiple fields are provided, it
899
+ performs cross-tabulation across those fields.
900
+
901
+ Parameters:
902
+ *fields: Field names to tally. If none provided, uses all available fields.
903
+ top_n: Optional limit to return only the top N most frequent values.
904
+ output: Format for results, either "Dataset" (recommended) or "dict".
905
+
906
+ Returns:
907
+ By default, returns a Dataset with columns for the field(s) and a 'count' column.
908
+ If output="dict", returns a dictionary mapping values to counts.
909
+
910
+ Notes:
911
+ - For single fields, returns counts of each unique value
912
+ - For multiple fields, returns counts of each unique combination of values
913
+ - Results are sorted in descending order by count
914
+ - Fields can be specified with or without their type prefix
915
+
916
+ Examples:
917
+ >>> from edsl import Results
918
+ >>> r = Results.example()
919
+
920
+ # Single field frequency count
921
+ >>> r.select('how_feeling').tally('answer.how_feeling', output="dict")
922
+ {'OK': 2, 'Great': 1, 'Terrible': 1}
923
+
924
+ # Return as Dataset (default)
925
+ >>> from edsl.dataset import Dataset
926
+ >>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
927
+ >>> r.select('how_feeling').tally('answer.how_feeling', output="Dataset") == expected
928
+ True
929
+
930
+ # Multi-field cross-tabulation - exact output varies based on data
931
+ >>> result = r.tally('how_feeling', 'how_feeling_yesterday')
932
+ >>> 'how_feeling' in result.keys() and 'how_feeling_yesterday' in result.keys() and 'count' in result.keys()
933
+ True
673
934
  """
674
935
  from collections import Counter
675
936
 
@@ -684,7 +945,9 @@ class DatasetExportMixin:
684
945
  f in self.relevant_columns() or f in relevant_columns_without_prefix
685
946
  for f in fields
686
947
  ):
687
- raise ValueError("One or more specified fields are not in the dataset.")
948
+ raise ValueError("One or more specified fields are not in the dataset."
949
+ f"The available fields are: {self.relevant_columns()}"
950
+ )
688
951
 
689
952
  if len(fields) == 1:
690
953
  field = fields[0]
@@ -695,13 +958,18 @@ class DatasetExportMixin:
695
958
  for value in values:
696
959
  if isinstance(value, list):
697
960
  value = tuple(value)
698
-
699
- tally = dict(Counter(values))
961
+ try:
962
+ tally = dict(Counter(values))
963
+ except TypeError:
964
+ tally = dict(Counter([str(v) for v in values]))
965
+ except Exception as e:
966
+ raise ValueError(f"Error tallying values: {e}")
967
+
700
968
  sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
701
969
  if top_n is not None:
702
970
  sorted_tally = dict(list(sorted_tally.items())[:top_n])
703
971
 
704
- from edsl.results.Dataset import Dataset
972
+ from ..dataset import Dataset
705
973
 
706
974
  if output == "dict":
707
975
  # why did I do this?
@@ -732,27 +1000,44 @@ class DatasetExportMixin:
732
1000
  keys.append("count")
733
1001
  return sl.reorder_keys(keys).to_dataset()
734
1002
 
735
- def flatten(self, field, keep_original=False):
1003
+ def flatten(self, field: str, keep_original: bool = False) -> "Dataset":
736
1004
  """
737
- Flatten a field containing a list of dictionaries into separate fields.
738
-
739
- >>> from edsl.results.Dataset import Dataset
740
- >>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5] }]).flatten('a')
741
- Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
742
-
743
-
744
- >>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5] }]).flatten('answer.example')
745
- Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
746
-
747
-
748
- Args:
749
- field: The field to flatten
750
- keep_original: If True, keeps the original field in the dataset
751
-
1005
+ Expand a field containing dictionaries into separate fields.
1006
+
1007
+ This method takes a field that contains a list of dictionaries and expands
1008
+ it into multiple fields, one for each key in the dictionaries. This is useful
1009
+ when working with nested data structures or results from extraction operations.
1010
+
1011
+ Parameters:
1012
+ field: The field containing dictionaries to flatten
1013
+ keep_original: Whether to retain the original field in the result
1014
+
752
1015
  Returns:
753
- A new dataset with the flattened fields
1016
+ A new Dataset with the dictionary keys expanded into separate fields
1017
+
1018
+ Notes:
1019
+ - Each key in the dictionaries becomes a new field with name pattern "{field}.{key}"
1020
+ - All dictionaries in the field must have compatible structures
1021
+ - If a dictionary is missing a key, the corresponding value will be None
1022
+ - Non-dictionary values in the field will cause a warning
1023
+
1024
+ Examples:
1025
+ >>> from edsl.dataset import Dataset
1026
+
1027
+ # Basic flattening of nested dictionaries
1028
+ >>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('a')
1029
+ Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
1030
+
1031
+ # Works with prefixed fields too
1032
+ >>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('answer.example')
1033
+ Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
1034
+
1035
+ # Keep the original field if needed
1036
+ >>> d = Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}])
1037
+ >>> d.flatten('a', keep_original=True)
1038
+ Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
754
1039
  """
755
- from edsl.results.Dataset import Dataset
1040
+ from ..dataset import Dataset
756
1041
 
757
1042
  # Ensure the dataset isn't empty
758
1043
  if not self.data:
@@ -853,7 +1138,7 @@ class DatasetExportMixin:
853
1138
  A new Dataset with unpacked columns
854
1139
 
855
1140
  Examples:
856
- >>> from edsl.results.Dataset import Dataset
1141
+ >>> from edsl.dataset import Dataset
857
1142
  >>> d = Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}])
858
1143
  >>> d.unpack_list('data')
859
1144
  Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'data_1': [1, 4]}, {'data_2': [2, 5]}, {'data_3': [3, 6]}])
@@ -861,7 +1146,7 @@ class DatasetExportMixin:
861
1146
  >>> d.unpack_list('data', new_names=['first', 'second', 'third'])
862
1147
  Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'first': [1, 4]}, {'second': [2, 5]}, {'third': [3, 6]}])
863
1148
  """
864
- from edsl.results.Dataset import Dataset
1149
+ from .dataset import Dataset
865
1150
 
866
1151
  # Create a copy of the dataset
867
1152
  result = Dataset(self.data.copy())
@@ -919,7 +1204,7 @@ class DatasetExportMixin:
919
1204
  KeyError: If the field_name doesn't exist in the dataset.
920
1205
 
921
1206
  Examples:
922
- >>> from edsl.results.Dataset import Dataset
1207
+ >>> from .dataset import Dataset
923
1208
  >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
924
1209
  >>> d.drop('a')
925
1210
  Dataset([{'b': [4, 5, 6]}])
@@ -929,7 +1214,7 @@ class DatasetExportMixin:
929
1214
  ...
930
1215
  KeyError: "Field 'c' not found in dataset"
931
1216
  """
932
- from edsl.results.Dataset import Dataset
1217
+ from .dataset import Dataset
933
1218
 
934
1219
  # Check if field exists in the dataset
935
1220
  if field_name not in self.relevant_columns():
@@ -959,14 +1244,15 @@ class DatasetExportMixin:
959
1244
  >>> r.select('how_feeling', 'how_feeling_yesterday').remove_prefix().relevant_columns()
960
1245
  ['how_feeling', 'how_feeling_yesterday']
961
1246
 
962
- >>> from edsl.results.Dataset import Dataset
1247
+ >>> from edsl.dataset import Dataset
963
1248
  >>> d = Dataset([{'a.x': [1, 2, 3]}, {'b.x': [4, 5, 6]}])
964
- >>> d.remove_prefix()
965
- Traceback (most recent call last):
966
- ...
967
- ValueError: Removing prefixes would result in duplicate column names: ['x']
1249
+ >>> # d.remove_prefix()
1250
+
1251
+ Traceback (most recent call last):
1252
+ ...
1253
+ ValueError: Removing prefixes would result in duplicate column names: ['x']
968
1254
  """
969
- from edsl.results.Dataset import Dataset
1255
+ from .dataset import Dataset
970
1256
 
971
1257
  # Get all column names
972
1258
  columns = self.relevant_columns()
@@ -1002,6 +1288,204 @@ class DatasetExportMixin:
1002
1288
  return Dataset(new_data)
1003
1289
 
1004
1290
 
1291
+ from functools import wraps
1292
+
1293
+ def to_dataset(func):
1294
+ """
1295
+ Decorator that ensures functions receive a Dataset object as their first argument.
1296
+
1297
+ This decorator automatically converts various EDSL container objects (Results,
1298
+ AgentList, ScenarioList) to Dataset objects before passing them to the decorated
1299
+ function. This allows methods defined in DataOperationsBase to work seamlessly
1300
+ across different container types without duplicating conversion logic.
1301
+
1302
+ Parameters:
1303
+ func: The function to decorate
1304
+
1305
+ Returns:
1306
+ A wrapped function that ensures its first argument is a Dataset
1307
+
1308
+ Notes:
1309
+ - For Results objects, calls select() to convert to a Dataset
1310
+ - For AgentList and ScenarioList objects, calls their to_dataset() method
1311
+ - For Dataset objects, passes them through unchanged
1312
+ - This decorator is used internally by the mixin system to enable method sharing
1313
+ """
1314
+ @wraps(func)
1315
+ def wrapper(self, *args, **kwargs):
1316
+ """Execute the function with self converted to a Dataset if needed."""
1317
+ # Convert to Dataset based on the class type
1318
+ if self.__class__.__name__ == "Results":
1319
+ dataset_self = self.select()
1320
+ elif self.__class__.__name__ == "AgentList":
1321
+ dataset_self = self.to_dataset()
1322
+ elif self.__class__.__name__ == "ScenarioList":
1323
+ dataset_self = self.to_dataset()
1324
+ else:
1325
+ dataset_self = self
1326
+
1327
+ # Call the function with the converted self
1328
+ return func(dataset_self, *args, **kwargs)
1329
+
1330
+ # Mark the wrapper as being wrapped by to_dataset
1331
+ wrapper._is_wrapped = True
1332
+ return wrapper
1333
+
1334
+
1335
+ def decorate_methods_from_mixin(cls, mixin_cls):
1336
+ """
1337
+ Apply the to_dataset decorator to methods inherited from a mixin class.
1338
+
1339
+ This function is part of EDSL's method inheritance system. It takes methods
1340
+ from a source mixin class, applies the to_dataset decorator to them, and adds
1341
+ them to a target class. This enables the sharing of data manipulation methods
1342
+ across different container types while ensuring they receive the right data type.
1343
+
1344
+ The function is careful not to override methods that are already defined in
1345
+ more specific parent classes, preserving the method resolution order (MRO).
1346
+
1347
+ Parameters:
1348
+ cls: The target class to add decorated methods to
1349
+ mixin_cls: The source mixin class providing the methods
1350
+
1351
+ Returns:
1352
+ The modified target class with decorated methods added
1353
+
1354
+ Notes:
1355
+ - Only public methods (not starting with "_") are decorated and added
1356
+ - Methods already defined in more specific parent classes are not overridden
1357
+ - Methods from DataOperationsBase are not skipped to ensure all base methods are available
1358
+ """
1359
+ # Get all attributes, including inherited ones
1360
+ for attr_name in dir(mixin_cls):
1361
+ # Skip magic methods and private methods
1362
+ if not attr_name.startswith('_'):
1363
+ attr_value = getattr(mixin_cls, attr_name)
1364
+ if callable(attr_value):
1365
+ # Check if the method is already defined in the class's MRO
1366
+ # but skip DataOperationsBase methods
1367
+ for base in cls.__mro__[1:]: # Skip the class itself
1368
+ if (attr_name in base.__dict__ and
1369
+ base is not DataOperationsBase):
1370
+ # Method is overridden in a more specific class, skip decorating
1371
+ break
1372
+ else:
1373
+ # Method not overridden, safe to decorate
1374
+ setattr(cls, attr_name, to_dataset(attr_value))
1375
+ return cls
1376
+
1377
+ # def decorate_methods_from_mixin(cls, mixin_cls):
1378
+ # """Decorates all methods from mixin_cls with to_dataset decorator."""
1379
+
1380
+ # # Get all attributes, including inherited ones
1381
+ # for attr_name in dir(mixin_cls):
1382
+ # # Skip magic methods and private methods
1383
+ # if not attr_name.startswith('_'):
1384
+ # attr_value = getattr(mixin_cls, attr_name)
1385
+ # if callable(attr_value):
1386
+ # setattr(cls, attr_name, to_dataset(attr_value))
1387
+ # return cls
1388
+
1389
+ class DatasetOperationsMixin(DataOperationsBase):
1390
+ """
1391
+ Mixin providing data manipulation operations for Dataset objects.
1392
+
1393
+ This mixin class is the cornerstone of EDSL's data manipulation system. It directly
1394
+ inherits methods from DataOperationsBase without requiring conversion, as it's
1395
+ designed specifically for the Dataset class. It serves as the primary implementation
1396
+ of all data operations methods that other container types will inherit and adapt
1397
+ through the to_dataset decorator.
1398
+
1399
+ The design follows a standard mixin pattern where common functionality is defined
1400
+ in a standalone class that can be "mixed in" to other classes. In EDSL's case,
1401
+ this allows different container types (Results, AgentList, ScenarioList) to share
1402
+ the same powerful data manipulation interface.
1403
+
1404
+ Key features:
1405
+
1406
+ 1. Data Transformation:
1407
+ - Filtering with `filter()`
1408
+ - Creating new columns with `mutate()`
1409
+ - Reshaping with `long()`, `wide()`, `flatten()`, etc.
1410
+ - Selecting specific data with `select()`
1411
+
1412
+ 2. Visualization:
1413
+ - Table display with `table()`
1414
+ - R integration with `ggplot2()`
1415
+ - Report generation with `report()`
1416
+
1417
+ 3. Data Export:
1418
+ - To files with `to_csv()`, `to_excel()`, etc.
1419
+ - To other formats with `to_pandas()`, `to_dicts()`, etc.
1420
+
1421
+ 4. Analysis:
1422
+ - SQL queries with `sql()`
1423
+ - Aggregation with `tally()`
1424
+ - Tree-based exploration with `tree()`
1425
+
1426
+ This mixin is designed for fluent method chaining, allowing complex data manipulation
1427
+ pipelines to be built in an expressive and readable way.
1428
+ """
1429
+ pass
1430
+
1431
+ class ResultsOperationsMixin(DataOperationsBase):
1432
+ """
1433
+ Mixin providing data operations for Results objects.
1434
+
1435
+ This mixin adapts DatasetOperationsMixin methods to work with Results objects.
1436
+ When a method is called on a Results object, it's automatically converted to
1437
+ a Dataset first via the to_dataset decorator applied in __init_subclass__.
1438
+
1439
+ This allows Results objects to have the same data manipulation capabilities
1440
+ as Dataset objects without duplicating code.
1441
+ """
1442
+ def __init_subclass__(cls, **kwargs):
1443
+ """
1444
+ Automatically decorate all methods from DatasetOperationsMixin.
1445
+
1446
+ This hook runs when a class inherits from ResultsOperationsMixin,
1447
+ applying the to_dataset decorator to all methods from DatasetOperationsMixin.
1448
+ """
1449
+ super().__init_subclass__(**kwargs)
1450
+ decorate_methods_from_mixin(cls, DatasetOperationsMixin)
1451
+
1452
+ class ScenarioListOperationsMixin(DataOperationsBase):
1453
+ """
1454
+ Mixin providing data operations for ScenarioList objects.
1455
+
1456
+ This mixin adapts DatasetOperationsMixin methods to work with ScenarioList objects.
1457
+ ScenarioList objects are converted to Dataset objects before method execution
1458
+ via the to_dataset decorator applied in __init_subclass__.
1459
+ """
1460
+ def __init_subclass__(cls, **kwargs):
1461
+ """
1462
+ Automatically decorate all methods from DatasetOperationsMixin.
1463
+
1464
+ This hook runs when a class inherits from ScenarioListOperationsMixin,
1465
+ applying the to_dataset decorator to all methods from DatasetOperationsMixin.
1466
+ """
1467
+ super().__init_subclass__(**kwargs)
1468
+ decorate_methods_from_mixin(cls, DatasetOperationsMixin)
1469
+
1470
+ class AgentListOperationsMixin(DataOperationsBase):
1471
+ """
1472
+ Mixin providing data operations for AgentList objects.
1473
+
1474
+ This mixin adapts DatasetOperationsMixin methods to work with AgentList objects.
1475
+ AgentList objects are converted to Dataset objects before method execution
1476
+ via the to_dataset decorator applied in __init_subclass__.
1477
+ """
1478
+ def __init_subclass__(cls, **kwargs):
1479
+ """
1480
+ Automatically decorate all methods from DatasetOperationsMixin.
1481
+
1482
+ This hook runs when a class inherits from AgentListOperationsMixin,
1483
+ applying the to_dataset decorator to all methods from DatasetOperationsMixin.
1484
+ """
1485
+ super().__init_subclass__(**kwargs)
1486
+ decorate_methods_from_mixin(cls, DatasetOperationsMixin)
1487
+
1488
+
1005
1489
  if __name__ == "__main__":
1006
1490
  import doctest
1007
1491