edsl 0.1.47__py3-none-any.whl → 0.1.48__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +44 -39
- edsl/__version__.py +1 -1
- edsl/agents/__init__.py +4 -2
- edsl/agents/{Agent.py → agent.py} +442 -152
- edsl/agents/{AgentList.py → agent_list.py} +220 -162
- edsl/agents/descriptors.py +46 -7
- edsl/{exceptions/agents.py → agents/exceptions.py} +3 -12
- edsl/base/__init__.py +75 -0
- edsl/base/base_class.py +1303 -0
- edsl/base/data_transfer_models.py +114 -0
- edsl/base/enums.py +215 -0
- edsl/base.py +8 -0
- edsl/buckets/__init__.py +25 -0
- edsl/buckets/bucket_collection.py +324 -0
- edsl/buckets/model_buckets.py +206 -0
- edsl/buckets/token_bucket.py +502 -0
- edsl/{jobs/buckets/TokenBucketAPI.py → buckets/token_bucket_api.py} +1 -1
- edsl/buckets/token_bucket_client.py +509 -0
- edsl/caching/__init__.py +20 -0
- edsl/caching/cache.py +814 -0
- edsl/caching/cache_entry.py +427 -0
- edsl/{data/CacheHandler.py → caching/cache_handler.py} +14 -15
- edsl/caching/exceptions.py +24 -0
- edsl/caching/orm.py +30 -0
- edsl/{data/RemoteCacheSync.py → caching/remote_cache_sync.py} +3 -3
- edsl/caching/sql_dict.py +441 -0
- edsl/config/__init__.py +8 -0
- edsl/config/config_class.py +177 -0
- edsl/config.py +4 -176
- edsl/conversation/Conversation.py +7 -7
- edsl/conversation/car_buying.py +4 -4
- edsl/conversation/chips.py +6 -6
- edsl/coop/__init__.py +25 -2
- edsl/coop/coop.py +303 -67
- edsl/coop/{ExpectedParrotKeyHandler.py → ep_key_handling.py} +86 -10
- edsl/coop/exceptions.py +62 -0
- edsl/coop/price_fetcher.py +126 -0
- edsl/coop/utils.py +89 -24
- edsl/data_transfer_models.py +5 -72
- edsl/dataset/__init__.py +10 -0
- edsl/{results/Dataset.py → dataset/dataset.py} +116 -36
- edsl/{results/DatasetExportMixin.py → dataset/dataset_operations_mixin.py} +606 -122
- edsl/{results/DatasetTree.py → dataset/dataset_tree.py} +156 -75
- edsl/{results/TableDisplay.py → dataset/display/table_display.py} +18 -7
- edsl/{results → dataset/display}/table_renderers.py +58 -2
- edsl/{results → dataset}/file_exports.py +4 -5
- edsl/{results → dataset}/smart_objects.py +2 -2
- edsl/enums.py +5 -205
- edsl/inference_services/__init__.py +5 -0
- edsl/inference_services/{AvailableModelCacheHandler.py → available_model_cache_handler.py} +2 -3
- edsl/inference_services/{AvailableModelFetcher.py → available_model_fetcher.py} +8 -14
- edsl/inference_services/data_structures.py +3 -2
- edsl/{exceptions/inference_services.py → inference_services/exceptions.py} +1 -1
- edsl/inference_services/{InferenceServiceABC.py → inference_service_abc.py} +1 -1
- edsl/inference_services/{InferenceServicesCollection.py → inference_services_collection.py} +8 -7
- edsl/inference_services/registry.py +4 -41
- edsl/inference_services/{ServiceAvailability.py → service_availability.py} +5 -25
- edsl/inference_services/services/__init__.py +31 -0
- edsl/inference_services/{AnthropicService.py → services/anthropic_service.py} +3 -3
- edsl/inference_services/{AwsBedrock.py → services/aws_bedrock.py} +2 -2
- edsl/inference_services/{AzureAI.py → services/azure_ai.py} +2 -2
- edsl/inference_services/{DeepInfraService.py → services/deep_infra_service.py} +1 -3
- edsl/inference_services/{DeepSeekService.py → services/deep_seek_service.py} +2 -4
- edsl/inference_services/{GoogleService.py → services/google_service.py} +5 -4
- edsl/inference_services/{GroqService.py → services/groq_service.py} +1 -1
- edsl/inference_services/{MistralAIService.py → services/mistral_ai_service.py} +3 -3
- edsl/inference_services/{OllamaService.py → services/ollama_service.py} +1 -7
- edsl/inference_services/{OpenAIService.py → services/open_ai_service.py} +5 -6
- edsl/inference_services/{PerplexityService.py → services/perplexity_service.py} +3 -7
- edsl/inference_services/{TestService.py → services/test_service.py} +7 -6
- edsl/inference_services/{TogetherAIService.py → services/together_ai_service.py} +2 -6
- edsl/inference_services/{XAIService.py → services/xai_service.py} +1 -1
- edsl/inference_services/write_available.py +1 -2
- edsl/instructions/__init__.py +6 -0
- edsl/{surveys/instructions/Instruction.py → instructions/instruction.py} +11 -6
- edsl/{surveys/instructions/InstructionCollection.py → instructions/instruction_collection.py} +10 -5
- edsl/{surveys/InstructionHandler.py → instructions/instruction_handler.py} +3 -3
- edsl/{jobs/interviews → interviews}/ReportErrors.py +2 -2
- edsl/interviews/__init__.py +4 -0
- edsl/{jobs/AnswerQuestionFunctionConstructor.py → interviews/answering_function.py} +45 -18
- edsl/{jobs/interviews/InterviewExceptionEntry.py → interviews/exception_tracking.py} +107 -22
- edsl/interviews/interview.py +638 -0
- edsl/{jobs/interviews/InterviewStatusDictionary.py → interviews/interview_status_dictionary.py} +21 -12
- edsl/{jobs/interviews/InterviewStatusLog.py → interviews/interview_status_log.py} +16 -7
- edsl/{jobs/InterviewTaskManager.py → interviews/interview_task_manager.py} +12 -7
- edsl/{jobs/RequestTokenEstimator.py → interviews/request_token_estimator.py} +8 -3
- edsl/{jobs/interviews/InterviewStatistic.py → interviews/statistics.py} +36 -10
- edsl/invigilators/__init__.py +38 -0
- edsl/invigilators/invigilator_base.py +477 -0
- edsl/{agents/Invigilator.py → invigilators/invigilators.py} +263 -10
- edsl/invigilators/prompt_constructor.py +476 -0
- edsl/{agents → invigilators}/prompt_helpers.py +2 -1
- edsl/{agents/QuestionInstructionPromptBuilder.py → invigilators/question_instructions_prompt_builder.py} +18 -13
- edsl/{agents → invigilators}/question_option_processor.py +96 -21
- edsl/{agents/QuestionTemplateReplacementsBuilder.py → invigilators/question_template_replacements_builder.py} +64 -12
- edsl/jobs/__init__.py +7 -1
- edsl/jobs/async_interview_runner.py +99 -35
- edsl/jobs/check_survey_scenario_compatibility.py +7 -5
- edsl/jobs/data_structures.py +153 -22
- edsl/{exceptions/jobs.py → jobs/exceptions.py} +2 -1
- edsl/jobs/{FetchInvigilator.py → fetch_invigilator.py} +4 -4
- edsl/jobs/{loggers/HTMLTableJobLogger.py → html_table_job_logger.py} +6 -2
- edsl/jobs/{Jobs.py → jobs.py} +313 -167
- edsl/jobs/{JobsChecks.py → jobs_checks.py} +15 -7
- edsl/jobs/{JobsComponentConstructor.py → jobs_component_constructor.py} +19 -17
- edsl/jobs/{InterviewsConstructor.py → jobs_interview_constructor.py} +10 -5
- edsl/jobs/jobs_pricing_estimation.py +347 -0
- edsl/jobs/{JobsRemoteInferenceLogger.py → jobs_remote_inference_logger.py} +4 -3
- edsl/jobs/jobs_runner_asyncio.py +282 -0
- edsl/jobs/{JobsRemoteInferenceHandler.py → remote_inference.py} +19 -22
- edsl/jobs/results_exceptions_handler.py +2 -2
- edsl/key_management/__init__.py +28 -0
- edsl/key_management/key_lookup.py +161 -0
- edsl/{language_models/key_management/KeyLookupBuilder.py → key_management/key_lookup_builder.py} +118 -47
- edsl/key_management/key_lookup_collection.py +82 -0
- edsl/key_management/models.py +218 -0
- edsl/language_models/__init__.py +7 -2
- edsl/language_models/{ComputeCost.py → compute_cost.py} +18 -3
- edsl/{exceptions/language_models.py → language_models/exceptions.py} +2 -1
- edsl/language_models/language_model.py +1080 -0
- edsl/language_models/model.py +10 -25
- edsl/language_models/{ModelList.py → model_list.py} +9 -14
- edsl/language_models/{RawResponseHandler.py → raw_response_handler.py} +1 -1
- edsl/language_models/{RegisterLanguageModelsMeta.py → registry.py} +1 -1
- edsl/language_models/repair.py +4 -4
- edsl/language_models/utilities.py +4 -4
- edsl/notebooks/__init__.py +3 -1
- edsl/notebooks/{Notebook.py → notebook.py} +7 -8
- edsl/prompts/__init__.py +1 -1
- edsl/{exceptions/prompts.py → prompts/exceptions.py} +3 -1
- edsl/prompts/{Prompt.py → prompt.py} +101 -95
- edsl/questions/HTMLQuestion.py +1 -1
- edsl/questions/__init__.py +154 -25
- edsl/questions/answer_validator_mixin.py +1 -1
- edsl/questions/compose_questions.py +4 -3
- edsl/questions/derived/question_likert_five.py +166 -0
- edsl/questions/derived/{QuestionLinearScale.py → question_linear_scale.py} +4 -4
- edsl/questions/derived/{QuestionTopK.py → question_top_k.py} +4 -4
- edsl/questions/derived/{QuestionYesNo.py → question_yes_no.py} +4 -5
- edsl/questions/descriptors.py +24 -30
- edsl/questions/loop_processor.py +65 -19
- edsl/questions/question_base.py +881 -0
- edsl/questions/question_base_gen_mixin.py +15 -16
- edsl/questions/{QuestionBasePromptsMixin.py → question_base_prompts_mixin.py} +2 -2
- edsl/questions/{QuestionBudget.py → question_budget.py} +3 -4
- edsl/questions/{QuestionCheckBox.py → question_check_box.py} +16 -16
- edsl/questions/{QuestionDict.py → question_dict.py} +39 -5
- edsl/questions/{QuestionExtract.py → question_extract.py} +9 -9
- edsl/questions/question_free_text.py +282 -0
- edsl/questions/{QuestionFunctional.py → question_functional.py} +6 -5
- edsl/questions/{QuestionList.py → question_list.py} +6 -7
- edsl/questions/{QuestionMatrix.py → question_matrix.py} +6 -5
- edsl/questions/{QuestionMultipleChoice.py → question_multiple_choice.py} +126 -21
- edsl/questions/{QuestionNumerical.py → question_numerical.py} +5 -5
- edsl/questions/{QuestionRank.py → question_rank.py} +6 -6
- edsl/questions/question_registry.py +4 -9
- edsl/questions/register_questions_meta.py +8 -4
- edsl/questions/response_validator_abc.py +17 -16
- edsl/results/__init__.py +4 -1
- edsl/{exceptions/results.py → results/exceptions.py} +1 -1
- edsl/results/report.py +197 -0
- edsl/results/{Result.py → result.py} +131 -45
- edsl/results/{Results.py → results.py} +365 -220
- edsl/results/results_selector.py +344 -25
- edsl/scenarios/__init__.py +30 -3
- edsl/scenarios/{ConstructDownloadLink.py → construct_download_link.py} +7 -0
- edsl/scenarios/directory_scanner.py +156 -13
- edsl/scenarios/document_chunker.py +186 -0
- edsl/scenarios/exceptions.py +101 -0
- edsl/scenarios/file_methods.py +2 -3
- edsl/scenarios/{FileStore.py → file_store.py} +275 -189
- edsl/scenarios/handlers/__init__.py +14 -14
- edsl/scenarios/handlers/{csv.py → csv_file_store.py} +1 -2
- edsl/scenarios/handlers/{docx.py → docx_file_store.py} +8 -7
- edsl/scenarios/handlers/{html.py → html_file_store.py} +1 -2
- edsl/scenarios/handlers/{jpeg.py → jpeg_file_store.py} +1 -1
- edsl/scenarios/handlers/{json.py → json_file_store.py} +1 -1
- edsl/scenarios/handlers/latex_file_store.py +5 -0
- edsl/scenarios/handlers/{md.py → md_file_store.py} +1 -1
- edsl/scenarios/handlers/{pdf.py → pdf_file_store.py} +2 -2
- edsl/scenarios/handlers/{png.py → png_file_store.py} +1 -1
- edsl/scenarios/handlers/{pptx.py → pptx_file_store.py} +8 -7
- edsl/scenarios/handlers/{py.py → py_file_store.py} +1 -3
- edsl/scenarios/handlers/{sql.py → sql_file_store.py} +2 -1
- edsl/scenarios/handlers/{sqlite.py → sqlite_file_store.py} +2 -3
- edsl/scenarios/handlers/{txt.py → txt_file_store.py} +1 -1
- edsl/scenarios/scenario.py +928 -0
- edsl/scenarios/scenario_join.py +18 -5
- edsl/scenarios/{ScenarioList.py → scenario_list.py} +294 -106
- edsl/scenarios/{ScenarioListPdfMixin.py → scenario_list_pdf_tools.py} +16 -15
- edsl/scenarios/scenario_selector.py +5 -1
- edsl/study/ObjectEntry.py +2 -2
- edsl/study/SnapShot.py +5 -5
- edsl/study/Study.py +18 -19
- edsl/study/__init__.py +6 -4
- edsl/surveys/__init__.py +7 -4
- edsl/surveys/dag/__init__.py +2 -0
- edsl/surveys/{ConstructDAG.py → dag/construct_dag.py} +3 -3
- edsl/surveys/{DAG.py → dag/dag.py} +13 -10
- edsl/surveys/descriptors.py +1 -1
- edsl/surveys/{EditSurvey.py → edit_survey.py} +9 -9
- edsl/{exceptions/surveys.py → surveys/exceptions.py} +1 -2
- edsl/surveys/memory/__init__.py +3 -0
- edsl/surveys/{MemoryPlan.py → memory/memory_plan.py} +10 -9
- edsl/surveys/rules/__init__.py +3 -0
- edsl/surveys/{Rule.py → rules/rule.py} +103 -43
- edsl/surveys/{RuleCollection.py → rules/rule_collection.py} +21 -30
- edsl/surveys/{RuleManager.py → rules/rule_manager.py} +19 -13
- edsl/surveys/survey.py +1743 -0
- edsl/surveys/{SurveyExportMixin.py → survey_export.py} +22 -27
- edsl/surveys/{SurveyFlowVisualization.py → survey_flow_visualization.py} +11 -2
- edsl/surveys/{Simulator.py → survey_simulator.py} +10 -3
- edsl/tasks/__init__.py +32 -0
- edsl/{jobs/tasks/QuestionTaskCreator.py → tasks/question_task_creator.py} +115 -57
- edsl/tasks/task_creators.py +135 -0
- edsl/{jobs/tasks/TaskHistory.py → tasks/task_history.py} +86 -47
- edsl/{jobs/tasks → tasks}/task_status_enum.py +91 -7
- edsl/tasks/task_status_log.py +85 -0
- edsl/tokens/__init__.py +2 -0
- edsl/tokens/interview_token_usage.py +53 -0
- edsl/utilities/PrettyList.py +1 -1
- edsl/utilities/SystemInfo.py +25 -22
- edsl/utilities/__init__.py +29 -21
- edsl/utilities/gcp_bucket/__init__.py +2 -0
- edsl/utilities/gcp_bucket/cloud_storage.py +99 -96
- edsl/utilities/interface.py +44 -536
- edsl/{results/MarkdownToPDF.py → utilities/markdown_to_pdf.py} +13 -5
- edsl/utilities/repair_functions.py +1 -1
- {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/METADATA +1 -1
- edsl-0.1.48.dist-info/RECORD +347 -0
- edsl/Base.py +0 -493
- edsl/BaseDiff.py +0 -260
- edsl/agents/InvigilatorBase.py +0 -260
- edsl/agents/PromptConstructor.py +0 -318
- edsl/coop/PriceFetcher.py +0 -54
- edsl/data/Cache.py +0 -582
- edsl/data/CacheEntry.py +0 -238
- edsl/data/SQLiteDict.py +0 -292
- edsl/data/__init__.py +0 -5
- edsl/data/orm.py +0 -10
- edsl/exceptions/cache.py +0 -5
- edsl/exceptions/coop.py +0 -14
- edsl/exceptions/data.py +0 -14
- edsl/exceptions/scenarios.py +0 -29
- edsl/jobs/Answers.py +0 -43
- edsl/jobs/JobsPrompts.py +0 -354
- edsl/jobs/buckets/BucketCollection.py +0 -134
- edsl/jobs/buckets/ModelBuckets.py +0 -65
- edsl/jobs/buckets/TokenBucket.py +0 -283
- edsl/jobs/buckets/TokenBucketClient.py +0 -191
- edsl/jobs/interviews/Interview.py +0 -395
- edsl/jobs/interviews/InterviewExceptionCollection.py +0 -99
- edsl/jobs/interviews/InterviewStatisticsCollection.py +0 -25
- edsl/jobs/runners/JobsRunnerAsyncio.py +0 -163
- edsl/jobs/runners/JobsRunnerStatusData.py +0 -0
- edsl/jobs/tasks/TaskCreators.py +0 -64
- edsl/jobs/tasks/TaskStatusLog.py +0 -23
- edsl/jobs/tokens/InterviewTokenUsage.py +0 -27
- edsl/language_models/LanguageModel.py +0 -635
- edsl/language_models/ServiceDataSources.py +0 -0
- edsl/language_models/key_management/KeyLookup.py +0 -63
- edsl/language_models/key_management/KeyLookupCollection.py +0 -38
- edsl/language_models/key_management/models.py +0 -137
- edsl/questions/QuestionBase.py +0 -544
- edsl/questions/QuestionFreeText.py +0 -130
- edsl/questions/derived/QuestionLikertFive.py +0 -76
- edsl/results/ResultsExportMixin.py +0 -45
- edsl/results/TextEditor.py +0 -50
- edsl/results/results_fetch_mixin.py +0 -33
- edsl/results/results_tools_mixin.py +0 -98
- edsl/scenarios/DocumentChunker.py +0 -104
- edsl/scenarios/Scenario.py +0 -548
- edsl/scenarios/ScenarioHtmlMixin.py +0 -65
- edsl/scenarios/ScenarioListExportMixin.py +0 -45
- edsl/scenarios/handlers/latex.py +0 -5
- edsl/shared.py +0 -1
- edsl/surveys/Survey.py +0 -1301
- edsl/surveys/SurveyQualtricsImport.py +0 -284
- edsl/surveys/SurveyToApp.py +0 -141
- edsl/surveys/instructions/__init__.py +0 -0
- edsl/tools/__init__.py +0 -1
- edsl/tools/clusters.py +0 -192
- edsl/tools/embeddings.py +0 -27
- edsl/tools/embeddings_plotting.py +0 -118
- edsl/tools/plotting.py +0 -112
- edsl/tools/summarize.py +0 -18
- edsl/utilities/data/Registry.py +0 -6
- edsl/utilities/data/__init__.py +0 -1
- edsl/utilities/data/scooter_results.json +0 -1
- edsl-0.1.47.dist-info/RECORD +0 -354
- /edsl/coop/{CoopFunctionsMixin.py → coop_functions.py} +0 -0
- /edsl/{results → dataset/display}/CSSParameterizer.py +0 -0
- /edsl/{language_models/key_management → dataset/display}/__init__.py +0 -0
- /edsl/{results → dataset/display}/table_data_class.py +0 -0
- /edsl/{results → dataset/display}/table_display.css +0 -0
- /edsl/{results/ResultsGGMixin.py → dataset/r/ggplot.py} +0 -0
- /edsl/{results → dataset}/tree_explore.py +0 -0
- /edsl/{surveys/instructions/ChangeInstruction.py → instructions/change_instruction.py} +0 -0
- /edsl/{jobs/interviews → interviews}/interview_status_enum.py +0 -0
- /edsl/jobs/{runners/JobsRunnerStatus.py → jobs_runner_status.py} +0 -0
- /edsl/language_models/{PriceManager.py → price_manager.py} +0 -0
- /edsl/language_models/{fake_openai_call.py → unused/fake_openai_call.py} +0 -0
- /edsl/language_models/{fake_openai_service.py → unused/fake_openai_service.py} +0 -0
- /edsl/notebooks/{NotebookToLaTeX.py → notebook_to_latex.py} +0 -0
- /edsl/{exceptions/questions.py → questions/exceptions.py} +0 -0
- /edsl/questions/{SimpleAskMixin.py → simple_ask_mixin.py} +0 -0
- /edsl/surveys/{Memory.py → memory/memory.py} +0 -0
- /edsl/surveys/{MemoryManagement.py → memory/memory_management.py} +0 -0
- /edsl/surveys/{SurveyCSS.py → survey_css.py} +0 -0
- /edsl/{jobs/tokens/TokenUsage.py → tokens/token_usage.py} +0 -0
- /edsl/{results/MarkdownToDocx.py → utilities/markdown_to_docx.py} +0 -0
- /edsl/{TemplateLoader.py → utilities/template_loader.py} +0 -0
- {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/LICENSE +0 -0
- {edsl-0.1.47.dist-info → edsl-0.1.48.dist-info}/WHEEL +0 -0
@@ -1,25 +1,122 @@
|
|
1
|
-
"""
|
1
|
+
"""
|
2
|
+
This module provides mixin classes that enable powerful data manipulation operations
|
3
|
+
across various EDSL list-like objects.
|
2
4
|
|
5
|
+
The DataOperationsBase class defines common operations for working with structured data,
|
6
|
+
including data transformation, visualization, export, querying, and analysis. These
|
7
|
+
operations are inherited by different specialized mixins (DatasetOperationsMixin,
|
8
|
+
ResultsOperationsMixin, etc.) which implement class-specific behaviors.
|
9
|
+
|
10
|
+
The design pattern used here allows different container types (Results, Dataset,
|
11
|
+
ScenarioList, AgentList) to share the same data manipulation interface, enabling
|
12
|
+
fluid operations across different parts of the EDSL ecosystem.
|
13
|
+
"""
|
14
|
+
|
15
|
+
from abc import ABC, abstractmethod
|
3
16
|
import io
|
4
17
|
import warnings
|
5
18
|
import textwrap
|
6
|
-
from typing import Optional, Tuple, Union, List
|
19
|
+
from typing import Optional, Tuple, Union, List, TYPE_CHECKING
|
20
|
+
from .r.ggplot import GGPlotMethod
|
7
21
|
|
8
|
-
|
22
|
+
if TYPE_CHECKING:
|
23
|
+
from docx import Document
|
24
|
+
from .dataset import Dataset
|
9
25
|
|
26
|
+
class DataOperationsBase:
|
27
|
+
"""
|
28
|
+
Base class providing common data operations for EDSL container objects.
|
29
|
+
|
30
|
+
This class serves as the foundation for various data manipulation mixins,
|
31
|
+
providing a consistent interface for operations like filtering, aggregation,
|
32
|
+
transformation, visualization, and export across different types of EDSL
|
33
|
+
containers (Results, Dataset, ScenarioList, AgentList).
|
34
|
+
|
35
|
+
Key functionality categories:
|
36
|
+
|
37
|
+
1. Data Transformation:
|
38
|
+
- Filtering with `filter()`
|
39
|
+
- Creating new columns with `mutate()`
|
40
|
+
- Reshaping with `long()`, `wide()`, `flatten()`, etc.
|
41
|
+
- Selecting specific columns with `select()`
|
42
|
+
|
43
|
+
2. Visualization and Display:
|
44
|
+
- Tabular display with `table()`
|
45
|
+
- Plotting with `ggplot2()`
|
46
|
+
- Generating reports with `report()`
|
47
|
+
|
48
|
+
3. Data Export:
|
49
|
+
- To various formats with `to_csv()`, `to_excel()`, etc.
|
50
|
+
- To other data structures with `to_pandas()`, `to_dicts()`, etc.
|
51
|
+
|
52
|
+
4. Analysis:
|
53
|
+
- SQL-based querying with `sql()`
|
54
|
+
- Aggregation with `tally()`
|
55
|
+
- Tree-based exploration
|
56
|
+
|
57
|
+
These operations are designed to be applied fluently in sequence, enabling
|
58
|
+
expressive data manipulation pipelines.
|
59
|
+
"""
|
60
|
+
|
61
|
+
|
62
|
+
def ggplot2(
|
63
|
+
self,
|
64
|
+
ggplot_code: str,
|
65
|
+
shape: str = "wide",
|
66
|
+
sql: Optional[str] = None,
|
67
|
+
remove_prefix: bool = True,
|
68
|
+
debug: bool = False,
|
69
|
+
height: float = 4,
|
70
|
+
width: float = 6,
|
71
|
+
factor_orders: Optional[dict] = None,
|
72
|
+
):
|
73
|
+
"""
|
74
|
+
Create visualizations using R's ggplot2 library.
|
75
|
+
|
76
|
+
This method provides a bridge to R's powerful ggplot2 visualization library,
|
77
|
+
allowing you to create sophisticated plots directly from EDSL data structures.
|
78
|
+
|
79
|
+
Parameters:
|
80
|
+
ggplot_code: R code string containing ggplot2 commands
|
81
|
+
shape: Data shape to use ("wide" or "long")
|
82
|
+
sql: Optional SQL query to transform data before visualization
|
83
|
+
remove_prefix: Whether to remove prefixes (like "answer.") from column names
|
84
|
+
debug: Whether to display debugging information
|
85
|
+
height: Plot height in inches
|
86
|
+
width: Plot width in inches
|
87
|
+
factor_orders: Dictionary mapping factor variables to their desired order
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
A plot object that renders in Jupyter notebooks
|
91
|
+
|
92
|
+
Notes:
|
93
|
+
- Requires R and the ggplot2 package to be installed
|
94
|
+
- Data is automatically converted to a format suitable for ggplot2
|
95
|
+
- The ggplot2 code should reference column names as they appear after
|
96
|
+
any transformations from the shape and remove_prefix parameters
|
97
|
+
|
98
|
+
Examples:
|
99
|
+
>>> from edsl.results import Results
|
100
|
+
>>> r = Results.example()
|
101
|
+
>>> # The following would create a plot if R is installed (not shown in doctest):
|
102
|
+
>>> # r.ggplot2('''
|
103
|
+
>>> # ggplot(df, aes(x=how_feeling)) +
|
104
|
+
>>> # geom_bar() +
|
105
|
+
>>> # labs(title="Distribution of Feelings")
|
106
|
+
>>> # ''')
|
107
|
+
"""
|
108
|
+
return GGPlotMethod(self).ggplot2(ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders)
|
10
109
|
|
11
|
-
class DatasetExportMixin:
|
12
|
-
"""Mixin class for exporting Dataset objects."""
|
13
110
|
|
14
111
|
def relevant_columns(
|
15
|
-
self, data_type: Optional[str] = None, remove_prefix=False
|
112
|
+
self, data_type: Optional[str] = None, remove_prefix:bool=False
|
16
113
|
) -> list:
|
17
114
|
"""Return the set of keys that are present in the dataset.
|
18
115
|
|
19
116
|
:param data_type: The data type to filter by.
|
20
117
|
:param remove_prefix: Whether to remove the prefix from the column names.
|
21
118
|
|
22
|
-
>>> from
|
119
|
+
>>> from ..dataset import Dataset
|
23
120
|
>>> d = Dataset([{'a.b':[1,2,3,4]}])
|
24
121
|
>>> d.relevant_columns()
|
25
122
|
['a.b']
|
@@ -71,7 +168,7 @@ class DatasetExportMixin:
|
|
71
168
|
def num_observations(self):
|
72
169
|
"""Return the number of observations in the dataset.
|
73
170
|
|
74
|
-
>>> from edsl.results
|
171
|
+
>>> from edsl.results import Results
|
75
172
|
>>> Results.example().num_observations()
|
76
173
|
4
|
77
174
|
"""
|
@@ -89,7 +186,7 @@ class DatasetExportMixin:
|
|
89
186
|
|
90
187
|
return _num_observations
|
91
188
|
|
92
|
-
def
|
189
|
+
def make_tabular(
|
93
190
|
self, remove_prefix: bool, pretty_labels: Optional[dict] = None
|
94
191
|
) -> tuple[list, List[list]]:
|
95
192
|
"""Turn the results into a tabular format.
|
@@ -98,10 +195,10 @@ class DatasetExportMixin:
|
|
98
195
|
|
99
196
|
>>> from edsl.results import Results
|
100
197
|
>>> r = Results.example()
|
101
|
-
>>> r.select('how_feeling').
|
198
|
+
>>> r.select('how_feeling').make_tabular(remove_prefix = True)
|
102
199
|
(['how_feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
|
103
200
|
|
104
|
-
>>> r.select('how_feeling').
|
201
|
+
>>> r.select('how_feeling').make_tabular(remove_prefix = True, pretty_labels = {'how_feeling': "How are you feeling"})
|
105
202
|
(['How are you feeling'], [['OK'], ['Great'], ['Terrible'], ['OK']])
|
106
203
|
"""
|
107
204
|
|
@@ -144,7 +241,7 @@ class DatasetExportMixin:
|
|
144
241
|
for value in list_of_values:
|
145
242
|
print(f"{key}: {value}")
|
146
243
|
|
147
|
-
def
|
244
|
+
def get_tabular_data(
|
148
245
|
self,
|
149
246
|
remove_prefix: bool = False,
|
150
247
|
pretty_labels: Optional[dict] = None,
|
@@ -161,7 +258,7 @@ class DatasetExportMixin:
|
|
161
258
|
if pretty_labels is None:
|
162
259
|
pretty_labels = {}
|
163
260
|
|
164
|
-
return self.
|
261
|
+
return self.make_tabular(
|
165
262
|
remove_prefix=remove_prefix, pretty_labels=pretty_labels
|
166
263
|
)
|
167
264
|
|
@@ -196,6 +293,8 @@ class DatasetExportMixin:
|
|
196
293
|
pretty_labels: Optional[dict] = None,
|
197
294
|
) -> Optional["FileStore"]:
|
198
295
|
"""Export the results to a FileStore instance containing CSV data."""
|
296
|
+
from .file_exports import CSVExport
|
297
|
+
|
199
298
|
exporter = CSVExport(
|
200
299
|
data=self,
|
201
300
|
filename=filename,
|
@@ -212,6 +311,8 @@ class DatasetExportMixin:
|
|
212
311
|
sheet_name: Optional[str] = None,
|
213
312
|
) -> Optional["FileStore"]:
|
214
313
|
"""Export the results to a FileStore instance containing Excel data."""
|
314
|
+
from .file_exports import ExcelExport
|
315
|
+
|
215
316
|
exporter = ExcelExport(
|
216
317
|
data=self,
|
217
318
|
filename=filename,
|
@@ -278,29 +379,51 @@ class DatasetExportMixin:
|
|
278
379
|
transpose_by: str = None,
|
279
380
|
remove_prefix: bool = True,
|
280
381
|
shape: str = "wide",
|
281
|
-
) ->
|
282
|
-
"""
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
382
|
+
) -> "Dataset":
|
383
|
+
"""
|
384
|
+
Execute SQL queries on the dataset.
|
385
|
+
|
386
|
+
This powerful method allows you to use SQL to query and transform your data,
|
387
|
+
combining the expressiveness of SQL with EDSL's data structures. It works by
|
388
|
+
creating an in-memory SQLite database from your data and executing the query
|
389
|
+
against it.
|
390
|
+
|
391
|
+
Parameters:
|
392
|
+
query: SQL query string to execute
|
393
|
+
transpose: Whether to transpose the resulting table (rows become columns)
|
394
|
+
transpose_by: Column to use as the new index when transposing
|
395
|
+
remove_prefix: Whether to remove type prefixes (e.g., "answer.") from column names
|
396
|
+
shape: Data shape to use ("wide" or "long")
|
397
|
+
- "wide": Default tabular format with columns for each field
|
398
|
+
- "long": Melted format with key-value pairs, useful for certain queries
|
399
|
+
|
400
|
+
Returns:
|
401
|
+
A Dataset object containing the query results
|
402
|
+
|
403
|
+
Notes:
|
404
|
+
- The data is stored in a table named "self" in the SQLite database
|
405
|
+
- In wide format, column names include their type prefix unless remove_prefix=True
|
406
|
+
- In long format, the data is melted into columns: row_number, key, value, data_type
|
407
|
+
- Complex objects like lists and dictionaries are converted to strings
|
408
|
+
|
298
409
|
Examples:
|
299
410
|
>>> from edsl import Results
|
300
|
-
>>> r = Results.example()
|
301
|
-
|
411
|
+
>>> r = Results.example()
|
412
|
+
|
413
|
+
# Basic selection
|
414
|
+
>>> len(r.sql("SELECT * FROM self", shape="wide"))
|
302
415
|
4
|
303
|
-
|
416
|
+
|
417
|
+
# Filtering with WHERE clause
|
418
|
+
>>> r.sql("SELECT * FROM self WHERE how_feeling = 'Great'").num_observations()
|
419
|
+
1
|
420
|
+
|
421
|
+
# Aggregation
|
422
|
+
>>> r.sql("SELECT how_feeling, COUNT(*) as count FROM self GROUP BY how_feeling").keys()
|
423
|
+
['how_feeling', 'count']
|
424
|
+
|
425
|
+
# Using long format
|
426
|
+
>>> len(r.sql("SELECT * FROM self", shape="long"))
|
304
427
|
172
|
305
428
|
"""
|
306
429
|
import pandas as pd
|
@@ -316,7 +439,7 @@ class DatasetExportMixin:
|
|
316
439
|
else:
|
317
440
|
df = df.set_index(df.columns[0])
|
318
441
|
df = df.transpose()
|
319
|
-
from
|
442
|
+
from .dataset import Dataset
|
320
443
|
|
321
444
|
return Dataset.from_pandas_dataframe(df)
|
322
445
|
|
@@ -372,6 +495,14 @@ class DatasetExportMixin:
|
|
372
495
|
csv_string = self.to_csv(remove_prefix=remove_prefix).text
|
373
496
|
df = pl.read_csv(io.StringIO(csv_string))
|
374
497
|
return df
|
498
|
+
|
499
|
+
def tree(self, node_order: Optional[List[str]] = None) -> "Tree":
|
500
|
+
"""Convert the results to a Tree.
|
501
|
+
|
502
|
+
:param node_order: The order of the nodes.
|
503
|
+
"""
|
504
|
+
from .dataset_tree import Tree
|
505
|
+
return Tree(self, node_order=node_order)
|
375
506
|
|
376
507
|
def to_scenario_list(self, remove_prefix: bool = True) -> list[dict]:
|
377
508
|
"""Convert the results to a list of dictionaries, one per scenario.
|
@@ -383,8 +514,7 @@ class DatasetExportMixin:
|
|
383
514
|
>>> r.select('how_feeling').to_scenario_list()
|
384
515
|
ScenarioList([Scenario({'how_feeling': 'OK'}), Scenario({'how_feeling': 'Great'}), Scenario({'how_feeling': 'Terrible'}), Scenario({'how_feeling': 'OK'})])
|
385
516
|
"""
|
386
|
-
from edsl.scenarios
|
387
|
-
from edsl.scenarios.Scenario import Scenario
|
517
|
+
from edsl.scenarios import ScenarioList, Scenario
|
388
518
|
|
389
519
|
list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
|
390
520
|
scenarios = []
|
@@ -402,8 +532,7 @@ class DatasetExportMixin:
|
|
402
532
|
>>> r.select('how_feeling').to_agent_list()
|
403
533
|
AgentList([Agent(traits = {'how_feeling': 'OK'}), Agent(traits = {'how_feeling': 'Great'}), Agent(traits = {'how_feeling': 'Terrible'}), Agent(traits = {'how_feeling': 'OK'})])
|
404
534
|
"""
|
405
|
-
from edsl.agents import Agent
|
406
|
-
from edsl.agents.AgentList import AgentList
|
535
|
+
from edsl.agents import Agent, AgentList
|
407
536
|
|
408
537
|
list_of_dicts = self.to_dicts(remove_prefix=remove_prefix)
|
409
538
|
agents = []
|
@@ -464,11 +593,11 @@ class DatasetExportMixin:
|
|
464
593
|
>>> r.select('how_feeling').to_list()
|
465
594
|
['OK', 'Great', 'Terrible', 'OK']
|
466
595
|
|
467
|
-
>>> from edsl.
|
596
|
+
>>> from edsl.dataset import Dataset
|
468
597
|
>>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}]).select('a.b').to_list(flatten = True)
|
469
598
|
[1, 9, 2, 3, 4]
|
470
599
|
|
471
|
-
>>> from edsl.
|
600
|
+
>>> from edsl.dataset import Dataset
|
472
601
|
>>> Dataset([{'a.b': [[1, 9], 2, 3, 4]}, {'c': [6, 2, 3, 4]}]).select('a.b', 'c').to_list(flatten = True)
|
473
602
|
Traceback (most recent call last):
|
474
603
|
...
|
@@ -545,42 +674,18 @@ class DatasetExportMixin:
|
|
545
674
|
if return_link:
|
546
675
|
return filename
|
547
676
|
|
548
|
-
def
|
549
|
-
|
550
|
-
|
551
|
-
"""Takes the fields in order and returns a report of the results by iterating through rows.
|
552
|
-
The row number is printed as # Observation: <row number>
|
553
|
-
The name of the field is used as markdown header at level "##"
|
554
|
-
The content of that field is then printed.
|
555
|
-
Then the next field and so on.
|
556
|
-
Once that row is done, a new line is printed and the next row is shown.
|
557
|
-
If in a jupyter notebook, the report is displayed as markdown.
|
677
|
+
def _prepare_report_data(self, *fields: Optional[str], top_n: Optional[int] = None,
|
678
|
+
header_fields: Optional[List[str]] = None) -> tuple:
|
679
|
+
"""Prepares data for report generation in various formats.
|
558
680
|
|
559
681
|
Args:
|
560
682
|
*fields: The fields to include in the report. If none provided, all fields are used.
|
561
683
|
top_n: Optional limit on the number of observations to include.
|
562
684
|
header_fields: Optional list of fields to include in the main header instead of as sections.
|
563
|
-
divider: If True, adds a horizontal rule between observations for better visual separation.
|
564
|
-
return_string: If True, returns the markdown string. If False (default in notebooks),
|
565
|
-
only displays the markdown without returning.
|
566
685
|
|
567
686
|
Returns:
|
568
|
-
A
|
569
|
-
|
570
|
-
Examples:
|
571
|
-
>>> from edsl.results import Results
|
572
|
-
>>> r = Results.example()
|
573
|
-
>>> report = r.select('how_feeling', 'how_feeling_yesterday').report(return_string=True)
|
574
|
-
>>> "# Observation: 1" in report
|
575
|
-
True
|
576
|
-
>>> "## answer.how_feeling" in report
|
577
|
-
True
|
578
|
-
>>> report = r.select('how_feeling').report(header_fields=['answer.how_feeling'], return_string=True)
|
579
|
-
>>> "# Observation: 1 (`how_feeling`: OK)" in report
|
580
|
-
True
|
687
|
+
A tuple containing (field_data, num_obs, fields, header_fields)
|
581
688
|
"""
|
582
|
-
from edsl.utilities.utilities import is_notebook
|
583
|
-
|
584
689
|
# If no fields specified, use all columns
|
585
690
|
if not fields:
|
586
691
|
fields = self.relevant_columns()
|
@@ -607,8 +712,22 @@ class DatasetExportMixin:
|
|
607
712
|
num_obs = self.num_observations()
|
608
713
|
if top_n is not None:
|
609
714
|
num_obs = min(num_obs, top_n)
|
715
|
+
|
716
|
+
return field_data, num_obs, fields, header_fields
|
717
|
+
|
718
|
+
def _report_markdown(self, field_data, num_obs, fields, header_fields, divider: bool = True) -> str:
|
719
|
+
"""Generates a markdown report from the prepared data.
|
610
720
|
|
611
|
-
|
721
|
+
Args:
|
722
|
+
field_data: Dictionary mapping field names to their values
|
723
|
+
num_obs: Number of observations to include
|
724
|
+
fields: Fields to include as sections
|
725
|
+
header_fields: Fields to include in the observation header
|
726
|
+
divider: If True, adds a horizontal rule between observations
|
727
|
+
|
728
|
+
Returns:
|
729
|
+
A string containing the markdown report
|
730
|
+
"""
|
612
731
|
report_lines = []
|
613
732
|
for i in range(num_obs):
|
614
733
|
# Create header with observation number and any header fields
|
@@ -642,34 +761,176 @@ class DatasetExportMixin:
|
|
642
761
|
else:
|
643
762
|
report_lines.append("") # Empty line between observations
|
644
763
|
|
645
|
-
|
764
|
+
return "\n".join(report_lines)
|
765
|
+
|
766
|
+
def _report_docx(self, field_data, num_obs, fields, header_fields) -> "Document":
|
767
|
+
"""Generates a Word document report from the prepared data.
|
768
|
+
|
769
|
+
Args:
|
770
|
+
field_data: Dictionary mapping field names to their values
|
771
|
+
num_obs: Number of observations to include
|
772
|
+
fields: Fields to include as sections
|
773
|
+
header_fields: Fields to include in the observation header
|
774
|
+
|
775
|
+
Returns:
|
776
|
+
A docx.Document object containing the report
|
777
|
+
"""
|
778
|
+
try:
|
779
|
+
from docx import Document
|
780
|
+
from docx.shared import Pt
|
781
|
+
import json
|
782
|
+
except ImportError:
|
783
|
+
raise ImportError("The python-docx package is required for DOCX export. Install it with 'pip install python-docx'.")
|
646
784
|
|
647
|
-
|
648
|
-
is_nb = is_notebook()
|
649
|
-
if is_nb:
|
650
|
-
from IPython.display import Markdown, display
|
651
|
-
display(Markdown(report_text))
|
785
|
+
doc = Document()
|
652
786
|
|
653
|
-
|
654
|
-
|
787
|
+
for i in range(num_obs):
|
788
|
+
# Create header with observation number and any header fields
|
789
|
+
header_text = f"Observation: {i+1}"
|
790
|
+
if header_fields:
|
791
|
+
header_parts = []
|
792
|
+
for field in header_fields:
|
793
|
+
value = field_data[field][i]
|
794
|
+
# Get the field name without prefix for cleaner display
|
795
|
+
display_name = field.split('.')[-1] if '.' in field else field
|
796
|
+
header_parts.append(f"{display_name}: {value}")
|
797
|
+
if header_parts:
|
798
|
+
header_text += f" ({', '.join(header_parts)})"
|
799
|
+
|
800
|
+
heading = doc.add_heading(header_text, level=1)
|
801
|
+
|
802
|
+
# Add the remaining fields
|
803
|
+
for field in fields:
|
804
|
+
if field not in header_fields:
|
805
|
+
doc.add_heading(field, level=2)
|
806
|
+
value = field_data[field][i]
|
807
|
+
|
808
|
+
if isinstance(value, (list, dict)):
|
809
|
+
# Format structured data with indentation
|
810
|
+
formatted_value = json.dumps(value, indent=2)
|
811
|
+
p = doc.add_paragraph()
|
812
|
+
p.add_run(formatted_value).font.name = 'Courier New'
|
813
|
+
p.add_run().font.size = Pt(10)
|
814
|
+
else:
|
815
|
+
doc.add_paragraph(str(value))
|
816
|
+
|
817
|
+
# Add page break between observations except for the last one
|
818
|
+
if i < num_obs - 1:
|
819
|
+
doc.add_page_break()
|
820
|
+
|
821
|
+
return doc
|
822
|
+
|
823
|
+
def report(self, *fields: Optional[str], top_n: Optional[int] = None,
|
824
|
+
header_fields: Optional[List[str]] = None, divider: bool = True,
|
825
|
+
return_string: bool = False, format: str = "markdown",
|
826
|
+
filename: Optional[str] = None) -> Optional[Union[str, "docx.Document"]]:
|
827
|
+
"""Generates a report of the results by iterating through rows.
|
828
|
+
|
829
|
+
Args:
|
830
|
+
*fields: The fields to include in the report. If none provided, all fields are used.
|
831
|
+
top_n: Optional limit on the number of observations to include.
|
832
|
+
header_fields: Optional list of fields to include in the main header instead of as sections.
|
833
|
+
divider: If True, adds a horizontal rule between observations (markdown only).
|
834
|
+
return_string: If True, returns the markdown string. If False (default in notebooks),
|
835
|
+
only displays the markdown without returning.
|
836
|
+
format: Output format - either "markdown" or "docx".
|
837
|
+
filename: If provided and format is "docx", saves the document to this file.
|
838
|
+
|
839
|
+
Returns:
|
840
|
+
Depending on format and return_string:
|
841
|
+
- For markdown: A string if return_string is True, otherwise None (displays in notebook)
|
842
|
+
- For docx: A docx.Document object, or None if filename is provided (saves to file)
|
843
|
+
|
844
|
+
Examples:
|
845
|
+
>>> from edsl.results import Results
|
846
|
+
>>> r = Results.example()
|
847
|
+
>>> report = r.select('how_feeling').report(return_string=True)
|
848
|
+
>>> "# Observation: 1" in report
|
849
|
+
True
|
850
|
+
>>> doc = r.select('how_feeling').report(format="docx")
|
851
|
+
>>> isinstance(doc, object)
|
852
|
+
True
|
853
|
+
"""
|
854
|
+
from edsl.utilities.utilities import is_notebook
|
855
|
+
|
856
|
+
# Prepare the data for the report
|
857
|
+
field_data, num_obs, fields, header_fields = self._prepare_report_data(
|
858
|
+
*fields, top_n=top_n, header_fields=header_fields
|
859
|
+
)
|
860
|
+
|
861
|
+
# Generate the report in the requested format
|
862
|
+
if format.lower() == "markdown":
|
863
|
+
report_text = self._report_markdown(
|
864
|
+
field_data, num_obs, fields, header_fields, divider
|
865
|
+
)
|
866
|
+
|
867
|
+
# In notebooks, display as markdown
|
868
|
+
is_nb = is_notebook()
|
869
|
+
if is_nb and not return_string:
|
870
|
+
from IPython.display import Markdown, display
|
871
|
+
display(Markdown(report_text))
|
872
|
+
return None
|
873
|
+
|
874
|
+
# Return the string if requested or if not in a notebook
|
655
875
|
return report_text
|
656
|
-
|
876
|
+
|
877
|
+
elif format.lower() == "docx":
|
878
|
+
doc = self._report_docx(field_data, num_obs, fields, header_fields)
|
879
|
+
|
880
|
+
# Save to file if filename is provided
|
881
|
+
if filename:
|
882
|
+
doc.save(filename)
|
883
|
+
print(f"Report saved to {filename}")
|
884
|
+
return None
|
885
|
+
|
886
|
+
return doc
|
887
|
+
|
888
|
+
else:
|
889
|
+
raise ValueError(f"Unsupported format: {format}. Use 'markdown' or 'docx'.")
|
657
890
|
|
658
891
|
def tally(
|
659
892
|
self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
|
660
893
|
) -> Union[dict, "Dataset"]:
|
661
|
-
"""
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
894
|
+
"""
|
895
|
+
Count frequency distributions of values in specified fields.
|
896
|
+
|
897
|
+
This method tallies the occurrence of unique values within one or more fields,
|
898
|
+
similar to a GROUP BY and COUNT in SQL. When multiple fields are provided, it
|
899
|
+
performs cross-tabulation across those fields.
|
900
|
+
|
901
|
+
Parameters:
|
902
|
+
*fields: Field names to tally. If none provided, uses all available fields.
|
903
|
+
top_n: Optional limit to return only the top N most frequent values.
|
904
|
+
output: Format for results, either "Dataset" (recommended) or "dict".
|
905
|
+
|
906
|
+
Returns:
|
907
|
+
By default, returns a Dataset with columns for the field(s) and a 'count' column.
|
908
|
+
If output="dict", returns a dictionary mapping values to counts.
|
909
|
+
|
910
|
+
Notes:
|
911
|
+
- For single fields, returns counts of each unique value
|
912
|
+
- For multiple fields, returns counts of each unique combination of values
|
913
|
+
- Results are sorted in descending order by count
|
914
|
+
- Fields can be specified with or without their type prefix
|
915
|
+
|
916
|
+
Examples:
|
917
|
+
>>> from edsl import Results
|
918
|
+
>>> r = Results.example()
|
919
|
+
|
920
|
+
# Single field frequency count
|
921
|
+
>>> r.select('how_feeling').tally('answer.how_feeling', output="dict")
|
922
|
+
{'OK': 2, 'Great': 1, 'Terrible': 1}
|
923
|
+
|
924
|
+
# Return as Dataset (default)
|
925
|
+
>>> from edsl.dataset import Dataset
|
926
|
+
>>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
|
927
|
+
>>> r.select('how_feeling').tally('answer.how_feeling', output="Dataset") == expected
|
928
|
+
True
|
929
|
+
|
930
|
+
# Multi-field cross-tabulation - exact output varies based on data
|
931
|
+
>>> result = r.tally('how_feeling', 'how_feeling_yesterday')
|
932
|
+
>>> 'how_feeling' in result.keys() and 'how_feeling_yesterday' in result.keys() and 'count' in result.keys()
|
933
|
+
True
|
673
934
|
"""
|
674
935
|
from collections import Counter
|
675
936
|
|
@@ -684,7 +945,9 @@ class DatasetExportMixin:
|
|
684
945
|
f in self.relevant_columns() or f in relevant_columns_without_prefix
|
685
946
|
for f in fields
|
686
947
|
):
|
687
|
-
raise ValueError("One or more specified fields are not in the dataset."
|
948
|
+
raise ValueError("One or more specified fields are not in the dataset."
|
949
|
+
f"The available fields are: {self.relevant_columns()}"
|
950
|
+
)
|
688
951
|
|
689
952
|
if len(fields) == 1:
|
690
953
|
field = fields[0]
|
@@ -695,13 +958,18 @@ class DatasetExportMixin:
|
|
695
958
|
for value in values:
|
696
959
|
if isinstance(value, list):
|
697
960
|
value = tuple(value)
|
698
|
-
|
699
|
-
|
961
|
+
try:
|
962
|
+
tally = dict(Counter(values))
|
963
|
+
except TypeError:
|
964
|
+
tally = dict(Counter([str(v) for v in values]))
|
965
|
+
except Exception as e:
|
966
|
+
raise ValueError(f"Error tallying values: {e}")
|
967
|
+
|
700
968
|
sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
|
701
969
|
if top_n is not None:
|
702
970
|
sorted_tally = dict(list(sorted_tally.items())[:top_n])
|
703
971
|
|
704
|
-
from
|
972
|
+
from ..dataset import Dataset
|
705
973
|
|
706
974
|
if output == "dict":
|
707
975
|
# why did I do this?
|
@@ -732,27 +1000,44 @@ class DatasetExportMixin:
|
|
732
1000
|
keys.append("count")
|
733
1001
|
return sl.reorder_keys(keys).to_dataset()
|
734
1002
|
|
735
|
-
def flatten(self, field, keep_original=False):
|
1003
|
+
def flatten(self, field: str, keep_original: bool = False) -> "Dataset":
|
736
1004
|
"""
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
Args:
|
749
|
-
field: The field to flatten
|
750
|
-
keep_original: If True, keeps the original field in the dataset
|
751
|
-
|
1005
|
+
Expand a field containing dictionaries into separate fields.
|
1006
|
+
|
1007
|
+
This method takes a field that contains a list of dictionaries and expands
|
1008
|
+
it into multiple fields, one for each key in the dictionaries. This is useful
|
1009
|
+
when working with nested data structures or results from extraction operations.
|
1010
|
+
|
1011
|
+
Parameters:
|
1012
|
+
field: The field containing dictionaries to flatten
|
1013
|
+
keep_original: Whether to retain the original field in the result
|
1014
|
+
|
752
1015
|
Returns:
|
753
|
-
A new
|
1016
|
+
A new Dataset with the dictionary keys expanded into separate fields
|
1017
|
+
|
1018
|
+
Notes:
|
1019
|
+
- Each key in the dictionaries becomes a new field with name pattern "{field}.{key}"
|
1020
|
+
- All dictionaries in the field must have compatible structures
|
1021
|
+
- If a dictionary is missing a key, the corresponding value will be None
|
1022
|
+
- Non-dictionary values in the field will cause a warning
|
1023
|
+
|
1024
|
+
Examples:
|
1025
|
+
>>> from edsl.dataset import Dataset
|
1026
|
+
|
1027
|
+
# Basic flattening of nested dictionaries
|
1028
|
+
>>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('a')
|
1029
|
+
Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
|
1030
|
+
|
1031
|
+
# Works with prefixed fields too
|
1032
|
+
>>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('answer.example')
|
1033
|
+
Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
|
1034
|
+
|
1035
|
+
# Keep the original field if needed
|
1036
|
+
>>> d = Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}])
|
1037
|
+
>>> d.flatten('a', keep_original=True)
|
1038
|
+
Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
|
754
1039
|
"""
|
755
|
-
from
|
1040
|
+
from ..dataset import Dataset
|
756
1041
|
|
757
1042
|
# Ensure the dataset isn't empty
|
758
1043
|
if not self.data:
|
@@ -853,7 +1138,7 @@ class DatasetExportMixin:
|
|
853
1138
|
A new Dataset with unpacked columns
|
854
1139
|
|
855
1140
|
Examples:
|
856
|
-
>>> from edsl.
|
1141
|
+
>>> from edsl.dataset import Dataset
|
857
1142
|
>>> d = Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}])
|
858
1143
|
>>> d.unpack_list('data')
|
859
1144
|
Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'data_1': [1, 4]}, {'data_2': [2, 5]}, {'data_3': [3, 6]}])
|
@@ -861,7 +1146,7 @@ class DatasetExportMixin:
|
|
861
1146
|
>>> d.unpack_list('data', new_names=['first', 'second', 'third'])
|
862
1147
|
Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'first': [1, 4]}, {'second': [2, 5]}, {'third': [3, 6]}])
|
863
1148
|
"""
|
864
|
-
from
|
1149
|
+
from .dataset import Dataset
|
865
1150
|
|
866
1151
|
# Create a copy of the dataset
|
867
1152
|
result = Dataset(self.data.copy())
|
@@ -919,7 +1204,7 @@ class DatasetExportMixin:
|
|
919
1204
|
KeyError: If the field_name doesn't exist in the dataset.
|
920
1205
|
|
921
1206
|
Examples:
|
922
|
-
>>> from
|
1207
|
+
>>> from .dataset import Dataset
|
923
1208
|
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
924
1209
|
>>> d.drop('a')
|
925
1210
|
Dataset([{'b': [4, 5, 6]}])
|
@@ -929,7 +1214,7 @@ class DatasetExportMixin:
|
|
929
1214
|
...
|
930
1215
|
KeyError: "Field 'c' not found in dataset"
|
931
1216
|
"""
|
932
|
-
from
|
1217
|
+
from .dataset import Dataset
|
933
1218
|
|
934
1219
|
# Check if field exists in the dataset
|
935
1220
|
if field_name not in self.relevant_columns():
|
@@ -959,14 +1244,15 @@ class DatasetExportMixin:
|
|
959
1244
|
>>> r.select('how_feeling', 'how_feeling_yesterday').remove_prefix().relevant_columns()
|
960
1245
|
['how_feeling', 'how_feeling_yesterday']
|
961
1246
|
|
962
|
-
>>> from edsl.
|
1247
|
+
>>> from edsl.dataset import Dataset
|
963
1248
|
>>> d = Dataset([{'a.x': [1, 2, 3]}, {'b.x': [4, 5, 6]}])
|
964
|
-
>>> d.remove_prefix()
|
965
|
-
|
966
|
-
|
967
|
-
|
1249
|
+
>>> # d.remove_prefix()
|
1250
|
+
|
1251
|
+
Traceback (most recent call last):
|
1252
|
+
...
|
1253
|
+
ValueError: Removing prefixes would result in duplicate column names: ['x']
|
968
1254
|
"""
|
969
|
-
from
|
1255
|
+
from .dataset import Dataset
|
970
1256
|
|
971
1257
|
# Get all column names
|
972
1258
|
columns = self.relevant_columns()
|
@@ -1002,6 +1288,204 @@ class DatasetExportMixin:
|
|
1002
1288
|
return Dataset(new_data)
|
1003
1289
|
|
1004
1290
|
|
1291
|
+
from functools import wraps
|
1292
|
+
|
1293
|
+
def to_dataset(func):
|
1294
|
+
"""
|
1295
|
+
Decorator that ensures functions receive a Dataset object as their first argument.
|
1296
|
+
|
1297
|
+
This decorator automatically converts various EDSL container objects (Results,
|
1298
|
+
AgentList, ScenarioList) to Dataset objects before passing them to the decorated
|
1299
|
+
function. This allows methods defined in DataOperationsBase to work seamlessly
|
1300
|
+
across different container types without duplicating conversion logic.
|
1301
|
+
|
1302
|
+
Parameters:
|
1303
|
+
func: The function to decorate
|
1304
|
+
|
1305
|
+
Returns:
|
1306
|
+
A wrapped function that ensures its first argument is a Dataset
|
1307
|
+
|
1308
|
+
Notes:
|
1309
|
+
- For Results objects, calls select() to convert to a Dataset
|
1310
|
+
- For AgentList and ScenarioList objects, calls their to_dataset() method
|
1311
|
+
- For Dataset objects, passes them through unchanged
|
1312
|
+
- This decorator is used internally by the mixin system to enable method sharing
|
1313
|
+
"""
|
1314
|
+
@wraps(func)
|
1315
|
+
def wrapper(self, *args, **kwargs):
|
1316
|
+
"""Execute the function with self converted to a Dataset if needed."""
|
1317
|
+
# Convert to Dataset based on the class type
|
1318
|
+
if self.__class__.__name__ == "Results":
|
1319
|
+
dataset_self = self.select()
|
1320
|
+
elif self.__class__.__name__ == "AgentList":
|
1321
|
+
dataset_self = self.to_dataset()
|
1322
|
+
elif self.__class__.__name__ == "ScenarioList":
|
1323
|
+
dataset_self = self.to_dataset()
|
1324
|
+
else:
|
1325
|
+
dataset_self = self
|
1326
|
+
|
1327
|
+
# Call the function with the converted self
|
1328
|
+
return func(dataset_self, *args, **kwargs)
|
1329
|
+
|
1330
|
+
# Mark the wrapper as being wrapped by to_dataset
|
1331
|
+
wrapper._is_wrapped = True
|
1332
|
+
return wrapper
|
1333
|
+
|
1334
|
+
|
1335
|
+
def decorate_methods_from_mixin(cls, mixin_cls):
|
1336
|
+
"""
|
1337
|
+
Apply the to_dataset decorator to methods inherited from a mixin class.
|
1338
|
+
|
1339
|
+
This function is part of EDSL's method inheritance system. It takes methods
|
1340
|
+
from a source mixin class, applies the to_dataset decorator to them, and adds
|
1341
|
+
them to a target class. This enables the sharing of data manipulation methods
|
1342
|
+
across different container types while ensuring they receive the right data type.
|
1343
|
+
|
1344
|
+
The function is careful not to override methods that are already defined in
|
1345
|
+
more specific parent classes, preserving the method resolution order (MRO).
|
1346
|
+
|
1347
|
+
Parameters:
|
1348
|
+
cls: The target class to add decorated methods to
|
1349
|
+
mixin_cls: The source mixin class providing the methods
|
1350
|
+
|
1351
|
+
Returns:
|
1352
|
+
The modified target class with decorated methods added
|
1353
|
+
|
1354
|
+
Notes:
|
1355
|
+
- Only public methods (not starting with "_") are decorated and added
|
1356
|
+
- Methods already defined in more specific parent classes are not overridden
|
1357
|
+
- Methods from DataOperationsBase are not skipped to ensure all base methods are available
|
1358
|
+
"""
|
1359
|
+
# Get all attributes, including inherited ones
|
1360
|
+
for attr_name in dir(mixin_cls):
|
1361
|
+
# Skip magic methods and private methods
|
1362
|
+
if not attr_name.startswith('_'):
|
1363
|
+
attr_value = getattr(mixin_cls, attr_name)
|
1364
|
+
if callable(attr_value):
|
1365
|
+
# Check if the method is already defined in the class's MRO
|
1366
|
+
# but skip DataOperationsBase methods
|
1367
|
+
for base in cls.__mro__[1:]: # Skip the class itself
|
1368
|
+
if (attr_name in base.__dict__ and
|
1369
|
+
base is not DataOperationsBase):
|
1370
|
+
# Method is overridden in a more specific class, skip decorating
|
1371
|
+
break
|
1372
|
+
else:
|
1373
|
+
# Method not overridden, safe to decorate
|
1374
|
+
setattr(cls, attr_name, to_dataset(attr_value))
|
1375
|
+
return cls
|
1376
|
+
|
1377
|
+
# def decorate_methods_from_mixin(cls, mixin_cls):
|
1378
|
+
# """Decorates all methods from mixin_cls with to_dataset decorator."""
|
1379
|
+
|
1380
|
+
# # Get all attributes, including inherited ones
|
1381
|
+
# for attr_name in dir(mixin_cls):
|
1382
|
+
# # Skip magic methods and private methods
|
1383
|
+
# if not attr_name.startswith('_'):
|
1384
|
+
# attr_value = getattr(mixin_cls, attr_name)
|
1385
|
+
# if callable(attr_value):
|
1386
|
+
# setattr(cls, attr_name, to_dataset(attr_value))
|
1387
|
+
# return cls
|
1388
|
+
|
1389
|
+
class DatasetOperationsMixin(DataOperationsBase):
|
1390
|
+
"""
|
1391
|
+
Mixin providing data manipulation operations for Dataset objects.
|
1392
|
+
|
1393
|
+
This mixin class is the cornerstone of EDSL's data manipulation system. It directly
|
1394
|
+
inherits methods from DataOperationsBase without requiring conversion, as it's
|
1395
|
+
designed specifically for the Dataset class. It serves as the primary implementation
|
1396
|
+
of all data operations methods that other container types will inherit and adapt
|
1397
|
+
through the to_dataset decorator.
|
1398
|
+
|
1399
|
+
The design follows a standard mixin pattern where common functionality is defined
|
1400
|
+
in a standalone class that can be "mixed in" to other classes. In EDSL's case,
|
1401
|
+
this allows different container types (Results, AgentList, ScenarioList) to share
|
1402
|
+
the same powerful data manipulation interface.
|
1403
|
+
|
1404
|
+
Key features:
|
1405
|
+
|
1406
|
+
1. Data Transformation:
|
1407
|
+
- Filtering with `filter()`
|
1408
|
+
- Creating new columns with `mutate()`
|
1409
|
+
- Reshaping with `long()`, `wide()`, `flatten()`, etc.
|
1410
|
+
- Selecting specific data with `select()`
|
1411
|
+
|
1412
|
+
2. Visualization:
|
1413
|
+
- Table display with `table()`
|
1414
|
+
- R integration with `ggplot2()`
|
1415
|
+
- Report generation with `report()`
|
1416
|
+
|
1417
|
+
3. Data Export:
|
1418
|
+
- To files with `to_csv()`, `to_excel()`, etc.
|
1419
|
+
- To other formats with `to_pandas()`, `to_dicts()`, etc.
|
1420
|
+
|
1421
|
+
4. Analysis:
|
1422
|
+
- SQL queries with `sql()`
|
1423
|
+
- Aggregation with `tally()`
|
1424
|
+
- Tree-based exploration with `tree()`
|
1425
|
+
|
1426
|
+
This mixin is designed for fluent method chaining, allowing complex data manipulation
|
1427
|
+
pipelines to be built in an expressive and readable way.
|
1428
|
+
"""
|
1429
|
+
pass
|
1430
|
+
|
1431
|
+
class ResultsOperationsMixin(DataOperationsBase):
|
1432
|
+
"""
|
1433
|
+
Mixin providing data operations for Results objects.
|
1434
|
+
|
1435
|
+
This mixin adapts DatasetOperationsMixin methods to work with Results objects.
|
1436
|
+
When a method is called on a Results object, it's automatically converted to
|
1437
|
+
a Dataset first via the to_dataset decorator applied in __init_subclass__.
|
1438
|
+
|
1439
|
+
This allows Results objects to have the same data manipulation capabilities
|
1440
|
+
as Dataset objects without duplicating code.
|
1441
|
+
"""
|
1442
|
+
def __init_subclass__(cls, **kwargs):
|
1443
|
+
"""
|
1444
|
+
Automatically decorate all methods from DatasetOperationsMixin.
|
1445
|
+
|
1446
|
+
This hook runs when a class inherits from ResultsOperationsMixin,
|
1447
|
+
applying the to_dataset decorator to all methods from DatasetOperationsMixin.
|
1448
|
+
"""
|
1449
|
+
super().__init_subclass__(**kwargs)
|
1450
|
+
decorate_methods_from_mixin(cls, DatasetOperationsMixin)
|
1451
|
+
|
1452
|
+
class ScenarioListOperationsMixin(DataOperationsBase):
|
1453
|
+
"""
|
1454
|
+
Mixin providing data operations for ScenarioList objects.
|
1455
|
+
|
1456
|
+
This mixin adapts DatasetOperationsMixin methods to work with ScenarioList objects.
|
1457
|
+
ScenarioList objects are converted to Dataset objects before method execution
|
1458
|
+
via the to_dataset decorator applied in __init_subclass__.
|
1459
|
+
"""
|
1460
|
+
def __init_subclass__(cls, **kwargs):
|
1461
|
+
"""
|
1462
|
+
Automatically decorate all methods from DatasetOperationsMixin.
|
1463
|
+
|
1464
|
+
This hook runs when a class inherits from ScenarioListOperationsMixin,
|
1465
|
+
applying the to_dataset decorator to all methods from DatasetOperationsMixin.
|
1466
|
+
"""
|
1467
|
+
super().__init_subclass__(**kwargs)
|
1468
|
+
decorate_methods_from_mixin(cls, DatasetOperationsMixin)
|
1469
|
+
|
1470
|
+
class AgentListOperationsMixin(DataOperationsBase):
|
1471
|
+
"""
|
1472
|
+
Mixin providing data operations for AgentList objects.
|
1473
|
+
|
1474
|
+
This mixin adapts DatasetOperationsMixin methods to work with AgentList objects.
|
1475
|
+
AgentList objects are converted to Dataset objects before method execution
|
1476
|
+
via the to_dataset decorator applied in __init_subclass__.
|
1477
|
+
"""
|
1478
|
+
def __init_subclass__(cls, **kwargs):
|
1479
|
+
"""
|
1480
|
+
Automatically decorate all methods from DatasetOperationsMixin.
|
1481
|
+
|
1482
|
+
This hook runs when a class inherits from AgentListOperationsMixin,
|
1483
|
+
applying the to_dataset decorator to all methods from DatasetOperationsMixin.
|
1484
|
+
"""
|
1485
|
+
super().__init_subclass__(**kwargs)
|
1486
|
+
decorate_methods_from_mixin(cls, DatasetOperationsMixin)
|
1487
|
+
|
1488
|
+
|
1005
1489
|
if __name__ == "__main__":
|
1006
1490
|
import doctest
|
1007
1491
|
|