edsl 0.1.37.dev6__py3-none-any.whl → 0.1.38.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. edsl/Base.py +303 -303
  2. edsl/BaseDiff.py +260 -260
  3. edsl/TemplateLoader.py +24 -24
  4. edsl/__init__.py +48 -48
  5. edsl/__version__.py +1 -1
  6. edsl/agents/Agent.py +855 -855
  7. edsl/agents/AgentList.py +350 -350
  8. edsl/agents/Invigilator.py +222 -222
  9. edsl/agents/InvigilatorBase.py +284 -284
  10. edsl/agents/PromptConstructor.py +353 -353
  11. edsl/agents/__init__.py +3 -3
  12. edsl/agents/descriptors.py +99 -99
  13. edsl/agents/prompt_helpers.py +129 -129
  14. edsl/auto/AutoStudy.py +117 -117
  15. edsl/auto/StageBase.py +230 -230
  16. edsl/auto/StageGenerateSurvey.py +178 -178
  17. edsl/auto/StageLabelQuestions.py +125 -125
  18. edsl/auto/StagePersona.py +61 -61
  19. edsl/auto/StagePersonaDimensionValueRanges.py +88 -88
  20. edsl/auto/StagePersonaDimensionValues.py +74 -74
  21. edsl/auto/StagePersonaDimensions.py +69 -69
  22. edsl/auto/StageQuestions.py +73 -73
  23. edsl/auto/SurveyCreatorPipeline.py +21 -21
  24. edsl/auto/utilities.py +224 -224
  25. edsl/base/Base.py +289 -289
  26. edsl/config.py +149 -149
  27. edsl/conjure/AgentConstructionMixin.py +160 -160
  28. edsl/conjure/Conjure.py +62 -62
  29. edsl/conjure/InputData.py +659 -659
  30. edsl/conjure/InputDataCSV.py +48 -48
  31. edsl/conjure/InputDataMixinQuestionStats.py +182 -182
  32. edsl/conjure/InputDataPyRead.py +91 -91
  33. edsl/conjure/InputDataSPSS.py +8 -8
  34. edsl/conjure/InputDataStata.py +8 -8
  35. edsl/conjure/QuestionOptionMixin.py +76 -76
  36. edsl/conjure/QuestionTypeMixin.py +23 -23
  37. edsl/conjure/RawQuestion.py +65 -65
  38. edsl/conjure/SurveyResponses.py +7 -7
  39. edsl/conjure/__init__.py +9 -9
  40. edsl/conjure/naming_utilities.py +263 -263
  41. edsl/conjure/utilities.py +201 -201
  42. edsl/conversation/Conversation.py +290 -290
  43. edsl/conversation/car_buying.py +58 -58
  44. edsl/conversation/chips.py +95 -95
  45. edsl/conversation/mug_negotiation.py +81 -81
  46. edsl/conversation/next_speaker_utilities.py +93 -93
  47. edsl/coop/PriceFetcher.py +54 -54
  48. edsl/coop/__init__.py +2 -2
  49. edsl/coop/coop.py +958 -958
  50. edsl/coop/utils.py +131 -131
  51. edsl/data/Cache.py +527 -527
  52. edsl/data/CacheEntry.py +228 -228
  53. edsl/data/CacheHandler.py +149 -149
  54. edsl/data/RemoteCacheSync.py +97 -97
  55. edsl/data/SQLiteDict.py +292 -292
  56. edsl/data/__init__.py +4 -4
  57. edsl/data/orm.py +10 -10
  58. edsl/data_transfer_models.py +73 -73
  59. edsl/enums.py +173 -173
  60. edsl/exceptions/BaseException.py +21 -21
  61. edsl/exceptions/__init__.py +54 -54
  62. edsl/exceptions/agents.py +38 -38
  63. edsl/exceptions/configuration.py +16 -16
  64. edsl/exceptions/coop.py +10 -10
  65. edsl/exceptions/data.py +14 -14
  66. edsl/exceptions/general.py +34 -34
  67. edsl/exceptions/jobs.py +33 -33
  68. edsl/exceptions/language_models.py +63 -63
  69. edsl/exceptions/prompts.py +15 -15
  70. edsl/exceptions/questions.py +91 -91
  71. edsl/exceptions/results.py +29 -29
  72. edsl/exceptions/scenarios.py +22 -22
  73. edsl/exceptions/surveys.py +37 -37
  74. edsl/inference_services/AnthropicService.py +87 -87
  75. edsl/inference_services/AwsBedrock.py +120 -120
  76. edsl/inference_services/AzureAI.py +217 -217
  77. edsl/inference_services/DeepInfraService.py +18 -18
  78. edsl/inference_services/GoogleService.py +156 -156
  79. edsl/inference_services/GroqService.py +20 -20
  80. edsl/inference_services/InferenceServiceABC.py +147 -147
  81. edsl/inference_services/InferenceServicesCollection.py +97 -97
  82. edsl/inference_services/MistralAIService.py +123 -123
  83. edsl/inference_services/OllamaService.py +18 -18
  84. edsl/inference_services/OpenAIService.py +224 -224
  85. edsl/inference_services/TestService.py +89 -89
  86. edsl/inference_services/TogetherAIService.py +170 -170
  87. edsl/inference_services/models_available_cache.py +118 -118
  88. edsl/inference_services/rate_limits_cache.py +25 -25
  89. edsl/inference_services/registry.py +39 -39
  90. edsl/inference_services/write_available.py +10 -10
  91. edsl/jobs/Answers.py +56 -56
  92. edsl/jobs/Jobs.py +1347 -1347
  93. edsl/jobs/__init__.py +1 -1
  94. edsl/jobs/buckets/BucketCollection.py +63 -63
  95. edsl/jobs/buckets/ModelBuckets.py +65 -65
  96. edsl/jobs/buckets/TokenBucket.py +248 -248
  97. edsl/jobs/interviews/Interview.py +661 -661
  98. edsl/jobs/interviews/InterviewExceptionCollection.py +99 -99
  99. edsl/jobs/interviews/InterviewExceptionEntry.py +186 -186
  100. edsl/jobs/interviews/InterviewStatistic.py +63 -63
  101. edsl/jobs/interviews/InterviewStatisticsCollection.py +25 -25
  102. edsl/jobs/interviews/InterviewStatusDictionary.py +78 -78
  103. edsl/jobs/interviews/InterviewStatusLog.py +92 -92
  104. edsl/jobs/interviews/ReportErrors.py +66 -66
  105. edsl/jobs/interviews/interview_status_enum.py +9 -9
  106. edsl/jobs/runners/JobsRunnerAsyncio.py +338 -338
  107. edsl/jobs/runners/JobsRunnerStatus.py +332 -332
  108. edsl/jobs/tasks/QuestionTaskCreator.py +242 -242
  109. edsl/jobs/tasks/TaskCreators.py +64 -64
  110. edsl/jobs/tasks/TaskHistory.py +442 -442
  111. edsl/jobs/tasks/TaskStatusLog.py +23 -23
  112. edsl/jobs/tasks/task_status_enum.py +163 -163
  113. edsl/jobs/tokens/InterviewTokenUsage.py +27 -27
  114. edsl/jobs/tokens/TokenUsage.py +34 -34
  115. edsl/language_models/KeyLookup.py +30 -30
  116. edsl/language_models/LanguageModel.py +706 -706
  117. edsl/language_models/ModelList.py +102 -102
  118. edsl/language_models/RegisterLanguageModelsMeta.py +184 -184
  119. edsl/language_models/__init__.py +3 -3
  120. edsl/language_models/fake_openai_call.py +15 -15
  121. edsl/language_models/fake_openai_service.py +61 -61
  122. edsl/language_models/registry.py +137 -137
  123. edsl/language_models/repair.py +156 -156
  124. edsl/language_models/unused/ReplicateBase.py +83 -83
  125. edsl/language_models/utilities.py +64 -64
  126. edsl/notebooks/Notebook.py +259 -259
  127. edsl/notebooks/__init__.py +1 -1
  128. edsl/prompts/Prompt.py +357 -357
  129. edsl/prompts/__init__.py +2 -2
  130. edsl/questions/AnswerValidatorMixin.py +289 -289
  131. edsl/questions/QuestionBase.py +656 -656
  132. edsl/questions/QuestionBaseGenMixin.py +161 -161
  133. edsl/questions/QuestionBasePromptsMixin.py +234 -234
  134. edsl/questions/QuestionBudget.py +227 -227
  135. edsl/questions/QuestionCheckBox.py +359 -359
  136. edsl/questions/QuestionExtract.py +183 -183
  137. edsl/questions/QuestionFreeText.py +114 -114
  138. edsl/questions/QuestionFunctional.py +159 -159
  139. edsl/questions/QuestionList.py +231 -231
  140. edsl/questions/QuestionMultipleChoice.py +286 -286
  141. edsl/questions/QuestionNumerical.py +153 -153
  142. edsl/questions/QuestionRank.py +324 -324
  143. edsl/questions/Quick.py +41 -41
  144. edsl/questions/RegisterQuestionsMeta.py +71 -71
  145. edsl/questions/ResponseValidatorABC.py +174 -174
  146. edsl/questions/SimpleAskMixin.py +73 -73
  147. edsl/questions/__init__.py +26 -26
  148. edsl/questions/compose_questions.py +98 -98
  149. edsl/questions/decorators.py +21 -21
  150. edsl/questions/derived/QuestionLikertFive.py +76 -76
  151. edsl/questions/derived/QuestionLinearScale.py +87 -87
  152. edsl/questions/derived/QuestionTopK.py +91 -91
  153. edsl/questions/derived/QuestionYesNo.py +82 -82
  154. edsl/questions/descriptors.py +413 -413
  155. edsl/questions/prompt_templates/question_budget.jinja +13 -13
  156. edsl/questions/prompt_templates/question_checkbox.jinja +32 -32
  157. edsl/questions/prompt_templates/question_extract.jinja +11 -11
  158. edsl/questions/prompt_templates/question_free_text.jinja +3 -3
  159. edsl/questions/prompt_templates/question_linear_scale.jinja +11 -11
  160. edsl/questions/prompt_templates/question_list.jinja +17 -17
  161. edsl/questions/prompt_templates/question_multiple_choice.jinja +33 -33
  162. edsl/questions/prompt_templates/question_numerical.jinja +36 -36
  163. edsl/questions/question_registry.py +147 -147
  164. edsl/questions/settings.py +12 -12
  165. edsl/questions/templates/budget/answering_instructions.jinja +7 -7
  166. edsl/questions/templates/budget/question_presentation.jinja +7 -7
  167. edsl/questions/templates/checkbox/answering_instructions.jinja +10 -10
  168. edsl/questions/templates/checkbox/question_presentation.jinja +22 -22
  169. edsl/questions/templates/extract/answering_instructions.jinja +7 -7
  170. edsl/questions/templates/likert_five/answering_instructions.jinja +10 -10
  171. edsl/questions/templates/likert_five/question_presentation.jinja +11 -11
  172. edsl/questions/templates/linear_scale/answering_instructions.jinja +5 -5
  173. edsl/questions/templates/linear_scale/question_presentation.jinja +5 -5
  174. edsl/questions/templates/list/answering_instructions.jinja +3 -3
  175. edsl/questions/templates/list/question_presentation.jinja +5 -5
  176. edsl/questions/templates/multiple_choice/answering_instructions.jinja +9 -9
  177. edsl/questions/templates/multiple_choice/question_presentation.jinja +11 -11
  178. edsl/questions/templates/numerical/answering_instructions.jinja +6 -6
  179. edsl/questions/templates/numerical/question_presentation.jinja +6 -6
  180. edsl/questions/templates/rank/answering_instructions.jinja +11 -11
  181. edsl/questions/templates/rank/question_presentation.jinja +15 -15
  182. edsl/questions/templates/top_k/answering_instructions.jinja +8 -8
  183. edsl/questions/templates/top_k/question_presentation.jinja +22 -22
  184. edsl/questions/templates/yes_no/answering_instructions.jinja +6 -6
  185. edsl/questions/templates/yes_no/question_presentation.jinja +11 -11
  186. edsl/results/Dataset.py +293 -293
  187. edsl/results/DatasetExportMixin.py +717 -717
  188. edsl/results/DatasetTree.py +145 -145
  189. edsl/results/Result.py +450 -450
  190. edsl/results/Results.py +1071 -1071
  191. edsl/results/ResultsDBMixin.py +238 -238
  192. edsl/results/ResultsExportMixin.py +43 -43
  193. edsl/results/ResultsFetchMixin.py +33 -33
  194. edsl/results/ResultsGGMixin.py +121 -121
  195. edsl/results/ResultsToolsMixin.py +98 -98
  196. edsl/results/Selector.py +135 -135
  197. edsl/results/__init__.py +2 -2
  198. edsl/results/tree_explore.py +115 -115
  199. edsl/scenarios/FileStore.py +458 -458
  200. edsl/scenarios/Scenario.py +546 -546
  201. edsl/scenarios/ScenarioHtmlMixin.py +64 -64
  202. edsl/scenarios/ScenarioList.py +1112 -1112
  203. edsl/scenarios/ScenarioListExportMixin.py +52 -52
  204. edsl/scenarios/ScenarioListPdfMixin.py +261 -261
  205. edsl/scenarios/__init__.py +4 -4
  206. edsl/shared.py +1 -1
  207. edsl/study/ObjectEntry.py +173 -173
  208. edsl/study/ProofOfWork.py +113 -113
  209. edsl/study/SnapShot.py +80 -80
  210. edsl/study/Study.py +528 -528
  211. edsl/study/__init__.py +4 -4
  212. edsl/surveys/DAG.py +148 -148
  213. edsl/surveys/Memory.py +31 -31
  214. edsl/surveys/MemoryPlan.py +244 -244
  215. edsl/surveys/Rule.py +330 -330
  216. edsl/surveys/RuleCollection.py +387 -387
  217. edsl/surveys/Survey.py +1795 -1795
  218. edsl/surveys/SurveyCSS.py +261 -261
  219. edsl/surveys/SurveyExportMixin.py +259 -259
  220. edsl/surveys/SurveyFlowVisualizationMixin.py +121 -121
  221. edsl/surveys/SurveyQualtricsImport.py +284 -284
  222. edsl/surveys/__init__.py +3 -3
  223. edsl/surveys/base.py +53 -53
  224. edsl/surveys/descriptors.py +56 -56
  225. edsl/surveys/instructions/ChangeInstruction.py +47 -47
  226. edsl/surveys/instructions/Instruction.py +51 -51
  227. edsl/surveys/instructions/InstructionCollection.py +77 -77
  228. edsl/templates/error_reporting/base.html +23 -23
  229. edsl/templates/error_reporting/exceptions_by_model.html +34 -34
  230. edsl/templates/error_reporting/exceptions_by_question_name.html +16 -16
  231. edsl/templates/error_reporting/exceptions_by_type.html +16 -16
  232. edsl/templates/error_reporting/interview_details.html +115 -115
  233. edsl/templates/error_reporting/interviews.html +9 -9
  234. edsl/templates/error_reporting/overview.html +4 -4
  235. edsl/templates/error_reporting/performance_plot.html +1 -1
  236. edsl/templates/error_reporting/report.css +73 -73
  237. edsl/templates/error_reporting/report.html +117 -117
  238. edsl/templates/error_reporting/report.js +25 -25
  239. edsl/tools/__init__.py +1 -1
  240. edsl/tools/clusters.py +192 -192
  241. edsl/tools/embeddings.py +27 -27
  242. edsl/tools/embeddings_plotting.py +118 -118
  243. edsl/tools/plotting.py +112 -112
  244. edsl/tools/summarize.py +18 -18
  245. edsl/utilities/SystemInfo.py +28 -28
  246. edsl/utilities/__init__.py +22 -22
  247. edsl/utilities/ast_utilities.py +25 -25
  248. edsl/utilities/data/Registry.py +6 -6
  249. edsl/utilities/data/__init__.py +1 -1
  250. edsl/utilities/data/scooter_results.json +1 -1
  251. edsl/utilities/decorators.py +77 -77
  252. edsl/utilities/gcp_bucket/cloud_storage.py +96 -96
  253. edsl/utilities/interface.py +627 -627
  254. edsl/utilities/repair_functions.py +28 -28
  255. edsl/utilities/restricted_python.py +70 -70
  256. edsl/utilities/utilities.py +409 -409
  257. {edsl-0.1.37.dev6.dist-info → edsl-0.1.38.dev1.dist-info}/LICENSE +21 -21
  258. {edsl-0.1.37.dev6.dist-info → edsl-0.1.38.dev1.dist-info}/METADATA +1 -1
  259. edsl-0.1.38.dev1.dist-info/RECORD +283 -0
  260. edsl-0.1.37.dev6.dist-info/RECORD +0 -283
  261. {edsl-0.1.37.dev6.dist-info → edsl-0.1.38.dev1.dist-info}/WHEEL +0 -0
edsl/conjure/InputData.py CHANGED
@@ -1,659 +1,659 @@
1
- import base64
2
- from abc import ABC, abstractmethod
3
- from typing import Dict, Callable, Optional, List, Generator, Tuple, Union
4
- from collections import namedtuple
5
- from typing import List, Union
6
-
7
- from edsl.questions.QuestionBase import QuestionBase
8
-
9
- from edsl.scenarios.ScenarioList import ScenarioList
10
- from edsl.surveys.Survey import Survey
11
- from edsl.conjure.SurveyResponses import SurveyResponses
12
- from edsl.conjure.naming_utilities import sanitize_string
13
- from edsl.utilities.utilities import is_valid_variable_name
14
-
15
- from edsl.conjure.RawQuestion import RawQuestion
16
- from edsl.conjure.AgentConstructionMixin import AgentConstructionMixin
17
-
18
- from edsl.conjure.QuestionOptionMixin import QuestionOptionMixin
19
- from edsl.conjure.InputDataMixinQuestionStats import InputDataMixinQuestionStats
20
- from edsl.conjure.QuestionTypeMixin import QuestionTypeMixin
21
-
22
-
23
- class InputDataABC(
24
- ABC,
25
- InputDataMixinQuestionStats,
26
- AgentConstructionMixin,
27
- QuestionOptionMixin,
28
- QuestionTypeMixin,
29
- ):
30
- """A class to represent the input data for a survey."""
31
-
32
- NUM_UNIQUE_THRESHOLD = 15
33
- FRAC_NUMERICAL_THRESHOLD = 0.8
34
- MULTIPLE_CHOICE_OTHER_THRESHOLD = 0.5
35
- OTHER_STRING = "Other:"
36
-
37
- question_attributes = [
38
- "num_responses",
39
- "num_unique_responses",
40
- "missing",
41
- "unique_responses",
42
- "frac_numerical",
43
- "top_5",
44
- "frac_obs_from_top_5",
45
- ]
46
- QuestionStats = namedtuple("QuestionStats", question_attributes)
47
-
48
- def __init__(
49
- self,
50
- datafile_name: str,
51
- config: Optional[dict] = None,
52
- naming_function: Optional[Callable] = sanitize_string,
53
- raw_data: Optional[List] = None,
54
- binary: Optional[str] = None,
55
- question_names: Optional[List[str]] = None,
56
- question_texts: Optional[List[str]] = None,
57
- answer_codebook: Optional[Dict] = None,
58
- question_types: Optional[List[str]] = None,
59
- question_options: Optional[List] = None,
60
- order_options=False,
61
- question_name_repair_func: Callable = None,
62
- ):
63
- """Initialize the InputData object.
64
-
65
- :param datafile_name: The name of the file containing the data.
66
- :param config: The configuration parameters for reading the data.
67
- :param raw_data: The raw data in the form of a dictionary.
68
- :param question_names: The names of the questions.
69
- :param question_texts: The text of the questions.
70
- :param answer_codebook: The codebook for the answers.
71
- :param question_types: The types of the questions.
72
- :param question_options: The options for the questions.
73
-
74
- >>> id = InputDataABC.example(question_names = ['a','b'], answer_codebook = {'a': {'1':'yes', '2':'no'}, 'b': {'1':'yes', '2':'no'}})
75
-
76
- >>> id = InputDataABC.example(question_names = ['a','b'], answer_codebook = {'a': {'1':'yes', '2':'no'}, 'c': {'1':'yes', '2':'no'}})
77
- Traceback (most recent call last):
78
- ...
79
- Exception: The keys of the answer_codebook must match the question_names.
80
- """
81
-
82
- self.datafile_name = datafile_name
83
- self.config = config
84
- self.naming_function = naming_function
85
-
86
- if binary is not None:
87
- self.binary = binary
88
- else:
89
- try:
90
- with open(self.datafile_name, "rb") as file:
91
- self.binary = base64.b64encode(file.read()).decode()
92
- except FileNotFoundError:
93
- self.binary = None
94
-
95
- def default_repair_func(x):
96
- return (
97
- x.replace("#", "_num")
98
- .replace("class", "social_class")
99
- .replace("name", "respondent_name")
100
- )
101
-
102
- self.question_name_repair_func = (
103
- question_name_repair_func or default_repair_func
104
- )
105
-
106
- if answer_codebook is not None and question_names is not None:
107
- if set(answer_codebook.keys()) != set(question_names):
108
- raise Exception(
109
- "The keys of the answer_codebook must match the question_names."
110
- )
111
-
112
- if question_names is not None and question_texts is not None:
113
- if len(question_names) != len(question_texts):
114
- raise Exception(
115
- "The question_names and question_texts must have the same length."
116
- )
117
-
118
- self.question_texts = question_texts
119
- self.question_names = question_names
120
- self.answer_codebook = answer_codebook
121
- self.raw_data = raw_data
122
-
123
- self.apply_codebook()
124
-
125
- self.question_types = question_types
126
- self.question_options = question_options
127
- if order_options:
128
- self.order_options()
129
-
130
- @property
131
- def download_link(self):
132
- from IPython.display import HTML
133
-
134
- actual_file_name = self.datafile_name.split("/")[-1]
135
- download_link = f'<a href="data:text/plain;base64,{self.binary}" download="{actual_file_name}">Download {self.datafile_name}</a>'
136
- return HTML(download_link)
137
-
138
- @abstractmethod
139
- def get_question_texts(self) -> List[str]:
140
- """Get the text of the questions
141
-
142
- >>> id = InputDataABC.example()
143
- >>> id.get_question_texts()
144
- ['how are you doing this morning?', 'how are you feeling?']
145
-
146
- """
147
- raise NotImplementedError
148
-
149
- @abstractmethod
150
- def get_raw_data(self) -> List[List[str]]:
151
- """Returns the responses by reading the datafile_name.
152
-
153
- >>> id = InputDataABC.example()
154
- >>> id.get_raw_data()
155
- [['1', '4'], ['3', '6']]
156
-
157
- """
158
- raise NotImplementedError
159
-
160
- @abstractmethod
161
- def get_question_names(self) -> List[str]:
162
- """Get the names of the questions.
163
-
164
- >>> id = InputDataABC.example()
165
- >>> id.get_question_names()
166
- ['morning', 'feeling']
167
-
168
- """
169
- raise NotImplementedError
170
-
171
- def rename_questions(
172
- self, rename_dict: Dict[str, str], ignore_missing=False
173
- ) -> "InputData":
174
- """Rename a question.
175
-
176
- >>> id = InputDataABC.example()
177
- >>> id.rename_questions({'morning': 'evening'}).question_names
178
- ['evening', 'feeling']
179
-
180
- """
181
- for old_name, new_name in rename_dict.items():
182
- self.rename(old_name, new_name, ignore_missing=ignore_missing)
183
- return self
184
-
185
- def rename(self, old_name, new_name, ignore_missing=False) -> "InputData":
186
- """Rename a question.
187
-
188
- >>> id = InputDataABC.example()
189
- >>> id.rename('morning', 'evening').question_names
190
- ['evening', 'feeling']
191
-
192
- """
193
- if old_name not in self.question_names:
194
- if ignore_missing:
195
- return self
196
- else:
197
- raise ValueError(f"Question {old_name} not found.")
198
-
199
- idx = self.question_names.index(old_name)
200
- self.question_names[idx] = new_name
201
- self.answer_codebook[new_name] = self.answer_codebook.pop(old_name, {})
202
-
203
- return self
204
-
205
- def _drop_question(self, question_name, ignore_missing=False):
206
- """Drop a question
207
-
208
- >>> id = InputDataABC.example()
209
- >>> id._drop_question('morning').question_names
210
- ['feeling']
211
-
212
- """
213
- if question_name not in self.question_names:
214
- if ignore_missing:
215
- return self
216
- else:
217
- raise ValueError(f"Question {question_name} not found.")
218
- idx = self.question_names.index(question_name)
219
- self._question_names.pop(idx)
220
- self._question_texts.pop(idx)
221
- self.question_types.pop(idx)
222
- self.question_options.pop(idx)
223
- self.raw_data.pop(idx)
224
- self.answer_codebook.pop(question_name, None)
225
- return self
226
-
227
- def drop(self, *question_names_to_drop) -> "InputData":
228
- """Drop a question.
229
-
230
- >>> id = InputDataABC.example()
231
- >>> id.drop('morning').question_names
232
- ['feeling']
233
-
234
- """
235
- for qn in question_names_to_drop:
236
- self._drop_question(qn)
237
- return self
238
-
239
- def keep(self, *question_names_to_keep, ignore_missing=False) -> "InputDataABC":
240
- """Keep a question.
241
-
242
- >>> id = InputDataABC.example()
243
- >>> id.keep('morning').question_names
244
- ['morning']
245
-
246
- """
247
- all_question_names = self._question_names[:]
248
- for qn in all_question_names:
249
- if qn not in question_names_to_keep:
250
- self._drop_question(qn, ignore_missing=ignore_missing)
251
- return self
252
-
253
- def modify_question_type(
254
- self,
255
- question_name: str,
256
- new_type: str,
257
- drop_options: bool = False,
258
- new_options: Optional[List[str]] = None,
259
- ) -> "InputData":
260
- """Modify the question type of a question. Checks to make sure the new type is valid.
261
-
262
- >>> id = InputDataABC.example()
263
- >>> id.modify_question_type('morning', 'numerical', drop_options = True).question_types
264
- ['numerical', 'multiple_choice']
265
-
266
- >>> id = InputDataABC.example()
267
- >>> id.modify_question_type('morning', 'poop')
268
- Traceback (most recent call last):
269
- ...
270
- ValueError: Question type poop is not available.
271
- """
272
- old_type = self.question_types[self.question_names.index(question_name)]
273
- old_options = self.question_options[self.question_names.index(question_name)]
274
-
275
- from edsl import Question
276
-
277
- if new_type not in Question.available():
278
- raise ValueError(f"Question type {new_type} is not available.")
279
-
280
- idx = self.question_names.index(question_name)
281
- self.question_types[idx] = new_type
282
- if drop_options:
283
- self.question_options[idx] = None
284
- if new_options is not None:
285
- self.question_options[idx] = new_options
286
-
287
- try:
288
- idx = self.question_names.index(question_name)
289
- rq = self.raw_question(idx)
290
- q = rq.to_question()
291
- except Exception as e:
292
- print(f"Error with question {question_name} in {self.datafile_name}")
293
- print(e)
294
- print("Reverting changes")
295
- self.question_types[idx] = old_type
296
- self.question_options[idx] = old_options
297
- return self
298
-
299
- @property
300
- def num_observations(self):
301
- """Return the number of observations.
302
-
303
- >>> id = InputDataABC.example()
304
- >>> id.num_observations
305
- 2
306
-
307
- """
308
- return len(self.raw_data[0])
309
-
310
- def to_dict(self):
311
- return {
312
- "datafile_name": self.datafile_name,
313
- "config": self.config,
314
- "raw_data": self.raw_data,
315
- "question_names": self.question_names,
316
- "question_texts": self.question_texts,
317
- "binary": self.binary,
318
- "answer_codebook": self.answer_codebook,
319
- "question_types": self.question_types,
320
- }
321
-
322
- @classmethod
323
- def from_dict(cls, d: Dict):
324
- return cls(**d)
325
-
326
- @property
327
- def question_names(self) -> List[str]:
328
- """
329
- Return a list of question names.
330
-
331
- >>> id = InputDataABC.example()
332
- >>> id.question_names
333
- ['morning', 'feeling']
334
-
335
- We can pass question names instead:
336
-
337
- >>> id = InputDataABC.example(question_names = ['a','b'])
338
- >>> id.question_names
339
- ['a', 'b']
340
-
341
- """
342
- if not hasattr(self, "_question_names"):
343
- self.question_names = None
344
- return self._question_names
345
-
346
- @question_names.setter
347
- def question_names(self, value) -> None:
348
- if value is None:
349
- value = self.get_question_names()
350
- if len(set(value)) != len(value):
351
- raise ValueError("Question names must be unique.")
352
- for i, qn in enumerate(value):
353
- if not is_valid_variable_name(qn, allow_name=False):
354
- new_name = self.question_name_repair_func(qn)
355
- if not is_valid_variable_name(new_name, allow_name=False):
356
- raise ValueError(
357
- f"""Question names must be valid Python identifiers. '{qn}' is not.""",
358
- """You can pass an entry in question_name_repair_func to fix this.""",
359
- )
360
- else:
361
- value[i] = new_name
362
- else:
363
- value[i] = qn
364
- self._question_names = value
365
-
366
- @property
367
- def question_texts(self) -> List[str]:
368
- """
369
- Return a list of question texts.
370
-
371
- >>> id = InputDataABC.example()
372
- >>> id.question_texts
373
- ['how are you doing this morning?', 'how are you feeling?']
374
- """
375
- if not hasattr(self, "_question_texts"):
376
- self.question_texts = None
377
- return self._question_texts
378
-
379
- @question_texts.setter
380
- def question_texts(self, value):
381
- if value is None:
382
- value = self.get_question_texts()
383
- self._question_texts = value
384
-
385
- @property
386
- def raw_data(self):
387
- """
388
-
389
- >>> id = InputDataABC.example()
390
- >>> id.raw_data
391
- [['1', '4'], ['3', '6']]
392
-
393
- """
394
- if not hasattr(self, "_raw_data"):
395
- self.raw_data = None
396
- return self._raw_data
397
-
398
- @raw_data.setter
399
- def raw_data(self, value):
400
- """ """
401
- if value is None:
402
- value = self.get_raw_data()
403
- # self.apply_codebook()
404
- self._raw_data = value
405
-
406
- def to_dataset(self) -> "Dataset":
407
- from edsl.results.Dataset import Dataset
408
-
409
- dataset_list = []
410
- for key, value in zip(self.question_names, self.raw_data):
411
- dataset_list.append({key: value})
412
- return Dataset(dataset_list)
413
-
414
- def to_scenario_list(self) -> ScenarioList:
415
- """Return a ScenarioList object from the raw response data.
416
-
417
- >>> id = InputDataABC.example()
418
- >>> s = id.to_scenario_list()
419
- >>> type(s) == ScenarioList
420
- True
421
-
422
- >>> s
423
- ScenarioList([Scenario({'morning': '1', 'feeling': '3'}), Scenario({'morning': '4', 'feeling': '6'})])
424
-
425
- """
426
- s = ScenarioList()
427
- for qn in self.question_names:
428
- idx = self.question_names.index(qn)
429
- s = s.add_list(qn, self.raw_data[idx])
430
- return s
431
-
432
- @property
433
- def names_to_texts(self) -> dict:
434
- """
435
- Return a dictionary of question names to question texts.
436
-
437
- >>> id = InputDataABC.example()
438
- >>> id.names_to_texts
439
- {'morning': 'how are you doing this morning?', 'feeling': 'how are you feeling?'}
440
- """
441
- return {n: t for n, t in zip(self.question_names, self.question_texts)}
442
-
443
- @property
444
- def texts_to_names(self):
445
- """Return a dictionary of question texts to question names.
446
-
447
- >>> id = InputDataABC.example()
448
- >>> id.texts_to_names
449
- {'how are you doing this morning?': 'morning', 'how are you feeling?': 'feeling'}
450
-
451
- """
452
- return {t: n for n, t in self.names_to_texts.items()}
453
-
454
- def raw_question(self, index: int) -> RawQuestion:
455
- return RawQuestion(
456
- question_type=self.question_types[index],
457
- question_name=self.question_names[index],
458
- question_text=self.question_texts[index],
459
- responses=self.raw_data[index],
460
- question_options=self.question_options[index],
461
- )
462
-
463
- def raw_questions(self) -> Generator[RawQuestion, None, None]:
464
- """Return a generator of RawQuestion objects."""
465
- for qn in self.question_names:
466
- idx = self.question_names.index(qn)
467
- yield self.raw_question(idx)
468
-
469
- def questions(self) -> Generator[Union[QuestionBase, None], None, None]:
470
- """Return a generator of Question objects."""
471
- for rq in self.raw_questions():
472
- try:
473
- yield rq.to_question()
474
- except Exception as e:
475
- print(
476
- f"Error with question '{rq.question_name}' in '{self.datafile_name}'"
477
- )
478
- print(e)
479
- yield None
480
-
481
- def select(self, *question_names: List[str]) -> "InputData":
482
- """Select a subset of the questions.
483
-
484
- :param question_names: The names of the questions to select.
485
-
486
- >>> id = InputDataABC.example()
487
- >>> id.select('morning').question_names
488
- ['morning']
489
-
490
- """
491
-
492
- idxs = [self.question_names.index(qn) for qn in question_names]
493
- new_data = [self.raw_data[i] for i in idxs]
494
- new_texts = [self.question_texts[i] for i in idxs]
495
- new_types = [self.question_types[i] for i in idxs]
496
- new_options = [self.question_options[i] for i in idxs]
497
- new_names = [self.question_names[i] for i in idxs]
498
- answer_codebook = {
499
- qn: self.answer_codebook.get(qn, {}) for qn in question_names
500
- }
501
- return self.__class__(
502
- self.datafile_name,
503
- self.config,
504
- raw_data=new_data,
505
- question_names=new_names,
506
- question_texts=new_texts,
507
- question_types=new_types,
508
- question_options=new_options,
509
- answer_codebook=answer_codebook,
510
- question_name_repair_func=self.question_name_repair_func,
511
- )
512
-
513
- def to_survey(self) -> Survey:
514
- """
515
- >>> id = InputDataABC.example()
516
- >>> s = id.to_survey()
517
- >>> type(s) == Survey
518
- True
519
-
520
- """
521
- s = Survey()
522
- for q in self.questions():
523
- if q is not None:
524
- s.add_question(q)
525
- return s
526
-
527
- def print(self):
528
- sl = (
529
- ScenarioList.from_list("question_name", self.question_names)
530
- .add_list("question_text", self.question_texts)
531
- .add_list("inferred_question_type", self.question_types)
532
- .add_list("num_responses", self.num_responses)
533
- .add_list("num_unique_responses", self.num_unique_responses)
534
- .add_list("missing", self.missing)
535
- .add_list("frac_numerical", self.frac_numerical)
536
- .add_list("top_5_items", self.top_k(5))
537
- .add_list("frac_obs_from_top_5", self.frac_obs_from_top_k(5))
538
- )
539
- sl.print()
540
-
541
- @property
542
- def answer_codebook(self) -> dict:
543
- """Return the answer codebook.
544
- >>> id = InputDataABC.example(answer_codebook = {'morning':{'1':'hello'}})
545
- >>> id.answer_codebook
546
- {'morning': {'1': 'hello'}}
547
-
548
- """
549
- if not hasattr(self, "_answer_codebook"):
550
- self._answer_codebook = None
551
- return self._answer_codebook
552
-
553
- @answer_codebook.setter
554
- def answer_codebook(self, value):
555
- if value is None:
556
- value = self.get_answer_codebook()
557
- self._answer_codebook = value
558
-
559
- def get_answer_codebook(self):
560
- return {}
561
-
562
- def _drop_rows(self, indices: List[int]):
563
- """Drop rows from the raw data.
564
- :param indices
565
-
566
- >>> id = InputDataABC.example()
567
- >>> id.num_observations
568
- 2
569
- >>> _ = id._drop_rows([1])
570
- >>> id.num_observations
571
- 1
572
-
573
- """
574
- self.raw_data = [
575
- [r for i, r in enumerate(row) if i not in indices] for row in self.raw_data
576
- ]
577
- return self
578
-
579
- def _missing_indices(self, question_name):
580
- """Return the indices of missing values for a question.
581
- TODO: Could re-factor to use SimpleEval
582
-
583
- >>> id = InputDataABC.example()
584
- >>> id.raw_data[0][0] = 'missing'
585
- >>> id._missing_indices('morning')
586
- [0]
587
- """
588
- idx = self.question_names.index(question_name)
589
- return [i for i, r in enumerate(self.raw_data[idx]) if r == "missing"]
590
-
591
- def drop_missing(self, question_name):
592
- """Drop missing values for a question.
593
-
594
- >>> id = InputDataABC.example()
595
- >>> id.num_observations
596
- 2
597
- >>> id.raw_data[0][0] = 'missing'
598
- >>> id.drop_missing('morning')
599
- >>> id.num_observations
600
- 1
601
- """
602
- self._drop_rows(self._missing_indices(question_name))
603
-
604
- @property
605
- def num_observations(self):
606
- """
607
- Return the number of observations
608
-
609
- >>> id = InputDataABC.example()
610
- >>> id.num_observations
611
- 2
612
- """
613
- return len(self.raw_data[0])
614
-
615
- def apply_codebook(self) -> None:
616
- """Apply the codebook to the raw data.
617
-
618
- >>> id = InputDataABC.example()
619
- >>> id.raw_data
620
- [['1', '4'], ['3', '6']]
621
-
622
- >>> id = InputDataABC.example(answer_codebook = {'morning':{'1':'hello'}})
623
- >>> id.raw_data
624
- [['hello', '4'], ['3', '6']]
625
- """
626
- for index, qn in enumerate(self.question_names):
627
- if qn in self.answer_codebook:
628
- new_responses = [
629
- self.answer_codebook[qn].get(r, r) for r in self.raw_data[index]
630
- ]
631
- self.raw_data[index] = new_responses
632
-
633
- def __repr__(self):
634
- return f"{self.__class__.__name__}: datafile_name:'{self.datafile_name}' num_questions:{len(self.question_names)}, num_observations:{len(self.raw_data[0])}"
635
-
636
- @classmethod
637
- def example(cls, **kwargs) -> "InputDataABC":
638
- class InputDataExample(InputDataABC):
639
- def get_question_texts(self) -> List[str]:
640
- """Get the text of the questions"""
641
- return ["how are you doing this morning?", "how are you feeling?"]
642
-
643
- def get_raw_data(self) -> SurveyResponses:
644
- """Returns a dataframe of responses by reading the datafile_name."""
645
- return [["1", "4"], ["3", "6"]]
646
-
647
- def get_question_names(self):
648
- new_names = [self.naming_function(q) for q in self.question_texts]
649
- if len(new_names) != len(set(new_names)):
650
- new_names = [f"{q}_{i}" for i, q in enumerate(new_names)]
651
- return new_names
652
-
653
- return InputDataExample("notneeded", config={}, **kwargs)
654
-
655
-
656
- if __name__ == "__main__":
657
- import doctest
658
-
659
- doctest.testmod(optionflags=doctest.ELLIPSIS)
1
+ import base64
2
+ from abc import ABC, abstractmethod
3
+ from typing import Dict, Callable, Optional, List, Generator, Tuple, Union
4
+ from collections import namedtuple
5
+ from typing import List, Union
6
+
7
+ from edsl.questions.QuestionBase import QuestionBase
8
+
9
+ from edsl.scenarios.ScenarioList import ScenarioList
10
+ from edsl.surveys.Survey import Survey
11
+ from edsl.conjure.SurveyResponses import SurveyResponses
12
+ from edsl.conjure.naming_utilities import sanitize_string
13
+ from edsl.utilities.utilities import is_valid_variable_name
14
+
15
+ from edsl.conjure.RawQuestion import RawQuestion
16
+ from edsl.conjure.AgentConstructionMixin import AgentConstructionMixin
17
+
18
+ from edsl.conjure.QuestionOptionMixin import QuestionOptionMixin
19
+ from edsl.conjure.InputDataMixinQuestionStats import InputDataMixinQuestionStats
20
+ from edsl.conjure.QuestionTypeMixin import QuestionTypeMixin
21
+
22
+
23
+ class InputDataABC(
24
+ ABC,
25
+ InputDataMixinQuestionStats,
26
+ AgentConstructionMixin,
27
+ QuestionOptionMixin,
28
+ QuestionTypeMixin,
29
+ ):
30
+ """A class to represent the input data for a survey."""
31
+
32
+ NUM_UNIQUE_THRESHOLD = 15
33
+ FRAC_NUMERICAL_THRESHOLD = 0.8
34
+ MULTIPLE_CHOICE_OTHER_THRESHOLD = 0.5
35
+ OTHER_STRING = "Other:"
36
+
37
+ question_attributes = [
38
+ "num_responses",
39
+ "num_unique_responses",
40
+ "missing",
41
+ "unique_responses",
42
+ "frac_numerical",
43
+ "top_5",
44
+ "frac_obs_from_top_5",
45
+ ]
46
+ QuestionStats = namedtuple("QuestionStats", question_attributes)
47
+
48
+ def __init__(
49
+ self,
50
+ datafile_name: str,
51
+ config: Optional[dict] = None,
52
+ naming_function: Optional[Callable] = sanitize_string,
53
+ raw_data: Optional[List] = None,
54
+ binary: Optional[str] = None,
55
+ question_names: Optional[List[str]] = None,
56
+ question_texts: Optional[List[str]] = None,
57
+ answer_codebook: Optional[Dict] = None,
58
+ question_types: Optional[List[str]] = None,
59
+ question_options: Optional[List] = None,
60
+ order_options=False,
61
+ question_name_repair_func: Callable = None,
62
+ ):
63
+ """Initialize the InputData object.
64
+
65
+ :param datafile_name: The name of the file containing the data.
66
+ :param config: The configuration parameters for reading the data.
67
+ :param raw_data: The raw data in the form of a dictionary.
68
+ :param question_names: The names of the questions.
69
+ :param question_texts: The text of the questions.
70
+ :param answer_codebook: The codebook for the answers.
71
+ :param question_types: The types of the questions.
72
+ :param question_options: The options for the questions.
73
+
74
+ >>> id = InputDataABC.example(question_names = ['a','b'], answer_codebook = {'a': {'1':'yes', '2':'no'}, 'b': {'1':'yes', '2':'no'}})
75
+
76
+ >>> id = InputDataABC.example(question_names = ['a','b'], answer_codebook = {'a': {'1':'yes', '2':'no'}, 'c': {'1':'yes', '2':'no'}})
77
+ Traceback (most recent call last):
78
+ ...
79
+ Exception: The keys of the answer_codebook must match the question_names.
80
+ """
81
+
82
+ self.datafile_name = datafile_name
83
+ self.config = config
84
+ self.naming_function = naming_function
85
+
86
+ if binary is not None:
87
+ self.binary = binary
88
+ else:
89
+ try:
90
+ with open(self.datafile_name, "rb") as file:
91
+ self.binary = base64.b64encode(file.read()).decode()
92
+ except FileNotFoundError:
93
+ self.binary = None
94
+
95
+ def default_repair_func(x):
96
+ return (
97
+ x.replace("#", "_num")
98
+ .replace("class", "social_class")
99
+ .replace("name", "respondent_name")
100
+ )
101
+
102
+ self.question_name_repair_func = (
103
+ question_name_repair_func or default_repair_func
104
+ )
105
+
106
+ if answer_codebook is not None and question_names is not None:
107
+ if set(answer_codebook.keys()) != set(question_names):
108
+ raise Exception(
109
+ "The keys of the answer_codebook must match the question_names."
110
+ )
111
+
112
+ if question_names is not None and question_texts is not None:
113
+ if len(question_names) != len(question_texts):
114
+ raise Exception(
115
+ "The question_names and question_texts must have the same length."
116
+ )
117
+
118
+ self.question_texts = question_texts
119
+ self.question_names = question_names
120
+ self.answer_codebook = answer_codebook
121
+ self.raw_data = raw_data
122
+
123
+ self.apply_codebook()
124
+
125
+ self.question_types = question_types
126
+ self.question_options = question_options
127
+ if order_options:
128
+ self.order_options()
129
+
130
+ @property
131
+ def download_link(self):
132
+ from IPython.display import HTML
133
+
134
+ actual_file_name = self.datafile_name.split("/")[-1]
135
+ download_link = f'<a href="data:text/plain;base64,{self.binary}" download="{actual_file_name}">Download {self.datafile_name}</a>'
136
+ return HTML(download_link)
137
+
138
+ @abstractmethod
139
+ def get_question_texts(self) -> List[str]:
140
+ """Get the text of the questions
141
+
142
+ >>> id = InputDataABC.example()
143
+ >>> id.get_question_texts()
144
+ ['how are you doing this morning?', 'how are you feeling?']
145
+
146
+ """
147
+ raise NotImplementedError
148
+
149
+ @abstractmethod
150
+ def get_raw_data(self) -> List[List[str]]:
151
+ """Returns the responses by reading the datafile_name.
152
+
153
+ >>> id = InputDataABC.example()
154
+ >>> id.get_raw_data()
155
+ [['1', '4'], ['3', '6']]
156
+
157
+ """
158
+ raise NotImplementedError
159
+
160
+ @abstractmethod
161
+ def get_question_names(self) -> List[str]:
162
+ """Get the names of the questions.
163
+
164
+ >>> id = InputDataABC.example()
165
+ >>> id.get_question_names()
166
+ ['morning', 'feeling']
167
+
168
+ """
169
+ raise NotImplementedError
170
+
171
+ def rename_questions(
172
+ self, rename_dict: Dict[str, str], ignore_missing=False
173
+ ) -> "InputData":
174
+ """Rename a question.
175
+
176
+ >>> id = InputDataABC.example()
177
+ >>> id.rename_questions({'morning': 'evening'}).question_names
178
+ ['evening', 'feeling']
179
+
180
+ """
181
+ for old_name, new_name in rename_dict.items():
182
+ self.rename(old_name, new_name, ignore_missing=ignore_missing)
183
+ return self
184
+
185
+ def rename(self, old_name, new_name, ignore_missing=False) -> "InputData":
186
+ """Rename a question.
187
+
188
+ >>> id = InputDataABC.example()
189
+ >>> id.rename('morning', 'evening').question_names
190
+ ['evening', 'feeling']
191
+
192
+ """
193
+ if old_name not in self.question_names:
194
+ if ignore_missing:
195
+ return self
196
+ else:
197
+ raise ValueError(f"Question {old_name} not found.")
198
+
199
+ idx = self.question_names.index(old_name)
200
+ self.question_names[idx] = new_name
201
+ self.answer_codebook[new_name] = self.answer_codebook.pop(old_name, {})
202
+
203
+ return self
204
+
205
+ def _drop_question(self, question_name, ignore_missing=False):
206
+ """Drop a question
207
+
208
+ >>> id = InputDataABC.example()
209
+ >>> id._drop_question('morning').question_names
210
+ ['feeling']
211
+
212
+ """
213
+ if question_name not in self.question_names:
214
+ if ignore_missing:
215
+ return self
216
+ else:
217
+ raise ValueError(f"Question {question_name} not found.")
218
+ idx = self.question_names.index(question_name)
219
+ self._question_names.pop(idx)
220
+ self._question_texts.pop(idx)
221
+ self.question_types.pop(idx)
222
+ self.question_options.pop(idx)
223
+ self.raw_data.pop(idx)
224
+ self.answer_codebook.pop(question_name, None)
225
+ return self
226
+
227
+ def drop(self, *question_names_to_drop) -> "InputData":
228
+ """Drop a question.
229
+
230
+ >>> id = InputDataABC.example()
231
+ >>> id.drop('morning').question_names
232
+ ['feeling']
233
+
234
+ """
235
+ for qn in question_names_to_drop:
236
+ self._drop_question(qn)
237
+ return self
238
+
239
+ def keep(self, *question_names_to_keep, ignore_missing=False) -> "InputDataABC":
240
+ """Keep a question.
241
+
242
+ >>> id = InputDataABC.example()
243
+ >>> id.keep('morning').question_names
244
+ ['morning']
245
+
246
+ """
247
+ all_question_names = self._question_names[:]
248
+ for qn in all_question_names:
249
+ if qn not in question_names_to_keep:
250
+ self._drop_question(qn, ignore_missing=ignore_missing)
251
+ return self
252
+
253
+ def modify_question_type(
254
+ self,
255
+ question_name: str,
256
+ new_type: str,
257
+ drop_options: bool = False,
258
+ new_options: Optional[List[str]] = None,
259
+ ) -> "InputData":
260
+ """Modify the question type of a question. Checks to make sure the new type is valid.
261
+
262
+ >>> id = InputDataABC.example()
263
+ >>> id.modify_question_type('morning', 'numerical', drop_options = True).question_types
264
+ ['numerical', 'multiple_choice']
265
+
266
+ >>> id = InputDataABC.example()
267
+ >>> id.modify_question_type('morning', 'poop')
268
+ Traceback (most recent call last):
269
+ ...
270
+ ValueError: Question type poop is not available.
271
+ """
272
+ old_type = self.question_types[self.question_names.index(question_name)]
273
+ old_options = self.question_options[self.question_names.index(question_name)]
274
+
275
+ from edsl import Question
276
+
277
+ if new_type not in Question.available():
278
+ raise ValueError(f"Question type {new_type} is not available.")
279
+
280
+ idx = self.question_names.index(question_name)
281
+ self.question_types[idx] = new_type
282
+ if drop_options:
283
+ self.question_options[idx] = None
284
+ if new_options is not None:
285
+ self.question_options[idx] = new_options
286
+
287
+ try:
288
+ idx = self.question_names.index(question_name)
289
+ rq = self.raw_question(idx)
290
+ q = rq.to_question()
291
+ except Exception as e:
292
+ print(f"Error with question {question_name} in {self.datafile_name}")
293
+ print(e)
294
+ print("Reverting changes")
295
+ self.question_types[idx] = old_type
296
+ self.question_options[idx] = old_options
297
+ return self
298
+
299
+ @property
300
+ def num_observations(self):
301
+ """Return the number of observations.
302
+
303
+ >>> id = InputDataABC.example()
304
+ >>> id.num_observations
305
+ 2
306
+
307
+ """
308
+ return len(self.raw_data[0])
309
+
310
+ def to_dict(self):
311
+ return {
312
+ "datafile_name": self.datafile_name,
313
+ "config": self.config,
314
+ "raw_data": self.raw_data,
315
+ "question_names": self.question_names,
316
+ "question_texts": self.question_texts,
317
+ "binary": self.binary,
318
+ "answer_codebook": self.answer_codebook,
319
+ "question_types": self.question_types,
320
+ }
321
+
322
+ @classmethod
323
+ def from_dict(cls, d: Dict):
324
+ return cls(**d)
325
+
326
+ @property
327
+ def question_names(self) -> List[str]:
328
+ """
329
+ Return a list of question names.
330
+
331
+ >>> id = InputDataABC.example()
332
+ >>> id.question_names
333
+ ['morning', 'feeling']
334
+
335
+ We can pass question names instead:
336
+
337
+ >>> id = InputDataABC.example(question_names = ['a','b'])
338
+ >>> id.question_names
339
+ ['a', 'b']
340
+
341
+ """
342
+ if not hasattr(self, "_question_names"):
343
+ self.question_names = None
344
+ return self._question_names
345
+
346
+ @question_names.setter
347
+ def question_names(self, value) -> None:
348
+ if value is None:
349
+ value = self.get_question_names()
350
+ if len(set(value)) != len(value):
351
+ raise ValueError("Question names must be unique.")
352
+ for i, qn in enumerate(value):
353
+ if not is_valid_variable_name(qn, allow_name=False):
354
+ new_name = self.question_name_repair_func(qn)
355
+ if not is_valid_variable_name(new_name, allow_name=False):
356
+ raise ValueError(
357
+ f"""Question names must be valid Python identifiers. '{qn}' is not.""",
358
+ """You can pass an entry in question_name_repair_func to fix this.""",
359
+ )
360
+ else:
361
+ value[i] = new_name
362
+ else:
363
+ value[i] = qn
364
+ self._question_names = value
365
+
366
+ @property
367
+ def question_texts(self) -> List[str]:
368
+ """
369
+ Return a list of question texts.
370
+
371
+ >>> id = InputDataABC.example()
372
+ >>> id.question_texts
373
+ ['how are you doing this morning?', 'how are you feeling?']
374
+ """
375
+ if not hasattr(self, "_question_texts"):
376
+ self.question_texts = None
377
+ return self._question_texts
378
+
379
+ @question_texts.setter
380
+ def question_texts(self, value):
381
+ if value is None:
382
+ value = self.get_question_texts()
383
+ self._question_texts = value
384
+
385
+ @property
386
+ def raw_data(self):
387
+ """
388
+
389
+ >>> id = InputDataABC.example()
390
+ >>> id.raw_data
391
+ [['1', '4'], ['3', '6']]
392
+
393
+ """
394
+ if not hasattr(self, "_raw_data"):
395
+ self.raw_data = None
396
+ return self._raw_data
397
+
398
+ @raw_data.setter
399
+ def raw_data(self, value):
400
+ """ """
401
+ if value is None:
402
+ value = self.get_raw_data()
403
+ # self.apply_codebook()
404
+ self._raw_data = value
405
+
406
+ def to_dataset(self) -> "Dataset":
407
+ from edsl.results.Dataset import Dataset
408
+
409
+ dataset_list = []
410
+ for key, value in zip(self.question_names, self.raw_data):
411
+ dataset_list.append({key: value})
412
+ return Dataset(dataset_list)
413
+
414
+ def to_scenario_list(self) -> ScenarioList:
415
+ """Return a ScenarioList object from the raw response data.
416
+
417
+ >>> id = InputDataABC.example()
418
+ >>> s = id.to_scenario_list()
419
+ >>> type(s) == ScenarioList
420
+ True
421
+
422
+ >>> s
423
+ ScenarioList([Scenario({'morning': '1', 'feeling': '3'}), Scenario({'morning': '4', 'feeling': '6'})])
424
+
425
+ """
426
+ s = ScenarioList()
427
+ for qn in self.question_names:
428
+ idx = self.question_names.index(qn)
429
+ s = s.add_list(qn, self.raw_data[idx])
430
+ return s
431
+
432
+ @property
433
+ def names_to_texts(self) -> dict:
434
+ """
435
+ Return a dictionary of question names to question texts.
436
+
437
+ >>> id = InputDataABC.example()
438
+ >>> id.names_to_texts
439
+ {'morning': 'how are you doing this morning?', 'feeling': 'how are you feeling?'}
440
+ """
441
+ return {n: t for n, t in zip(self.question_names, self.question_texts)}
442
+
443
+ @property
444
+ def texts_to_names(self):
445
+ """Return a dictionary of question texts to question names.
446
+
447
+ >>> id = InputDataABC.example()
448
+ >>> id.texts_to_names
449
+ {'how are you doing this morning?': 'morning', 'how are you feeling?': 'feeling'}
450
+
451
+ """
452
+ return {t: n for n, t in self.names_to_texts.items()}
453
+
454
+ def raw_question(self, index: int) -> RawQuestion:
455
+ return RawQuestion(
456
+ question_type=self.question_types[index],
457
+ question_name=self.question_names[index],
458
+ question_text=self.question_texts[index],
459
+ responses=self.raw_data[index],
460
+ question_options=self.question_options[index],
461
+ )
462
+
463
+ def raw_questions(self) -> Generator[RawQuestion, None, None]:
464
+ """Return a generator of RawQuestion objects."""
465
+ for qn in self.question_names:
466
+ idx = self.question_names.index(qn)
467
+ yield self.raw_question(idx)
468
+
469
+ def questions(self) -> Generator[Union[QuestionBase, None], None, None]:
470
+ """Return a generator of Question objects."""
471
+ for rq in self.raw_questions():
472
+ try:
473
+ yield rq.to_question()
474
+ except Exception as e:
475
+ print(
476
+ f"Error with question '{rq.question_name}' in '{self.datafile_name}'"
477
+ )
478
+ print(e)
479
+ yield None
480
+
481
+ def select(self, *question_names: List[str]) -> "InputData":
482
+ """Select a subset of the questions.
483
+
484
+ :param question_names: The names of the questions to select.
485
+
486
+ >>> id = InputDataABC.example()
487
+ >>> id.select('morning').question_names
488
+ ['morning']
489
+
490
+ """
491
+
492
+ idxs = [self.question_names.index(qn) for qn in question_names]
493
+ new_data = [self.raw_data[i] for i in idxs]
494
+ new_texts = [self.question_texts[i] for i in idxs]
495
+ new_types = [self.question_types[i] for i in idxs]
496
+ new_options = [self.question_options[i] for i in idxs]
497
+ new_names = [self.question_names[i] for i in idxs]
498
+ answer_codebook = {
499
+ qn: self.answer_codebook.get(qn, {}) for qn in question_names
500
+ }
501
+ return self.__class__(
502
+ self.datafile_name,
503
+ self.config,
504
+ raw_data=new_data,
505
+ question_names=new_names,
506
+ question_texts=new_texts,
507
+ question_types=new_types,
508
+ question_options=new_options,
509
+ answer_codebook=answer_codebook,
510
+ question_name_repair_func=self.question_name_repair_func,
511
+ )
512
+
513
+ def to_survey(self) -> Survey:
514
+ """
515
+ >>> id = InputDataABC.example()
516
+ >>> s = id.to_survey()
517
+ >>> type(s) == Survey
518
+ True
519
+
520
+ """
521
+ s = Survey()
522
+ for q in self.questions():
523
+ if q is not None:
524
+ s.add_question(q)
525
+ return s
526
+
527
+ def print(self):
528
+ sl = (
529
+ ScenarioList.from_list("question_name", self.question_names)
530
+ .add_list("question_text", self.question_texts)
531
+ .add_list("inferred_question_type", self.question_types)
532
+ .add_list("num_responses", self.num_responses)
533
+ .add_list("num_unique_responses", self.num_unique_responses)
534
+ .add_list("missing", self.missing)
535
+ .add_list("frac_numerical", self.frac_numerical)
536
+ .add_list("top_5_items", self.top_k(5))
537
+ .add_list("frac_obs_from_top_5", self.frac_obs_from_top_k(5))
538
+ )
539
+ sl.print()
540
+
541
+ @property
542
+ def answer_codebook(self) -> dict:
543
+ """Return the answer codebook.
544
+ >>> id = InputDataABC.example(answer_codebook = {'morning':{'1':'hello'}})
545
+ >>> id.answer_codebook
546
+ {'morning': {'1': 'hello'}}
547
+
548
+ """
549
+ if not hasattr(self, "_answer_codebook"):
550
+ self._answer_codebook = None
551
+ return self._answer_codebook
552
+
553
+ @answer_codebook.setter
554
+ def answer_codebook(self, value):
555
+ if value is None:
556
+ value = self.get_answer_codebook()
557
+ self._answer_codebook = value
558
+
559
+ def get_answer_codebook(self):
560
+ return {}
561
+
562
+ def _drop_rows(self, indices: List[int]):
563
+ """Drop rows from the raw data.
564
+ :param indices
565
+
566
+ >>> id = InputDataABC.example()
567
+ >>> id.num_observations
568
+ 2
569
+ >>> _ = id._drop_rows([1])
570
+ >>> id.num_observations
571
+ 1
572
+
573
+ """
574
+ self.raw_data = [
575
+ [r for i, r in enumerate(row) if i not in indices] for row in self.raw_data
576
+ ]
577
+ return self
578
+
579
+ def _missing_indices(self, question_name):
580
+ """Return the indices of missing values for a question.
581
+ TODO: Could re-factor to use SimpleEval
582
+
583
+ >>> id = InputDataABC.example()
584
+ >>> id.raw_data[0][0] = 'missing'
585
+ >>> id._missing_indices('morning')
586
+ [0]
587
+ """
588
+ idx = self.question_names.index(question_name)
589
+ return [i for i, r in enumerate(self.raw_data[idx]) if r == "missing"]
590
+
591
+ def drop_missing(self, question_name):
592
+ """Drop missing values for a question.
593
+
594
+ >>> id = InputDataABC.example()
595
+ >>> id.num_observations
596
+ 2
597
+ >>> id.raw_data[0][0] = 'missing'
598
+ >>> id.drop_missing('morning')
599
+ >>> id.num_observations
600
+ 1
601
+ """
602
+ self._drop_rows(self._missing_indices(question_name))
603
+
604
+ @property
605
+ def num_observations(self):
606
+ """
607
+ Return the number of observations
608
+
609
+ >>> id = InputDataABC.example()
610
+ >>> id.num_observations
611
+ 2
612
+ """
613
+ return len(self.raw_data[0])
614
+
615
+ def apply_codebook(self) -> None:
616
+ """Apply the codebook to the raw data.
617
+
618
+ >>> id = InputDataABC.example()
619
+ >>> id.raw_data
620
+ [['1', '4'], ['3', '6']]
621
+
622
+ >>> id = InputDataABC.example(answer_codebook = {'morning':{'1':'hello'}})
623
+ >>> id.raw_data
624
+ [['hello', '4'], ['3', '6']]
625
+ """
626
+ for index, qn in enumerate(self.question_names):
627
+ if qn in self.answer_codebook:
628
+ new_responses = [
629
+ self.answer_codebook[qn].get(r, r) for r in self.raw_data[index]
630
+ ]
631
+ self.raw_data[index] = new_responses
632
+
633
+ def __repr__(self):
634
+ return f"{self.__class__.__name__}: datafile_name:'{self.datafile_name}' num_questions:{len(self.question_names)}, num_observations:{len(self.raw_data[0])}"
635
+
636
+ @classmethod
637
+ def example(cls, **kwargs) -> "InputDataABC":
638
+ class InputDataExample(InputDataABC):
639
+ def get_question_texts(self) -> List[str]:
640
+ """Get the text of the questions"""
641
+ return ["how are you doing this morning?", "how are you feeling?"]
642
+
643
+ def get_raw_data(self) -> SurveyResponses:
644
+ """Returns a dataframe of responses by reading the datafile_name."""
645
+ return [["1", "4"], ["3", "6"]]
646
+
647
+ def get_question_names(self):
648
+ new_names = [self.naming_function(q) for q in self.question_texts]
649
+ if len(new_names) != len(set(new_names)):
650
+ new_names = [f"{q}_{i}" for i, q in enumerate(new_names)]
651
+ return new_names
652
+
653
+ return InputDataExample("notneeded", config={}, **kwargs)
654
+
655
+
656
+ if __name__ == "__main__":
657
+ import doctest
658
+
659
+ doctest.testmod(optionflags=doctest.ELLIPSIS)