edsl 0.1.36.dev6__py3-none-any.whl → 0.1.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. edsl/Base.py +303 -303
  2. edsl/BaseDiff.py +260 -260
  3. edsl/TemplateLoader.py +24 -24
  4. edsl/__init__.py +48 -47
  5. edsl/__version__.py +1 -1
  6. edsl/agents/Agent.py +855 -804
  7. edsl/agents/AgentList.py +350 -337
  8. edsl/agents/Invigilator.py +222 -222
  9. edsl/agents/InvigilatorBase.py +284 -294
  10. edsl/agents/PromptConstructor.py +353 -312
  11. edsl/agents/__init__.py +3 -3
  12. edsl/agents/descriptors.py +99 -86
  13. edsl/agents/prompt_helpers.py +129 -129
  14. edsl/auto/AutoStudy.py +117 -117
  15. edsl/auto/StageBase.py +230 -230
  16. edsl/auto/StageGenerateSurvey.py +178 -178
  17. edsl/auto/StageLabelQuestions.py +125 -125
  18. edsl/auto/StagePersona.py +61 -61
  19. edsl/auto/StagePersonaDimensionValueRanges.py +88 -88
  20. edsl/auto/StagePersonaDimensionValues.py +74 -74
  21. edsl/auto/StagePersonaDimensions.py +69 -69
  22. edsl/auto/StageQuestions.py +73 -73
  23. edsl/auto/SurveyCreatorPipeline.py +21 -21
  24. edsl/auto/utilities.py +224 -224
  25. edsl/base/Base.py +289 -289
  26. edsl/config.py +149 -149
  27. edsl/conjure/AgentConstructionMixin.py +160 -152
  28. edsl/conjure/Conjure.py +62 -62
  29. edsl/conjure/InputData.py +659 -659
  30. edsl/conjure/InputDataCSV.py +48 -48
  31. edsl/conjure/InputDataMixinQuestionStats.py +182 -182
  32. edsl/conjure/InputDataPyRead.py +91 -91
  33. edsl/conjure/InputDataSPSS.py +8 -8
  34. edsl/conjure/InputDataStata.py +8 -8
  35. edsl/conjure/QuestionOptionMixin.py +76 -76
  36. edsl/conjure/QuestionTypeMixin.py +23 -23
  37. edsl/conjure/RawQuestion.py +65 -65
  38. edsl/conjure/SurveyResponses.py +7 -7
  39. edsl/conjure/__init__.py +9 -9
  40. edsl/conjure/naming_utilities.py +263 -263
  41. edsl/conjure/utilities.py +201 -201
  42. edsl/conversation/Conversation.py +290 -238
  43. edsl/conversation/car_buying.py +58 -58
  44. edsl/conversation/chips.py +95 -0
  45. edsl/conversation/mug_negotiation.py +81 -81
  46. edsl/conversation/next_speaker_utilities.py +93 -93
  47. edsl/coop/PriceFetcher.py +54 -54
  48. edsl/coop/__init__.py +2 -2
  49. edsl/coop/coop.py +958 -849
  50. edsl/coop/utils.py +131 -131
  51. edsl/data/Cache.py +527 -527
  52. edsl/data/CacheEntry.py +228 -228
  53. edsl/data/CacheHandler.py +149 -149
  54. edsl/data/RemoteCacheSync.py +97 -84
  55. edsl/data/SQLiteDict.py +292 -292
  56. edsl/data/__init__.py +4 -4
  57. edsl/data/orm.py +10 -10
  58. edsl/data_transfer_models.py +73 -73
  59. edsl/enums.py +173 -173
  60. edsl/exceptions/BaseException.py +21 -0
  61. edsl/exceptions/__init__.py +54 -50
  62. edsl/exceptions/agents.py +38 -40
  63. edsl/exceptions/configuration.py +16 -16
  64. edsl/exceptions/coop.py +10 -10
  65. edsl/exceptions/data.py +14 -14
  66. edsl/exceptions/general.py +34 -34
  67. edsl/exceptions/jobs.py +33 -33
  68. edsl/exceptions/language_models.py +63 -63
  69. edsl/exceptions/prompts.py +15 -15
  70. edsl/exceptions/questions.py +91 -91
  71. edsl/exceptions/results.py +29 -26
  72. edsl/exceptions/scenarios.py +22 -0
  73. edsl/exceptions/surveys.py +37 -34
  74. edsl/inference_services/AnthropicService.py +87 -87
  75. edsl/inference_services/AwsBedrock.py +120 -115
  76. edsl/inference_services/AzureAI.py +217 -217
  77. edsl/inference_services/DeepInfraService.py +18 -18
  78. edsl/inference_services/GoogleService.py +156 -156
  79. edsl/inference_services/GroqService.py +20 -20
  80. edsl/inference_services/InferenceServiceABC.py +147 -147
  81. edsl/inference_services/InferenceServicesCollection.py +97 -72
  82. edsl/inference_services/MistralAIService.py +123 -123
  83. edsl/inference_services/OllamaService.py +18 -18
  84. edsl/inference_services/OpenAIService.py +224 -224
  85. edsl/inference_services/TestService.py +89 -89
  86. edsl/inference_services/TogetherAIService.py +170 -170
  87. edsl/inference_services/models_available_cache.py +118 -118
  88. edsl/inference_services/rate_limits_cache.py +25 -25
  89. edsl/inference_services/registry.py +39 -39
  90. edsl/inference_services/write_available.py +10 -10
  91. edsl/jobs/Answers.py +56 -56
  92. edsl/jobs/Jobs.py +1347 -1112
  93. edsl/jobs/__init__.py +1 -1
  94. edsl/jobs/buckets/BucketCollection.py +63 -63
  95. edsl/jobs/buckets/ModelBuckets.py +65 -65
  96. edsl/jobs/buckets/TokenBucket.py +248 -248
  97. edsl/jobs/interviews/Interview.py +661 -651
  98. edsl/jobs/interviews/InterviewExceptionCollection.py +99 -99
  99. edsl/jobs/interviews/InterviewExceptionEntry.py +186 -182
  100. edsl/jobs/interviews/InterviewStatistic.py +63 -63
  101. edsl/jobs/interviews/InterviewStatisticsCollection.py +25 -25
  102. edsl/jobs/interviews/InterviewStatusDictionary.py +78 -78
  103. edsl/jobs/interviews/InterviewStatusLog.py +92 -92
  104. edsl/jobs/interviews/ReportErrors.py +66 -66
  105. edsl/jobs/interviews/interview_status_enum.py +9 -9
  106. edsl/jobs/runners/JobsRunnerAsyncio.py +338 -337
  107. edsl/jobs/runners/JobsRunnerStatus.py +332 -332
  108. edsl/jobs/tasks/QuestionTaskCreator.py +242 -242
  109. edsl/jobs/tasks/TaskCreators.py +64 -64
  110. edsl/jobs/tasks/TaskHistory.py +442 -441
  111. edsl/jobs/tasks/TaskStatusLog.py +23 -23
  112. edsl/jobs/tasks/task_status_enum.py +163 -163
  113. edsl/jobs/tokens/InterviewTokenUsage.py +27 -27
  114. edsl/jobs/tokens/TokenUsage.py +34 -34
  115. edsl/language_models/KeyLookup.py +30 -0
  116. edsl/language_models/LanguageModel.py +706 -718
  117. edsl/language_models/ModelList.py +102 -102
  118. edsl/language_models/RegisterLanguageModelsMeta.py +184 -184
  119. edsl/language_models/__init__.py +3 -2
  120. edsl/language_models/fake_openai_call.py +15 -15
  121. edsl/language_models/fake_openai_service.py +61 -61
  122. edsl/language_models/registry.py +137 -137
  123. edsl/language_models/repair.py +156 -156
  124. edsl/language_models/unused/ReplicateBase.py +83 -83
  125. edsl/language_models/utilities.py +64 -64
  126. edsl/notebooks/Notebook.py +259 -259
  127. edsl/notebooks/__init__.py +1 -1
  128. edsl/prompts/Prompt.py +357 -358
  129. edsl/prompts/__init__.py +2 -2
  130. edsl/questions/AnswerValidatorMixin.py +289 -289
  131. edsl/questions/QuestionBase.py +656 -616
  132. edsl/questions/QuestionBaseGenMixin.py +161 -161
  133. edsl/questions/QuestionBasePromptsMixin.py +234 -266
  134. edsl/questions/QuestionBudget.py +227 -227
  135. edsl/questions/QuestionCheckBox.py +359 -359
  136. edsl/questions/QuestionExtract.py +183 -183
  137. edsl/questions/QuestionFreeText.py +114 -113
  138. edsl/questions/QuestionFunctional.py +159 -159
  139. edsl/questions/QuestionList.py +231 -231
  140. edsl/questions/QuestionMultipleChoice.py +286 -286
  141. edsl/questions/QuestionNumerical.py +153 -153
  142. edsl/questions/QuestionRank.py +324 -324
  143. edsl/questions/Quick.py +41 -41
  144. edsl/questions/RegisterQuestionsMeta.py +71 -71
  145. edsl/questions/ResponseValidatorABC.py +174 -174
  146. edsl/questions/SimpleAskMixin.py +73 -73
  147. edsl/questions/__init__.py +26 -26
  148. edsl/questions/compose_questions.py +98 -98
  149. edsl/questions/decorators.py +21 -21
  150. edsl/questions/derived/QuestionLikertFive.py +76 -76
  151. edsl/questions/derived/QuestionLinearScale.py +87 -87
  152. edsl/questions/derived/QuestionTopK.py +91 -91
  153. edsl/questions/derived/QuestionYesNo.py +82 -82
  154. edsl/questions/descriptors.py +413 -418
  155. edsl/questions/prompt_templates/question_budget.jinja +13 -13
  156. edsl/questions/prompt_templates/question_checkbox.jinja +32 -32
  157. edsl/questions/prompt_templates/question_extract.jinja +11 -11
  158. edsl/questions/prompt_templates/question_free_text.jinja +3 -3
  159. edsl/questions/prompt_templates/question_linear_scale.jinja +11 -11
  160. edsl/questions/prompt_templates/question_list.jinja +17 -17
  161. edsl/questions/prompt_templates/question_multiple_choice.jinja +33 -33
  162. edsl/questions/prompt_templates/question_numerical.jinja +36 -36
  163. edsl/questions/question_registry.py +147 -147
  164. edsl/questions/settings.py +12 -12
  165. edsl/questions/templates/budget/answering_instructions.jinja +7 -7
  166. edsl/questions/templates/budget/question_presentation.jinja +7 -7
  167. edsl/questions/templates/checkbox/answering_instructions.jinja +10 -10
  168. edsl/questions/templates/checkbox/question_presentation.jinja +22 -22
  169. edsl/questions/templates/extract/answering_instructions.jinja +7 -7
  170. edsl/questions/templates/likert_five/answering_instructions.jinja +10 -10
  171. edsl/questions/templates/likert_five/question_presentation.jinja +11 -11
  172. edsl/questions/templates/linear_scale/answering_instructions.jinja +5 -5
  173. edsl/questions/templates/linear_scale/question_presentation.jinja +5 -5
  174. edsl/questions/templates/list/answering_instructions.jinja +3 -3
  175. edsl/questions/templates/list/question_presentation.jinja +5 -5
  176. edsl/questions/templates/multiple_choice/answering_instructions.jinja +9 -9
  177. edsl/questions/templates/multiple_choice/question_presentation.jinja +11 -11
  178. edsl/questions/templates/numerical/answering_instructions.jinja +6 -6
  179. edsl/questions/templates/numerical/question_presentation.jinja +6 -6
  180. edsl/questions/templates/rank/answering_instructions.jinja +11 -11
  181. edsl/questions/templates/rank/question_presentation.jinja +15 -15
  182. edsl/questions/templates/top_k/answering_instructions.jinja +8 -8
  183. edsl/questions/templates/top_k/question_presentation.jinja +22 -22
  184. edsl/questions/templates/yes_no/answering_instructions.jinja +6 -6
  185. edsl/questions/templates/yes_no/question_presentation.jinja +11 -11
  186. edsl/results/Dataset.py +293 -293
  187. edsl/results/DatasetExportMixin.py +717 -693
  188. edsl/results/DatasetTree.py +145 -145
  189. edsl/results/Result.py +450 -433
  190. edsl/results/Results.py +1071 -1158
  191. edsl/results/ResultsDBMixin.py +238 -238
  192. edsl/results/ResultsExportMixin.py +43 -43
  193. edsl/results/ResultsFetchMixin.py +33 -33
  194. edsl/results/ResultsGGMixin.py +121 -121
  195. edsl/results/ResultsToolsMixin.py +98 -98
  196. edsl/results/Selector.py +135 -118
  197. edsl/results/__init__.py +2 -2
  198. edsl/results/tree_explore.py +115 -115
  199. edsl/scenarios/FileStore.py +458 -443
  200. edsl/scenarios/Scenario.py +546 -507
  201. edsl/scenarios/ScenarioHtmlMixin.py +64 -59
  202. edsl/scenarios/ScenarioList.py +1112 -1101
  203. edsl/scenarios/ScenarioListExportMixin.py +52 -52
  204. edsl/scenarios/ScenarioListPdfMixin.py +261 -261
  205. edsl/scenarios/__init__.py +4 -2
  206. edsl/shared.py +1 -1
  207. edsl/study/ObjectEntry.py +173 -173
  208. edsl/study/ProofOfWork.py +113 -113
  209. edsl/study/SnapShot.py +80 -80
  210. edsl/study/Study.py +528 -528
  211. edsl/study/__init__.py +4 -4
  212. edsl/surveys/DAG.py +148 -148
  213. edsl/surveys/Memory.py +31 -31
  214. edsl/surveys/MemoryPlan.py +244 -244
  215. edsl/surveys/Rule.py +330 -324
  216. edsl/surveys/RuleCollection.py +387 -387
  217. edsl/surveys/Survey.py +1795 -1772
  218. edsl/surveys/SurveyCSS.py +261 -261
  219. edsl/surveys/SurveyExportMixin.py +259 -259
  220. edsl/surveys/SurveyFlowVisualizationMixin.py +121 -121
  221. edsl/surveys/SurveyQualtricsImport.py +284 -284
  222. edsl/surveys/__init__.py +3 -3
  223. edsl/surveys/base.py +53 -53
  224. edsl/surveys/descriptors.py +56 -56
  225. edsl/surveys/instructions/ChangeInstruction.py +47 -47
  226. edsl/surveys/instructions/Instruction.py +51 -51
  227. edsl/surveys/instructions/InstructionCollection.py +77 -77
  228. edsl/templates/error_reporting/base.html +23 -23
  229. edsl/templates/error_reporting/exceptions_by_model.html +34 -34
  230. edsl/templates/error_reporting/exceptions_by_question_name.html +16 -16
  231. edsl/templates/error_reporting/exceptions_by_type.html +16 -16
  232. edsl/templates/error_reporting/interview_details.html +115 -115
  233. edsl/templates/error_reporting/interviews.html +9 -9
  234. edsl/templates/error_reporting/overview.html +4 -4
  235. edsl/templates/error_reporting/performance_plot.html +1 -1
  236. edsl/templates/error_reporting/report.css +73 -73
  237. edsl/templates/error_reporting/report.html +117 -117
  238. edsl/templates/error_reporting/report.js +25 -25
  239. edsl/tools/__init__.py +1 -1
  240. edsl/tools/clusters.py +192 -192
  241. edsl/tools/embeddings.py +27 -27
  242. edsl/tools/embeddings_plotting.py +118 -118
  243. edsl/tools/plotting.py +112 -112
  244. edsl/tools/summarize.py +18 -18
  245. edsl/utilities/SystemInfo.py +28 -28
  246. edsl/utilities/__init__.py +22 -22
  247. edsl/utilities/ast_utilities.py +25 -25
  248. edsl/utilities/data/Registry.py +6 -6
  249. edsl/utilities/data/__init__.py +1 -1
  250. edsl/utilities/data/scooter_results.json +1 -1
  251. edsl/utilities/decorators.py +77 -77
  252. edsl/utilities/gcp_bucket/cloud_storage.py +96 -96
  253. edsl/utilities/interface.py +627 -627
  254. edsl/utilities/repair_functions.py +28 -28
  255. edsl/utilities/restricted_python.py +70 -70
  256. edsl/utilities/utilities.py +409 -391
  257. {edsl-0.1.36.dev6.dist-info → edsl-0.1.37.dist-info}/LICENSE +21 -21
  258. {edsl-0.1.36.dev6.dist-info → edsl-0.1.37.dist-info}/METADATA +1 -1
  259. edsl-0.1.37.dist-info/RECORD +283 -0
  260. edsl-0.1.36.dev6.dist-info/RECORD +0 -279
  261. {edsl-0.1.36.dev6.dist-info → edsl-0.1.37.dist-info}/WHEEL +0 -0
@@ -1,48 +1,48 @@
1
- from typing import List, Optional
2
- import pandas as pd
3
- from edsl.conjure.InputData import InputDataABC
4
- from edsl.conjure.utilities import convert_value
5
-
6
-
7
- class InputDataCSV(InputDataABC):
8
- def __init__(self, datafile_name: str, config: Optional[dict] = None, **kwargs):
9
- if config is None:
10
- config = {"skiprows": None, "delimiter": ","}
11
-
12
- super().__init__(datafile_name, config, **kwargs)
13
-
14
- def get_df(self) -> pd.DataFrame:
15
- if not hasattr(self, "_df"):
16
- self._df = pd.read_csv(
17
- self.datafile_name,
18
- skiprows=self.config["skiprows"],
19
- encoding_errors="ignore",
20
- )
21
- float_columns = self._df.select_dtypes(include=["float64"]).columns
22
- self._df[float_columns] = self._df[float_columns].astype(str)
23
- self._df.fillna("", inplace=True)
24
- self._df = self._df.astype(str)
25
- return self._df
26
-
27
- def get_raw_data(self) -> List[List[str]]:
28
- data = [
29
- [convert_value(obs) for obs in v]
30
- for k, v in self.get_df().to_dict(orient="list").items()
31
- ]
32
- return data
33
-
34
- def get_question_texts(self):
35
- return list(self.get_df().columns)
36
-
37
- def get_question_names(self):
38
- new_names = [self.naming_function(q) for q in self.question_texts]
39
-
40
- if len(new_names) > len(set(new_names)):
41
- from collections import Counter
42
-
43
- counter = Counter(new_names)
44
- for i, name in enumerate(new_names):
45
- if counter[name] > 1:
46
- new_names[i] = name + str(counter[name])
47
- counter[name] -= 1
48
- return new_names
1
+ from typing import List, Optional
2
+ import pandas as pd
3
+ from edsl.conjure.InputData import InputDataABC
4
+ from edsl.conjure.utilities import convert_value
5
+
6
+
7
+ class InputDataCSV(InputDataABC):
8
+ def __init__(self, datafile_name: str, config: Optional[dict] = None, **kwargs):
9
+ if config is None:
10
+ config = {"skiprows": None, "delimiter": ","}
11
+
12
+ super().__init__(datafile_name, config, **kwargs)
13
+
14
+ def get_df(self) -> pd.DataFrame:
15
+ if not hasattr(self, "_df"):
16
+ self._df = pd.read_csv(
17
+ self.datafile_name,
18
+ skiprows=self.config["skiprows"],
19
+ encoding_errors="ignore",
20
+ )
21
+ float_columns = self._df.select_dtypes(include=["float64"]).columns
22
+ self._df[float_columns] = self._df[float_columns].astype(str)
23
+ self._df.fillna("", inplace=True)
24
+ self._df = self._df.astype(str)
25
+ return self._df
26
+
27
+ def get_raw_data(self) -> List[List[str]]:
28
+ data = [
29
+ [convert_value(obs) for obs in v]
30
+ for k, v in self.get_df().to_dict(orient="list").items()
31
+ ]
32
+ return data
33
+
34
+ def get_question_texts(self):
35
+ return list(self.get_df().columns)
36
+
37
+ def get_question_names(self):
38
+ new_names = [self.naming_function(q) for q in self.question_texts]
39
+
40
+ if len(new_names) > len(set(new_names)):
41
+ from collections import Counter
42
+
43
+ counter = Counter(new_names)
44
+ for i, name in enumerate(new_names):
45
+ if counter[name] > 1:
46
+ new_names[i] = name + str(counter[name])
47
+ counter[name] -= 1
48
+ return new_names
@@ -1,182 +1,182 @@
1
- import functools
2
- from typing import List
3
- from edsl.conjure.utilities import Missing
4
- from collections import Counter
5
-
6
-
7
- class InputDataMixinQuestionStats:
8
- def question_statistics(self, question_name: str) -> "QuestionStats":
9
- """Return statistics for a question."""
10
- return self.QuestionStats(**self._compute_question_statistics(question_name))
11
-
12
- def _compute_question_statistics(self, question_name: str) -> dict:
13
- """
14
- Return a dictionary of statistics for a question.
15
-
16
- >>> from edsl.conjure.InputData import InputDataABC
17
- >>> id = InputDataABC.example()
18
- >>> id._compute_question_statistics('morning')
19
- {'num_responses': 2, 'num_unique_responses': 2, 'missing': 0, 'unique_responses': ..., 'frac_numerical': 0.0, 'top_5': [('1', 1), ('4', 1)], 'frac_obs_from_top_5': 1.0}
20
- """
21
- idx = self.question_names.index(question_name)
22
- return {attr: getattr(self, attr)[idx] for attr in self.question_attributes}
23
-
24
- @property
25
- def num_responses(self) -> List[int]:
26
- """
27
- Return the number of responses for each question.
28
-
29
- >>> from edsl.conjure.InputData import InputDataABC
30
- >>> id = InputDataABC.example()
31
- >>> id.num_responses
32
- [2, 2]
33
- """
34
- return self.compute_num_responses()
35
-
36
- @functools.lru_cache(maxsize=1)
37
- def compute_num_responses(self):
38
- return [len(responses) for responses in self.raw_data]
39
-
40
- @property
41
- def num_unique_responses(self) -> List[int]:
42
- """
43
- The number of unique responses for each question.
44
-
45
- >>> from edsl.conjure.InputData import InputDataABC
46
- >>> id = InputDataABC.example()
47
- >>> id.num_unique_responses
48
- [2, 2]
49
- """
50
- return self.compute_num_unique_responses()
51
-
52
- @functools.lru_cache(maxsize=1)
53
- def compute_num_unique_responses(self):
54
- return [len(set(responses)) for responses in self.raw_data]
55
-
56
- @property
57
- def missing(self) -> List[int]:
58
- """The number of observations that are missing.
59
-
60
- >>> from edsl.conjure.InputData import InputDataABC
61
- >>> input_data = InputDataABC.example(raw_data = [[1,2,Missing().value()]], question_texts = ['A question'])
62
- >>> input_data.missing
63
- [1]
64
-
65
- """
66
- return self.compute_missing()
67
-
68
- @functools.lru_cache(maxsize=1)
69
- def compute_missing(self):
70
- return [sum([1 for x in v if x == Missing().value()]) for v in self.raw_data]
71
-
72
- @property
73
- def frac_numerical(self) -> List[float]:
74
- """
75
- The fraction of responses that are numerical for each question.
76
-
77
- >>> from edsl.conjure.InputData import InputDataABC
78
- >>> input_data = InputDataABC.example(raw_data = [[1,2,"Poop", 3]], question_texts = ['A question'])
79
- >>> input_data.frac_numerical
80
- [0.75]
81
- """
82
- return self.compute_frac_numerical()
83
-
84
- @functools.lru_cache(maxsize=1)
85
- def compute_frac_numerical(self):
86
- return [
87
- sum([1 for x in v if isinstance(x, (int, float))]) / len(v)
88
- for v in self.raw_data
89
- ]
90
-
91
- @functools.lru_cache(maxsize=1)
92
- def top_k(self, k: int) -> List[List[tuple]]:
93
- """
94
- >>> from edsl.conjure.InputData import InputDataABC
95
- >>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,2]], question_texts = ['A question'])
96
- >>> input_data.top_k(1)
97
- [[(1, 5)]]
98
- >>> input_data.top_k(2)
99
- [[(1, 5), (2, 1)]]
100
- """
101
- return [Counter(value).most_common(k) for value in self.raw_data]
102
-
103
- @functools.lru_cache(maxsize=1)
104
- def frac_obs_from_top_k(self, k):
105
- """
106
- Return the fraction of observations that are in the top k for each question.
107
-
108
- >>> from edsl.conjure.InputData import InputDataABC
109
- >>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,1,1,1,2, 3]], question_names = ['a'])
110
- >>> input_data.frac_obs_from_top_k(1)
111
- [0.8]
112
- """
113
- return [
114
- round(
115
- sum([x[1] for x in Counter(value).most_common(k) if x[0] != "missing"])
116
- / len(value),
117
- 2,
118
- )
119
- for value in self.raw_data
120
- ]
121
-
122
- @property
123
- def frac_obs_from_top_5(self):
124
- """The fraction of observations that are in the top 5 for each question."""
125
- return self.frac_obs_from_top_k(5)
126
-
127
- @property
128
- def top_5(self):
129
- """The top 5 responses for each question."""
130
- return self.top_k(5)
131
-
132
- @property
133
- def unique_responses(self) -> List[List[str]]:
134
- """Return a list of unique responses for each question.
135
-
136
- >>> from edsl.conjure.InputData import InputDataABC
137
- >>> id = InputDataABC.example()
138
- >>> id.unique_responses
139
- [..., ...]
140
- """
141
- return self.compute_unique_responses()
142
-
143
- @functools.lru_cache(maxsize=1)
144
- def compute_unique_responses(self):
145
- return [
146
- list(set(self.filter_missing(responses))) for responses in self.raw_data
147
- ]
148
-
149
- @staticmethod
150
- def filter_missing(responses) -> List[str]:
151
- """Return a list of responses with missing values removed."""
152
- return [
153
- v
154
- for v in responses
155
- if v != Missing().value() and v != "missing" and v != ""
156
- ]
157
-
158
- def unique_responses_more_than_k(self, k, remove_missing=True) -> List[List[str]]:
159
- """Return a list of unique responses that occur more than k times for each question.
160
-
161
- >>> from edsl.conjure.InputData import InputDataABC
162
- >>> id = InputDataABC.example()
163
- >>> id.unique_responses_more_than_k(1)
164
- [[...], [...]]
165
-
166
- """
167
- counters = [Counter(responses) for responses in self.raw_data]
168
- new_counters = []
169
- for question in counters:
170
- top_options = []
171
- for option, count in question.items():
172
- if count > k and (option != "missing" or not remove_missing):
173
- top_options.append(option)
174
- new_counters.append(top_options)
175
- return new_counters
176
-
177
-
178
- if __name__ == "__main__":
179
- from edsl.conjure.InputData import InputDataABC
180
- import doctest
181
-
182
- doctest.testmod(optionflags=doctest.ELLIPSIS)
1
+ import functools
2
+ from typing import List
3
+ from edsl.conjure.utilities import Missing
4
+ from collections import Counter
5
+
6
+
7
+ class InputDataMixinQuestionStats:
8
+ def question_statistics(self, question_name: str) -> "QuestionStats":
9
+ """Return statistics for a question."""
10
+ return self.QuestionStats(**self._compute_question_statistics(question_name))
11
+
12
+ def _compute_question_statistics(self, question_name: str) -> dict:
13
+ """
14
+ Return a dictionary of statistics for a question.
15
+
16
+ >>> from edsl.conjure.InputData import InputDataABC
17
+ >>> id = InputDataABC.example()
18
+ >>> id._compute_question_statistics('morning')
19
+ {'num_responses': 2, 'num_unique_responses': 2, 'missing': 0, 'unique_responses': ..., 'frac_numerical': 0.0, 'top_5': [('1', 1), ('4', 1)], 'frac_obs_from_top_5': 1.0}
20
+ """
21
+ idx = self.question_names.index(question_name)
22
+ return {attr: getattr(self, attr)[idx] for attr in self.question_attributes}
23
+
24
+ @property
25
+ def num_responses(self) -> List[int]:
26
+ """
27
+ Return the number of responses for each question.
28
+
29
+ >>> from edsl.conjure.InputData import InputDataABC
30
+ >>> id = InputDataABC.example()
31
+ >>> id.num_responses
32
+ [2, 2]
33
+ """
34
+ return self.compute_num_responses()
35
+
36
+ @functools.lru_cache(maxsize=1)
37
+ def compute_num_responses(self):
38
+ return [len(responses) for responses in self.raw_data]
39
+
40
+ @property
41
+ def num_unique_responses(self) -> List[int]:
42
+ """
43
+ The number of unique responses for each question.
44
+
45
+ >>> from edsl.conjure.InputData import InputDataABC
46
+ >>> id = InputDataABC.example()
47
+ >>> id.num_unique_responses
48
+ [2, 2]
49
+ """
50
+ return self.compute_num_unique_responses()
51
+
52
+ @functools.lru_cache(maxsize=1)
53
+ def compute_num_unique_responses(self):
54
+ return [len(set(responses)) for responses in self.raw_data]
55
+
56
+ @property
57
+ def missing(self) -> List[int]:
58
+ """The number of observations that are missing.
59
+
60
+ >>> from edsl.conjure.InputData import InputDataABC
61
+ >>> input_data = InputDataABC.example(raw_data = [[1,2,Missing().value()]], question_texts = ['A question'])
62
+ >>> input_data.missing
63
+ [1]
64
+
65
+ """
66
+ return self.compute_missing()
67
+
68
+ @functools.lru_cache(maxsize=1)
69
+ def compute_missing(self):
70
+ return [sum([1 for x in v if x == Missing().value()]) for v in self.raw_data]
71
+
72
+ @property
73
+ def frac_numerical(self) -> List[float]:
74
+ """
75
+ The fraction of responses that are numerical for each question.
76
+
77
+ >>> from edsl.conjure.InputData import InputDataABC
78
+ >>> input_data = InputDataABC.example(raw_data = [[1,2,"Poop", 3]], question_texts = ['A question'])
79
+ >>> input_data.frac_numerical
80
+ [0.75]
81
+ """
82
+ return self.compute_frac_numerical()
83
+
84
+ @functools.lru_cache(maxsize=1)
85
+ def compute_frac_numerical(self):
86
+ return [
87
+ sum([1 for x in v if isinstance(x, (int, float))]) / len(v)
88
+ for v in self.raw_data
89
+ ]
90
+
91
+ @functools.lru_cache(maxsize=1)
92
+ def top_k(self, k: int) -> List[List[tuple]]:
93
+ """
94
+ >>> from edsl.conjure.InputData import InputDataABC
95
+ >>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,2]], question_texts = ['A question'])
96
+ >>> input_data.top_k(1)
97
+ [[(1, 5)]]
98
+ >>> input_data.top_k(2)
99
+ [[(1, 5), (2, 1)]]
100
+ """
101
+ return [Counter(value).most_common(k) for value in self.raw_data]
102
+
103
+ @functools.lru_cache(maxsize=1)
104
+ def frac_obs_from_top_k(self, k):
105
+ """
106
+ Return the fraction of observations that are in the top k for each question.
107
+
108
+ >>> from edsl.conjure.InputData import InputDataABC
109
+ >>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,1,1,1,2, 3]], question_names = ['a'])
110
+ >>> input_data.frac_obs_from_top_k(1)
111
+ [0.8]
112
+ """
113
+ return [
114
+ round(
115
+ sum([x[1] for x in Counter(value).most_common(k) if x[0] != "missing"])
116
+ / len(value),
117
+ 2,
118
+ )
119
+ for value in self.raw_data
120
+ ]
121
+
122
+ @property
123
+ def frac_obs_from_top_5(self):
124
+ """The fraction of observations that are in the top 5 for each question."""
125
+ return self.frac_obs_from_top_k(5)
126
+
127
+ @property
128
+ def top_5(self):
129
+ """The top 5 responses for each question."""
130
+ return self.top_k(5)
131
+
132
+ @property
133
+ def unique_responses(self) -> List[List[str]]:
134
+ """Return a list of unique responses for each question.
135
+
136
+ >>> from edsl.conjure.InputData import InputDataABC
137
+ >>> id = InputDataABC.example()
138
+ >>> id.unique_responses
139
+ [..., ...]
140
+ """
141
+ return self.compute_unique_responses()
142
+
143
+ @functools.lru_cache(maxsize=1)
144
+ def compute_unique_responses(self):
145
+ return [
146
+ list(set(self.filter_missing(responses))) for responses in self.raw_data
147
+ ]
148
+
149
+ @staticmethod
150
+ def filter_missing(responses) -> List[str]:
151
+ """Return a list of responses with missing values removed."""
152
+ return [
153
+ v
154
+ for v in responses
155
+ if v != Missing().value() and v != "missing" and v != ""
156
+ ]
157
+
158
+ def unique_responses_more_than_k(self, k, remove_missing=True) -> List[List[str]]:
159
+ """Return a list of unique responses that occur more than k times for each question.
160
+
161
+ >>> from edsl.conjure.InputData import InputDataABC
162
+ >>> id = InputDataABC.example()
163
+ >>> id.unique_responses_more_than_k(1)
164
+ [[...], [...]]
165
+
166
+ """
167
+ counters = [Counter(responses) for responses in self.raw_data]
168
+ new_counters = []
169
+ for question in counters:
170
+ top_options = []
171
+ for option, count in question.items():
172
+ if count > k and (option != "missing" or not remove_missing):
173
+ top_options.append(option)
174
+ new_counters.append(top_options)
175
+ return new_counters
176
+
177
+
178
+ if __name__ == "__main__":
179
+ from edsl.conjure.InputData import InputDataABC
180
+ import doctest
181
+
182
+ doctest.testmod(optionflags=doctest.ELLIPSIS)
@@ -1,91 +1,91 @@
1
- import pandas as pd
2
- from typing import List
3
-
4
- from edsl.conjure.InputData import InputDataABC
5
- from edsl.conjure.utilities import convert_value
6
- from edsl.utilities.utilities import is_valid_variable_name
7
-
8
- try:
9
- import pyreadstat
10
- except ImportError as e:
11
- raise ImportError(
12
- "The 'pyreadstat' package is required for this feature. Please install it by running:\n"
13
- "pip install pyreadstat\n"
14
- ) from e
15
-
16
-
17
- class InputDataPyRead(InputDataABC):
18
- def pyread_function(self, datafile_name):
19
- raise NotImplementedError
20
-
21
- def _parse(self) -> None:
22
- try:
23
- df, meta = self.pyread_function(self.datafile_name)
24
- except Exception as e:
25
- raise ValueError(
26
- f"An error occurred while reading the file {self.datafile_name}."
27
- ) from e
28
- float_columns = df.select_dtypes(include=["float64"]).columns
29
- df[float_columns] = df[float_columns].astype(str)
30
-
31
- df.fillna("", inplace=True)
32
- df = df.astype(str)
33
- self._df = df
34
- self._meta = meta
35
-
36
- def get_df(self) -> pd.DataFrame:
37
- if not hasattr(self, "_df"):
38
- self._parse()
39
- return self._df
40
-
41
- def get_answer_codebook(self):
42
- if not hasattr(self, "_meta"):
43
- self._parse()
44
-
45
- question_name_to_label_name = self._meta.variable_to_label
46
- label_name_to_labels = self._meta.value_labels
47
- return {
48
- qn: label_name_to_labels[label_name]
49
- for qn, label_name in question_name_to_label_name.items()
50
- }
51
-
52
- def get_raw_data(self) -> List[List[str]]:
53
- df = self.get_df()
54
- data = [
55
- [convert_value(obs) for obs in v]
56
- for k, v in df.to_dict(orient="list").items()
57
- ]
58
- return data
59
-
60
- @property
61
- def question_names_to_question_texts(self):
62
- """Return a dictionary of question names to question texts.
63
- This will repair the question names if they are not valid Python identifiers using the
64
- same question_name_repair_func that was passed in.
65
- """
66
- if not hasattr(self, "_meta"):
67
- self._parse()
68
- d = {}
69
- for qn, label in self._meta.column_names_to_labels.items():
70
- new_name = qn
71
- if not is_valid_variable_name(qn):
72
- new_name = self.question_name_repair_func(qn)
73
- if not is_valid_variable_name(new_name):
74
- raise ValueError(
75
- f"""Question names must be valid Python identifiers. '{qn}' is not.""",
76
- """You can pass an entry in question_name_repair_dict to fix this.""",
77
- )
78
- if label is not None:
79
- d[new_name] = label
80
- return d
81
-
82
- def get_question_texts(self):
83
- if not hasattr(self, "_meta"):
84
- self._parse()
85
- return [
86
- self.question_names_to_question_texts.get(qn, qn)
87
- for qn in self.question_names
88
- ]
89
-
90
- def get_question_names(self):
91
- return self.get_df().columns.tolist()
1
+ import pandas as pd
2
+ from typing import List
3
+
4
+ from edsl.conjure.InputData import InputDataABC
5
+ from edsl.conjure.utilities import convert_value
6
+ from edsl.utilities.utilities import is_valid_variable_name
7
+
8
+ try:
9
+ import pyreadstat
10
+ except ImportError as e:
11
+ raise ImportError(
12
+ "The 'pyreadstat' package is required for this feature. Please install it by running:\n"
13
+ "pip install pyreadstat\n"
14
+ ) from e
15
+
16
+
17
+ class InputDataPyRead(InputDataABC):
18
+ def pyread_function(self, datafile_name):
19
+ raise NotImplementedError
20
+
21
+ def _parse(self) -> None:
22
+ try:
23
+ df, meta = self.pyread_function(self.datafile_name)
24
+ except Exception as e:
25
+ raise ValueError(
26
+ f"An error occurred while reading the file {self.datafile_name}."
27
+ ) from e
28
+ float_columns = df.select_dtypes(include=["float64"]).columns
29
+ df[float_columns] = df[float_columns].astype(str)
30
+
31
+ df.fillna("", inplace=True)
32
+ df = df.astype(str)
33
+ self._df = df
34
+ self._meta = meta
35
+
36
+ def get_df(self) -> pd.DataFrame:
37
+ if not hasattr(self, "_df"):
38
+ self._parse()
39
+ return self._df
40
+
41
+ def get_answer_codebook(self):
42
+ if not hasattr(self, "_meta"):
43
+ self._parse()
44
+
45
+ question_name_to_label_name = self._meta.variable_to_label
46
+ label_name_to_labels = self._meta.value_labels
47
+ return {
48
+ qn: label_name_to_labels[label_name]
49
+ for qn, label_name in question_name_to_label_name.items()
50
+ }
51
+
52
+ def get_raw_data(self) -> List[List[str]]:
53
+ df = self.get_df()
54
+ data = [
55
+ [convert_value(obs) for obs in v]
56
+ for k, v in df.to_dict(orient="list").items()
57
+ ]
58
+ return data
59
+
60
+ @property
61
+ def question_names_to_question_texts(self):
62
+ """Return a dictionary of question names to question texts.
63
+ This will repair the question names if they are not valid Python identifiers using the
64
+ same question_name_repair_func that was passed in.
65
+ """
66
+ if not hasattr(self, "_meta"):
67
+ self._parse()
68
+ d = {}
69
+ for qn, label in self._meta.column_names_to_labels.items():
70
+ new_name = qn
71
+ if not is_valid_variable_name(qn):
72
+ new_name = self.question_name_repair_func(qn)
73
+ if not is_valid_variable_name(new_name):
74
+ raise ValueError(
75
+ f"""Question names must be valid Python identifiers. '{qn}' is not.""",
76
+ """You can pass an entry in question_name_repair_dict to fix this.""",
77
+ )
78
+ if label is not None:
79
+ d[new_name] = label
80
+ return d
81
+
82
+ def get_question_texts(self):
83
+ if not hasattr(self, "_meta"):
84
+ self._parse()
85
+ return [
86
+ self.question_names_to_question_texts.get(qn, qn)
87
+ for qn in self.question_names
88
+ ]
89
+
90
+ def get_question_names(self):
91
+ return self.get_df().columns.tolist()
@@ -1,8 +1,8 @@
1
- from edsl.conjure.InputDataPyRead import InputDataPyRead
2
-
3
-
4
- class InputDataSPSS(InputDataPyRead):
5
- def pyread_function(self, datafile_name):
6
- from pyreadstat import read_sav
7
-
8
- return read_sav(datafile_name)
1
+ from edsl.conjure.InputDataPyRead import InputDataPyRead
2
+
3
+
4
+ class InputDataSPSS(InputDataPyRead):
5
+ def pyread_function(self, datafile_name):
6
+ from pyreadstat import read_sav
7
+
8
+ return read_sav(datafile_name)