edsl 0.1.37.dev2__py3-none-any.whl → 0.1.37.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/Base.py +303 -303
- edsl/BaseDiff.py +260 -260
- edsl/TemplateLoader.py +24 -24
- edsl/__init__.py +48 -48
- edsl/__version__.py +1 -1
- edsl/agents/Agent.py +804 -804
- edsl/agents/AgentList.py +345 -345
- edsl/agents/Invigilator.py +222 -222
- edsl/agents/InvigilatorBase.py +305 -305
- edsl/agents/PromptConstructor.py +312 -312
- edsl/agents/__init__.py +3 -3
- edsl/agents/descriptors.py +86 -86
- edsl/agents/prompt_helpers.py +129 -129
- edsl/auto/AutoStudy.py +117 -117
- edsl/auto/StageBase.py +230 -230
- edsl/auto/StageGenerateSurvey.py +178 -178
- edsl/auto/StageLabelQuestions.py +125 -125
- edsl/auto/StagePersona.py +61 -61
- edsl/auto/StagePersonaDimensionValueRanges.py +88 -88
- edsl/auto/StagePersonaDimensionValues.py +74 -74
- edsl/auto/StagePersonaDimensions.py +69 -69
- edsl/auto/StageQuestions.py +73 -73
- edsl/auto/SurveyCreatorPipeline.py +21 -21
- edsl/auto/utilities.py +224 -224
- edsl/base/Base.py +289 -289
- edsl/config.py +149 -149
- edsl/conjure/AgentConstructionMixin.py +152 -152
- edsl/conjure/Conjure.py +62 -62
- edsl/conjure/InputData.py +659 -659
- edsl/conjure/InputDataCSV.py +48 -48
- edsl/conjure/InputDataMixinQuestionStats.py +182 -182
- edsl/conjure/InputDataPyRead.py +91 -91
- edsl/conjure/InputDataSPSS.py +8 -8
- edsl/conjure/InputDataStata.py +8 -8
- edsl/conjure/QuestionOptionMixin.py +76 -76
- edsl/conjure/QuestionTypeMixin.py +23 -23
- edsl/conjure/RawQuestion.py +65 -65
- edsl/conjure/SurveyResponses.py +7 -7
- edsl/conjure/__init__.py +9 -9
- edsl/conjure/naming_utilities.py +263 -263
- edsl/conjure/utilities.py +201 -201
- edsl/conversation/Conversation.py +238 -238
- edsl/conversation/car_buying.py +58 -58
- edsl/conversation/mug_negotiation.py +81 -81
- edsl/conversation/next_speaker_utilities.py +93 -93
- edsl/coop/PriceFetcher.py +54 -54
- edsl/coop/__init__.py +2 -2
- edsl/coop/coop.py +824 -824
- edsl/coop/utils.py +131 -131
- edsl/data/Cache.py +527 -527
- edsl/data/CacheEntry.py +228 -228
- edsl/data/CacheHandler.py +149 -149
- edsl/data/RemoteCacheSync.py +97 -97
- edsl/data/SQLiteDict.py +292 -292
- edsl/data/__init__.py +4 -4
- edsl/data/orm.py +10 -10
- edsl/data_transfer_models.py +73 -73
- edsl/enums.py +173 -173
- edsl/exceptions/__init__.py +50 -50
- edsl/exceptions/agents.py +40 -40
- edsl/exceptions/configuration.py +16 -16
- edsl/exceptions/coop.py +10 -10
- edsl/exceptions/data.py +14 -14
- edsl/exceptions/general.py +34 -34
- edsl/exceptions/jobs.py +33 -33
- edsl/exceptions/language_models.py +63 -63
- edsl/exceptions/prompts.py +15 -15
- edsl/exceptions/questions.py +91 -91
- edsl/exceptions/results.py +26 -26
- edsl/exceptions/surveys.py +34 -34
- edsl/inference_services/AnthropicService.py +87 -87
- edsl/inference_services/AwsBedrock.py +115 -115
- edsl/inference_services/AzureAI.py +217 -217
- edsl/inference_services/DeepInfraService.py +18 -18
- edsl/inference_services/GoogleService.py +156 -156
- edsl/inference_services/GroqService.py +20 -20
- edsl/inference_services/InferenceServiceABC.py +147 -147
- edsl/inference_services/InferenceServicesCollection.py +74 -74
- edsl/inference_services/MistralAIService.py +123 -123
- edsl/inference_services/OllamaService.py +18 -18
- edsl/inference_services/OpenAIService.py +224 -224
- edsl/inference_services/TestService.py +89 -89
- edsl/inference_services/TogetherAIService.py +170 -170
- edsl/inference_services/models_available_cache.py +118 -118
- edsl/inference_services/rate_limits_cache.py +25 -25
- edsl/inference_services/registry.py +39 -39
- edsl/inference_services/write_available.py +10 -10
- edsl/jobs/Answers.py +56 -56
- edsl/jobs/Jobs.py +1121 -1112
- edsl/jobs/__init__.py +1 -1
- edsl/jobs/buckets/BucketCollection.py +63 -63
- edsl/jobs/buckets/ModelBuckets.py +65 -65
- edsl/jobs/buckets/TokenBucket.py +248 -248
- edsl/jobs/interviews/Interview.py +661 -661
- edsl/jobs/interviews/InterviewExceptionCollection.py +99 -99
- edsl/jobs/interviews/InterviewExceptionEntry.py +182 -182
- edsl/jobs/interviews/InterviewStatistic.py +63 -63
- edsl/jobs/interviews/InterviewStatisticsCollection.py +25 -25
- edsl/jobs/interviews/InterviewStatusDictionary.py +78 -78
- edsl/jobs/interviews/InterviewStatusLog.py +92 -92
- edsl/jobs/interviews/ReportErrors.py +66 -66
- edsl/jobs/interviews/interview_status_enum.py +9 -9
- edsl/jobs/runners/JobsRunnerAsyncio.py +338 -338
- edsl/jobs/runners/JobsRunnerStatus.py +332 -332
- edsl/jobs/tasks/QuestionTaskCreator.py +242 -242
- edsl/jobs/tasks/TaskCreators.py +64 -64
- edsl/jobs/tasks/TaskHistory.py +441 -441
- edsl/jobs/tasks/TaskStatusLog.py +23 -23
- edsl/jobs/tasks/task_status_enum.py +163 -163
- edsl/jobs/tokens/InterviewTokenUsage.py +27 -27
- edsl/jobs/tokens/TokenUsage.py +34 -34
- edsl/language_models/LanguageModel.py +718 -718
- edsl/language_models/ModelList.py +102 -102
- edsl/language_models/RegisterLanguageModelsMeta.py +184 -184
- edsl/language_models/__init__.py +2 -2
- edsl/language_models/fake_openai_call.py +15 -15
- edsl/language_models/fake_openai_service.py +61 -61
- edsl/language_models/registry.py +137 -137
- edsl/language_models/repair.py +156 -156
- edsl/language_models/unused/ReplicateBase.py +83 -83
- edsl/language_models/utilities.py +64 -64
- edsl/notebooks/Notebook.py +259 -259
- edsl/notebooks/__init__.py +1 -1
- edsl/prompts/Prompt.py +353 -353
- edsl/prompts/__init__.py +2 -2
- edsl/questions/AnswerValidatorMixin.py +289 -289
- edsl/questions/QuestionBase.py +616 -616
- edsl/questions/QuestionBaseGenMixin.py +161 -161
- edsl/questions/QuestionBasePromptsMixin.py +266 -266
- edsl/questions/QuestionBudget.py +227 -227
- edsl/questions/QuestionCheckBox.py +359 -359
- edsl/questions/QuestionExtract.py +183 -183
- edsl/questions/QuestionFreeText.py +114 -114
- edsl/questions/QuestionFunctional.py +159 -159
- edsl/questions/QuestionList.py +231 -231
- edsl/questions/QuestionMultipleChoice.py +286 -286
- edsl/questions/QuestionNumerical.py +153 -153
- edsl/questions/QuestionRank.py +324 -324
- edsl/questions/Quick.py +41 -41
- edsl/questions/RegisterQuestionsMeta.py +71 -71
- edsl/questions/ResponseValidatorABC.py +174 -174
- edsl/questions/SimpleAskMixin.py +73 -73
- edsl/questions/__init__.py +26 -26
- edsl/questions/compose_questions.py +98 -98
- edsl/questions/decorators.py +21 -21
- edsl/questions/derived/QuestionLikertFive.py +76 -76
- edsl/questions/derived/QuestionLinearScale.py +87 -87
- edsl/questions/derived/QuestionTopK.py +91 -91
- edsl/questions/derived/QuestionYesNo.py +82 -82
- edsl/questions/descriptors.py +418 -418
- edsl/questions/prompt_templates/question_budget.jinja +13 -13
- edsl/questions/prompt_templates/question_checkbox.jinja +32 -32
- edsl/questions/prompt_templates/question_extract.jinja +11 -11
- edsl/questions/prompt_templates/question_free_text.jinja +3 -3
- edsl/questions/prompt_templates/question_linear_scale.jinja +11 -11
- edsl/questions/prompt_templates/question_list.jinja +17 -17
- edsl/questions/prompt_templates/question_multiple_choice.jinja +33 -33
- edsl/questions/prompt_templates/question_numerical.jinja +36 -36
- edsl/questions/question_registry.py +147 -147
- edsl/questions/settings.py +12 -12
- edsl/questions/templates/budget/answering_instructions.jinja +7 -7
- edsl/questions/templates/budget/question_presentation.jinja +7 -7
- edsl/questions/templates/checkbox/answering_instructions.jinja +10 -10
- edsl/questions/templates/checkbox/question_presentation.jinja +22 -22
- edsl/questions/templates/extract/answering_instructions.jinja +7 -7
- edsl/questions/templates/likert_five/answering_instructions.jinja +10 -10
- edsl/questions/templates/likert_five/question_presentation.jinja +11 -11
- edsl/questions/templates/linear_scale/answering_instructions.jinja +5 -5
- edsl/questions/templates/linear_scale/question_presentation.jinja +5 -5
- edsl/questions/templates/list/answering_instructions.jinja +3 -3
- edsl/questions/templates/list/question_presentation.jinja +5 -5
- edsl/questions/templates/multiple_choice/answering_instructions.jinja +9 -9
- edsl/questions/templates/multiple_choice/question_presentation.jinja +11 -11
- edsl/questions/templates/numerical/answering_instructions.jinja +6 -6
- edsl/questions/templates/numerical/question_presentation.jinja +6 -6
- edsl/questions/templates/rank/answering_instructions.jinja +11 -11
- edsl/questions/templates/rank/question_presentation.jinja +15 -15
- edsl/questions/templates/top_k/answering_instructions.jinja +8 -8
- edsl/questions/templates/top_k/question_presentation.jinja +22 -22
- edsl/questions/templates/yes_no/answering_instructions.jinja +6 -6
- edsl/questions/templates/yes_no/question_presentation.jinja +11 -11
- edsl/results/Dataset.py +293 -293
- edsl/results/DatasetExportMixin.py +693 -693
- edsl/results/DatasetTree.py +145 -145
- edsl/results/Result.py +435 -435
- edsl/results/Results.py +1160 -1160
- edsl/results/ResultsDBMixin.py +238 -238
- edsl/results/ResultsExportMixin.py +43 -43
- edsl/results/ResultsFetchMixin.py +33 -33
- edsl/results/ResultsGGMixin.py +121 -121
- edsl/results/ResultsToolsMixin.py +98 -98
- edsl/results/Selector.py +118 -118
- edsl/results/__init__.py +2 -2
- edsl/results/tree_explore.py +115 -115
- edsl/scenarios/FileStore.py +458 -458
- edsl/scenarios/Scenario.py +510 -510
- edsl/scenarios/ScenarioHtmlMixin.py +59 -59
- edsl/scenarios/ScenarioList.py +1101 -1101
- edsl/scenarios/ScenarioListExportMixin.py +52 -52
- edsl/scenarios/ScenarioListPdfMixin.py +261 -261
- edsl/scenarios/__init__.py +4 -4
- edsl/shared.py +1 -1
- edsl/study/ObjectEntry.py +173 -173
- edsl/study/ProofOfWork.py +113 -113
- edsl/study/SnapShot.py +80 -80
- edsl/study/Study.py +528 -528
- edsl/study/__init__.py +4 -4
- edsl/surveys/DAG.py +148 -148
- edsl/surveys/Memory.py +31 -31
- edsl/surveys/MemoryPlan.py +244 -244
- edsl/surveys/Rule.py +324 -324
- edsl/surveys/RuleCollection.py +387 -387
- edsl/surveys/Survey.py +1772 -1772
- edsl/surveys/SurveyCSS.py +261 -261
- edsl/surveys/SurveyExportMixin.py +259 -259
- edsl/surveys/SurveyFlowVisualizationMixin.py +121 -121
- edsl/surveys/SurveyQualtricsImport.py +284 -284
- edsl/surveys/__init__.py +3 -3
- edsl/surveys/base.py +53 -53
- edsl/surveys/descriptors.py +56 -56
- edsl/surveys/instructions/ChangeInstruction.py +47 -47
- edsl/surveys/instructions/Instruction.py +51 -51
- edsl/surveys/instructions/InstructionCollection.py +77 -77
- edsl/templates/error_reporting/base.html +23 -23
- edsl/templates/error_reporting/exceptions_by_model.html +34 -34
- edsl/templates/error_reporting/exceptions_by_question_name.html +16 -16
- edsl/templates/error_reporting/exceptions_by_type.html +16 -16
- edsl/templates/error_reporting/interview_details.html +115 -115
- edsl/templates/error_reporting/interviews.html +9 -9
- edsl/templates/error_reporting/overview.html +4 -4
- edsl/templates/error_reporting/performance_plot.html +1 -1
- edsl/templates/error_reporting/report.css +73 -73
- edsl/templates/error_reporting/report.html +117 -117
- edsl/templates/error_reporting/report.js +25 -25
- edsl/tools/__init__.py +1 -1
- edsl/tools/clusters.py +192 -192
- edsl/tools/embeddings.py +27 -27
- edsl/tools/embeddings_plotting.py +118 -118
- edsl/tools/plotting.py +112 -112
- edsl/tools/summarize.py +18 -18
- edsl/utilities/SystemInfo.py +28 -28
- edsl/utilities/__init__.py +22 -22
- edsl/utilities/ast_utilities.py +25 -25
- edsl/utilities/data/Registry.py +6 -6
- edsl/utilities/data/__init__.py +1 -1
- edsl/utilities/data/scooter_results.json +1 -1
- edsl/utilities/decorators.py +77 -77
- edsl/utilities/gcp_bucket/cloud_storage.py +96 -96
- edsl/utilities/interface.py +627 -627
- edsl/utilities/repair_functions.py +28 -28
- edsl/utilities/restricted_python.py +70 -70
- edsl/utilities/utilities.py +391 -391
- {edsl-0.1.37.dev2.dist-info → edsl-0.1.37.dev3.dist-info}/LICENSE +21 -21
- {edsl-0.1.37.dev2.dist-info → edsl-0.1.37.dev3.dist-info}/METADATA +1 -1
- edsl-0.1.37.dev3.dist-info/RECORD +279 -0
- edsl-0.1.37.dev2.dist-info/RECORD +0 -279
- {edsl-0.1.37.dev2.dist-info → edsl-0.1.37.dev3.dist-info}/WHEEL +0 -0
edsl/conjure/InputDataCSV.py
CHANGED
@@ -1,48 +1,48 @@
|
|
1
|
-
from typing import List, Optional
|
2
|
-
import pandas as pd
|
3
|
-
from edsl.conjure.InputData import InputDataABC
|
4
|
-
from edsl.conjure.utilities import convert_value
|
5
|
-
|
6
|
-
|
7
|
-
class InputDataCSV(InputDataABC):
|
8
|
-
def __init__(self, datafile_name: str, config: Optional[dict] = None, **kwargs):
|
9
|
-
if config is None:
|
10
|
-
config = {"skiprows": None, "delimiter": ","}
|
11
|
-
|
12
|
-
super().__init__(datafile_name, config, **kwargs)
|
13
|
-
|
14
|
-
def get_df(self) -> pd.DataFrame:
|
15
|
-
if not hasattr(self, "_df"):
|
16
|
-
self._df = pd.read_csv(
|
17
|
-
self.datafile_name,
|
18
|
-
skiprows=self.config["skiprows"],
|
19
|
-
encoding_errors="ignore",
|
20
|
-
)
|
21
|
-
float_columns = self._df.select_dtypes(include=["float64"]).columns
|
22
|
-
self._df[float_columns] = self._df[float_columns].astype(str)
|
23
|
-
self._df.fillna("", inplace=True)
|
24
|
-
self._df = self._df.astype(str)
|
25
|
-
return self._df
|
26
|
-
|
27
|
-
def get_raw_data(self) -> List[List[str]]:
|
28
|
-
data = [
|
29
|
-
[convert_value(obs) for obs in v]
|
30
|
-
for k, v in self.get_df().to_dict(orient="list").items()
|
31
|
-
]
|
32
|
-
return data
|
33
|
-
|
34
|
-
def get_question_texts(self):
|
35
|
-
return list(self.get_df().columns)
|
36
|
-
|
37
|
-
def get_question_names(self):
|
38
|
-
new_names = [self.naming_function(q) for q in self.question_texts]
|
39
|
-
|
40
|
-
if len(new_names) > len(set(new_names)):
|
41
|
-
from collections import Counter
|
42
|
-
|
43
|
-
counter = Counter(new_names)
|
44
|
-
for i, name in enumerate(new_names):
|
45
|
-
if counter[name] > 1:
|
46
|
-
new_names[i] = name + str(counter[name])
|
47
|
-
counter[name] -= 1
|
48
|
-
return new_names
|
1
|
+
from typing import List, Optional
|
2
|
+
import pandas as pd
|
3
|
+
from edsl.conjure.InputData import InputDataABC
|
4
|
+
from edsl.conjure.utilities import convert_value
|
5
|
+
|
6
|
+
|
7
|
+
class InputDataCSV(InputDataABC):
|
8
|
+
def __init__(self, datafile_name: str, config: Optional[dict] = None, **kwargs):
|
9
|
+
if config is None:
|
10
|
+
config = {"skiprows": None, "delimiter": ","}
|
11
|
+
|
12
|
+
super().__init__(datafile_name, config, **kwargs)
|
13
|
+
|
14
|
+
def get_df(self) -> pd.DataFrame:
|
15
|
+
if not hasattr(self, "_df"):
|
16
|
+
self._df = pd.read_csv(
|
17
|
+
self.datafile_name,
|
18
|
+
skiprows=self.config["skiprows"],
|
19
|
+
encoding_errors="ignore",
|
20
|
+
)
|
21
|
+
float_columns = self._df.select_dtypes(include=["float64"]).columns
|
22
|
+
self._df[float_columns] = self._df[float_columns].astype(str)
|
23
|
+
self._df.fillna("", inplace=True)
|
24
|
+
self._df = self._df.astype(str)
|
25
|
+
return self._df
|
26
|
+
|
27
|
+
def get_raw_data(self) -> List[List[str]]:
|
28
|
+
data = [
|
29
|
+
[convert_value(obs) for obs in v]
|
30
|
+
for k, v in self.get_df().to_dict(orient="list").items()
|
31
|
+
]
|
32
|
+
return data
|
33
|
+
|
34
|
+
def get_question_texts(self):
|
35
|
+
return list(self.get_df().columns)
|
36
|
+
|
37
|
+
def get_question_names(self):
|
38
|
+
new_names = [self.naming_function(q) for q in self.question_texts]
|
39
|
+
|
40
|
+
if len(new_names) > len(set(new_names)):
|
41
|
+
from collections import Counter
|
42
|
+
|
43
|
+
counter = Counter(new_names)
|
44
|
+
for i, name in enumerate(new_names):
|
45
|
+
if counter[name] > 1:
|
46
|
+
new_names[i] = name + str(counter[name])
|
47
|
+
counter[name] -= 1
|
48
|
+
return new_names
|
@@ -1,182 +1,182 @@
|
|
1
|
-
import functools
|
2
|
-
from typing import List
|
3
|
-
from edsl.conjure.utilities import Missing
|
4
|
-
from collections import Counter
|
5
|
-
|
6
|
-
|
7
|
-
class InputDataMixinQuestionStats:
|
8
|
-
def question_statistics(self, question_name: str) -> "QuestionStats":
|
9
|
-
"""Return statistics for a question."""
|
10
|
-
return self.QuestionStats(**self._compute_question_statistics(question_name))
|
11
|
-
|
12
|
-
def _compute_question_statistics(self, question_name: str) -> dict:
|
13
|
-
"""
|
14
|
-
Return a dictionary of statistics for a question.
|
15
|
-
|
16
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
17
|
-
>>> id = InputDataABC.example()
|
18
|
-
>>> id._compute_question_statistics('morning')
|
19
|
-
{'num_responses': 2, 'num_unique_responses': 2, 'missing': 0, 'unique_responses': ..., 'frac_numerical': 0.0, 'top_5': [('1', 1), ('4', 1)], 'frac_obs_from_top_5': 1.0}
|
20
|
-
"""
|
21
|
-
idx = self.question_names.index(question_name)
|
22
|
-
return {attr: getattr(self, attr)[idx] for attr in self.question_attributes}
|
23
|
-
|
24
|
-
@property
|
25
|
-
def num_responses(self) -> List[int]:
|
26
|
-
"""
|
27
|
-
Return the number of responses for each question.
|
28
|
-
|
29
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
30
|
-
>>> id = InputDataABC.example()
|
31
|
-
>>> id.num_responses
|
32
|
-
[2, 2]
|
33
|
-
"""
|
34
|
-
return self.compute_num_responses()
|
35
|
-
|
36
|
-
@functools.lru_cache(maxsize=1)
|
37
|
-
def compute_num_responses(self):
|
38
|
-
return [len(responses) for responses in self.raw_data]
|
39
|
-
|
40
|
-
@property
|
41
|
-
def num_unique_responses(self) -> List[int]:
|
42
|
-
"""
|
43
|
-
The number of unique responses for each question.
|
44
|
-
|
45
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
46
|
-
>>> id = InputDataABC.example()
|
47
|
-
>>> id.num_unique_responses
|
48
|
-
[2, 2]
|
49
|
-
"""
|
50
|
-
return self.compute_num_unique_responses()
|
51
|
-
|
52
|
-
@functools.lru_cache(maxsize=1)
|
53
|
-
def compute_num_unique_responses(self):
|
54
|
-
return [len(set(responses)) for responses in self.raw_data]
|
55
|
-
|
56
|
-
@property
|
57
|
-
def missing(self) -> List[int]:
|
58
|
-
"""The number of observations that are missing.
|
59
|
-
|
60
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
61
|
-
>>> input_data = InputDataABC.example(raw_data = [[1,2,Missing().value()]], question_texts = ['A question'])
|
62
|
-
>>> input_data.missing
|
63
|
-
[1]
|
64
|
-
|
65
|
-
"""
|
66
|
-
return self.compute_missing()
|
67
|
-
|
68
|
-
@functools.lru_cache(maxsize=1)
|
69
|
-
def compute_missing(self):
|
70
|
-
return [sum([1 for x in v if x == Missing().value()]) for v in self.raw_data]
|
71
|
-
|
72
|
-
@property
|
73
|
-
def frac_numerical(self) -> List[float]:
|
74
|
-
"""
|
75
|
-
The fraction of responses that are numerical for each question.
|
76
|
-
|
77
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
78
|
-
>>> input_data = InputDataABC.example(raw_data = [[1,2,"Poop", 3]], question_texts = ['A question'])
|
79
|
-
>>> input_data.frac_numerical
|
80
|
-
[0.75]
|
81
|
-
"""
|
82
|
-
return self.compute_frac_numerical()
|
83
|
-
|
84
|
-
@functools.lru_cache(maxsize=1)
|
85
|
-
def compute_frac_numerical(self):
|
86
|
-
return [
|
87
|
-
sum([1 for x in v if isinstance(x, (int, float))]) / len(v)
|
88
|
-
for v in self.raw_data
|
89
|
-
]
|
90
|
-
|
91
|
-
@functools.lru_cache(maxsize=1)
|
92
|
-
def top_k(self, k: int) -> List[List[tuple]]:
|
93
|
-
"""
|
94
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
95
|
-
>>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,2]], question_texts = ['A question'])
|
96
|
-
>>> input_data.top_k(1)
|
97
|
-
[[(1, 5)]]
|
98
|
-
>>> input_data.top_k(2)
|
99
|
-
[[(1, 5), (2, 1)]]
|
100
|
-
"""
|
101
|
-
return [Counter(value).most_common(k) for value in self.raw_data]
|
102
|
-
|
103
|
-
@functools.lru_cache(maxsize=1)
|
104
|
-
def frac_obs_from_top_k(self, k):
|
105
|
-
"""
|
106
|
-
Return the fraction of observations that are in the top k for each question.
|
107
|
-
|
108
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
109
|
-
>>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,1,1,1,2, 3]], question_names = ['a'])
|
110
|
-
>>> input_data.frac_obs_from_top_k(1)
|
111
|
-
[0.8]
|
112
|
-
"""
|
113
|
-
return [
|
114
|
-
round(
|
115
|
-
sum([x[1] for x in Counter(value).most_common(k) if x[0] != "missing"])
|
116
|
-
/ len(value),
|
117
|
-
2,
|
118
|
-
)
|
119
|
-
for value in self.raw_data
|
120
|
-
]
|
121
|
-
|
122
|
-
@property
|
123
|
-
def frac_obs_from_top_5(self):
|
124
|
-
"""The fraction of observations that are in the top 5 for each question."""
|
125
|
-
return self.frac_obs_from_top_k(5)
|
126
|
-
|
127
|
-
@property
|
128
|
-
def top_5(self):
|
129
|
-
"""The top 5 responses for each question."""
|
130
|
-
return self.top_k(5)
|
131
|
-
|
132
|
-
@property
|
133
|
-
def unique_responses(self) -> List[List[str]]:
|
134
|
-
"""Return a list of unique responses for each question.
|
135
|
-
|
136
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
137
|
-
>>> id = InputDataABC.example()
|
138
|
-
>>> id.unique_responses
|
139
|
-
[..., ...]
|
140
|
-
"""
|
141
|
-
return self.compute_unique_responses()
|
142
|
-
|
143
|
-
@functools.lru_cache(maxsize=1)
|
144
|
-
def compute_unique_responses(self):
|
145
|
-
return [
|
146
|
-
list(set(self.filter_missing(responses))) for responses in self.raw_data
|
147
|
-
]
|
148
|
-
|
149
|
-
@staticmethod
|
150
|
-
def filter_missing(responses) -> List[str]:
|
151
|
-
"""Return a list of responses with missing values removed."""
|
152
|
-
return [
|
153
|
-
v
|
154
|
-
for v in responses
|
155
|
-
if v != Missing().value() and v != "missing" and v != ""
|
156
|
-
]
|
157
|
-
|
158
|
-
def unique_responses_more_than_k(self, k, remove_missing=True) -> List[List[str]]:
|
159
|
-
"""Return a list of unique responses that occur more than k times for each question.
|
160
|
-
|
161
|
-
>>> from edsl.conjure.InputData import InputDataABC
|
162
|
-
>>> id = InputDataABC.example()
|
163
|
-
>>> id.unique_responses_more_than_k(1)
|
164
|
-
[[...], [...]]
|
165
|
-
|
166
|
-
"""
|
167
|
-
counters = [Counter(responses) for responses in self.raw_data]
|
168
|
-
new_counters = []
|
169
|
-
for question in counters:
|
170
|
-
top_options = []
|
171
|
-
for option, count in question.items():
|
172
|
-
if count > k and (option != "missing" or not remove_missing):
|
173
|
-
top_options.append(option)
|
174
|
-
new_counters.append(top_options)
|
175
|
-
return new_counters
|
176
|
-
|
177
|
-
|
178
|
-
if __name__ == "__main__":
|
179
|
-
from edsl.conjure.InputData import InputDataABC
|
180
|
-
import doctest
|
181
|
-
|
182
|
-
doctest.testmod(optionflags=doctest.ELLIPSIS)
|
1
|
+
import functools
|
2
|
+
from typing import List
|
3
|
+
from edsl.conjure.utilities import Missing
|
4
|
+
from collections import Counter
|
5
|
+
|
6
|
+
|
7
|
+
class InputDataMixinQuestionStats:
|
8
|
+
def question_statistics(self, question_name: str) -> "QuestionStats":
|
9
|
+
"""Return statistics for a question."""
|
10
|
+
return self.QuestionStats(**self._compute_question_statistics(question_name))
|
11
|
+
|
12
|
+
def _compute_question_statistics(self, question_name: str) -> dict:
|
13
|
+
"""
|
14
|
+
Return a dictionary of statistics for a question.
|
15
|
+
|
16
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
17
|
+
>>> id = InputDataABC.example()
|
18
|
+
>>> id._compute_question_statistics('morning')
|
19
|
+
{'num_responses': 2, 'num_unique_responses': 2, 'missing': 0, 'unique_responses': ..., 'frac_numerical': 0.0, 'top_5': [('1', 1), ('4', 1)], 'frac_obs_from_top_5': 1.0}
|
20
|
+
"""
|
21
|
+
idx = self.question_names.index(question_name)
|
22
|
+
return {attr: getattr(self, attr)[idx] for attr in self.question_attributes}
|
23
|
+
|
24
|
+
@property
|
25
|
+
def num_responses(self) -> List[int]:
|
26
|
+
"""
|
27
|
+
Return the number of responses for each question.
|
28
|
+
|
29
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
30
|
+
>>> id = InputDataABC.example()
|
31
|
+
>>> id.num_responses
|
32
|
+
[2, 2]
|
33
|
+
"""
|
34
|
+
return self.compute_num_responses()
|
35
|
+
|
36
|
+
@functools.lru_cache(maxsize=1)
|
37
|
+
def compute_num_responses(self):
|
38
|
+
return [len(responses) for responses in self.raw_data]
|
39
|
+
|
40
|
+
@property
|
41
|
+
def num_unique_responses(self) -> List[int]:
|
42
|
+
"""
|
43
|
+
The number of unique responses for each question.
|
44
|
+
|
45
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
46
|
+
>>> id = InputDataABC.example()
|
47
|
+
>>> id.num_unique_responses
|
48
|
+
[2, 2]
|
49
|
+
"""
|
50
|
+
return self.compute_num_unique_responses()
|
51
|
+
|
52
|
+
@functools.lru_cache(maxsize=1)
|
53
|
+
def compute_num_unique_responses(self):
|
54
|
+
return [len(set(responses)) for responses in self.raw_data]
|
55
|
+
|
56
|
+
@property
|
57
|
+
def missing(self) -> List[int]:
|
58
|
+
"""The number of observations that are missing.
|
59
|
+
|
60
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
61
|
+
>>> input_data = InputDataABC.example(raw_data = [[1,2,Missing().value()]], question_texts = ['A question'])
|
62
|
+
>>> input_data.missing
|
63
|
+
[1]
|
64
|
+
|
65
|
+
"""
|
66
|
+
return self.compute_missing()
|
67
|
+
|
68
|
+
@functools.lru_cache(maxsize=1)
|
69
|
+
def compute_missing(self):
|
70
|
+
return [sum([1 for x in v if x == Missing().value()]) for v in self.raw_data]
|
71
|
+
|
72
|
+
@property
|
73
|
+
def frac_numerical(self) -> List[float]:
|
74
|
+
"""
|
75
|
+
The fraction of responses that are numerical for each question.
|
76
|
+
|
77
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
78
|
+
>>> input_data = InputDataABC.example(raw_data = [[1,2,"Poop", 3]], question_texts = ['A question'])
|
79
|
+
>>> input_data.frac_numerical
|
80
|
+
[0.75]
|
81
|
+
"""
|
82
|
+
return self.compute_frac_numerical()
|
83
|
+
|
84
|
+
@functools.lru_cache(maxsize=1)
|
85
|
+
def compute_frac_numerical(self):
|
86
|
+
return [
|
87
|
+
sum([1 for x in v if isinstance(x, (int, float))]) / len(v)
|
88
|
+
for v in self.raw_data
|
89
|
+
]
|
90
|
+
|
91
|
+
@functools.lru_cache(maxsize=1)
|
92
|
+
def top_k(self, k: int) -> List[List[tuple]]:
|
93
|
+
"""
|
94
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
95
|
+
>>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,2]], question_texts = ['A question'])
|
96
|
+
>>> input_data.top_k(1)
|
97
|
+
[[(1, 5)]]
|
98
|
+
>>> input_data.top_k(2)
|
99
|
+
[[(1, 5), (2, 1)]]
|
100
|
+
"""
|
101
|
+
return [Counter(value).most_common(k) for value in self.raw_data]
|
102
|
+
|
103
|
+
@functools.lru_cache(maxsize=1)
|
104
|
+
def frac_obs_from_top_k(self, k):
|
105
|
+
"""
|
106
|
+
Return the fraction of observations that are in the top k for each question.
|
107
|
+
|
108
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
109
|
+
>>> input_data = InputDataABC.example(raw_data = [[1,1,1,1,1,1,1,1,2, 3]], question_names = ['a'])
|
110
|
+
>>> input_data.frac_obs_from_top_k(1)
|
111
|
+
[0.8]
|
112
|
+
"""
|
113
|
+
return [
|
114
|
+
round(
|
115
|
+
sum([x[1] for x in Counter(value).most_common(k) if x[0] != "missing"])
|
116
|
+
/ len(value),
|
117
|
+
2,
|
118
|
+
)
|
119
|
+
for value in self.raw_data
|
120
|
+
]
|
121
|
+
|
122
|
+
@property
|
123
|
+
def frac_obs_from_top_5(self):
|
124
|
+
"""The fraction of observations that are in the top 5 for each question."""
|
125
|
+
return self.frac_obs_from_top_k(5)
|
126
|
+
|
127
|
+
@property
|
128
|
+
def top_5(self):
|
129
|
+
"""The top 5 responses for each question."""
|
130
|
+
return self.top_k(5)
|
131
|
+
|
132
|
+
@property
|
133
|
+
def unique_responses(self) -> List[List[str]]:
|
134
|
+
"""Return a list of unique responses for each question.
|
135
|
+
|
136
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
137
|
+
>>> id = InputDataABC.example()
|
138
|
+
>>> id.unique_responses
|
139
|
+
[..., ...]
|
140
|
+
"""
|
141
|
+
return self.compute_unique_responses()
|
142
|
+
|
143
|
+
@functools.lru_cache(maxsize=1)
|
144
|
+
def compute_unique_responses(self):
|
145
|
+
return [
|
146
|
+
list(set(self.filter_missing(responses))) for responses in self.raw_data
|
147
|
+
]
|
148
|
+
|
149
|
+
@staticmethod
|
150
|
+
def filter_missing(responses) -> List[str]:
|
151
|
+
"""Return a list of responses with missing values removed."""
|
152
|
+
return [
|
153
|
+
v
|
154
|
+
for v in responses
|
155
|
+
if v != Missing().value() and v != "missing" and v != ""
|
156
|
+
]
|
157
|
+
|
158
|
+
def unique_responses_more_than_k(self, k, remove_missing=True) -> List[List[str]]:
|
159
|
+
"""Return a list of unique responses that occur more than k times for each question.
|
160
|
+
|
161
|
+
>>> from edsl.conjure.InputData import InputDataABC
|
162
|
+
>>> id = InputDataABC.example()
|
163
|
+
>>> id.unique_responses_more_than_k(1)
|
164
|
+
[[...], [...]]
|
165
|
+
|
166
|
+
"""
|
167
|
+
counters = [Counter(responses) for responses in self.raw_data]
|
168
|
+
new_counters = []
|
169
|
+
for question in counters:
|
170
|
+
top_options = []
|
171
|
+
for option, count in question.items():
|
172
|
+
if count > k and (option != "missing" or not remove_missing):
|
173
|
+
top_options.append(option)
|
174
|
+
new_counters.append(top_options)
|
175
|
+
return new_counters
|
176
|
+
|
177
|
+
|
178
|
+
if __name__ == "__main__":
|
179
|
+
from edsl.conjure.InputData import InputDataABC
|
180
|
+
import doctest
|
181
|
+
|
182
|
+
doctest.testmod(optionflags=doctest.ELLIPSIS)
|
edsl/conjure/InputDataPyRead.py
CHANGED
@@ -1,91 +1,91 @@
|
|
1
|
-
import pandas as pd
|
2
|
-
from typing import List
|
3
|
-
|
4
|
-
from edsl.conjure.InputData import InputDataABC
|
5
|
-
from edsl.conjure.utilities import convert_value
|
6
|
-
from edsl.utilities.utilities import is_valid_variable_name
|
7
|
-
|
8
|
-
try:
|
9
|
-
import pyreadstat
|
10
|
-
except ImportError as e:
|
11
|
-
raise ImportError(
|
12
|
-
"The 'pyreadstat' package is required for this feature. Please install it by running:\n"
|
13
|
-
"pip install pyreadstat\n"
|
14
|
-
) from e
|
15
|
-
|
16
|
-
|
17
|
-
class InputDataPyRead(InputDataABC):
|
18
|
-
def pyread_function(self, datafile_name):
|
19
|
-
raise NotImplementedError
|
20
|
-
|
21
|
-
def _parse(self) -> None:
|
22
|
-
try:
|
23
|
-
df, meta = self.pyread_function(self.datafile_name)
|
24
|
-
except Exception as e:
|
25
|
-
raise ValueError(
|
26
|
-
f"An error occurred while reading the file {self.datafile_name}."
|
27
|
-
) from e
|
28
|
-
float_columns = df.select_dtypes(include=["float64"]).columns
|
29
|
-
df[float_columns] = df[float_columns].astype(str)
|
30
|
-
|
31
|
-
df.fillna("", inplace=True)
|
32
|
-
df = df.astype(str)
|
33
|
-
self._df = df
|
34
|
-
self._meta = meta
|
35
|
-
|
36
|
-
def get_df(self) -> pd.DataFrame:
|
37
|
-
if not hasattr(self, "_df"):
|
38
|
-
self._parse()
|
39
|
-
return self._df
|
40
|
-
|
41
|
-
def get_answer_codebook(self):
|
42
|
-
if not hasattr(self, "_meta"):
|
43
|
-
self._parse()
|
44
|
-
|
45
|
-
question_name_to_label_name = self._meta.variable_to_label
|
46
|
-
label_name_to_labels = self._meta.value_labels
|
47
|
-
return {
|
48
|
-
qn: label_name_to_labels[label_name]
|
49
|
-
for qn, label_name in question_name_to_label_name.items()
|
50
|
-
}
|
51
|
-
|
52
|
-
def get_raw_data(self) -> List[List[str]]:
|
53
|
-
df = self.get_df()
|
54
|
-
data = [
|
55
|
-
[convert_value(obs) for obs in v]
|
56
|
-
for k, v in df.to_dict(orient="list").items()
|
57
|
-
]
|
58
|
-
return data
|
59
|
-
|
60
|
-
@property
|
61
|
-
def question_names_to_question_texts(self):
|
62
|
-
"""Return a dictionary of question names to question texts.
|
63
|
-
This will repair the question names if they are not valid Python identifiers using the
|
64
|
-
same question_name_repair_func that was passed in.
|
65
|
-
"""
|
66
|
-
if not hasattr(self, "_meta"):
|
67
|
-
self._parse()
|
68
|
-
d = {}
|
69
|
-
for qn, label in self._meta.column_names_to_labels.items():
|
70
|
-
new_name = qn
|
71
|
-
if not is_valid_variable_name(qn):
|
72
|
-
new_name = self.question_name_repair_func(qn)
|
73
|
-
if not is_valid_variable_name(new_name):
|
74
|
-
raise ValueError(
|
75
|
-
f"""Question names must be valid Python identifiers. '{qn}' is not.""",
|
76
|
-
"""You can pass an entry in question_name_repair_dict to fix this.""",
|
77
|
-
)
|
78
|
-
if label is not None:
|
79
|
-
d[new_name] = label
|
80
|
-
return d
|
81
|
-
|
82
|
-
def get_question_texts(self):
|
83
|
-
if not hasattr(self, "_meta"):
|
84
|
-
self._parse()
|
85
|
-
return [
|
86
|
-
self.question_names_to_question_texts.get(qn, qn)
|
87
|
-
for qn in self.question_names
|
88
|
-
]
|
89
|
-
|
90
|
-
def get_question_names(self):
|
91
|
-
return self.get_df().columns.tolist()
|
1
|
+
import pandas as pd
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
from edsl.conjure.InputData import InputDataABC
|
5
|
+
from edsl.conjure.utilities import convert_value
|
6
|
+
from edsl.utilities.utilities import is_valid_variable_name
|
7
|
+
|
8
|
+
try:
|
9
|
+
import pyreadstat
|
10
|
+
except ImportError as e:
|
11
|
+
raise ImportError(
|
12
|
+
"The 'pyreadstat' package is required for this feature. Please install it by running:\n"
|
13
|
+
"pip install pyreadstat\n"
|
14
|
+
) from e
|
15
|
+
|
16
|
+
|
17
|
+
class InputDataPyRead(InputDataABC):
|
18
|
+
def pyread_function(self, datafile_name):
|
19
|
+
raise NotImplementedError
|
20
|
+
|
21
|
+
def _parse(self) -> None:
|
22
|
+
try:
|
23
|
+
df, meta = self.pyread_function(self.datafile_name)
|
24
|
+
except Exception as e:
|
25
|
+
raise ValueError(
|
26
|
+
f"An error occurred while reading the file {self.datafile_name}."
|
27
|
+
) from e
|
28
|
+
float_columns = df.select_dtypes(include=["float64"]).columns
|
29
|
+
df[float_columns] = df[float_columns].astype(str)
|
30
|
+
|
31
|
+
df.fillna("", inplace=True)
|
32
|
+
df = df.astype(str)
|
33
|
+
self._df = df
|
34
|
+
self._meta = meta
|
35
|
+
|
36
|
+
def get_df(self) -> pd.DataFrame:
|
37
|
+
if not hasattr(self, "_df"):
|
38
|
+
self._parse()
|
39
|
+
return self._df
|
40
|
+
|
41
|
+
def get_answer_codebook(self):
|
42
|
+
if not hasattr(self, "_meta"):
|
43
|
+
self._parse()
|
44
|
+
|
45
|
+
question_name_to_label_name = self._meta.variable_to_label
|
46
|
+
label_name_to_labels = self._meta.value_labels
|
47
|
+
return {
|
48
|
+
qn: label_name_to_labels[label_name]
|
49
|
+
for qn, label_name in question_name_to_label_name.items()
|
50
|
+
}
|
51
|
+
|
52
|
+
def get_raw_data(self) -> List[List[str]]:
|
53
|
+
df = self.get_df()
|
54
|
+
data = [
|
55
|
+
[convert_value(obs) for obs in v]
|
56
|
+
for k, v in df.to_dict(orient="list").items()
|
57
|
+
]
|
58
|
+
return data
|
59
|
+
|
60
|
+
@property
|
61
|
+
def question_names_to_question_texts(self):
|
62
|
+
"""Return a dictionary of question names to question texts.
|
63
|
+
This will repair the question names if they are not valid Python identifiers using the
|
64
|
+
same question_name_repair_func that was passed in.
|
65
|
+
"""
|
66
|
+
if not hasattr(self, "_meta"):
|
67
|
+
self._parse()
|
68
|
+
d = {}
|
69
|
+
for qn, label in self._meta.column_names_to_labels.items():
|
70
|
+
new_name = qn
|
71
|
+
if not is_valid_variable_name(qn):
|
72
|
+
new_name = self.question_name_repair_func(qn)
|
73
|
+
if not is_valid_variable_name(new_name):
|
74
|
+
raise ValueError(
|
75
|
+
f"""Question names must be valid Python identifiers. '{qn}' is not.""",
|
76
|
+
"""You can pass an entry in question_name_repair_dict to fix this.""",
|
77
|
+
)
|
78
|
+
if label is not None:
|
79
|
+
d[new_name] = label
|
80
|
+
return d
|
81
|
+
|
82
|
+
def get_question_texts(self):
|
83
|
+
if not hasattr(self, "_meta"):
|
84
|
+
self._parse()
|
85
|
+
return [
|
86
|
+
self.question_names_to_question_texts.get(qn, qn)
|
87
|
+
for qn in self.question_names
|
88
|
+
]
|
89
|
+
|
90
|
+
def get_question_names(self):
|
91
|
+
return self.get_df().columns.tolist()
|
edsl/conjure/InputDataSPSS.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
from edsl.conjure.InputDataPyRead import InputDataPyRead
|
2
|
-
|
3
|
-
|
4
|
-
class InputDataSPSS(InputDataPyRead):
|
5
|
-
def pyread_function(self, datafile_name):
|
6
|
-
from pyreadstat import read_sav
|
7
|
-
|
8
|
-
return read_sav(datafile_name)
|
1
|
+
from edsl.conjure.InputDataPyRead import InputDataPyRead
|
2
|
+
|
3
|
+
|
4
|
+
class InputDataSPSS(InputDataPyRead):
|
5
|
+
def pyread_function(self, datafile_name):
|
6
|
+
from pyreadstat import read_sav
|
7
|
+
|
8
|
+
return read_sav(datafile_name)
|