edsl 0.1.27.dev2__py3-none-any.whl → 0.1.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. edsl/Base.py +99 -22
  2. edsl/BaseDiff.py +260 -0
  3. edsl/__init__.py +4 -0
  4. edsl/__version__.py +1 -1
  5. edsl/agents/Agent.py +26 -5
  6. edsl/agents/AgentList.py +62 -7
  7. edsl/agents/Invigilator.py +4 -9
  8. edsl/agents/InvigilatorBase.py +5 -5
  9. edsl/agents/descriptors.py +3 -1
  10. edsl/conjure/AgentConstructionMixin.py +152 -0
  11. edsl/conjure/Conjure.py +56 -0
  12. edsl/conjure/InputData.py +628 -0
  13. edsl/conjure/InputDataCSV.py +48 -0
  14. edsl/conjure/InputDataMixinQuestionStats.py +182 -0
  15. edsl/conjure/InputDataPyRead.py +91 -0
  16. edsl/conjure/InputDataSPSS.py +8 -0
  17. edsl/conjure/InputDataStata.py +8 -0
  18. edsl/conjure/QuestionOptionMixin.py +76 -0
  19. edsl/conjure/QuestionTypeMixin.py +23 -0
  20. edsl/conjure/RawQuestion.py +65 -0
  21. edsl/conjure/SurveyResponses.py +7 -0
  22. edsl/conjure/__init__.py +9 -4
  23. edsl/conjure/examples/placeholder.txt +0 -0
  24. edsl/conjure/naming_utilities.py +263 -0
  25. edsl/conjure/utilities.py +165 -28
  26. edsl/conversation/Conversation.py +238 -0
  27. edsl/conversation/car_buying.py +58 -0
  28. edsl/conversation/mug_negotiation.py +81 -0
  29. edsl/conversation/next_speaker_utilities.py +93 -0
  30. edsl/coop/coop.py +191 -12
  31. edsl/coop/utils.py +20 -2
  32. edsl/data/Cache.py +55 -17
  33. edsl/data/CacheHandler.py +10 -9
  34. edsl/inference_services/AnthropicService.py +1 -0
  35. edsl/inference_services/DeepInfraService.py +20 -13
  36. edsl/inference_services/GoogleService.py +7 -1
  37. edsl/inference_services/InferenceServicesCollection.py +33 -7
  38. edsl/inference_services/OpenAIService.py +17 -10
  39. edsl/inference_services/models_available_cache.py +69 -0
  40. edsl/inference_services/rate_limits_cache.py +25 -0
  41. edsl/inference_services/write_available.py +10 -0
  42. edsl/jobs/Jobs.py +240 -36
  43. edsl/jobs/buckets/BucketCollection.py +9 -3
  44. edsl/jobs/interviews/Interview.py +4 -1
  45. edsl/jobs/interviews/InterviewTaskBuildingMixin.py +24 -10
  46. edsl/jobs/interviews/retry_management.py +4 -4
  47. edsl/jobs/runners/JobsRunnerAsyncio.py +87 -45
  48. edsl/jobs/runners/JobsRunnerStatusData.py +3 -3
  49. edsl/jobs/tasks/QuestionTaskCreator.py +4 -2
  50. edsl/language_models/LanguageModel.py +37 -44
  51. edsl/language_models/ModelList.py +96 -0
  52. edsl/language_models/registry.py +14 -0
  53. edsl/language_models/repair.py +95 -24
  54. edsl/notebooks/Notebook.py +119 -31
  55. edsl/questions/QuestionBase.py +109 -12
  56. edsl/questions/descriptors.py +5 -2
  57. edsl/questions/question_registry.py +7 -0
  58. edsl/results/Result.py +20 -8
  59. edsl/results/Results.py +85 -11
  60. edsl/results/ResultsDBMixin.py +3 -6
  61. edsl/results/ResultsExportMixin.py +47 -16
  62. edsl/results/ResultsToolsMixin.py +5 -5
  63. edsl/scenarios/Scenario.py +59 -5
  64. edsl/scenarios/ScenarioList.py +97 -40
  65. edsl/study/ObjectEntry.py +97 -0
  66. edsl/study/ProofOfWork.py +110 -0
  67. edsl/study/SnapShot.py +77 -0
  68. edsl/study/Study.py +491 -0
  69. edsl/study/__init__.py +2 -0
  70. edsl/surveys/Survey.py +79 -31
  71. edsl/surveys/SurveyExportMixin.py +21 -3
  72. edsl/utilities/__init__.py +1 -0
  73. edsl/utilities/gcp_bucket/__init__.py +0 -0
  74. edsl/utilities/gcp_bucket/cloud_storage.py +96 -0
  75. edsl/utilities/gcp_bucket/simple_example.py +9 -0
  76. edsl/utilities/interface.py +24 -28
  77. edsl/utilities/repair_functions.py +28 -0
  78. edsl/utilities/utilities.py +57 -2
  79. {edsl-0.1.27.dev2.dist-info → edsl-0.1.28.dist-info}/METADATA +43 -17
  80. {edsl-0.1.27.dev2.dist-info → edsl-0.1.28.dist-info}/RECORD +83 -55
  81. edsl-0.1.28.dist-info/entry_points.txt +3 -0
  82. edsl/conjure/RawResponseColumn.py +0 -327
  83. edsl/conjure/SurveyBuilder.py +0 -308
  84. edsl/conjure/SurveyBuilderCSV.py +0 -78
  85. edsl/conjure/SurveyBuilderSPSS.py +0 -118
  86. edsl/data/RemoteDict.py +0 -103
  87. {edsl-0.1.27.dev2.dist-info → edsl-0.1.28.dist-info}/LICENSE +0 -0
  88. {edsl-0.1.27.dev2.dist-info → edsl-0.1.28.dist-info}/WHEEL +0 -0
@@ -0,0 +1,628 @@
1
+ import functools
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, Callable, Optional, List, Generator, Tuple, Union
5
+ from collections import namedtuple
6
+ from typing import List, Union
7
+
8
+ from edsl.questions.QuestionBase import QuestionBase
9
+
10
+ from edsl.scenarios.ScenarioList import ScenarioList
11
+ from edsl.surveys.Survey import Survey
12
+ from edsl.conjure.SurveyResponses import SurveyResponses
13
+ from edsl.conjure.naming_utilities import sanitize_string
14
+ from edsl.utilities.utilities import is_valid_variable_name
15
+
16
+ from edsl.conjure.RawQuestion import RawQuestion
17
+ from edsl.conjure.AgentConstructionMixin import AgentConstructionMixin
18
+
19
+ from edsl.conjure.QuestionOptionMixin import QuestionOptionMixin
20
+ from edsl.conjure.InputDataMixinQuestionStats import InputDataMixinQuestionStats
21
+ from edsl.conjure.QuestionTypeMixin import QuestionTypeMixin
22
+
23
+
24
+ class InputDataABC(
25
+ ABC,
26
+ InputDataMixinQuestionStats,
27
+ AgentConstructionMixin,
28
+ QuestionOptionMixin,
29
+ QuestionTypeMixin,
30
+ ):
31
+ """A class to represent the input data for a survey."""
32
+
33
+ NUM_UNIQUE_THRESHOLD = 15
34
+ FRAC_NUMERICAL_THRESHOLD = 0.8
35
+ MULTIPLE_CHOICE_OTHER_THRESHOLD = 0.5
36
+ OTHER_STRING = "Other:"
37
+
38
+ question_attributes = [
39
+ "num_responses",
40
+ "num_unique_responses",
41
+ "missing",
42
+ "unique_responses",
43
+ "frac_numerical",
44
+ "top_5",
45
+ "frac_obs_from_top_5",
46
+ ]
47
+ QuestionStats = namedtuple("QuestionStats", question_attributes)
48
+
49
+ def __init__(
50
+ self,
51
+ datafile_name: str,
52
+ config: Optional[dict] = None,
53
+ naming_function: Optional[Callable] = sanitize_string,
54
+ raw_data: Optional[List] = None,
55
+ question_names: Optional[List[str]] = None,
56
+ question_texts: Optional[List[str]] = None,
57
+ answer_codebook: Optional[Dict] = None,
58
+ question_types: Optional[List[str]] = None,
59
+ question_options: Optional[List] = None,
60
+ order_options=False,
61
+ question_name_repair_func: Callable = None,
62
+ ):
63
+ """Initialize the InputData object.
64
+
65
+ :param datafile_name: The name of the file containing the data.
66
+ :param config: The configuration parameters for reading the data.
67
+ :param raw_data: The raw data in the form of a dictionary.
68
+ :param question_names: The names of the questions.
69
+ :param question_texts: The text of the questions.
70
+ :param answer_codebook: The codebook for the answers.
71
+ :param question_types: The types of the questions.
72
+ :param question_options: The options for the questions.
73
+
74
+ >>> id = InputDataABC.example(question_names = ['a','b'], answer_codebook = {'a': {'1':'yes', '2':'no'}, 'b': {'1':'yes', '2':'no'}})
75
+
76
+ >>> id = InputDataABC.example(question_names = ['a','b'], answer_codebook = {'a': {'1':'yes', '2':'no'}, 'c': {'1':'yes', '2':'no'}})
77
+ Traceback (most recent call last):
78
+ ...
79
+ Exception: The keys of the answer_codebook must match the question_names.
80
+ """
81
+
82
+ self.datafile_name = datafile_name
83
+ self.config = config
84
+ self.naming_function = naming_function
85
+
86
+ def default_repair_func(x):
87
+ return (
88
+ x.replace("#", "_num")
89
+ .replace("class", "social_class")
90
+ .replace("name", "respondent_name")
91
+ )
92
+
93
+ self.question_name_repair_func = (
94
+ question_name_repair_func or default_repair_func
95
+ )
96
+
97
+ if answer_codebook is not None and question_names is not None:
98
+ if set(answer_codebook.keys()) != set(question_names):
99
+ raise Exception(
100
+ "The keys of the answer_codebook must match the question_names."
101
+ )
102
+
103
+ if question_names is not None and question_texts is not None:
104
+ if len(question_names) != len(question_texts):
105
+ raise Exception(
106
+ "The question_names and question_texts must have the same length."
107
+ )
108
+
109
+ self.question_texts = question_texts
110
+ self.question_names = question_names
111
+ self.answer_codebook = answer_codebook
112
+ self.raw_data = raw_data
113
+
114
+ self.apply_codebook()
115
+
116
+ self.question_types = question_types
117
+ self.question_options = question_options
118
+ if order_options:
119
+ self.order_options()
120
+
121
+ @abstractmethod
122
+ def get_question_texts(self) -> List[str]:
123
+ """Get the text of the questions
124
+
125
+ >>> id = InputDataABC.example()
126
+ >>> id.get_question_texts()
127
+ ['how are you doing this morning?', 'how are you feeling?']
128
+
129
+ """
130
+ raise NotImplementedError
131
+
132
+ @abstractmethod
133
+ def get_raw_data(self) -> List[List[str]]:
134
+ """Returns the responses by reading the datafile_name.
135
+
136
+ >>> id = InputDataABC.example()
137
+ >>> id.get_raw_data()
138
+ [['1', '4'], ['3', '6']]
139
+
140
+ """
141
+ raise NotImplementedError
142
+
143
+ @abstractmethod
144
+ def get_question_names(self) -> List[str]:
145
+ """Get the names of the questions.
146
+
147
+ >>> id = InputDataABC.example()
148
+ >>> id.get_question_names()
149
+ ['morning', 'feeling']
150
+
151
+ """
152
+ raise NotImplementedError
153
+
154
+ def rename_questions(self, rename_dict: Dict[str, str]) -> "InputData":
155
+ """Rename a question.
156
+
157
+ >>> id = InputDataABC.example()
158
+ >>> id.rename_questions({'morning': 'evening'}).question_names
159
+ ['evening', 'feeling']
160
+
161
+ """
162
+ for old_name, new_name in rename_dict.items():
163
+ self.rename(old_name, new_name)
164
+ return self
165
+
166
+ def rename(self, old_name, new_name) -> "InputData":
167
+ """Rename a question.
168
+
169
+ >>> id = InputDataABC.example()
170
+ >>> id.rename('morning', 'evening').question_names
171
+ ['evening', 'feeling']
172
+
173
+ """
174
+ idx = self.question_names.index(old_name)
175
+ self.question_names[idx] = new_name
176
+ self.answer_codebook[new_name] = self.answer_codebook.pop(old_name, {})
177
+
178
+ return self
179
+
180
+ def _drop_question(self, question_name):
181
+ """Drop a question
182
+
183
+ >>> id = InputDataABC.example()
184
+ >>> id._drop_question('morning').question_names
185
+ ['feeling']
186
+
187
+ """
188
+ idx = self.question_names.index(question_name)
189
+ self._question_names.pop(idx)
190
+ self._question_texts.pop(idx)
191
+ self.question_types.pop(idx)
192
+ self.question_options.pop(idx)
193
+ self.raw_data.pop(idx)
194
+ self.answer_codebook.pop(question_name, None)
195
+ return self
196
+
197
+ def drop(self, *question_names_to_drop) -> "InputData":
198
+ """Drop a question.
199
+
200
+ >>> id = InputDataABC.example()
201
+ >>> id.drop('morning').question_names
202
+ ['feeling']
203
+
204
+ """
205
+ for qn in question_names_to_drop:
206
+ self._drop_question(qn)
207
+ return self
208
+
209
+ def keep(self, *question_names_to_keep) -> "InputDataABC":
210
+ """Keep a question.
211
+
212
+ >>> id = InputDataABC.example()
213
+ >>> id.keep('morning').question_names
214
+ ['morning']
215
+
216
+ """
217
+ all_question_names = self._question_names[:]
218
+ for qn in all_question_names:
219
+ if qn not in question_names_to_keep:
220
+ self._drop_question(qn)
221
+ return self
222
+
223
+ def modify_question_type(
224
+ self,
225
+ question_name: str,
226
+ new_type: str,
227
+ drop_options: bool = False,
228
+ new_options: Optional[List[str]] = None,
229
+ ) -> "InputData":
230
+ """Modify the question type of a question. Checks to make sure the new type is valid.
231
+
232
+ >>> id = InputDataABC.example()
233
+ >>> id.modify_question_type('morning', 'numerical', drop_options = True).question_types
234
+ ['numerical', 'multiple_choice']
235
+
236
+ >>> id = InputDataABC.example()
237
+ >>> id.modify_question_type('morning', 'poop')
238
+ Traceback (most recent call last):
239
+ ...
240
+ ValueError: Question type poop is not available.
241
+ """
242
+ old_type = self.question_types[self.question_names.index(question_name)]
243
+ old_options = self.question_options[self.question_names.index(question_name)]
244
+
245
+ from edsl import Question
246
+
247
+ if new_type not in Question.available():
248
+ raise ValueError(f"Question type {new_type} is not available.")
249
+
250
+ idx = self.question_names.index(question_name)
251
+ self.question_types[idx] = new_type
252
+ if drop_options:
253
+ self.question_options[idx] = None
254
+ if new_options is not None:
255
+ self.question_options[idx] = new_options
256
+
257
+ try:
258
+ idx = self.question_names.index(question_name)
259
+ rq = self.raw_question(idx)
260
+ q = rq.to_question()
261
+ except Exception as e:
262
+ print(f"Error with question {question_name} in {self.datafile_name}")
263
+ print(e)
264
+ print("Reverting changes")
265
+ self.question_types[idx] = old_type
266
+ self.question_options[idx] = old_options
267
+ return self
268
+
269
+ @property
270
+ def num_observations(self):
271
+ """Return the number of observations.
272
+
273
+ >>> id = InputDataABC.example()
274
+ >>> id.num_observations
275
+ 2
276
+
277
+ """
278
+ return len(self.raw_data[0])
279
+
280
+ def to_dict(self):
281
+ return {
282
+ "datafile_name": self.datafile_name,
283
+ "config": self.config,
284
+ "raw_data": self.raw_data,
285
+ "question_names": self.question_names,
286
+ "question_texts": self.question_texts,
287
+ "answer_codebook": self.answer_codebook,
288
+ "question_types": self.question_types,
289
+ }
290
+
291
+ @classmethod
292
+ def from_dict(cls, d: Dict):
293
+ return cls(**d)
294
+
295
+ @property
296
+ def question_names(self) -> List[str]:
297
+ """
298
+ Return a list of question names.
299
+
300
+ >>> id = InputDataABC.example()
301
+ >>> id.question_names
302
+ ['morning', 'feeling']
303
+
304
+ We can pass question names instead:
305
+
306
+ >>> id = InputDataABC.example(question_names = ['a','b'])
307
+ >>> id.question_names
308
+ ['a', 'b']
309
+
310
+ """
311
+ if not hasattr(self, "_question_names"):
312
+ self.question_names = None
313
+ return self._question_names
314
+
315
+ @question_names.setter
316
+ def question_names(self, value) -> None:
317
+ if value is None:
318
+ value = self.get_question_names()
319
+ if len(set(value)) != len(value):
320
+ raise ValueError("Question names must be unique.")
321
+ for i, qn in enumerate(value):
322
+ if not is_valid_variable_name(qn, allow_name=False):
323
+ new_name = self.question_name_repair_func(qn)
324
+ if not is_valid_variable_name(new_name, allow_name=False):
325
+ raise ValueError(
326
+ f"""Question names must be valid Python identifiers. '{qn}' is not.""",
327
+ """You can pass an entry in question_name_repair_func to fix this.""",
328
+ )
329
+ else:
330
+ value[i] = new_name
331
+ else:
332
+ value[i] = qn
333
+ self._question_names = value
334
+
335
+ @property
336
+ def question_texts(self) -> List[str]:
337
+ """
338
+ Return a list of question texts.
339
+
340
+ >>> id = InputDataABC.example()
341
+ >>> id.question_texts
342
+ ['how are you doing this morning?', 'how are you feeling?']
343
+ """
344
+ if not hasattr(self, "_question_texts"):
345
+ self.question_texts = None
346
+ return self._question_texts
347
+
348
+ @question_texts.setter
349
+ def question_texts(self, value):
350
+ if value is None:
351
+ value = self.get_question_texts()
352
+ self._question_texts = value
353
+
354
+ @property
355
+ def raw_data(self):
356
+ """
357
+
358
+ >>> id = InputDataABC.example()
359
+ >>> id.raw_data
360
+ [['1', '4'], ['3', '6']]
361
+
362
+ """
363
+ if not hasattr(self, "_raw_data"):
364
+ self.raw_data = None
365
+ return self._raw_data
366
+
367
+ @raw_data.setter
368
+ def raw_data(self, value):
369
+ """ """
370
+ if value is None:
371
+ value = self.get_raw_data()
372
+ # self.apply_codebook()
373
+ self._raw_data = value
374
+
375
+ def to_dataset(self) -> "Dataset":
376
+ from edsl.results.Dataset import Dataset
377
+
378
+ dataset_list = []
379
+ for key, value in zip(self.question_names, self.raw_data):
380
+ dataset_list.append({key: value})
381
+ return Dataset(dataset_list)
382
+
383
+ def to_scenario_list(self) -> ScenarioList:
384
+ """Return a ScenarioList object from the raw response data.
385
+
386
+ >>> id = InputDataABC.example()
387
+ >>> s = id.to_scenario_list()
388
+ >>> type(s) == ScenarioList
389
+ True
390
+
391
+ >>> s
392
+ ScenarioList([Scenario({'morning': '1', 'feeling': '3'}), Scenario({'morning': '4', 'feeling': '6'})])
393
+
394
+ """
395
+ s = ScenarioList()
396
+ for qn in self.question_names:
397
+ idx = self.question_names.index(qn)
398
+ s = s.add_list(qn, self.raw_data[idx])
399
+ return s
400
+
401
+ @property
402
+ def names_to_texts(self) -> dict:
403
+ """
404
+ Return a dictionary of question names to question texts.
405
+
406
+ >>> id = InputDataABC.example()
407
+ >>> id.names_to_texts
408
+ {'morning': 'how are you doing this morning?', 'feeling': 'how are you feeling?'}
409
+ """
410
+ return {n: t for n, t in zip(self.question_names, self.question_texts)}
411
+
412
+ @property
413
+ def texts_to_names(self):
414
+ """Return a dictionary of question texts to question names.
415
+
416
+ >>> id = InputDataABC.example()
417
+ >>> id.texts_to_names
418
+ {'how are you doing this morning?': 'morning', 'how are you feeling?': 'feeling'}
419
+
420
+ """
421
+ return {t: n for n, t in self.names_to_texts.items()}
422
+
423
+ def raw_question(self, index: int) -> RawQuestion:
424
+ return RawQuestion(
425
+ question_type=self.question_types[index],
426
+ question_name=self.question_names[index],
427
+ question_text=self.question_texts[index],
428
+ responses=self.raw_data[index],
429
+ question_options=self.question_options[index],
430
+ )
431
+
432
+ def raw_questions(self) -> Generator[RawQuestion, None, None]:
433
+ """Return a generator of RawQuestion objects."""
434
+ for qn in self.question_names:
435
+ idx = self.question_names.index(qn)
436
+ yield self.raw_question(idx)
437
+
438
+ def questions(self) -> Generator[Union[QuestionBase, None], None, None]:
439
+ """Return a generator of Question objects."""
440
+ for rq in self.raw_questions():
441
+ try:
442
+ yield rq.to_question()
443
+ except Exception as e:
444
+ print(
445
+ f"Error with question '{rq.question_name}' in '{self.datafile_name}'"
446
+ )
447
+ print(e)
448
+ yield None
449
+
450
+ def select(self, *question_names: List[str]) -> "InputData":
451
+ """Select a subset of the questions.
452
+
453
+ :param question_names: The names of the questions to select.
454
+
455
+ >>> id = InputDataABC.example()
456
+ >>> id.select('morning').question_names
457
+ ['morning']
458
+
459
+ """
460
+
461
+ idxs = [self.question_names.index(qn) for qn in question_names]
462
+ new_data = [self.raw_data[i] for i in idxs]
463
+ new_texts = [self.question_texts[i] for i in idxs]
464
+ new_types = [self.question_types[i] for i in idxs]
465
+ new_options = [self.question_options[i] for i in idxs]
466
+ new_names = [self.question_names[i] for i in idxs]
467
+ answer_codebook = {
468
+ qn: self.answer_codebook.get(qn, {}) for qn in question_names
469
+ }
470
+ return self.__class__(
471
+ self.datafile_name,
472
+ self.config,
473
+ raw_data=new_data,
474
+ question_names=new_names,
475
+ question_texts=new_texts,
476
+ question_types=new_types,
477
+ question_options=new_options,
478
+ answer_codebook=answer_codebook,
479
+ question_name_repair_func=self.question_name_repair_func,
480
+ )
481
+
482
+ def to_survey(self) -> Survey:
483
+ """
484
+ >>> id = InputDataABC.example()
485
+ >>> s = id.to_survey()
486
+ >>> type(s) == Survey
487
+ True
488
+
489
+ """
490
+ s = Survey()
491
+ for q in self.questions():
492
+ if q is not None:
493
+ s.add_question(q)
494
+ return s
495
+
496
+ def print(self):
497
+ sl = (
498
+ ScenarioList.from_list("question_name", self.question_names)
499
+ .add_list("question_text", self.question_texts)
500
+ .add_list("inferred_question_type", self.question_types)
501
+ .add_list("num_responses", self.num_responses)
502
+ .add_list("num_unique_responses", self.num_unique_responses)
503
+ .add_list("missing", self.missing)
504
+ .add_list("frac_numerical", self.frac_numerical)
505
+ .add_list("top_5_items", self.top_k(5))
506
+ .add_list("frac_obs_from_top_5", self.frac_obs_from_top_k(5))
507
+ )
508
+ sl.print()
509
+
510
+ @property
511
+ def answer_codebook(self) -> dict:
512
+ """Return the answer codebook.
513
+ >>> id = InputDataABC.example(answer_codebook = {'morning':{'1':'hello'}})
514
+ >>> id.answer_codebook
515
+ {'morning': {'1': 'hello'}}
516
+
517
+ """
518
+ if not hasattr(self, "_answer_codebook"):
519
+ self._answer_codebook = None
520
+ return self._answer_codebook
521
+
522
+ @answer_codebook.setter
523
+ def answer_codebook(self, value):
524
+ if value is None:
525
+ value = self.get_answer_codebook()
526
+ self._answer_codebook = value
527
+
528
+ def get_answer_codebook(self):
529
+ return {}
530
+
531
+ def _drop_rows(self, indices: List[int]):
532
+ """Drop rows from the raw data.
533
+ :param indices
534
+
535
+ >>> id = InputDataABC.example()
536
+ >>> id.num_observations
537
+ 2
538
+ >>> _ = id._drop_rows([1])
539
+ >>> id.num_observations
540
+ 1
541
+
542
+ """
543
+ self.raw_data = [
544
+ [r for i, r in enumerate(row) if i not in indices] for row in self.raw_data
545
+ ]
546
+ return self
547
+
548
+ def _missing_indices(self, question_name):
549
+ """Return the indices of missing values for a question.
550
+ TODO: Could re-factor to use SimpleEval
551
+
552
+ >>> id = InputDataABC.example()
553
+ >>> id.raw_data[0][0] = 'missing'
554
+ >>> id._missing_indices('morning')
555
+ [0]
556
+ """
557
+ idx = self.question_names.index(question_name)
558
+ return [i for i, r in enumerate(self.raw_data[idx]) if r == "missing"]
559
+
560
+ def drop_missing(self, question_name):
561
+ """Drop missing values for a question.
562
+
563
+ >>> id = InputDataABC.example()
564
+ >>> id.num_observations
565
+ 2
566
+ >>> id.raw_data[0][0] = 'missing'
567
+ >>> id.drop_missing('morning')
568
+ >>> id.num_observations
569
+ 1
570
+ """
571
+ self._drop_rows(self._missing_indices(question_name))
572
+
573
+ @property
574
+ def num_observations(self):
575
+ """
576
+ Return the number of observations
577
+
578
+ >>> id = InputDataABC.example()
579
+ >>> id.num_observations
580
+ 2
581
+ """
582
+ return len(self.raw_data[0])
583
+
584
+ def apply_codebook(self) -> None:
585
+ """Apply the codebook to the raw data.
586
+
587
+ >>> id = InputDataABC.example()
588
+ >>> id.raw_data
589
+ [['1', '4'], ['3', '6']]
590
+
591
+ >>> id = InputDataABC.example(answer_codebook = {'morning':{'1':'hello'}})
592
+ >>> id.raw_data
593
+ [['hello', '4'], ['3', '6']]
594
+ """
595
+ for index, qn in enumerate(self.question_names):
596
+ if qn in self.answer_codebook:
597
+ new_responses = [
598
+ self.answer_codebook[qn].get(r, r) for r in self.raw_data[index]
599
+ ]
600
+ self.raw_data[index] = new_responses
601
+
602
+ def __repr__(self):
603
+ return f"{self.__class__.__name__}: datafile_name:'{self.datafile_name}' num_questions:{len(self.question_names)}, num_observations:{len(self.raw_data[0])}"
604
+
605
+ @classmethod
606
+ def example(cls, **kwargs) -> "InputDataABC":
607
+ class InputDataExample(InputDataABC):
608
+ def get_question_texts(self) -> List[str]:
609
+ """Get the text of the questions"""
610
+ return ["how are you doing this morning?", "how are you feeling?"]
611
+
612
+ def get_raw_data(self) -> SurveyResponses:
613
+ """Returns a dataframe of responses by reading the datafile_name."""
614
+ return [["1", "4"], ["3", "6"]]
615
+
616
+ def get_question_names(self):
617
+ new_names = [self.naming_function(q) for q in self.question_texts]
618
+ if len(new_names) != len(set(new_names)):
619
+ new_names = [f"{q}_{i}" for i, q in enumerate(new_names)]
620
+ return new_names
621
+
622
+ return InputDataExample("notneeded", config={}, **kwargs)
623
+
624
+
625
+ if __name__ == "__main__":
626
+ import doctest
627
+
628
+ doctest.testmod(optionflags=doctest.ELLIPSIS)
@@ -0,0 +1,48 @@
1
+ from typing import List, Optional
2
+ import pandas as pd
3
+ from edsl.conjure.InputData import InputDataABC
4
+ from edsl.conjure.utilities import convert_value
5
+
6
+
7
+ class InputDataCSV(InputDataABC):
8
+ def __init__(self, datafile_name: str, config: Optional[dict] = None, **kwargs):
9
+ if config is None:
10
+ config = {"skiprows": None, "delimiter": ","}
11
+
12
+ super().__init__(datafile_name, config, **kwargs)
13
+
14
+ def get_df(self) -> pd.DataFrame:
15
+ if not hasattr(self, "_df"):
16
+ self._df = pd.read_csv(
17
+ self.datafile_name,
18
+ skiprows=self.config["skiprows"],
19
+ encoding_errors="ignore",
20
+ )
21
+ float_columns = self._df.select_dtypes(include=["float64"]).columns
22
+ self._df[float_columns] = self._df[float_columns].astype(str)
23
+ self._df.fillna("", inplace=True)
24
+ self._df = self._df.astype(str)
25
+ return self._df
26
+
27
+ def get_raw_data(self) -> List[List[str]]:
28
+ data = [
29
+ [convert_value(obs) for obs in v]
30
+ for k, v in self.get_df().to_dict(orient="list").items()
31
+ ]
32
+ return data
33
+
34
+ def get_question_texts(self):
35
+ return list(self.get_df().columns)
36
+
37
+ def get_question_names(self):
38
+ new_names = [self.naming_function(q) for q in self.question_texts]
39
+
40
+ if len(new_names) > len(set(new_names)):
41
+ from collections import Counter
42
+
43
+ counter = Counter(new_names)
44
+ for i, name in enumerate(new_names):
45
+ if counter[name] > 1:
46
+ new_names[i] = name + str(counter[name])
47
+ counter[name] -= 1
48
+ return new_names