edsl 0.1.48__py3-none-any.whl → 0.1.50__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +124 -53
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +21 -21
- edsl/agents/agent_list.py +2 -5
- edsl/agents/exceptions.py +119 -5
- edsl/base/__init__.py +10 -35
- edsl/base/base_class.py +71 -36
- edsl/base/base_exception.py +204 -0
- edsl/base/data_transfer_models.py +1 -1
- edsl/base/exceptions.py +94 -0
- edsl/buckets/__init__.py +15 -1
- edsl/buckets/bucket_collection.py +3 -4
- edsl/buckets/exceptions.py +75 -0
- edsl/buckets/model_buckets.py +1 -2
- edsl/buckets/token_bucket.py +11 -6
- edsl/buckets/token_bucket_api.py +1 -2
- edsl/buckets/token_bucket_client.py +9 -7
- edsl/caching/cache.py +7 -2
- edsl/caching/cache_entry.py +10 -9
- edsl/caching/exceptions.py +113 -7
- edsl/caching/remote_cache_sync.py +1 -2
- edsl/caching/sql_dict.py +17 -12
- edsl/cli.py +43 -0
- edsl/config/config_class.py +30 -6
- edsl/conversation/Conversation.py +3 -2
- edsl/conversation/exceptions.py +58 -0
- edsl/conversation/mug_negotiation.py +0 -2
- edsl/coop/__init__.py +20 -1
- edsl/coop/coop.py +129 -38
- edsl/coop/exceptions.py +188 -9
- edsl/coop/price_fetcher.py +3 -6
- edsl/coop/utils.py +4 -6
- edsl/dataset/__init__.py +5 -4
- edsl/dataset/dataset.py +53 -43
- edsl/dataset/dataset_operations_mixin.py +86 -72
- edsl/dataset/dataset_tree.py +9 -5
- edsl/dataset/display/table_display.py +0 -2
- edsl/dataset/display/table_renderers.py +0 -1
- edsl/dataset/exceptions.py +125 -0
- edsl/dataset/file_exports.py +18 -11
- edsl/dataset/r/ggplot.py +13 -6
- edsl/display/__init__.py +27 -0
- edsl/display/core.py +147 -0
- edsl/display/plugin.py +189 -0
- edsl/display/utils.py +52 -0
- edsl/inference_services/__init__.py +9 -1
- edsl/inference_services/available_model_cache_handler.py +1 -1
- edsl/inference_services/available_model_fetcher.py +4 -5
- edsl/inference_services/data_structures.py +9 -6
- edsl/inference_services/exceptions.py +132 -1
- edsl/inference_services/inference_service_abc.py +2 -2
- edsl/inference_services/inference_services_collection.py +2 -6
- edsl/inference_services/registry.py +4 -3
- edsl/inference_services/service_availability.py +2 -1
- edsl/inference_services/services/anthropic_service.py +4 -1
- edsl/inference_services/services/aws_bedrock.py +13 -12
- edsl/inference_services/services/azure_ai.py +12 -10
- edsl/inference_services/services/deep_infra_service.py +1 -4
- edsl/inference_services/services/deep_seek_service.py +1 -5
- edsl/inference_services/services/google_service.py +6 -2
- edsl/inference_services/services/groq_service.py +1 -1
- edsl/inference_services/services/mistral_ai_service.py +4 -2
- edsl/inference_services/services/ollama_service.py +1 -1
- edsl/inference_services/services/open_ai_service.py +7 -5
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +8 -7
- edsl/inference_services/services/together_ai_service.py +2 -3
- edsl/inference_services/services/xai_service.py +1 -1
- edsl/instructions/__init__.py +1 -1
- edsl/instructions/change_instruction.py +3 -2
- edsl/instructions/exceptions.py +61 -0
- edsl/instructions/instruction.py +5 -2
- edsl/instructions/instruction_collection.py +2 -1
- edsl/instructions/instruction_handler.py +4 -9
- edsl/interviews/ReportErrors.py +0 -3
- edsl/interviews/__init__.py +9 -2
- edsl/interviews/answering_function.py +11 -13
- edsl/interviews/exception_tracking.py +14 -7
- edsl/interviews/exceptions.py +79 -0
- edsl/interviews/interview.py +32 -29
- edsl/interviews/interview_status_dictionary.py +4 -2
- edsl/interviews/interview_status_log.py +2 -1
- edsl/interviews/interview_task_manager.py +3 -3
- edsl/interviews/request_token_estimator.py +3 -1
- edsl/interviews/statistics.py +2 -3
- edsl/invigilators/__init__.py +7 -1
- edsl/invigilators/exceptions.py +79 -0
- edsl/invigilators/invigilator_base.py +0 -1
- edsl/invigilators/invigilators.py +8 -12
- edsl/invigilators/prompt_constructor.py +1 -5
- edsl/invigilators/prompt_helpers.py +8 -4
- edsl/invigilators/question_instructions_prompt_builder.py +1 -1
- edsl/invigilators/question_option_processor.py +9 -5
- edsl/invigilators/question_template_replacements_builder.py +3 -2
- edsl/jobs/__init__.py +3 -3
- edsl/jobs/async_interview_runner.py +24 -22
- edsl/jobs/check_survey_scenario_compatibility.py +7 -6
- edsl/jobs/data_structures.py +7 -4
- edsl/jobs/exceptions.py +177 -8
- edsl/jobs/fetch_invigilator.py +1 -1
- edsl/jobs/jobs.py +72 -67
- edsl/jobs/jobs_checks.py +2 -3
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_pricing_estimation.py +3 -2
- edsl/jobs/jobs_remote_inference_logger.py +5 -4
- edsl/jobs/jobs_runner_asyncio.py +1 -2
- edsl/jobs/jobs_runner_status.py +8 -9
- edsl/jobs/remote_inference.py +26 -23
- edsl/jobs/results_exceptions_handler.py +8 -5
- edsl/key_management/__init__.py +3 -1
- edsl/key_management/exceptions.py +62 -0
- edsl/key_management/key_lookup.py +1 -1
- edsl/key_management/key_lookup_builder.py +37 -14
- edsl/key_management/key_lookup_collection.py +2 -0
- edsl/language_models/__init__.py +1 -1
- edsl/language_models/exceptions.py +302 -14
- edsl/language_models/language_model.py +4 -7
- edsl/language_models/model.py +4 -4
- edsl/language_models/model_list.py +1 -1
- edsl/language_models/price_manager.py +1 -1
- edsl/language_models/raw_response_handler.py +14 -9
- edsl/language_models/registry.py +17 -21
- edsl/language_models/repair.py +0 -6
- edsl/language_models/unused/fake_openai_service.py +0 -1
- edsl/load_plugins.py +69 -0
- edsl/logger.py +146 -0
- edsl/notebooks/notebook.py +1 -1
- edsl/notebooks/notebook_to_latex.py +0 -1
- edsl/plugins/__init__.py +63 -0
- edsl/plugins/built_in/export_example.py +50 -0
- edsl/plugins/built_in/pig_latin.py +67 -0
- edsl/plugins/cli.py +372 -0
- edsl/plugins/cli_typer.py +283 -0
- edsl/plugins/exceptions.py +31 -0
- edsl/plugins/hookspec.py +51 -0
- edsl/plugins/plugin_host.py +128 -0
- edsl/plugins/plugin_manager.py +633 -0
- edsl/plugins/plugins_registry.py +168 -0
- edsl/prompts/__init__.py +2 -0
- edsl/prompts/exceptions.py +107 -5
- edsl/prompts/prompt.py +14 -6
- edsl/questions/HTMLQuestion.py +5 -11
- edsl/questions/Quick.py +0 -1
- edsl/questions/__init__.py +2 -0
- edsl/questions/answer_validator_mixin.py +318 -318
- edsl/questions/compose_questions.py +2 -2
- edsl/questions/descriptors.py +10 -49
- edsl/questions/exceptions.py +278 -22
- edsl/questions/loop_processor.py +7 -5
- edsl/questions/prompt_templates/question_list.jinja +3 -0
- edsl/questions/question_base.py +14 -16
- edsl/questions/question_base_gen_mixin.py +2 -2
- edsl/questions/question_base_prompts_mixin.py +9 -3
- edsl/questions/question_budget.py +9 -5
- edsl/questions/question_check_box.py +3 -5
- edsl/questions/question_dict.py +171 -194
- edsl/questions/question_extract.py +1 -1
- edsl/questions/question_free_text.py +4 -6
- edsl/questions/question_functional.py +4 -3
- edsl/questions/question_list.py +36 -9
- edsl/questions/question_matrix.py +95 -61
- edsl/questions/question_multiple_choice.py +6 -4
- edsl/questions/question_numerical.py +2 -4
- edsl/questions/question_registry.py +4 -2
- edsl/questions/register_questions_meta.py +0 -1
- edsl/questions/response_validator_abc.py +7 -13
- edsl/questions/templates/dict/answering_instructions.jinja +1 -0
- edsl/questions/templates/rank/question_presentation.jinja +1 -1
- edsl/results/__init__.py +1 -1
- edsl/results/exceptions.py +141 -7
- edsl/results/report.py +0 -1
- edsl/results/result.py +4 -5
- edsl/results/results.py +10 -51
- edsl/results/results_selector.py +8 -4
- edsl/scenarios/PdfExtractor.py +2 -2
- edsl/scenarios/construct_download_link.py +69 -35
- edsl/scenarios/directory_scanner.py +33 -14
- edsl/scenarios/document_chunker.py +1 -1
- edsl/scenarios/exceptions.py +238 -14
- edsl/scenarios/file_methods.py +1 -1
- edsl/scenarios/file_store.py +7 -3
- edsl/scenarios/handlers/__init__.py +17 -0
- edsl/scenarios/handlers/docx_file_store.py +0 -5
- edsl/scenarios/handlers/pdf_file_store.py +0 -1
- edsl/scenarios/handlers/pptx_file_store.py +0 -5
- edsl/scenarios/handlers/py_file_store.py +0 -1
- edsl/scenarios/handlers/sql_file_store.py +1 -4
- edsl/scenarios/handlers/sqlite_file_store.py +0 -1
- edsl/scenarios/handlers/txt_file_store.py +1 -1
- edsl/scenarios/scenario.py +0 -1
- edsl/scenarios/scenario_list.py +152 -18
- edsl/scenarios/scenario_list_pdf_tools.py +1 -0
- edsl/scenarios/scenario_selector.py +0 -1
- edsl/surveys/__init__.py +3 -4
- edsl/surveys/dag/__init__.py +4 -2
- edsl/surveys/descriptors.py +1 -1
- edsl/surveys/edit_survey.py +1 -0
- edsl/surveys/exceptions.py +165 -9
- edsl/surveys/memory/__init__.py +5 -3
- edsl/surveys/memory/memory_management.py +1 -0
- edsl/surveys/memory/memory_plan.py +6 -15
- edsl/surveys/rules/__init__.py +5 -3
- edsl/surveys/rules/rule.py +1 -2
- edsl/surveys/rules/rule_collection.py +1 -1
- edsl/surveys/survey.py +12 -24
- edsl/surveys/survey_export.py +6 -3
- edsl/surveys/survey_flow_visualization.py +10 -1
- edsl/tasks/__init__.py +2 -0
- edsl/tasks/question_task_creator.py +3 -3
- edsl/tasks/task_creators.py +1 -3
- edsl/tasks/task_history.py +5 -7
- edsl/tasks/task_status_log.py +1 -2
- edsl/tokens/__init__.py +3 -1
- edsl/tokens/token_usage.py +1 -1
- edsl/utilities/__init__.py +21 -1
- edsl/utilities/decorators.py +1 -2
- edsl/utilities/markdown_to_docx.py +2 -2
- edsl/utilities/markdown_to_pdf.py +1 -1
- edsl/utilities/repair_functions.py +0 -1
- edsl/utilities/restricted_python.py +0 -1
- edsl/utilities/template_loader.py +2 -3
- edsl/utilities/utilities.py +8 -29
- {edsl-0.1.48.dist-info → edsl-0.1.50.dist-info}/METADATA +32 -2
- edsl-0.1.50.dist-info/RECORD +363 -0
- edsl-0.1.50.dist-info/entry_points.txt +3 -0
- edsl/dataset/smart_objects.py +0 -96
- edsl/exceptions/BaseException.py +0 -21
- edsl/exceptions/__init__.py +0 -54
- edsl/exceptions/configuration.py +0 -16
- edsl/exceptions/general.py +0 -34
- edsl/study/ObjectEntry.py +0 -173
- edsl/study/ProofOfWork.py +0 -113
- edsl/study/SnapShot.py +0 -80
- edsl/study/Study.py +0 -520
- edsl/study/__init__.py +0 -6
- edsl/utilities/interface.py +0 -135
- edsl-0.1.48.dist-info/RECORD +0 -347
- {edsl-0.1.48.dist-info → edsl-0.1.50.dist-info}/LICENSE +0 -0
- {edsl-0.1.48.dist-info → edsl-0.1.50.dist-info}/WHEEL +0 -0
@@ -12,16 +12,18 @@ ScenarioList, AgentList) to share the same data manipulation interface, enabling
|
|
12
12
|
fluid operations across different parts of the EDSL ecosystem.
|
13
13
|
"""
|
14
14
|
|
15
|
-
from abc import ABC, abstractmethod
|
16
15
|
import io
|
17
16
|
import warnings
|
18
17
|
import textwrap
|
19
|
-
from typing import Optional, Tuple, Union, List, TYPE_CHECKING
|
18
|
+
from typing import Optional, Tuple, Union, List, TYPE_CHECKING # Callable not used
|
19
|
+
from functools import wraps
|
20
20
|
from .r.ggplot import GGPlotMethod
|
21
|
+
from .exceptions import DatasetKeyError, DatasetValueError, DatasetTypeError, DatasetExportError
|
21
22
|
|
22
23
|
if TYPE_CHECKING:
|
23
24
|
from docx import Document
|
24
25
|
from .dataset import Dataset
|
26
|
+
from ..jobs import Job # noqa: F401
|
25
27
|
|
26
28
|
class DataOperationsBase:
|
27
29
|
"""
|
@@ -135,10 +137,7 @@ class DataOperationsBase:
|
|
135
137
|
>>> sorted(Results.example().select().relevant_columns(data_type = "model"))
|
136
138
|
['model.frequency_penalty', ...]
|
137
139
|
|
138
|
-
>>>
|
139
|
-
Traceback (most recent call last):
|
140
|
-
...
|
141
|
-
ValueError: No columns found for data type: flimflam. Available data types are: ...
|
140
|
+
>>> # Testing relevant_columns with invalid data_type raises DatasetValueError - tested in unit tests
|
142
141
|
"""
|
143
142
|
columns = [list(x.keys())[0] for x in self]
|
144
143
|
if remove_prefix:
|
@@ -159,7 +158,7 @@ class DataOperationsBase:
|
|
159
158
|
all_data_types = sorted(
|
160
159
|
list(set(get_data_type(column) for column in all_columns))
|
161
160
|
)
|
162
|
-
raise
|
161
|
+
raise DatasetValueError(
|
163
162
|
f"No columns found for data type: {data_type}. Available data types are: {all_data_types}."
|
164
163
|
)
|
165
164
|
|
@@ -179,7 +178,7 @@ class DataOperationsBase:
|
|
179
178
|
_num_observations = len(values)
|
180
179
|
else:
|
181
180
|
if len(values) != _num_observations:
|
182
|
-
raise
|
181
|
+
raise DatasetValueError(
|
183
182
|
f"The number of observations is not consistent across columns. "
|
184
183
|
f"Column '{key}' has {len(values)} observations, but previous columns had {_num_observations} observations."
|
185
184
|
)
|
@@ -262,8 +261,9 @@ class DataOperationsBase:
|
|
262
261
|
remove_prefix=remove_prefix, pretty_labels=pretty_labels
|
263
262
|
)
|
264
263
|
|
265
|
-
def to_jsonl(self, filename: Optional[str] = None)
|
264
|
+
def to_jsonl(self, filename: Optional[str] = None):
|
266
265
|
"""Export the results to a FileStore instance containing JSONL data."""
|
266
|
+
from .file_exports import JSONLExport
|
267
267
|
exporter = JSONLExport(data=self, filename=filename)
|
268
268
|
return exporter.export()
|
269
269
|
|
@@ -274,8 +274,9 @@ class DataOperationsBase:
|
|
274
274
|
pretty_labels: Optional[dict] = None,
|
275
275
|
table_name: str = "results",
|
276
276
|
if_exists: str = "replace",
|
277
|
-
)
|
277
|
+
):
|
278
278
|
"""Export the results to a SQLite database file."""
|
279
|
+
from .file_exports import SQLiteExport
|
279
280
|
exporter = SQLiteExport(
|
280
281
|
data=self,
|
281
282
|
filename=filename,
|
@@ -291,7 +292,7 @@ class DataOperationsBase:
|
|
291
292
|
filename: Optional[str] = None,
|
292
293
|
remove_prefix: bool = False,
|
293
294
|
pretty_labels: Optional[dict] = None,
|
294
|
-
)
|
295
|
+
):
|
295
296
|
"""Export the results to a FileStore instance containing CSV data."""
|
296
297
|
from .file_exports import CSVExport
|
297
298
|
|
@@ -309,9 +310,9 @@ class DataOperationsBase:
|
|
309
310
|
remove_prefix: bool = False,
|
310
311
|
pretty_labels: Optional[dict] = None,
|
311
312
|
sheet_name: Optional[str] = None,
|
312
|
-
)
|
313
|
+
):
|
313
314
|
"""Export the results to a FileStore instance containing Excel data."""
|
314
|
-
from .file_exports import
|
315
|
+
from .file_exports import ExcelExport
|
315
316
|
|
316
317
|
exporter = ExcelExport(
|
317
318
|
data=self,
|
@@ -324,25 +325,28 @@ class DataOperationsBase:
|
|
324
325
|
|
325
326
|
def _db(
|
326
327
|
self, remove_prefix: bool = True, shape: str = "wide"
|
327
|
-
)
|
328
|
+
):
|
328
329
|
"""Create a SQLite database in memory and return the connection.
|
329
330
|
|
330
331
|
Args:
|
331
332
|
remove_prefix: Whether to remove the prefix from the column names
|
332
333
|
shape: The shape of the data in the database ("wide" or "long")
|
333
|
-
|
334
|
+
|
334
335
|
Returns:
|
335
336
|
A database connection
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
337
|
+
|
338
|
+
Examples:
|
339
|
+
>>> from sqlalchemy import text
|
340
|
+
>>> from edsl import Results
|
341
|
+
>>> engine = Results.example()._db()
|
342
|
+
>>> len(engine.execute(text("SELECT * FROM self")).fetchall())
|
343
|
+
4
|
344
|
+
>>> engine = Results.example()._db(shape = "long")
|
345
|
+
>>> len(engine.execute(text("SELECT * FROM self")).fetchall())
|
346
|
+
172
|
344
347
|
"""
|
345
|
-
|
348
|
+
# Import needed for database connection
|
349
|
+
from sqlalchemy import create_engine
|
346
350
|
|
347
351
|
engine = create_engine("sqlite:///:memory:")
|
348
352
|
if remove_prefix and shape == "wide":
|
@@ -445,29 +449,35 @@ class DataOperationsBase:
|
|
445
449
|
|
446
450
|
def to_pandas(
|
447
451
|
self, remove_prefix: bool = False, lists_as_strings=False
|
448
|
-
)
|
452
|
+
):
|
449
453
|
"""Convert the results to a pandas DataFrame, ensuring that lists remain as lists.
|
450
454
|
|
451
|
-
:
|
452
|
-
|
455
|
+
Args:
|
456
|
+
remove_prefix: Whether to remove the prefix from the column names.
|
457
|
+
lists_as_strings: Whether to convert lists to strings.
|
458
|
+
|
459
|
+
Returns:
|
460
|
+
A pandas DataFrame.
|
453
461
|
"""
|
462
|
+
# pandas is imported in _to_pandas_strings
|
454
463
|
return self._to_pandas_strings(remove_prefix)
|
455
464
|
|
456
|
-
def _to_pandas_strings(self, remove_prefix: bool = False)
|
465
|
+
def _to_pandas_strings(self, remove_prefix: bool = False):
|
457
466
|
"""Convert the results to a pandas DataFrame.
|
458
467
|
|
459
|
-
:
|
468
|
+
Args:
|
469
|
+
remove_prefix: Whether to remove the prefix from the column names.
|
460
470
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
471
|
+
Examples:
|
472
|
+
>>> from edsl.results import Results
|
473
|
+
>>> r = Results.example()
|
474
|
+
>>> r.select('how_feeling').to_pandas()
|
475
|
+
answer.how_feeling
|
476
|
+
0 OK
|
477
|
+
1 Great
|
478
|
+
2 Terrible
|
479
|
+
3 OK
|
469
480
|
"""
|
470
|
-
|
471
481
|
import pandas as pd
|
472
482
|
|
473
483
|
csv_string = self.to_csv(remove_prefix=remove_prefix).text
|
@@ -478,17 +488,27 @@ class DataOperationsBase:
|
|
478
488
|
|
479
489
|
def to_polars(
|
480
490
|
self, remove_prefix: bool = False, lists_as_strings=False
|
481
|
-
)
|
491
|
+
):
|
482
492
|
"""Convert the results to a Polars DataFrame.
|
483
493
|
|
484
|
-
:
|
494
|
+
Args:
|
495
|
+
remove_prefix: Whether to remove the prefix from the column names.
|
496
|
+
lists_as_strings: Whether to convert lists to strings.
|
497
|
+
|
498
|
+
Returns:
|
499
|
+
A Polars DataFrame.
|
485
500
|
"""
|
501
|
+
# polars is imported in _to_polars_strings
|
486
502
|
return self._to_polars_strings(remove_prefix)
|
487
503
|
|
488
|
-
def _to_polars_strings(self, remove_prefix: bool = False)
|
504
|
+
def _to_polars_strings(self, remove_prefix: bool = False):
|
489
505
|
"""Convert the results to a Polars DataFrame.
|
490
506
|
|
491
|
-
:
|
507
|
+
Args:
|
508
|
+
remove_prefix: Whether to remove the prefix from the column names.
|
509
|
+
|
510
|
+
Returns:
|
511
|
+
A Polars DataFrame.
|
492
512
|
"""
|
493
513
|
import polars as pl
|
494
514
|
|
@@ -496,10 +516,14 @@ class DataOperationsBase:
|
|
496
516
|
df = pl.read_csv(io.StringIO(csv_string))
|
497
517
|
return df
|
498
518
|
|
499
|
-
def tree(self, node_order: Optional[List[str]] = None)
|
519
|
+
def tree(self, node_order: Optional[List[str]] = None):
|
500
520
|
"""Convert the results to a Tree.
|
501
521
|
|
502
|
-
:
|
522
|
+
Args:
|
523
|
+
node_order: The order of the nodes.
|
524
|
+
|
525
|
+
Returns:
|
526
|
+
A Tree object.
|
503
527
|
"""
|
504
528
|
from .dataset_tree import Tree
|
505
529
|
return Tree(self, node_order=node_order)
|
@@ -598,15 +622,12 @@ class DataOperationsBase:
|
|
598
622
|
[1, 9, 2, 3, 4]
|
599
623
|
|
600
624
|
>>> from edsl.dataset import Dataset
|
601
|
-
>>>
|
602
|
-
Traceback (most recent call last):
|
603
|
-
...
|
604
|
-
ValueError: Cannot flatten a list of lists when there are multiple columns selected.
|
625
|
+
>>> # Testing to_list flatten with multiple columns raises DatasetValueError - tested in unit tests
|
605
626
|
|
606
627
|
|
607
628
|
"""
|
608
629
|
if len(self.relevant_columns()) > 1 and flatten:
|
609
|
-
raise
|
630
|
+
raise DatasetValueError(
|
610
631
|
"Cannot flatten a list of lists when there are multiple columns selected."
|
611
632
|
)
|
612
633
|
|
@@ -632,7 +653,6 @@ class DataOperationsBase:
|
|
632
653
|
new_list.append(item)
|
633
654
|
list_to_return = new_list
|
634
655
|
|
635
|
-
from edsl.utilities.PrettyList import PrettyList
|
636
656
|
|
637
657
|
#return PrettyList(list_to_return)
|
638
658
|
return list_to_return
|
@@ -647,7 +667,6 @@ class DataOperationsBase:
|
|
647
667
|
import tempfile
|
648
668
|
from edsl.utilities.utilities import is_notebook
|
649
669
|
from IPython.display import HTML, display
|
650
|
-
from edsl.utilities.utilities import is_notebook
|
651
670
|
|
652
671
|
df = self.to_pandas()
|
653
672
|
|
@@ -698,7 +717,7 @@ class DataOperationsBase:
|
|
698
717
|
all_fields = list(fields) + [f for f in header_fields if f not in fields]
|
699
718
|
for field in all_fields:
|
700
719
|
if field not in self.relevant_columns():
|
701
|
-
raise
|
720
|
+
raise DatasetKeyError(f"Field '{field}' not found in dataset")
|
702
721
|
|
703
722
|
# Get data for each field
|
704
723
|
field_data = {}
|
@@ -780,7 +799,8 @@ class DataOperationsBase:
|
|
780
799
|
from docx.shared import Pt
|
781
800
|
import json
|
782
801
|
except ImportError:
|
783
|
-
|
802
|
+
from edsl.dataset.exceptions import DatasetImportError
|
803
|
+
raise DatasetImportError("The python-docx package is required for DOCX export. Install it with 'pip install python-docx'.")
|
784
804
|
|
785
805
|
doc = Document()
|
786
806
|
|
@@ -797,7 +817,7 @@ class DataOperationsBase:
|
|
797
817
|
if header_parts:
|
798
818
|
header_text += f" ({', '.join(header_parts)})"
|
799
819
|
|
800
|
-
|
820
|
+
doc.add_heading(header_text, level=1)
|
801
821
|
|
802
822
|
# Add the remaining fields
|
803
823
|
for field in fields:
|
@@ -823,7 +843,7 @@ class DataOperationsBase:
|
|
823
843
|
def report(self, *fields: Optional[str], top_n: Optional[int] = None,
|
824
844
|
header_fields: Optional[List[str]] = None, divider: bool = True,
|
825
845
|
return_string: bool = False, format: str = "markdown",
|
826
|
-
filename: Optional[str] = None) -> Optional[Union[str, "
|
846
|
+
filename: Optional[str] = None) -> Optional[Union[str, "Document"]]:
|
827
847
|
"""Generates a report of the results by iterating through rows.
|
828
848
|
|
829
849
|
Args:
|
@@ -886,7 +906,7 @@ class DataOperationsBase:
|
|
886
906
|
return doc
|
887
907
|
|
888
908
|
else:
|
889
|
-
raise
|
909
|
+
raise DatasetExportError(f"Unsupported format: {format}. Use 'markdown' or 'docx'.")
|
890
910
|
|
891
911
|
def tally(
|
892
912
|
self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
|
@@ -945,7 +965,7 @@ class DataOperationsBase:
|
|
945
965
|
f in self.relevant_columns() or f in relevant_columns_without_prefix
|
946
966
|
for f in fields
|
947
967
|
):
|
948
|
-
raise
|
968
|
+
raise DatasetKeyError("One or more specified fields are not in the dataset."
|
949
969
|
f"The available fields are: {self.relevant_columns()}"
|
950
970
|
)
|
951
971
|
|
@@ -963,7 +983,7 @@ class DataOperationsBase:
|
|
963
983
|
except TypeError:
|
964
984
|
tally = dict(Counter([str(v) for v in values]))
|
965
985
|
except Exception as e:
|
966
|
-
raise
|
986
|
+
raise DatasetValueError(f"Error tallying values: {e}")
|
967
987
|
|
968
988
|
sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
|
969
989
|
if top_n is not None:
|
@@ -1056,7 +1076,8 @@ class DataOperationsBase:
|
|
1056
1076
|
# Check if the field is ambiguous
|
1057
1077
|
if len(matching_entries) > 1:
|
1058
1078
|
matching_cols = [next(iter(entry.keys())) for entry in matching_entries]
|
1059
|
-
|
1079
|
+
from edsl.dataset.exceptions import DatasetValueError
|
1080
|
+
raise DatasetValueError(
|
1060
1081
|
f"Ambiguous field name '{field}'. It matches multiple columns: {matching_cols}. "
|
1061
1082
|
f"Please specify the full column name to flatten."
|
1062
1083
|
)
|
@@ -1159,13 +1180,13 @@ class DataOperationsBase:
|
|
1159
1180
|
break
|
1160
1181
|
|
1161
1182
|
if field_index is None:
|
1162
|
-
raise
|
1183
|
+
raise DatasetKeyError(f"Field '{field}' not found in dataset")
|
1163
1184
|
|
1164
1185
|
field_data = result.data[field_index][field]
|
1165
1186
|
|
1166
1187
|
# Check if values are lists
|
1167
1188
|
if not all(isinstance(v, list) for v in field_data):
|
1168
|
-
raise
|
1189
|
+
raise DatasetTypeError(f"Field '{field}' does not contain lists in all entries")
|
1169
1190
|
|
1170
1191
|
# Get the maximum length of lists
|
1171
1192
|
max_len = max(len(v) for v in field_data)
|
@@ -1209,16 +1230,13 @@ class DataOperationsBase:
|
|
1209
1230
|
>>> d.drop('a')
|
1210
1231
|
Dataset([{'b': [4, 5, 6]}])
|
1211
1232
|
|
1212
|
-
>>>
|
1213
|
-
Traceback (most recent call last):
|
1214
|
-
...
|
1215
|
-
KeyError: "Field 'c' not found in dataset"
|
1233
|
+
>>> # Testing drop with nonexistent field raises DatasetKeyError - tested in unit tests
|
1216
1234
|
"""
|
1217
1235
|
from .dataset import Dataset
|
1218
1236
|
|
1219
1237
|
# Check if field exists in the dataset
|
1220
1238
|
if field_name not in self.relevant_columns():
|
1221
|
-
raise
|
1239
|
+
raise DatasetKeyError(f"Field '{field_name}' not found in dataset")
|
1222
1240
|
|
1223
1241
|
# Create a new dataset without the specified field
|
1224
1242
|
new_data = [entry for entry in self.data if field_name not in entry]
|
@@ -1248,9 +1266,7 @@ class DataOperationsBase:
|
|
1248
1266
|
>>> d = Dataset([{'a.x': [1, 2, 3]}, {'b.x': [4, 5, 6]}])
|
1249
1267
|
>>> # d.remove_prefix()
|
1250
1268
|
|
1251
|
-
|
1252
|
-
...
|
1253
|
-
ValueError: Removing prefixes would result in duplicate column names: ['x']
|
1269
|
+
# Testing remove_prefix with duplicate column names raises DatasetValueError - tested in unit tests
|
1254
1270
|
"""
|
1255
1271
|
from .dataset import Dataset
|
1256
1272
|
|
@@ -1273,7 +1289,7 @@ class DataOperationsBase:
|
|
1273
1289
|
|
1274
1290
|
# Check for duplicates
|
1275
1291
|
if duplicates:
|
1276
|
-
raise
|
1292
|
+
raise DatasetValueError(f"Removing prefixes would result in duplicate column names: {sorted(list(duplicates))}")
|
1277
1293
|
|
1278
1294
|
# Create a new dataset with unprefixed column names
|
1279
1295
|
new_data = []
|
@@ -1288,8 +1304,6 @@ class DataOperationsBase:
|
|
1288
1304
|
return Dataset(new_data)
|
1289
1305
|
|
1290
1306
|
|
1291
|
-
from functools import wraps
|
1292
|
-
|
1293
1307
|
def to_dataset(func):
|
1294
1308
|
"""
|
1295
1309
|
Decorator that ensures functions receive a Dataset object as their first argument.
|
edsl/dataset/dataset_tree.py
CHANGED
@@ -1,4 +1,7 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Optional, List, TYPE_CHECKING
|
2
|
+
|
3
|
+
if TYPE_CHECKING:
|
4
|
+
from .dataset import Dataset
|
2
5
|
|
3
6
|
|
4
7
|
def is_hashable(v):
|
@@ -16,8 +19,10 @@ class TreeNode:
|
|
16
19
|
self.children = {}
|
17
20
|
|
18
21
|
|
22
|
+
|
19
23
|
class Tree:
|
20
24
|
def __init__(self, data: "Dataset", node_order: Optional[List[str]] = None):
|
25
|
+
"""Initialize the tree with a Dataset."""
|
21
26
|
d = {}
|
22
27
|
for entry in data:
|
23
28
|
d.update(entry)
|
@@ -46,7 +51,8 @@ class Tree:
|
|
46
51
|
else:
|
47
52
|
if not set(node_order).issubset(set(self.data.keys())):
|
48
53
|
invalid_keys = set(node_order) - set(self.data.keys())
|
49
|
-
|
54
|
+
from edsl.dataset.exceptions import DatasetValueError
|
55
|
+
raise DatasetValueError(f"Invalid keys in node_order: {invalid_keys}")
|
50
56
|
|
51
57
|
self.root = TreeNode()
|
52
58
|
|
@@ -95,8 +101,7 @@ class Tree:
|
|
95
101
|
filename = "tree_structure.docx"
|
96
102
|
|
97
103
|
from docx import Document
|
98
|
-
from docx.shared import
|
99
|
-
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
104
|
+
from docx.shared import Pt
|
100
105
|
from docx.enum.style import WD_STYLE_TYPE
|
101
106
|
|
102
107
|
doc = Document()
|
@@ -118,7 +123,6 @@ class Tree:
|
|
118
123
|
self._add_to_docx(doc, self.root, 0)
|
119
124
|
import base64
|
120
125
|
from io import BytesIO
|
121
|
-
import base64
|
122
126
|
|
123
127
|
# Save document to bytes buffer
|
124
128
|
doc_buffer = BytesIO()
|
@@ -0,0 +1,125 @@
|
|
1
|
+
"""
|
2
|
+
Exceptions module for dataset-related operations.
|
3
|
+
|
4
|
+
This module defines custom exception classes for all dataset-related error conditions
|
5
|
+
in the EDSL framework, ensuring consistent error handling for data manipulation,
|
6
|
+
transformation, and analysis operations.
|
7
|
+
"""
|
8
|
+
|
9
|
+
from ..base import BaseException
|
10
|
+
|
11
|
+
|
12
|
+
class DatasetError(BaseException):
|
13
|
+
"""
|
14
|
+
Base exception class for all dataset-related errors.
|
15
|
+
|
16
|
+
This is the parent class for exceptions related to Dataset operations
|
17
|
+
in the EDSL framework, including data creation, manipulation, validation,
|
18
|
+
and serialization.
|
19
|
+
|
20
|
+
Examples:
|
21
|
+
```python
|
22
|
+
# Usually not raised directly, but through subclasses:
|
23
|
+
dataset = Dataset([])
|
24
|
+
dataset["missing_key"] # Would raise DatasetKeyError
|
25
|
+
```
|
26
|
+
"""
|
27
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
28
|
+
|
29
|
+
|
30
|
+
class DatasetKeyError(DatasetError):
|
31
|
+
"""
|
32
|
+
Exception raised when a key is not found in a dataset.
|
33
|
+
|
34
|
+
This exception occurs when attempting to access a field or column
|
35
|
+
that doesn't exist in the dataset.
|
36
|
+
|
37
|
+
Examples:
|
38
|
+
```python
|
39
|
+
dataset = Dataset([{"a": 1}])
|
40
|
+
dataset["b"] # Raises DatasetKeyError
|
41
|
+
```
|
42
|
+
"""
|
43
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
44
|
+
|
45
|
+
|
46
|
+
class DatasetValueError(DatasetError):
|
47
|
+
"""
|
48
|
+
Exception raised when there's an issue with dataset values.
|
49
|
+
|
50
|
+
This exception occurs when dataset values are invalid, incompatible
|
51
|
+
with an operation, or otherwise problematic.
|
52
|
+
|
53
|
+
Examples:
|
54
|
+
```python
|
55
|
+
dataset = Dataset([{"a": 1}, {"b": 2}])
|
56
|
+
dataset.select(["c"]) # Raises DatasetValueError for missing field
|
57
|
+
```
|
58
|
+
"""
|
59
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
60
|
+
|
61
|
+
|
62
|
+
class DatasetTypeError(DatasetError):
|
63
|
+
"""
|
64
|
+
Exception raised when there's a type mismatch in dataset operations.
|
65
|
+
|
66
|
+
This exception occurs when trying to perform operations with
|
67
|
+
incompatible data types.
|
68
|
+
|
69
|
+
Examples:
|
70
|
+
```python
|
71
|
+
dataset = Dataset([{"a": 1}])
|
72
|
+
dataset + "not a dataset" # Raises DatasetTypeError
|
73
|
+
```
|
74
|
+
"""
|
75
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
76
|
+
|
77
|
+
|
78
|
+
class DatasetExportError(DatasetError):
|
79
|
+
"""
|
80
|
+
Exception raised when exporting a dataset to a different format fails.
|
81
|
+
|
82
|
+
This exception occurs when trying to export a dataset to a file format
|
83
|
+
(like CSV, SQLite, etc.) and the operation fails.
|
84
|
+
|
85
|
+
Examples:
|
86
|
+
```python
|
87
|
+
dataset = Dataset([{"a": complex(1, 2)}])
|
88
|
+
dataset.to_csv("file.csv") # Raises DatasetExportError (complex not serializable)
|
89
|
+
```
|
90
|
+
"""
|
91
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
92
|
+
|
93
|
+
|
94
|
+
class DatasetImportError(DatasetError):
|
95
|
+
"""
|
96
|
+
Exception raised when importing data from an external source fails.
|
97
|
+
|
98
|
+
This exception occurs when trying to import data from an external source or format
|
99
|
+
(like CSV, JSON, etc.) and the operation fails, often due to missing dependencies
|
100
|
+
or format issues.
|
101
|
+
|
102
|
+
Examples:
|
103
|
+
```python
|
104
|
+
# Trying to export to DOCX without python-docx package
|
105
|
+
dataset.to_docx("file.docx") # Raises DatasetImportError
|
106
|
+
```
|
107
|
+
"""
|
108
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
109
|
+
|
110
|
+
|
111
|
+
class DatasetRuntimeError(DatasetError):
|
112
|
+
"""
|
113
|
+
Exception raised when an operation fails during runtime.
|
114
|
+
|
115
|
+
This exception is used for runtime errors in dataset operations,
|
116
|
+
typically for operations that depend on external systems or libraries
|
117
|
+
like R integration.
|
118
|
+
|
119
|
+
Examples:
|
120
|
+
```python
|
121
|
+
# Plotting with ggplot when R is not installed
|
122
|
+
dataset.ggplot() # Raises DatasetRuntimeError
|
123
|
+
```
|
124
|
+
"""
|
125
|
+
relevant_doc = "https://docs.expectedparrot.com/en/latest/dataset.html"
|
edsl/dataset/file_exports.py
CHANGED
@@ -2,7 +2,8 @@ from abc import ABC, abstractmethod
|
|
2
2
|
import io
|
3
3
|
import csv
|
4
4
|
import base64
|
5
|
-
|
5
|
+
import sqlite3
|
6
|
+
from typing import Optional, Union, Any, Dict
|
6
7
|
|
7
8
|
|
8
9
|
class FileExport(ABC):
|
@@ -37,14 +38,15 @@ class FileExport(ABC):
|
|
37
38
|
"""Generate default filename for this format."""
|
38
39
|
return f"results.{self.suffix}"
|
39
40
|
|
40
|
-
def _create_filestore(self, data: Union[str, bytes])
|
41
|
+
def _create_filestore(self, data: Union[str, bytes]):
|
41
42
|
"""Create a FileStore instance with encoded data."""
|
43
|
+
from ..scenarios import FileStore
|
42
44
|
if isinstance(data, str):
|
43
45
|
base64_string = base64.b64encode(data.encode()).decode()
|
44
46
|
else:
|
45
47
|
base64_string = base64.b64encode(data).decode()
|
46
48
|
|
47
|
-
|
49
|
+
# FileStore already imported
|
48
50
|
|
49
51
|
path = self.filename or self._get_default_filename()
|
50
52
|
|
@@ -66,8 +68,12 @@ class FileExport(ABC):
|
|
66
68
|
"""Convert the input data to the target format."""
|
67
69
|
pass
|
68
70
|
|
69
|
-
def export(self) -> Optional
|
70
|
-
"""Export the data to a FileStore instance.
|
71
|
+
def export(self) -> Optional:
|
72
|
+
"""Export the data to a FileStore instance.
|
73
|
+
|
74
|
+
Returns:
|
75
|
+
A FileStore instance or None if the file was written directly.
|
76
|
+
"""
|
71
77
|
formatted_data = self.format_data()
|
72
78
|
return self._create_filestore(formatted_data)
|
73
79
|
|
@@ -140,8 +146,6 @@ class ExcelExport(TabularExport):
|
|
140
146
|
return buffer.getvalue()
|
141
147
|
|
142
148
|
|
143
|
-
import sqlite3
|
144
|
-
from typing import Any
|
145
149
|
|
146
150
|
|
147
151
|
class SQLiteExport(TabularExport):
|
@@ -195,11 +199,12 @@ class SQLiteExport(TabularExport):
|
|
195
199
|
cursor.execute(f"DROP TABLE IF EXISTS {self.table_name}")
|
196
200
|
elif self.if_exists == "fail":
|
197
201
|
cursor.execute(
|
198
|
-
|
202
|
+
"SELECT name FROM sqlite_master WHERE type='table' AND name=?",
|
199
203
|
(self.table_name,),
|
200
204
|
)
|
201
205
|
if cursor.fetchone():
|
202
|
-
|
206
|
+
from edsl.dataset.exceptions import DatasetValueError
|
207
|
+
raise DatasetValueError(f"Table {self.table_name} already exists")
|
203
208
|
|
204
209
|
# Create table
|
205
210
|
columns = ", ".join(f'"{col}" {dtype}' for col, dtype in column_types)
|
@@ -240,12 +245,14 @@ class SQLiteExport(TabularExport):
|
|
240
245
|
"""Validate initialization parameters."""
|
241
246
|
valid_if_exists = {"fail", "replace", "append"}
|
242
247
|
if self.if_exists not in valid_if_exists:
|
243
|
-
|
248
|
+
from edsl.dataset.exceptions import DatasetValueError
|
249
|
+
raise DatasetValueError(
|
244
250
|
f"if_exists must be one of {valid_if_exists}, got {self.if_exists}"
|
245
251
|
)
|
246
252
|
|
247
253
|
# Validate table name (basic SQLite identifier validation)
|
248
254
|
if not self.table_name.isalnum() and not all(c in "_" for c in self.table_name):
|
249
|
-
|
255
|
+
from edsl.dataset.exceptions import DatasetValueError
|
256
|
+
raise DatasetValueError(
|
250
257
|
f"Invalid table name: {self.table_name}. Must contain only alphanumeric characters and underscores."
|
251
258
|
)
|