PyPI - edsl - Versions diffs - 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl - Mend

edsl 0.1.54py3-none-any.whl → 0.1.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

edsl/__init__.py +8 -1
edsl/__init__original.py +134 -0
edsl/__version__.py +1 -1
edsl/agents/agent.py +29 -0
edsl/agents/agent_list.py +36 -1
edsl/base/base_class.py +281 -151
edsl/base/data_transfer_models.py +15 -4
edsl/buckets/__init__.py +8 -3
edsl/buckets/bucket_collection.py +9 -3
edsl/buckets/model_buckets.py +4 -2
edsl/buckets/token_bucket.py +2 -2
edsl/buckets/token_bucket_client.py +5 -3
edsl/caching/cache.py +131 -62
edsl/caching/cache_entry.py +70 -58
edsl/caching/sql_dict.py +17 -0
edsl/cli.py +99 -0
edsl/config/config_class.py +16 -0
edsl/conversation/__init__.py +31 -0
edsl/coop/coop.py +276 -242
edsl/coop/coop_jobs_objects.py +59 -0
edsl/coop/coop_objects.py +29 -0
edsl/coop/coop_regular_objects.py +26 -0
edsl/coop/utils.py +24 -19
edsl/dataset/dataset.py +338 -101
edsl/dataset/dataset_operations_mixin.py +216 -180
edsl/db_list/sqlite_list.py +349 -0
edsl/inference_services/__init__.py +40 -5
edsl/inference_services/exceptions.py +11 -0
edsl/inference_services/services/anthropic_service.py +5 -2
edsl/inference_services/services/aws_bedrock.py +6 -2
edsl/inference_services/services/azure_ai.py +6 -2
edsl/inference_services/services/google_service.py +7 -3
edsl/inference_services/services/mistral_ai_service.py +6 -2
edsl/inference_services/services/open_ai_service.py +6 -2
edsl/inference_services/services/perplexity_service.py +6 -2
edsl/inference_services/services/test_service.py +94 -5
edsl/interviews/answering_function.py +167 -59
edsl/interviews/interview.py +124 -72
edsl/interviews/interview_task_manager.py +10 -0
edsl/interviews/request_token_estimator.py +8 -0
edsl/invigilators/invigilators.py +35 -13
edsl/jobs/async_interview_runner.py +146 -104
edsl/jobs/data_structures.py +6 -4
edsl/jobs/decorators.py +61 -0
edsl/jobs/fetch_invigilator.py +61 -18
edsl/jobs/html_table_job_logger.py +14 -2
edsl/jobs/jobs.py +180 -104
edsl/jobs/jobs_component_constructor.py +2 -2
edsl/jobs/jobs_interview_constructor.py +2 -0
edsl/jobs/jobs_pricing_estimation.py +154 -113
edsl/jobs/jobs_remote_inference_logger.py +4 -0
edsl/jobs/jobs_runner_status.py +30 -25
edsl/jobs/progress_bar_manager.py +79 -0
edsl/jobs/remote_inference.py +35 -1
edsl/key_management/key_lookup_builder.py +6 -1
edsl/language_models/language_model.py +110 -12
edsl/language_models/model.py +10 -3
edsl/language_models/price_manager.py +176 -71
edsl/language_models/registry.py +5 -0
edsl/notebooks/notebook.py +77 -10
edsl/questions/VALIDATION_README.md +134 -0
edsl/questions/__init__.py +24 -1
edsl/questions/exceptions.py +21 -0
edsl/questions/question_dict.py +201 -16
edsl/questions/question_multiple_choice_with_other.py +624 -0
edsl/questions/question_registry.py +2 -1
edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
edsl/questions/validation_analysis.py +185 -0
edsl/questions/validation_cli.py +131 -0
edsl/questions/validation_html_report.py +404 -0
edsl/questions/validation_logger.py +136 -0
edsl/results/result.py +115 -46
edsl/results/results.py +702 -171
edsl/scenarios/construct_download_link.py +16 -3
edsl/scenarios/directory_scanner.py +226 -226
edsl/scenarios/file_methods.py +5 -0
edsl/scenarios/file_store.py +150 -9
edsl/scenarios/handlers/__init__.py +5 -1
edsl/scenarios/handlers/mp4_file_store.py +104 -0
edsl/scenarios/handlers/webm_file_store.py +104 -0
edsl/scenarios/scenario.py +120 -101
edsl/scenarios/scenario_list.py +800 -727
edsl/scenarios/scenario_list_gc_test.py +146 -0
edsl/scenarios/scenario_list_memory_test.py +214 -0
edsl/scenarios/scenario_list_source_refactor.md +35 -0
edsl/scenarios/scenario_selector.py +5 -4
edsl/scenarios/scenario_source.py +1990 -0
edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
edsl/surveys/survey.py +22 -0
edsl/tasks/__init__.py +4 -2
edsl/tasks/task_history.py +198 -36
edsl/tests/scenarios/test_ScenarioSource.py +51 -0
edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
edsl/utilities/__init__.py +2 -1
edsl/utilities/decorators.py +121 -0
edsl/utilities/memory_debugger.py +1010 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
edsl/jobs/jobs_runner_asyncio.py +0 -281
edsl/language_models/unused/fake_openai_service.py +0 -60
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0

edsl/dataset/dataset_operations_mixin.py CHANGED Viewed

@@ -7,7 +7,7 @@ including data transformation, visualization, export, querying, and analysis. Th
 operations are inherited by different specialized mixins (DatasetOperationsMixin,
 ResultsOperationsMixin, etc.) which implement class-specific behaviors.
-The design pattern used here allows different container types (Results, Dataset,
+The design pattern used here allows different container types (Results, Dataset,
 ScenarioList, AgentList) to share the same data manipulation interface, enabling
 fluid operations across different parts of the EDSL ecosystem.
 """
@@ -18,49 +18,54 @@ import textwrap
 from typing import Optional, Tuple, Union, List, TYPE_CHECKING  # Callable not used
 from functools import wraps
 from .r.ggplot import GGPlotMethod
-from .exceptions import DatasetKeyError, DatasetValueError, DatasetTypeError, DatasetExportError
+from .exceptions import (
+    DatasetKeyError,
+    DatasetValueError,
+    DatasetTypeError,
+    DatasetExportError,
+)
 if TYPE_CHECKING:
     from docx import Document
     from .dataset import Dataset
     from ..jobs import Job  # noqa: F401
 class DataOperationsBase:
     """
     Base class providing common data operations for EDSL container objects.
     This class serves as the foundation for various data manipulation mixins,
     providing a consistent interface for operations like filtering, aggregation,
     transformation, visualization, and export across different types of EDSL
     containers (Results, Dataset, ScenarioList, AgentList).
     Key functionality categories:
     1. Data Transformation:
        - Filtering with `filter()`
        - Creating new columns with `mutate()`
        - Reshaping with `long()`, `wide()`, `flatten()`, etc.
        - Selecting specific columns with `select()`
     2. Visualization and Display:
        - Tabular display with `table()`
        - Plotting with `ggplot2()`
        - Generating reports with `report()`
     3. Data Export:
        - To various formats with `to_csv()`, `to_excel()`, etc.
        - To other data structures with `to_pandas()`, `to_dicts()`, etc.
     4. Analysis:
        - SQL-based querying with `sql()`
        - Aggregation with `tally()`
        - Tree-based exploration
     These operations are designed to be applied fluently in sequence, enabling
     expressive data manipulation pipelines.
     """
     def ggplot2(
         self,
         ggplot_code: str,
@@ -74,10 +79,10 @@ class DataOperationsBase:
     ):
         """
         Create visualizations using R's ggplot2 library.
         This method provides a bridge to R's powerful ggplot2 visualization library,
         allowing you to create sophisticated plots directly from EDSL data structures.
         Parameters:
             ggplot_code: R code string containing ggplot2 commands
             shape: Data shape to use ("wide" or "long")
@@ -87,31 +92,32 @@ class DataOperationsBase:
             height: Plot height in inches
             width: Plot width in inches
             factor_orders: Dictionary mapping factor variables to their desired order
         Returns:
             A plot object that renders in Jupyter notebooks
         Notes:
             - Requires R and the ggplot2 package to be installed
             - Data is automatically converted to a format suitable for ggplot2
             - The ggplot2 code should reference column names as they appear after
               any transformations from the shape and remove_prefix parameters
         Examples:
             >>> from edsl.results import Results
             >>> r = Results.example()
             >>> # The following would create a plot if R is installed (not shown in doctest):
             >>> # r.ggplot2('''
-            >>> #     ggplot(df, aes(x=how_feeling)) +
+            >>> #     ggplot(df, aes(x=how_feeling)) +
             >>> #     geom_bar() +
             >>> #     labs(title="Distribution of Feelings")
             >>> # ''')
         """
-        return GGPlotMethod(self).ggplot2(ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders)
+        return GGPlotMethod(self).ggplot2(
+            ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders
+        )
     def relevant_columns(
-        self, data_type: Optional[str] = None, remove_prefix:bool=False
+        self, data_type: Optional[str] = None, remove_prefix: bool = False
     ) -> list:
         """Return the set of keys that are present in the dataset.
@@ -184,12 +190,13 @@ class DataOperationsBase:
                     )
         return _num_observations
     def chart(self):
         """
         Create a chart from the results.
         """
         import altair as alt
         return alt.Chart(self.to_pandas(remove_prefix=True))
     def make_tabular(
@@ -271,6 +278,7 @@ class DataOperationsBase:
     def to_jsonl(self, filename: Optional[str] = None):
         """Export the results to a FileStore instance containing JSONL data."""
         from .file_exports import JSONLExport
         exporter = JSONLExport(data=self, filename=filename)
         return exporter.export()
@@ -284,6 +292,7 @@ class DataOperationsBase:
     ):
         """Export the results to a SQLite database file."""
         from .file_exports import SQLiteExport
         exporter = SQLiteExport(
             data=self,
             filename=filename,
@@ -330,18 +339,16 @@ class DataOperationsBase:
         )
         return exporter.export()
-    def _db(
-        self, remove_prefix: bool = True, shape: str = "wide"
-    ):
+    def _db(self, remove_prefix: bool = True, shape: str = "wide"):
         """Create a SQLite database in memory and return the connection.
         Args:
             remove_prefix: Whether to remove the prefix from the column names
             shape: The shape of the data in the database ("wide" or "long")
         Returns:
             A database connection
         Examples:
             >>> from sqlalchemy import text
             >>> from edsl import Results
@@ -350,7 +357,7 @@ class DataOperationsBase:
             4
             >>> engine = Results.example()._db(shape = "long")
             >>> len(engine.execute(text("SELECT * FROM self")).fetchall())
-            172
+            204
         """
         # Import needed for database connection
         from sqlalchemy import create_engine
@@ -393,12 +400,12 @@ class DataOperationsBase:
     ) -> "Dataset":
         """
         Execute SQL queries on the dataset.
-        This powerful method allows you to use SQL to query and transform your data,
-        combining the expressiveness of SQL with EDSL's data structures. It works by
+        This powerful method allows you to use SQL to query and transform your data,
+        combining the expressiveness of SQL with EDSL's data structures. It works by
         creating an in-memory SQLite database from your data and executing the query
         against it.
         Parameters:
             query: SQL query string to execute
             transpose: Whether to transpose the resulting table (rows become columns)
@@ -407,35 +414,35 @@ class DataOperationsBase:
             shape: Data shape to use ("wide" or "long")
                   - "wide": Default tabular format with columns for each field
                   - "long": Melted format with key-value pairs, useful for certain queries
         Returns:
             A Dataset object containing the query results
         Notes:
             - The data is stored in a table named "self" in the SQLite database
             - In wide format, column names include their type prefix unless remove_prefix=True
             - In long format, the data is melted into columns: row_number, key, value, data_type
             - Complex objects like lists and dictionaries are converted to strings
         Examples:
             >>> from edsl import Results
             >>> r = Results.example()
             # Basic selection
             >>> len(r.sql("SELECT * FROM self", shape="wide"))
             4
             # Filtering with WHERE clause
             >>> r.sql("SELECT * FROM self WHERE how_feeling = 'Great'").num_observations()
             1
             # Aggregation
             >>> r.sql("SELECT how_feeling, COUNT(*) as count FROM self GROUP BY how_feeling").keys()
             ['how_feeling', 'count']
             # Using long format
             >>> len(r.sql("SELECT * FROM self", shape="long"))
-            172
+            204
         """
         import pandas as pd
@@ -454,15 +461,13 @@ class DataOperationsBase:
         return Dataset.from_pandas_dataframe(df)
-    def to_pandas(
-        self, remove_prefix: bool = False, lists_as_strings=False
-    ):
+    def to_pandas(self, remove_prefix: bool = False, lists_as_strings=False):
         """Convert the results to a pandas DataFrame, ensuring that lists remain as lists.
         Args:
             remove_prefix: Whether to remove the prefix from the column names.
             lists_as_strings: Whether to convert lists to strings.
         Returns:
             A pandas DataFrame.
         """
@@ -493,15 +498,13 @@ class DataOperationsBase:
         # df_sorted = df.sort_index(axis=1)  # Sort columns alphabetically
         return df
-    def to_polars(
-        self, remove_prefix: bool = False, lists_as_strings=False
-    ):
+    def to_polars(self, remove_prefix: bool = False, lists_as_strings=False):
         """Convert the results to a Polars DataFrame.
         Args:
             remove_prefix: Whether to remove the prefix from the column names.
             lists_as_strings: Whether to convert lists to strings.
         Returns:
             A Polars DataFrame.
         """
@@ -513,7 +516,7 @@ class DataOperationsBase:
         Args:
             remove_prefix: Whether to remove the prefix from the column names.
         Returns:
             A Polars DataFrame.
         """
@@ -522,17 +525,18 @@ class DataOperationsBase:
         csv_string = self.to_csv(remove_prefix=remove_prefix).text
         df = pl.read_csv(io.StringIO(csv_string))
         return df
     def tree(self, node_order: Optional[List[str]] = None):
         """Convert the results to a Tree.
         Args:
             node_order: The order of the nodes.
         Returns:
             A Tree object.
         """
         from .dataset_tree import Tree
         return Tree(self, node_order=node_order)
     def to_scenario_list(self, remove_prefix: bool = True) -> list[dict]:
@@ -552,7 +556,6 @@ class DataOperationsBase:
         for d in list_of_dicts:
             scenarios.append(Scenario(d))
         return ScenarioList(scenarios)
     def to_agent_list(self, remove_prefix: bool = True):
         """Convert the results to a list of dictionaries, one per agent.
@@ -661,10 +664,9 @@ class DataOperationsBase:
                     new_list.append(item)
             list_to_return = new_list
-        #return PrettyList(list_to_return)
+        # return PrettyList(list_to_return)
         return list_to_return
     def html(
         self,
         filename: Optional[str] = None,
@@ -700,33 +702,37 @@ class DataOperationsBase:
         if return_link:
             return filename
-    def _prepare_report_data(self, *fields: Optional[str], top_n: Optional[int] = None,
-                            header_fields: Optional[List[str]] = None) -> tuple:
+    def _prepare_report_data(
+        self,
+        *fields: Optional[str],
+        top_n: Optional[int] = None,
+        header_fields: Optional[List[str]] = None,
+    ) -> tuple:
         """Prepares data for report generation in various formats.
         Args:
             *fields: The fields to include in the report. If none provided, all fields are used.
             top_n: Optional limit on the number of observations to include.
             header_fields: Optional list of fields to include in the main header instead of as sections.
         Returns:
             A tuple containing (field_data, num_obs, fields, header_fields)
         """
         # If no fields specified, use all columns
         if not fields:
             fields = self.relevant_columns()
         # Initialize header_fields if not provided
         if header_fields is None:
             header_fields = []
         # Validate all fields
         all_fields = list(fields) + [f for f in header_fields if f not in fields]
         for field in all_fields:
             if field not in self.relevant_columns():
                 raise DatasetKeyError(f"Field '{field}' not found in dataset")
         # Get data for each field
         field_data = {}
         for field in all_fields:
@@ -734,24 +740,26 @@ class DataOperationsBase:
                 if field in entry:
                     field_data[field] = entry[field]
                     break
         # Number of observations to process
         num_obs = self.num_observations()
         if top_n is not None:
             num_obs = min(num_obs, top_n)
         return field_data, num_obs, fields, header_fields
-    def _report_markdown(self, field_data, num_obs, fields, header_fields, divider: bool = True) -> str:
+    def _report_markdown(
+        self, field_data, num_obs, fields, header_fields, divider: bool = True
+    ) -> str:
         """Generates a markdown report from the prepared data.
         Args:
             field_data: Dictionary mapping field names to their values
             num_obs: Number of observations to include
             fields: Fields to include as sections
             header_fields: Fields to include in the observation header
             divider: If True, adds a horizontal rule between observations
         Returns:
             A string containing the markdown report
         """
@@ -764,13 +772,13 @@ class DataOperationsBase:
                 for field in header_fields:
                     value = field_data[field][i]
                     # Get the field name without prefix for cleaner display
-                    display_name = field.split('.')[-1] if '.' in field else field
+                    display_name = field.split(".")[-1] if "." in field else field
                     # Format with backticks for monospace
                     header_parts.append(f"`{display_name}`: {value}")
                 if header_parts:
                     header += f" ({', '.join(header_parts)})"
             report_lines.append(header)
             # Add the remaining fields
             for field in fields:
                 if field not in header_fields:
@@ -778,27 +786,28 @@ class DataOperationsBase:
                     value = field_data[field][i]
                     if isinstance(value, list) or isinstance(value, dict):
                         import json
                         report_lines.append(f"```\n{json.dumps(value, indent=2)}\n```")
                     else:
                         report_lines.append(str(value))
             # Add divider between observations if requested
             if divider and i < num_obs - 1:
                 report_lines.append("\n---\n")
             else:
                 report_lines.append("")  # Empty line between observations
         return "\n".join(report_lines)
     def _report_docx(self, field_data, num_obs, fields, header_fields) -> "Document":
         """Generates a Word document report from the prepared data.
         Args:
             field_data: Dictionary mapping field names to their values
             num_obs: Number of observations to include
             fields: Fields to include as sections
             header_fields: Fields to include in the observation header
         Returns:
             A docx.Document object containing the report
         """
@@ -808,10 +817,13 @@ class DataOperationsBase:
             import json
         except ImportError:
             from .exceptions import DatasetImportError
-            raise DatasetImportError("The python-docx package is required for DOCX export. Install it with 'pip install python-docx'.")
+            raise DatasetImportError(
+                "The python-docx package is required for DOCX export. Install it with 'pip install python-docx'."
+            )
         doc = Document()
         for i in range(num_obs):
             # Create header with observation number and any header fields
             header_text = f"Observation: {i+1}"
@@ -820,40 +832,46 @@ class DataOperationsBase:
                 for field in header_fields:
                     value = field_data[field][i]
                     # Get the field name without prefix for cleaner display
-                    display_name = field.split('.')[-1] if '.' in field else field
+                    display_name = field.split(".")[-1] if "." in field else field
                     header_parts.append(f"{display_name}: {value}")
                 if header_parts:
                     header_text += f" ({', '.join(header_parts)})"
             doc.add_heading(header_text, level=1)
             # Add the remaining fields
             for field in fields:
                 if field not in header_fields:
                     doc.add_heading(field, level=2)
                     value = field_data[field][i]
                     if isinstance(value, (list, dict)):
                         # Format structured data with indentation
                         formatted_value = json.dumps(value, indent=2)
                         p = doc.add_paragraph()
-                        p.add_run(formatted_value).font.name = 'Courier New'
+                        p.add_run(formatted_value).font.name = "Courier New"
                         p.add_run().font.size = Pt(10)
                     else:
                         doc.add_paragraph(str(value))
             # Add page break between observations except for the last one
             if i < num_obs - 1:
                 doc.add_page_break()
         return doc
-    def report(self, *fields: Optional[str], top_n: Optional[int] = None,
-               header_fields: Optional[List[str]] = None, divider: bool = True,
-               return_string: bool = False, format: str = "markdown",
-               filename: Optional[str] = None) -> Optional[Union[str, "Document"]]:
+    def report(
+        self,
+        *fields: Optional[str],
+        top_n: Optional[int] = None,
+        header_fields: Optional[List[str]] = None,
+        divider: bool = True,
+        return_string: bool = False,
+        format: str = "markdown",
+        filename: Optional[str] = None,
+    ) -> Optional[Union[str, "Document"]]:
         """Generates a report of the results by iterating through rows.
         Args:
             *fields: The fields to include in the report. If none provided, all fields are used.
             top_n: Optional limit on the number of observations to include.
@@ -863,12 +881,12 @@ class DataOperationsBase:
                           only displays the markdown without returning.
             format: Output format - either "markdown" or "docx".
             filename: If provided and format is "docx", saves the document to this file.
         Returns:
             Depending on format and return_string:
             - For markdown: A string if return_string is True, otherwise None (displays in notebook)
             - For docx: A docx.Document object, or None if filename is provided (saves to file)
         Examples:
             >>> from edsl.results import Results
             >>> r = Results.example()
@@ -880,81 +898,84 @@ class DataOperationsBase:
             True
         """
         from ..utilities.utilities import is_notebook
         # Prepare the data for the report
         field_data, num_obs, fields, header_fields = self._prepare_report_data(
             *fields, top_n=top_n, header_fields=header_fields
         )
         # Generate the report in the requested format
         if format.lower() == "markdown":
             report_text = self._report_markdown(
                 field_data, num_obs, fields, header_fields, divider
             )
             # In notebooks, display as markdown
             is_nb = is_notebook()
             if is_nb and not return_string:
                 from IPython.display import Markdown, display
                 display(Markdown(report_text))
                 return None
             # Return the string if requested or if not in a notebook
             return report_text
         elif format.lower() == "docx":
             doc = self._report_docx(field_data, num_obs, fields, header_fields)
             # Save to file if filename is provided
             if filename:
                 doc.save(filename)
                 print(f"Report saved to {filename}")
                 return None
             return doc
         else:
-            raise DatasetExportError(f"Unsupported format: {format}. Use 'markdown' or 'docx'.")
+            raise DatasetExportError(
+                f"Unsupported format: {format}. Use 'markdown' or 'docx'."
+            )
     def tally(
         self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
     ) -> Union[dict, "Dataset"]:
         """
         Count frequency distributions of values in specified fields.
         This method tallies the occurrence of unique values within one or more fields,
         similar to a GROUP BY and COUNT in SQL. When multiple fields are provided, it
         performs cross-tabulation across those fields.
         Parameters:
             *fields: Field names to tally. If none provided, uses all available fields.
             top_n: Optional limit to return only the top N most frequent values.
             output: Format for results, either "Dataset" (recommended) or "dict".
         Returns:
             By default, returns a Dataset with columns for the field(s) and a 'count' column.
             If output="dict", returns a dictionary mapping values to counts.
         Notes:
             - For single fields, returns counts of each unique value
             - For multiple fields, returns counts of each unique combination of values
             - Results are sorted in descending order by count
             - Fields can be specified with or without their type prefix
         Examples:
             >>> from edsl import Results
             >>> r = Results.example()
             # Single field frequency count
             >>> r.select('how_feeling').tally('answer.how_feeling', output="dict")
             {'OK': 2, 'Great': 1, 'Terrible': 1}
             # Return as Dataset (default)
             >>> from edsl.dataset import Dataset
             >>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
             >>> r.select('how_feeling').tally('answer.how_feeling', output="Dataset") == expected
             True
             # Multi-field cross-tabulation - exact output varies based on data
             >>> result = r.tally('how_feeling', 'how_feeling_yesterday')
             >>> 'how_feeling' in result.keys() and 'how_feeling_yesterday' in result.keys() and 'count' in result.keys()
@@ -973,9 +994,10 @@ class DataOperationsBase:
             f in self.relevant_columns() or f in relevant_columns_without_prefix
             for f in fields
         ):
-            raise DatasetKeyError("One or more specified fields are not in the dataset."
-                             f"The available fields are: {self.relevant_columns()}"
-                             )
+            raise DatasetKeyError(
+                "One or more specified fields are not in the dataset."
+                f"The available fields are: {self.relevant_columns()}"
+            )
         if len(fields) == 1:
             field = fields[0]
@@ -992,7 +1014,7 @@ class DataOperationsBase:
             tally = dict(Counter([str(v) for v in values]))
         except Exception as e:
             raise DatasetValueError(f"Error tallying values: {e}")
         sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
         if top_n is not None:
             sorted_tally = dict(list(sorted_tally.items())[:top_n])
@@ -1031,35 +1053,35 @@ class DataOperationsBase:
     def flatten(self, field: str, keep_original: bool = False) -> "Dataset":
         """
         Expand a field containing dictionaries into separate fields.
         This method takes a field that contains a list of dictionaries and expands
         it into multiple fields, one for each key in the dictionaries. This is useful
         when working with nested data structures or results from extraction operations.
         Parameters:
             field: The field containing dictionaries to flatten
             keep_original: Whether to retain the original field in the result
         Returns:
             A new Dataset with the dictionary keys expanded into separate fields
         Notes:
             - Each key in the dictionaries becomes a new field with name pattern "{field}.{key}"
             - All dictionaries in the field must have compatible structures
             - If a dictionary is missing a key, the corresponding value will be None
             - Non-dictionary values in the field will cause a warning
         Examples:
             >>> from edsl.dataset import Dataset
             # Basic flattening of nested dictionaries
             >>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('a')
             Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
             # Works with prefixed fields too
             >>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('answer.example')
             Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
             # Keep the original field if needed
             >>> d = Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}])
             >>> d.flatten('a', keep_original=True)
@@ -1070,21 +1092,22 @@ class DataOperationsBase:
         # Ensure the dataset isn't empty
         if not self.data:
             return self.copy()
         # Find all columns that contain the field
         matching_entries = []
         for entry in self.data:
             col_name = next(iter(entry.keys()))
             if field == col_name or (
-                '.' in col_name and
-                (col_name.endswith('.' + field) or col_name.startswith(field + '.'))
+                "." in col_name
+                and (col_name.endswith("." + field) or col_name.startswith(field + "."))
             ):
                 matching_entries.append(entry)
         # Check if the field is ambiguous
         if len(matching_entries) > 1:
             matching_cols = [next(iter(entry.keys())) for entry in matching_entries]
             from .exceptions import DatasetValueError
             raise DatasetValueError(
                 f"Ambiguous field name '{field}'. It matches multiple columns: {matching_cols}. "
                 f"Please specify the full column name to flatten."
@@ -1194,7 +1217,9 @@ class DataOperationsBase:
         # Check if values are lists
         if not all(isinstance(v, list) for v in field_data):
-            raise DatasetTypeError(f"Field '{field}' does not contain lists in all entries")
+            raise DatasetTypeError(
+                f"Field '{field}' does not contain lists in all entries"
+            )
         # Get the maximum length of lists
         max_len = max(len(v) for v in field_data)
@@ -1218,50 +1243,50 @@ class DataOperationsBase:
             result.data.pop(field_index)
         return result
     def drop(self, field_name):
         """
         Returns a new Dataset with the specified field removed.
         Args:
             field_name (str): The name of the field to remove.
         Returns:
             Dataset: A new Dataset instance without the specified field.
         Raises:
             KeyError: If the field_name doesn't exist in the dataset.
         Examples:
             >>> from .dataset import Dataset
             >>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
             >>> d.drop('a')
             Dataset([{'b': [4, 5, 6]}])
             >>> # Testing drop with nonexistent field raises DatasetKeyError - tested in unit tests
         """
         from .dataset import Dataset
         # Check if field exists in the dataset
         if field_name not in self.relevant_columns():
             raise DatasetKeyError(f"Field '{field_name}' not found in dataset")
         # Create a new dataset without the specified field
         new_data = [entry for entry in self.data if field_name not in entry]
         return Dataset(new_data)
     def remove_prefix(self):
         """Returns a new Dataset with the prefix removed from all column names.
         The prefix is defined as everything before the first dot (.) in the column name.
         If removing prefixes would result in duplicate column names, an exception is raised.
         Returns:
             Dataset: A new Dataset with prefixes removed from column names
         Raises:
             ValueError: If removing prefixes would result in duplicate column names
         Examples:
             >>> from edsl.results import Results
             >>> r = Results.example()
@@ -1269,70 +1294,73 @@ class DataOperationsBase:
             ['answer.how_feeling', 'answer.how_feeling_yesterday']
             >>> r.select('how_feeling', 'how_feeling_yesterday').remove_prefix().relevant_columns()
             ['how_feeling', 'how_feeling_yesterday']
             >>> from edsl.dataset import Dataset
             >>> d = Dataset([{'a.x': [1, 2, 3]}, {'b.x': [4, 5, 6]}])
             >>> # d.remove_prefix()
         # Testing remove_prefix with duplicate column names raises DatasetValueError - tested in unit tests
         """
         from .dataset import Dataset
         # Get all column names
         columns = self.relevant_columns()
         # Extract the unprefixed names
         unprefixed = {}
         duplicates = set()
         for col in columns:
-            if '.' in col:
-                unprefixed_name = col.split('.', 1)[1]
+            if "." in col:
+                unprefixed_name = col.split(".", 1)[1]
                 if unprefixed_name in unprefixed:
                     duplicates.add(unprefixed_name)
                 unprefixed[unprefixed_name] = col
             else:
                 # For columns without a prefix, keep them as is
                 unprefixed[col] = col
         # Check for duplicates
         if duplicates:
-            raise DatasetValueError(f"Removing prefixes would result in duplicate column names: {sorted(list(duplicates))}")
+            raise DatasetValueError(
+                f"Removing prefixes would result in duplicate column names: {sorted(list(duplicates))}"
+            )
         # Create a new dataset with unprefixed column names
         new_data = []
         for entry in self.data:
             key, values = list(entry.items())[0]
-            if '.' in key:
-                new_key = key.split('.', 1)[1]
+            if "." in key:
+                new_key = key.split(".", 1)[1]
             else:
                 new_key = key
             new_data.append({new_key: values})
         return Dataset(new_data)
 def to_dataset(func):
     """
     Decorator that ensures functions receive a Dataset object as their first argument.
     This decorator automatically converts various EDSL container objects (Results,
     AgentList, ScenarioList) to Dataset objects before passing them to the decorated
     function. This allows methods defined in DataOperationsBase to work seamlessly
     across different container types without duplicating conversion logic.
     Parameters:
         func: The function to decorate
     Returns:
         A wrapped function that ensures its first argument is a Dataset
     Notes:
         - For Results objects, calls select() to convert to a Dataset
         - For AgentList and ScenarioList objects, calls their to_dataset() method
         - For Dataset objects, passes them through unchanged
         - This decorator is used internally by the mixin system to enable method sharing
     """
     @wraps(func)
     def wrapper(self, *args, **kwargs):
         """Execute the function with self converted to a Dataset if needed."""
@@ -1345,7 +1373,7 @@ def to_dataset(func):
             dataset_self = self.to_dataset()
         else:
             dataset_self = self
         # Call the function with the converted self
         return func(dataset_self, *args, **kwargs)
@@ -1357,22 +1385,22 @@ def to_dataset(func):
 def decorate_methods_from_mixin(cls, mixin_cls):
     """
     Apply the to_dataset decorator to methods inherited from a mixin class.
     This function is part of EDSL's method inheritance system. It takes methods
     from a source mixin class, applies the to_dataset decorator to them, and adds
     them to a target class. This enables the sharing of data manipulation methods
     across different container types while ensuring they receive the right data type.
     The function is careful not to override methods that are already defined in
     more specific parent classes, preserving the method resolution order (MRO).
     Parameters:
         cls: The target class to add decorated methods to
         mixin_cls: The source mixin class providing the methods
     Returns:
         The modified target class with decorated methods added
     Notes:
         - Only public methods (not starting with "_") are decorated and added
         - Methods already defined in more specific parent classes are not overridden
@@ -1381,14 +1409,13 @@ def decorate_methods_from_mixin(cls, mixin_cls):
     # Get all attributes, including inherited ones
     for attr_name in dir(mixin_cls):
         # Skip magic methods and private methods
-        if not attr_name.startswith('_'):
+        if not attr_name.startswith("_"):
             attr_value = getattr(mixin_cls, attr_name)
             if callable(attr_value):
                 # Check if the method is already defined in the class's MRO
                 # but skip DataOperationsBase methods
                 for base in cls.__mro__[1:]:  # Skip the class itself
-                    if (attr_name in base.__dict__ and
-                        base is not DataOperationsBase):
+                    if attr_name in base.__dict__ and base is not DataOperationsBase:
                         # Method is overridden in a more specific class, skip decorating
                         break
                 else:
@@ -1396,9 +1423,10 @@ def decorate_methods_from_mixin(cls, mixin_cls):
                     setattr(cls, attr_name, to_dataset(attr_value))
     return cls
 # def decorate_methods_from_mixin(cls, mixin_cls):
 #     """Decorates all methods from mixin_cls with to_dataset decorator."""
 #     # Get all attributes, including inherited ones
 #     for attr_name in dir(mixin_cls):
 #         # Skip magic methods and private methods
@@ -1408,99 +1436,107 @@ def decorate_methods_from_mixin(cls, mixin_cls):
 #                 setattr(cls, attr_name, to_dataset(attr_value))
 #     return cls
 class DatasetOperationsMixin(DataOperationsBase):
     """
     Mixin providing data manipulation operations for Dataset objects.
-    This mixin class is the cornerstone of EDSL's data manipulation system. It directly
+    This mixin class is the cornerstone of EDSL's data manipulation system. It directly
     inherits methods from DataOperationsBase without requiring conversion, as it's
     designed specifically for the Dataset class. It serves as the primary implementation
-    of all data operations methods that other container types will inherit and adapt
+    of all data operations methods that other container types will inherit and adapt
     through the to_dataset decorator.
     The design follows a standard mixin pattern where common functionality is defined
     in a standalone class that can be "mixed in" to other classes. In EDSL's case,
     this allows different container types (Results, AgentList, ScenarioList) to share
     the same powerful data manipulation interface.
     Key features:
     1. Data Transformation:
        - Filtering with `filter()`
        - Creating new columns with `mutate()`
        - Reshaping with `long()`, `wide()`, `flatten()`, etc.
        - Selecting specific data with `select()`
     2. Visualization:
        - Table display with `table()`
        - R integration with `ggplot2()`
        - Report generation with `report()`
     3. Data Export:
        - To files with `to_csv()`, `to_excel()`, etc.
        - To other formats with `to_pandas()`, `to_dicts()`, etc.
     4. Analysis:
        - SQL queries with `sql()`
        - Aggregation with `tally()`
        - Tree-based exploration with `tree()`
     This mixin is designed for fluent method chaining, allowing complex data manipulation
     pipelines to be built in an expressive and readable way.
     """
     pass
 class ResultsOperationsMixin(DataOperationsBase):
     """
     Mixin providing data operations for Results objects.
     This mixin adapts DatasetOperationsMixin methods to work with Results objects.
     When a method is called on a Results object, it's automatically converted to
     a Dataset first via the to_dataset decorator applied in __init_subclass__.
     This allows Results objects to have the same data manipulation capabilities
     as Dataset objects without duplicating code.
     """
     def __init_subclass__(cls, **kwargs):
         """
         Automatically decorate all methods from DatasetOperationsMixin.
         This hook runs when a class inherits from ResultsOperationsMixin,
         applying the to_dataset decorator to all methods from DatasetOperationsMixin.
         """
         super().__init_subclass__(**kwargs)
         decorate_methods_from_mixin(cls, DatasetOperationsMixin)
 class ScenarioListOperationsMixin(DataOperationsBase):
     """
     Mixin providing data operations for ScenarioList objects.
     This mixin adapts DatasetOperationsMixin methods to work with ScenarioList objects.
     ScenarioList objects are converted to Dataset objects before method execution
     via the to_dataset decorator applied in __init_subclass__.
     """
     def __init_subclass__(cls, **kwargs):
         """
         Automatically decorate all methods from DatasetOperationsMixin.
         This hook runs when a class inherits from ScenarioListOperationsMixin,
         applying the to_dataset decorator to all methods from DatasetOperationsMixin.
         """
         super().__init_subclass__(**kwargs)
         decorate_methods_from_mixin(cls, DatasetOperationsMixin)
 class AgentListOperationsMixin(DataOperationsBase):
     """
     Mixin providing data operations for AgentList objects.
     This mixin adapts DatasetOperationsMixin methods to work with AgentList objects.
     AgentList objects are converted to Dataset objects before method execution
     via the to_dataset decorator applied in __init_subclass__.
     """
     def __init_subclass__(cls, **kwargs):
         """
         Automatically decorate all methods from DatasetOperationsMixin.
         This hook runs when a class inherits from AgentListOperationsMixin,
         applying the to_dataset decorator to all methods from DatasetOperationsMixin.
         """

edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl

edsl 0.1.54py3-none-any.whl → 0.1.56py3-none-any.whl