PyPI - edsl - Versions diffs - 0.1.44__py3-none-any.whl → 0.1.45__py3-none-any.whl - Mend

edsl 0.1.44py3-none-any.whl → 0.1.45py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

edsl/__version__.py +1 -1
edsl/agents/InvigilatorBase.py +3 -1
edsl/agents/PromptConstructor.py +62 -34
edsl/agents/QuestionInstructionPromptBuilder.py +111 -68
edsl/agents/QuestionTemplateReplacementsBuilder.py +69 -16
edsl/agents/question_option_processor.py +15 -6
edsl/coop/CoopFunctionsMixin.py +3 -4
edsl/coop/coop.py +23 -9
edsl/enums.py +3 -3
edsl/inference_services/AnthropicService.py +11 -9
edsl/inference_services/AvailableModelFetcher.py +2 -0
edsl/inference_services/AwsBedrock.py +1 -2
edsl/inference_services/AzureAI.py +12 -9
edsl/inference_services/GoogleService.py +9 -4
edsl/inference_services/InferenceServicesCollection.py +2 -2
edsl/inference_services/MistralAIService.py +1 -2
edsl/inference_services/OpenAIService.py +9 -4
edsl/inference_services/PerplexityService.py +2 -1
edsl/inference_services/{GrokService.py → XAIService.py} +2 -2
edsl/inference_services/registry.py +2 -2
edsl/jobs/Jobs.py +9 -0
edsl/jobs/JobsChecks.py +10 -13
edsl/jobs/async_interview_runner.py +3 -1
edsl/jobs/check_survey_scenario_compatibility.py +5 -5
edsl/jobs/interviews/InterviewExceptionEntry.py +12 -0
edsl/jobs/tasks/TaskHistory.py +1 -1
edsl/language_models/LanguageModel.py +0 -3
edsl/language_models/PriceManager.py +45 -5
edsl/language_models/model.py +47 -26
edsl/questions/QuestionBase.py +21 -0
edsl/questions/QuestionBasePromptsMixin.py +103 -0
edsl/questions/QuestionFreeText.py +22 -5
edsl/questions/descriptors.py +4 -0
edsl/questions/question_base_gen_mixin.py +94 -29
edsl/results/Dataset.py +65 -0
edsl/results/DatasetExportMixin.py +299 -32
edsl/results/Result.py +27 -0
edsl/results/Results.py +22 -2
edsl/results/ResultsGGMixin.py +7 -3
edsl/scenarios/DocumentChunker.py +2 -0
edsl/scenarios/FileStore.py +10 -0
edsl/scenarios/PdfExtractor.py +21 -1
edsl/scenarios/Scenario.py +25 -9
edsl/scenarios/ScenarioList.py +73 -3
edsl/scenarios/handlers/__init__.py +1 -0
edsl/scenarios/handlers/docx.py +5 -1
edsl/scenarios/handlers/jpeg.py +39 -0
edsl/surveys/Survey.py +5 -4
edsl/surveys/SurveyFlowVisualization.py +91 -43
edsl/templates/error_reporting/exceptions_table.html +7 -8
edsl/templates/error_reporting/interview_details.html +1 -1
edsl/templates/error_reporting/interviews.html +0 -1
edsl/templates/error_reporting/overview.html +2 -7
edsl/templates/error_reporting/performance_plot.html +1 -1
edsl/templates/error_reporting/report.css +1 -1
edsl/utilities/PrettyList.py +14 -0
edsl-0.1.45.dist-info/METADATA +246 -0
{edsl-0.1.44.dist-info → edsl-0.1.45.dist-info}/RECORD +60 -59
edsl-0.1.44.dist-info/METADATA +0 -110
{edsl-0.1.44.dist-info → edsl-0.1.45.dist-info}/LICENSE +0 -0
{edsl-0.1.44.dist-info → edsl-0.1.45.dist-info}/WHEEL +0 -0

edsl/results/DatasetExportMixin.py CHANGED Viewed

@@ -7,6 +7,7 @@ from typing import Optional, Tuple, Union, List
 from edsl.results.file_exports import CSVExport, ExcelExport, JSONLExport, SQLiteExport
 class DatasetExportMixin:
     """Mixin class for exporting Dataset objects."""
@@ -82,7 +83,8 @@ class DatasetExportMixin:
             else:
                 if len(values) != _num_observations:
                     raise ValueError(
-                        "The number of observations is not consistent across columns."
+                        f"The number of observations is not consistent across columns. "
+                        f"Column '{key}' has {len(values)} observations, but previous columns had {_num_observations} observations."
                     )
         return _num_observations
@@ -219,7 +221,9 @@ class DatasetExportMixin:
         )
         return exporter.export()
-    def _db(self, remove_prefix: bool = True, shape: str = "wide") -> "sqlalchemy.engine.Engine":
+    def _db(
+        self, remove_prefix: bool = True, shape: str = "wide"
+    ) -> "sqlalchemy.engine.Engine":
         """Create a SQLite database in memory and return the connection.
         Args:
@@ -229,7 +233,7 @@ class DatasetExportMixin:
         Returns:
             A database connection
         >>> from sqlalchemy import text
-        >>> from edsl import Results
+        >>> from edsl import Results
         >>> engine = Results.example()._db()
         >>> len(engine.execute(text("SELECT * FROM self")).fetchall())
         4
@@ -247,16 +251,17 @@ class DatasetExportMixin:
         if shape == "long":
             # Melt the dataframe to convert it to long format
-            df = df.melt(
-                var_name='key',
-                value_name='value'
-            )
+            df = df.melt(var_name="key", value_name="value")
             # Add a row number column for reference
-            df.insert(0, 'row_number', range(1, len(df) + 1))
+            df.insert(0, "row_number", range(1, len(df) + 1))
             # Split the key into data_type and key
-            df['data_type'] = df['key'].apply(lambda x: x.split('.')[0] if '.' in x else None)
-            df['key'] = df['key'].apply(lambda x: '.'.join(x.split('.')[1:]) if '.' in x else x)
+            df["data_type"] = df["key"].apply(
+                lambda x: x.split(".")[0] if "." in x else None
+            )
+            df["key"] = df["key"].apply(
+                lambda x: ".".join(x.split(".")[1:]) if "." in x else x
+            )
         df.to_sql(
             "self",
@@ -276,27 +281,27 @@ class DatasetExportMixin:
     ) -> Union["pd.DataFrame", str]:
         """Execute a SQL query and return the results as a DataFrame.
-        Args:
-            query: The SQL query to execute
-            shape: The shape of the data in the database (wide or long)
-            remove_prefix: Whether to remove the prefix from the column names
-            transpose: Whether to transpose the DataFrame
-            transpose_by: The column to use as the index when transposing
-            csv: Whether to return the DataFrame as a CSV string
-            to_list: Whether to return the results as a list
-            to_latex: Whether to return the results as LaTeX
-            filename: Optional filename to save the results to
-        Returns:
-            DataFrame, CSV string, list, or LaTeX string depending on parameters
-       Examples:
-           >>> from edsl import Results
-           >>> r = Results.example();
-           >>> len(r.sql("SELECT * FROM self", shape = "wide"))
-           4
-           >>> len(r.sql("SELECT * FROM self", shape = "long"))
-           172
+         Args:
+             query: The SQL query to execute
+             shape: The shape of the data in the database (wide or long)
+             remove_prefix: Whether to remove the prefix from the column names
+             transpose: Whether to transpose the DataFrame
+             transpose_by: The column to use as the index when transposing
+             csv: Whether to return the DataFrame as a CSV string
+             to_list: Whether to return the results as a list
+             to_latex: Whether to return the results as LaTeX
+             filename: Optional filename to save the results to
+         Returns:
+             DataFrame, CSV string, list, or LaTeX string depending on parameters
+        Examples:
+            >>> from edsl import Results
+            >>> r = Results.example();
+            >>> len(r.sql("SELECT * FROM self", shape = "wide"))
+            4
+            >>> len(r.sql("SELECT * FROM self", shape = "long"))
+            172
         """
         import pandas as pd
@@ -538,6 +543,116 @@ class DatasetExportMixin:
         if return_link:
             return filename
+    def report(self, *fields: Optional[str], top_n: Optional[int] = None,
+               header_fields: Optional[List[str]] = None, divider: bool = True,
+               return_string: bool = False) -> Optional[str]:
+        """Takes the fields in order and returns a report of the results by iterating through rows.
+        The row number is printed as # Observation: <row number>
+        The name of the field is used as markdown header at level "##"
+        The content of that field is then printed.
+        Then the next field and so on.
+        Once that row is done, a new line is printed and the next row is shown.
+        If in a jupyter notebook, the report is displayed as markdown.
+        Args:
+            *fields: The fields to include in the report. If none provided, all fields are used.
+            top_n: Optional limit on the number of observations to include.
+            header_fields: Optional list of fields to include in the main header instead of as sections.
+            divider: If True, adds a horizontal rule between observations for better visual separation.
+            return_string: If True, returns the markdown string. If False (default in notebooks),
+                          only displays the markdown without returning.
+        Returns:
+            A string containing the markdown report if return_string is True, otherwise None.
+        Examples:
+            >>> from edsl.results import Results
+            >>> r = Results.example()
+            >>> report = r.select('how_feeling', 'how_feeling_yesterday').report(return_string=True)
+            >>> "# Observation: 1" in report
+            True
+            >>> "## answer.how_feeling" in report
+            True
+            >>> report = r.select('how_feeling').report(header_fields=['answer.how_feeling'], return_string=True)
+            >>> "# Observation: 1 (`how_feeling`: OK)" in report
+            True
+        """
+        from edsl.utilities.utilities import is_notebook
+        # If no fields specified, use all columns
+        if not fields:
+            fields = self.relevant_columns()
+        # Initialize header_fields if not provided
+        if header_fields is None:
+            header_fields = []
+        # Validate all fields
+        all_fields = list(fields) + [f for f in header_fields if f not in fields]
+        for field in all_fields:
+            if field not in self.relevant_columns():
+                raise ValueError(f"Field '{field}' not found in dataset")
+        # Get data for each field
+        field_data = {}
+        for field in all_fields:
+            for entry in self:
+                if field in entry:
+                    field_data[field] = entry[field]
+                    break
+        # Number of observations to process
+        num_obs = self.num_observations()
+        if top_n is not None:
+            num_obs = min(num_obs, top_n)
+        # Build the report
+        report_lines = []
+        for i in range(num_obs):
+            # Create header with observation number and any header fields
+            header = f"# Observation: {i+1}"
+            if header_fields:
+                header_parts = []
+                for field in header_fields:
+                    value = field_data[field][i]
+                    # Get the field name without prefix for cleaner display
+                    display_name = field.split('.')[-1] if '.' in field else field
+                    # Format with backticks for monospace
+                    header_parts.append(f"`{display_name}`: {value}")
+                if header_parts:
+                    header += f" ({', '.join(header_parts)})"
+            report_lines.append(header)
+            # Add the remaining fields
+            for field in fields:
+                if field not in header_fields:
+                    report_lines.append(f"## {field}")
+                    value = field_data[field][i]
+                    if isinstance(value, list) or isinstance(value, dict):
+                        import json
+                        report_lines.append(f"```\n{json.dumps(value, indent=2)}\n```")
+                    else:
+                        report_lines.append(str(value))
+            # Add divider between observations if requested
+            if divider and i < num_obs - 1:
+                report_lines.append("\n---\n")
+            else:
+                report_lines.append("")  # Empty line between observations
+        report_text = "\n".join(report_lines)
+        # In notebooks, display as markdown and optionally return
+        is_nb = is_notebook()
+        if is_nb:
+            from IPython.display import Markdown, display
+            display(Markdown(report_text))
+        # Return the string if requested or if not in a notebook
+        if return_string or not is_nb:
+            return report_text
+        return None
     def tally(
         self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
@@ -616,6 +731,158 @@ class DatasetExportMixin:
             keys.append("count")
             return sl.reorder_keys(keys).to_dataset()
+    def flatten(self, field, keep_original=False):
+        """
+        Flatten a field containing a list of dictionaries into separate fields.
+        For example, if a dataset contains:
+        [{'data': [{'a': 1}, {'b': 2}], 'other': ['x', 'y']}]
+        After d.flatten('data'), it should become:
+        [{'other': ['x', 'y'], 'data.a': [1, None], 'data.b': [None, 2]}]
+        Args:
+            field: The field to flatten
+            keep_original: If True, keeps the original field in the dataset
+        Returns:
+            A new dataset with the flattened fields
+        """
+        from edsl.results.Dataset import Dataset
+        # Ensure the dataset isn't empty
+        if not self.data:
+            return self.copy()
+        # Get the number of observations
+        num_observations = self.num_observations()
+        # Find the column to flatten
+        field_entry = None
+        for entry in self.data:
+            if field in entry:
+                field_entry = entry
+                break
+        if field_entry is None:
+            warnings.warn(
+                f"Field '{field}' not found in dataset, returning original dataset"
+            )
+            return self.copy()
+        # Create new dictionary for flattened data
+        flattened_data = []
+        # Copy all existing columns except the one we're flattening (if keep_original is False)
+        for entry in self.data:
+            col_name = next(iter(entry.keys()))
+            if col_name != field or keep_original:
+                flattened_data.append(entry.copy())
+        # Get field data and make sure it's valid
+        field_values = field_entry[field]
+        if not all(isinstance(item, dict) for item in field_values if item is not None):
+            warnings.warn(
+                f"Field '{field}' contains non-dictionary values that cannot be flattened"
+            )
+            return self.copy()
+        # Collect all unique keys across all dictionaries
+        all_keys = set()
+        for item in field_values:
+            if isinstance(item, dict):
+                all_keys.update(item.keys())
+        # Create new columns for each key
+        for key in sorted(all_keys):  # Sort for consistent output
+            new_values = []
+            for i in range(num_observations):
+                value = None
+                if i < len(field_values) and isinstance(field_values[i], dict):
+                    value = field_values[i].get(key, None)
+                new_values.append(value)
+            # Add this as a new column
+            flattened_data.append({f"{field}.{key}": new_values})
+        # Return a new Dataset with the flattened data
+        return Dataset(flattened_data)
+    def unpack_list(
+        self,
+        field: str,
+        new_names: Optional[List[str]] = None,
+        keep_original: bool = True,
+    ) -> "Dataset":
+        """Unpack list columns into separate columns with provided names or numeric suffixes.
+        For example, if a dataset contains:
+        [{'data': [[1, 2, 3], [4, 5, 6]], 'other': ['x', 'y']}]
+        After d.unpack_list('data'), it should become:
+        [{'other': ['x', 'y'], 'data_1': [1, 4], 'data_2': [2, 5], 'data_3': [3, 6]}]
+        Args:
+            field: The field containing lists to unpack
+            new_names: Optional list of names for the unpacked fields. If None, uses numeric suffixes.
+            keep_original: If True, keeps the original field in the dataset
+        Returns:
+            A new Dataset with unpacked columns
+        Examples:
+            >>> from edsl.results.Dataset import Dataset
+            >>> d = Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}])
+            >>> d.unpack_list('data')
+            Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'data_1': [1, 4]}, {'data_2': [2, 5]}, {'data_3': [3, 6]}])
+            >>> d.unpack_list('data', new_names=['first', 'second', 'third'])
+            Dataset([{'data': [[1, 2, 3], [4, 5, 6]]}, {'first': [1, 4]}, {'second': [2, 5]}, {'third': [3, 6]}])
+        """
+        from edsl.results.Dataset import Dataset
+        # Create a copy of the dataset
+        result = Dataset(self.data.copy())
+        # Find the field in the dataset
+        field_index = None
+        for i, entry in enumerate(result.data):
+            if field in entry:
+                field_index = i
+                break
+        if field_index is None:
+            raise ValueError(f"Field '{field}' not found in dataset")
+        field_data = result.data[field_index][field]
+        # Check if values are lists
+        if not all(isinstance(v, list) for v in field_data):
+            raise ValueError(f"Field '{field}' does not contain lists in all entries")
+        # Get the maximum length of lists
+        max_len = max(len(v) for v in field_data)
+        # Create new fields for each index
+        for i in range(max_len):
+            if new_names and i < len(new_names):
+                new_field = new_names[i]
+            else:
+                new_field = f"{field}_{i+1}"
+            # Extract the i-th element from each list
+            new_values = []
+            for item in field_data:
+                new_values.append(item[i] if i < len(item) else None)
+            result.data.append({new_field: new_values})
+        # Remove the original field if keep_original is False
+        if not keep_original:
+            result.data.pop(field_index)
+        return result
 if __name__ == "__main__":
     import doctest

edsl/results/Result.py CHANGED Viewed

@@ -439,6 +439,33 @@ class Result(Base, UserDict):
         from edsl.results.Results import Results
         return Results.example()[0]
+    def score_with_answer_key(self, answer_key: dict) -> Union[int, float]:
+        """Score the result using an answer key.
+        :param answer_key: A dictionary that maps question_names to answers
+        >>> Result.example()['answer']
+        {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
+        >>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': 'Great'}
+        >>> Result.example().score_with_answer_key(answer_key)
+        {'correct': 2, 'incorrect': 0, 'missing': 0}
+        >>> answer_key = {'how_feeling': 'OK', 'how_feeling_yesterday': ['Great', 'Good']}
+        >>> Result.example().score_with_answer_key(answer_key)
+        {'correct': 2, 'incorrect': 0, 'missing': 0}
+        """
+        final_scores = {'correct': 0, 'incorrect': 0, 'missing': 0}
+        for question_name, answer in self.answer.items():
+            if question_name in answer_key:
+                if answer == answer_key[question_name] or answer in answer_key[question_name]:
+                    final_scores['correct'] += 1
+                else:
+                    final_scores['incorrect'] += 1
+            else:
+                final_scores['missing'] += 1
+        return final_scores
     def score(self, scoring_function: Callable) -> Union[int, float]:
         """Score the result using a passed-in scoring function.

edsl/results/Results.py CHANGED Viewed

@@ -34,7 +34,7 @@ if TYPE_CHECKING:
     from simpleeval import EvalWithCompoundTypes
 from edsl.results.ResultsExportMixin import ResultsExportMixin
-from edsl.results.ResultsGGMixin import ResultsGGMixin
+from edsl.results.ResultsGGMixin import GGPlotMethod
 from edsl.results.results_fetch_mixin import ResultsFetchMixin
 from edsl.utilities.remove_edsl_version import remove_edsl_version
@@ -100,7 +100,7 @@ class NotReadyObject:
 class Mixins(
     ResultsExportMixin,
     ResultsFetchMixin,
-    ResultsGGMixin,
+#    ResultsGGMixin,
 ):
     def long(self):
         return self.table().long()
@@ -151,6 +151,19 @@ class Results(UserList, Mixins, Base):
         "cache_keys",
     ]
+    def ggplot2(
+        self,
+        ggplot_code: str,
+        shape="wide",
+        sql: str = None,
+        remove_prefix: bool = True,
+        debug: bool = False,
+        height=4,
+        width=6,
+        factor_orders: Optional[dict] = None,
+    ):
+        return GGPlotMethod(self).ggplot2(ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders)
     @classmethod
     def from_job_info(cls, job_info: dict) -> Results:
         """
@@ -1277,6 +1290,13 @@ class Results(UserList, Mixins, Base):
         """
         return [r.score(f) for r in self.data]
+    def score_with_answer_key(self, answer_key: dict) -> list:
+        """Score the results using an answer key.
+        :param answer_key: A dictionary that maps answer values to scores.
+        """
+        return [r.score_with_answer_key(answer_key) for r in self.data]
     def fetch_remote(self, job_info: "RemoteJobInfo") -> None:
         """

edsl/results/ResultsGGMixin.py CHANGED Viewed

@@ -75,7 +75,11 @@ class GGPlot:
         return self._svg_data
-class ResultsGGMixin:
+class GGPlotMethod:
+    def __init__(self, results: 'Results'):
+        self.results = results
     """Mixin class for ggplot2 plotting."""
     def ggplot2(
@@ -106,9 +110,9 @@ class ResultsGGMixin:
             sql = "select * from self"
         if shape == "long":
-            df = self.sql(sql, shape="long")
+            df = self.results.sql(sql, shape="long")
         elif shape == "wide":
-            df = self.sql(sql, remove_prefix=remove_prefix)
+            df = self.results.sql(sql, remove_prefix=remove_prefix)
         # Convert DataFrame to CSV format
         csv_data = df.to_csv().text

edsl/scenarios/DocumentChunker.py CHANGED Viewed

@@ -85,6 +85,8 @@ class DocumentChunker:
             new_scenario = copy.deepcopy(self.scenario)
             new_scenario[field] = chunk
             new_scenario[field + "_chunk"] = i
+            new_scenario[field + "_char_count"] = len(chunk)
+            new_scenario[field + "_word_count"] = len(chunk.split())
             if include_original:
                 if hash_original:
                     new_scenario[field + "_original"] = hashlib.md5(

edsl/scenarios/FileStore.py CHANGED Viewed

@@ -29,6 +29,12 @@ class FileStore(Scenario):
         if path is None and "filename" in kwargs:
             path = kwargs["filename"]
+        # Check if path is a URL and handle download
+        if path and (path.startswith('http://') or path.startswith('https://')):
+            temp_filestore = self.from_url(path, mime_type=mime_type)
+            path = temp_filestore._path
+            mime_type = temp_filestore.mime_type
         self._path = path  # Store the original path privately
         self._temp_path = None  # Track any generated temporary file
@@ -138,6 +144,10 @@ class FileStore(Scenario):
                 base64_encoded_data = base64.b64encode(binary_data)
                 self.binary = True
         # Convert the base64 bytes to a string
+        except FileNotFoundError:
+            print(f"File not found: {file_path}")
+            print("Current working directory:", os.getcwd())
+            raise
         base64_string = base64_encoded_data.decode("utf-8")
         return base64_string

edsl/scenarios/PdfExtractor.py CHANGED Viewed

@@ -4,10 +4,30 @@ import os
 class PdfExtractor:
     def __init__(self, pdf_path: str):
         self.pdf_path = pdf_path
+        self._has_pymupdf = self._check_pymupdf()
         #self.constructor = parent_object.__class__
+    def _check_pymupdf(self):
+        """Check if PyMuPDF is installed."""
+        try:
+            import fitz
+            return True
+        except ImportError:
+            return False
     def get_pdf_dict(self) -> dict:
-        # Ensure the file exists
+        # First check if the file exists
+        if not os.path.exists(self.pdf_path):
+            raise FileNotFoundError(f"The file {self.pdf_path} does not exist.")
+        # Then check if PyMuPDF is available
+        if not self._has_pymupdf:
+            raise ImportError(
+                "The 'fitz' module (PyMuPDF) is required for PDF extraction. "
+                "Please install it with: pip install pymupdf"
+            )
+        # If we get here, we can safely import and use fitz
         import fitz
         if not os.path.exists(self.pdf_path):

edsl/scenarios/Scenario.py CHANGED Viewed

@@ -64,6 +64,15 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
         self.data = data if data is not None else {}
         self.name = name
+    def __mul__(self, scenario_list_or_scenario: Union["ScenarioList", "Scenario"]) -> "ScenarioList":
+        from edsl.scenarios.ScenarioList import ScenarioList
+        if isinstance(scenario_list_or_scenario, ScenarioList):
+            return scenario_list_or_scenario * self
+        elif isinstance(scenario_list_or_scenario, Scenario):
+            return ScenarioList([self]) * scenario_list_or_scenario
+        else:
+            raise TypeError(f"Cannot multiply Scenario with {type(scenario_list_or_scenario)}")
     def replicate(self, n: int) -> "ScenarioList":
         """Replicate a scenario n times to return a ScenarioList.
@@ -356,11 +365,18 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
     @classmethod
     def from_pdf(cls, pdf_path: str):
-        from edsl.scenarios.PdfExtractor import PdfExtractor
-        extractor = PdfExtractor(pdf_path)
-        return Scenario(extractor.get_pdf_dict())
+        """Create a Scenario from a PDF file."""
+        try:
+            from edsl.scenarios.PdfExtractor import PdfExtractor
+            extractor = PdfExtractor(pdf_path)
+            return Scenario(extractor.get_pdf_dict())
+        except ImportError as e:
+            raise ImportError(
+                f"Could not extract text from PDF: {str(e)}. "
+                "PDF extraction requires the PyMuPDF library. "
+                "Install it with: pip install pymupdf"
+            )
     @classmethod
     def from_pdf_to_image(cls, pdf_path, image_format="jpeg"):
         """
@@ -442,18 +458,18 @@ class Scenario(Base, UserDict, ScenarioHtmlMixin):
         >>> s = Scenario({"text": "This is a test.\\nThis is a test.\\n\\nThis is a test."})
         >>> s.chunk("text", num_lines = 1)
-        ScenarioList([Scenario({'text': 'This is a test.', 'text_chunk': 0}), Scenario({'text': 'This is a test.', 'text_chunk': 1}), Scenario({'text': '', 'text_chunk': 2}), Scenario({'text': 'This is a test.', 'text_chunk': 3})])
+        ScenarioList([Scenario({'text': 'This is a test.', 'text_chunk': 0, 'text_char_count': 15, 'text_word_count': 4}), Scenario({'text': 'This is a test.', 'text_chunk': 1, 'text_char_count': 15, 'text_word_count': 4}), Scenario({'text': '', 'text_chunk': 2, 'text_char_count': 0, 'text_word_count': 0}), Scenario({'text': 'This is a test.', 'text_chunk': 3, 'text_char_count': 15, 'text_word_count': 4})])
         >>> s.chunk("text", num_words = 2)
-        ScenarioList([Scenario({'text': 'This is', 'text_chunk': 0}), Scenario({'text': 'a test.', 'text_chunk': 1}), Scenario({'text': 'This is', 'text_chunk': 2}), Scenario({'text': 'a test.', 'text_chunk': 3}), Scenario({'text': 'This is', 'text_chunk': 4}), Scenario({'text': 'a test.', 'text_chunk': 5})])
+        ScenarioList([Scenario({'text': 'This is', 'text_chunk': 0, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 1, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'This is', 'text_chunk': 2, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 3, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'This is', 'text_chunk': 4, 'text_char_count': 7, 'text_word_count': 2}), Scenario({'text': 'a test.', 'text_chunk': 5, 'text_char_count': 7, 'text_word_count': 2})])
         >>> s = Scenario({"text": "Hello World"})
         >>> s.chunk("text", num_words = 1, include_original = True)
-        ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_original': 'Hello World'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_original': 'Hello World'})])
+        ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'Hello World'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'Hello World'})])
         >>> s = Scenario({"text": "Hello World"})
         >>> s.chunk("text", num_words = 1, include_original = True, hash_original = True)
-        ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'})])
+        ScenarioList([Scenario({'text': 'Hello', 'text_chunk': 0, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'}), Scenario({'text': 'World', 'text_chunk': 1, 'text_char_count': 5, 'text_word_count': 1, 'text_original': 'b10a8db164e0754105b7a99be72e3fe5'})])
         >>> s.chunk("text")
         Traceback (most recent call last):

edsl 0.1.44__py3-none-any.whl → 0.1.45__py3-none-any.whl

edsl 0.1.44py3-none-any.whl → 0.1.45py3-none-any.whl