edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- edsl/__init__.py +8 -1
- edsl/__init__original.py +134 -0
- edsl/__version__.py +1 -1
- edsl/agents/agent.py +29 -0
- edsl/agents/agent_list.py +36 -1
- edsl/base/base_class.py +281 -151
- edsl/base/data_transfer_models.py +15 -4
- edsl/buckets/__init__.py +8 -3
- edsl/buckets/bucket_collection.py +9 -3
- edsl/buckets/model_buckets.py +4 -2
- edsl/buckets/token_bucket.py +2 -2
- edsl/buckets/token_bucket_client.py +5 -3
- edsl/caching/cache.py +131 -62
- edsl/caching/cache_entry.py +70 -58
- edsl/caching/sql_dict.py +17 -0
- edsl/cli.py +99 -0
- edsl/config/config_class.py +16 -0
- edsl/conversation/__init__.py +31 -0
- edsl/coop/coop.py +276 -242
- edsl/coop/coop_jobs_objects.py +59 -0
- edsl/coop/coop_objects.py +29 -0
- edsl/coop/coop_regular_objects.py +26 -0
- edsl/coop/utils.py +24 -19
- edsl/dataset/dataset.py +338 -101
- edsl/dataset/dataset_operations_mixin.py +216 -180
- edsl/db_list/sqlite_list.py +349 -0
- edsl/inference_services/__init__.py +40 -5
- edsl/inference_services/exceptions.py +11 -0
- edsl/inference_services/services/anthropic_service.py +5 -2
- edsl/inference_services/services/aws_bedrock.py +6 -2
- edsl/inference_services/services/azure_ai.py +6 -2
- edsl/inference_services/services/google_service.py +7 -3
- edsl/inference_services/services/mistral_ai_service.py +6 -2
- edsl/inference_services/services/open_ai_service.py +6 -2
- edsl/inference_services/services/perplexity_service.py +6 -2
- edsl/inference_services/services/test_service.py +94 -5
- edsl/interviews/answering_function.py +167 -59
- edsl/interviews/interview.py +124 -72
- edsl/interviews/interview_task_manager.py +10 -0
- edsl/interviews/request_token_estimator.py +8 -0
- edsl/invigilators/invigilators.py +35 -13
- edsl/jobs/async_interview_runner.py +146 -104
- edsl/jobs/data_structures.py +6 -4
- edsl/jobs/decorators.py +61 -0
- edsl/jobs/fetch_invigilator.py +61 -18
- edsl/jobs/html_table_job_logger.py +14 -2
- edsl/jobs/jobs.py +180 -104
- edsl/jobs/jobs_component_constructor.py +2 -2
- edsl/jobs/jobs_interview_constructor.py +2 -0
- edsl/jobs/jobs_pricing_estimation.py +154 -113
- edsl/jobs/jobs_remote_inference_logger.py +4 -0
- edsl/jobs/jobs_runner_status.py +30 -25
- edsl/jobs/progress_bar_manager.py +79 -0
- edsl/jobs/remote_inference.py +35 -1
- edsl/key_management/key_lookup_builder.py +6 -1
- edsl/language_models/language_model.py +110 -12
- edsl/language_models/model.py +10 -3
- edsl/language_models/price_manager.py +176 -71
- edsl/language_models/registry.py +5 -0
- edsl/notebooks/notebook.py +77 -10
- edsl/questions/VALIDATION_README.md +134 -0
- edsl/questions/__init__.py +24 -1
- edsl/questions/exceptions.py +21 -0
- edsl/questions/question_dict.py +201 -16
- edsl/questions/question_multiple_choice_with_other.py +624 -0
- edsl/questions/question_registry.py +2 -1
- edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
- edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
- edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
- edsl/questions/validation_analysis.py +185 -0
- edsl/questions/validation_cli.py +131 -0
- edsl/questions/validation_html_report.py +404 -0
- edsl/questions/validation_logger.py +136 -0
- edsl/results/result.py +115 -46
- edsl/results/results.py +702 -171
- edsl/scenarios/construct_download_link.py +16 -3
- edsl/scenarios/directory_scanner.py +226 -226
- edsl/scenarios/file_methods.py +5 -0
- edsl/scenarios/file_store.py +150 -9
- edsl/scenarios/handlers/__init__.py +5 -1
- edsl/scenarios/handlers/mp4_file_store.py +104 -0
- edsl/scenarios/handlers/webm_file_store.py +104 -0
- edsl/scenarios/scenario.py +120 -101
- edsl/scenarios/scenario_list.py +800 -727
- edsl/scenarios/scenario_list_gc_test.py +146 -0
- edsl/scenarios/scenario_list_memory_test.py +214 -0
- edsl/scenarios/scenario_list_source_refactor.md +35 -0
- edsl/scenarios/scenario_selector.py +5 -4
- edsl/scenarios/scenario_source.py +1990 -0
- edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
- edsl/surveys/survey.py +22 -0
- edsl/tasks/__init__.py +4 -2
- edsl/tasks/task_history.py +198 -36
- edsl/tests/scenarios/test_ScenarioSource.py +51 -0
- edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
- edsl/utilities/__init__.py +2 -1
- edsl/utilities/decorators.py +121 -0
- edsl/utilities/memory_debugger.py +1010 -0
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
- edsl/jobs/jobs_runner_asyncio.py +0 -281
- edsl/language_models/unused/fake_openai_service.py +0 -60
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
- {edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0
@@ -7,7 +7,7 @@ including data transformation, visualization, export, querying, and analysis. Th
|
|
7
7
|
operations are inherited by different specialized mixins (DatasetOperationsMixin,
|
8
8
|
ResultsOperationsMixin, etc.) which implement class-specific behaviors.
|
9
9
|
|
10
|
-
The design pattern used here allows different container types (Results, Dataset,
|
10
|
+
The design pattern used here allows different container types (Results, Dataset,
|
11
11
|
ScenarioList, AgentList) to share the same data manipulation interface, enabling
|
12
12
|
fluid operations across different parts of the EDSL ecosystem.
|
13
13
|
"""
|
@@ -18,49 +18,54 @@ import textwrap
|
|
18
18
|
from typing import Optional, Tuple, Union, List, TYPE_CHECKING # Callable not used
|
19
19
|
from functools import wraps
|
20
20
|
from .r.ggplot import GGPlotMethod
|
21
|
-
from .exceptions import
|
21
|
+
from .exceptions import (
|
22
|
+
DatasetKeyError,
|
23
|
+
DatasetValueError,
|
24
|
+
DatasetTypeError,
|
25
|
+
DatasetExportError,
|
26
|
+
)
|
22
27
|
|
23
28
|
if TYPE_CHECKING:
|
24
29
|
from docx import Document
|
25
30
|
from .dataset import Dataset
|
26
31
|
from ..jobs import Job # noqa: F401
|
27
32
|
|
33
|
+
|
28
34
|
class DataOperationsBase:
|
29
35
|
"""
|
30
36
|
Base class providing common data operations for EDSL container objects.
|
31
|
-
|
37
|
+
|
32
38
|
This class serves as the foundation for various data manipulation mixins,
|
33
39
|
providing a consistent interface for operations like filtering, aggregation,
|
34
40
|
transformation, visualization, and export across different types of EDSL
|
35
41
|
containers (Results, Dataset, ScenarioList, AgentList).
|
36
|
-
|
42
|
+
|
37
43
|
Key functionality categories:
|
38
|
-
|
44
|
+
|
39
45
|
1. Data Transformation:
|
40
46
|
- Filtering with `filter()`
|
41
47
|
- Creating new columns with `mutate()`
|
42
48
|
- Reshaping with `long()`, `wide()`, `flatten()`, etc.
|
43
49
|
- Selecting specific columns with `select()`
|
44
|
-
|
50
|
+
|
45
51
|
2. Visualization and Display:
|
46
52
|
- Tabular display with `table()`
|
47
53
|
- Plotting with `ggplot2()`
|
48
54
|
- Generating reports with `report()`
|
49
|
-
|
55
|
+
|
50
56
|
3. Data Export:
|
51
57
|
- To various formats with `to_csv()`, `to_excel()`, etc.
|
52
58
|
- To other data structures with `to_pandas()`, `to_dicts()`, etc.
|
53
|
-
|
59
|
+
|
54
60
|
4. Analysis:
|
55
61
|
- SQL-based querying with `sql()`
|
56
62
|
- Aggregation with `tally()`
|
57
63
|
- Tree-based exploration
|
58
|
-
|
64
|
+
|
59
65
|
These operations are designed to be applied fluently in sequence, enabling
|
60
66
|
expressive data manipulation pipelines.
|
61
67
|
"""
|
62
68
|
|
63
|
-
|
64
69
|
def ggplot2(
|
65
70
|
self,
|
66
71
|
ggplot_code: str,
|
@@ -74,10 +79,10 @@ class DataOperationsBase:
|
|
74
79
|
):
|
75
80
|
"""
|
76
81
|
Create visualizations using R's ggplot2 library.
|
77
|
-
|
82
|
+
|
78
83
|
This method provides a bridge to R's powerful ggplot2 visualization library,
|
79
84
|
allowing you to create sophisticated plots directly from EDSL data structures.
|
80
|
-
|
85
|
+
|
81
86
|
Parameters:
|
82
87
|
ggplot_code: R code string containing ggplot2 commands
|
83
88
|
shape: Data shape to use ("wide" or "long")
|
@@ -87,31 +92,32 @@ class DataOperationsBase:
|
|
87
92
|
height: Plot height in inches
|
88
93
|
width: Plot width in inches
|
89
94
|
factor_orders: Dictionary mapping factor variables to their desired order
|
90
|
-
|
95
|
+
|
91
96
|
Returns:
|
92
97
|
A plot object that renders in Jupyter notebooks
|
93
|
-
|
98
|
+
|
94
99
|
Notes:
|
95
100
|
- Requires R and the ggplot2 package to be installed
|
96
101
|
- Data is automatically converted to a format suitable for ggplot2
|
97
102
|
- The ggplot2 code should reference column names as they appear after
|
98
103
|
any transformations from the shape and remove_prefix parameters
|
99
|
-
|
104
|
+
|
100
105
|
Examples:
|
101
106
|
>>> from edsl.results import Results
|
102
107
|
>>> r = Results.example()
|
103
108
|
>>> # The following would create a plot if R is installed (not shown in doctest):
|
104
109
|
>>> # r.ggplot2('''
|
105
|
-
>>> # ggplot(df, aes(x=how_feeling)) +
|
110
|
+
>>> # ggplot(df, aes(x=how_feeling)) +
|
106
111
|
>>> # geom_bar() +
|
107
112
|
>>> # labs(title="Distribution of Feelings")
|
108
113
|
>>> # ''')
|
109
114
|
"""
|
110
|
-
return GGPlotMethod(self).ggplot2(
|
111
|
-
|
115
|
+
return GGPlotMethod(self).ggplot2(
|
116
|
+
ggplot_code, shape, sql, remove_prefix, debug, height, width, factor_orders
|
117
|
+
)
|
112
118
|
|
113
119
|
def relevant_columns(
|
114
|
-
self, data_type: Optional[str] = None, remove_prefix:bool=False
|
120
|
+
self, data_type: Optional[str] = None, remove_prefix: bool = False
|
115
121
|
) -> list:
|
116
122
|
"""Return the set of keys that are present in the dataset.
|
117
123
|
|
@@ -184,12 +190,13 @@ class DataOperationsBase:
|
|
184
190
|
)
|
185
191
|
|
186
192
|
return _num_observations
|
187
|
-
|
193
|
+
|
188
194
|
def chart(self):
|
189
195
|
"""
|
190
196
|
Create a chart from the results.
|
191
197
|
"""
|
192
198
|
import altair as alt
|
199
|
+
|
193
200
|
return alt.Chart(self.to_pandas(remove_prefix=True))
|
194
201
|
|
195
202
|
def make_tabular(
|
@@ -271,6 +278,7 @@ class DataOperationsBase:
|
|
271
278
|
def to_jsonl(self, filename: Optional[str] = None):
|
272
279
|
"""Export the results to a FileStore instance containing JSONL data."""
|
273
280
|
from .file_exports import JSONLExport
|
281
|
+
|
274
282
|
exporter = JSONLExport(data=self, filename=filename)
|
275
283
|
return exporter.export()
|
276
284
|
|
@@ -284,6 +292,7 @@ class DataOperationsBase:
|
|
284
292
|
):
|
285
293
|
"""Export the results to a SQLite database file."""
|
286
294
|
from .file_exports import SQLiteExport
|
295
|
+
|
287
296
|
exporter = SQLiteExport(
|
288
297
|
data=self,
|
289
298
|
filename=filename,
|
@@ -330,18 +339,16 @@ class DataOperationsBase:
|
|
330
339
|
)
|
331
340
|
return exporter.export()
|
332
341
|
|
333
|
-
def _db(
|
334
|
-
self, remove_prefix: bool = True, shape: str = "wide"
|
335
|
-
):
|
342
|
+
def _db(self, remove_prefix: bool = True, shape: str = "wide"):
|
336
343
|
"""Create a SQLite database in memory and return the connection.
|
337
344
|
|
338
345
|
Args:
|
339
346
|
remove_prefix: Whether to remove the prefix from the column names
|
340
347
|
shape: The shape of the data in the database ("wide" or "long")
|
341
|
-
|
348
|
+
|
342
349
|
Returns:
|
343
350
|
A database connection
|
344
|
-
|
351
|
+
|
345
352
|
Examples:
|
346
353
|
>>> from sqlalchemy import text
|
347
354
|
>>> from edsl import Results
|
@@ -350,7 +357,7 @@ class DataOperationsBase:
|
|
350
357
|
4
|
351
358
|
>>> engine = Results.example()._db(shape = "long")
|
352
359
|
>>> len(engine.execute(text("SELECT * FROM self")).fetchall())
|
353
|
-
|
360
|
+
204
|
354
361
|
"""
|
355
362
|
# Import needed for database connection
|
356
363
|
from sqlalchemy import create_engine
|
@@ -393,12 +400,12 @@ class DataOperationsBase:
|
|
393
400
|
) -> "Dataset":
|
394
401
|
"""
|
395
402
|
Execute SQL queries on the dataset.
|
396
|
-
|
397
|
-
This powerful method allows you to use SQL to query and transform your data,
|
398
|
-
combining the expressiveness of SQL with EDSL's data structures. It works by
|
403
|
+
|
404
|
+
This powerful method allows you to use SQL to query and transform your data,
|
405
|
+
combining the expressiveness of SQL with EDSL's data structures. It works by
|
399
406
|
creating an in-memory SQLite database from your data and executing the query
|
400
407
|
against it.
|
401
|
-
|
408
|
+
|
402
409
|
Parameters:
|
403
410
|
query: SQL query string to execute
|
404
411
|
transpose: Whether to transpose the resulting table (rows become columns)
|
@@ -407,35 +414,35 @@ class DataOperationsBase:
|
|
407
414
|
shape: Data shape to use ("wide" or "long")
|
408
415
|
- "wide": Default tabular format with columns for each field
|
409
416
|
- "long": Melted format with key-value pairs, useful for certain queries
|
410
|
-
|
417
|
+
|
411
418
|
Returns:
|
412
419
|
A Dataset object containing the query results
|
413
|
-
|
420
|
+
|
414
421
|
Notes:
|
415
422
|
- The data is stored in a table named "self" in the SQLite database
|
416
423
|
- In wide format, column names include their type prefix unless remove_prefix=True
|
417
424
|
- In long format, the data is melted into columns: row_number, key, value, data_type
|
418
425
|
- Complex objects like lists and dictionaries are converted to strings
|
419
|
-
|
426
|
+
|
420
427
|
Examples:
|
421
428
|
>>> from edsl import Results
|
422
429
|
>>> r = Results.example()
|
423
|
-
|
430
|
+
|
424
431
|
# Basic selection
|
425
432
|
>>> len(r.sql("SELECT * FROM self", shape="wide"))
|
426
433
|
4
|
427
|
-
|
434
|
+
|
428
435
|
# Filtering with WHERE clause
|
429
436
|
>>> r.sql("SELECT * FROM self WHERE how_feeling = 'Great'").num_observations()
|
430
437
|
1
|
431
|
-
|
438
|
+
|
432
439
|
# Aggregation
|
433
440
|
>>> r.sql("SELECT how_feeling, COUNT(*) as count FROM self GROUP BY how_feeling").keys()
|
434
441
|
['how_feeling', 'count']
|
435
|
-
|
442
|
+
|
436
443
|
# Using long format
|
437
444
|
>>> len(r.sql("SELECT * FROM self", shape="long"))
|
438
|
-
|
445
|
+
204
|
439
446
|
"""
|
440
447
|
import pandas as pd
|
441
448
|
|
@@ -454,15 +461,13 @@ class DataOperationsBase:
|
|
454
461
|
|
455
462
|
return Dataset.from_pandas_dataframe(df)
|
456
463
|
|
457
|
-
def to_pandas(
|
458
|
-
self, remove_prefix: bool = False, lists_as_strings=False
|
459
|
-
):
|
464
|
+
def to_pandas(self, remove_prefix: bool = False, lists_as_strings=False):
|
460
465
|
"""Convert the results to a pandas DataFrame, ensuring that lists remain as lists.
|
461
466
|
|
462
467
|
Args:
|
463
468
|
remove_prefix: Whether to remove the prefix from the column names.
|
464
469
|
lists_as_strings: Whether to convert lists to strings.
|
465
|
-
|
470
|
+
|
466
471
|
Returns:
|
467
472
|
A pandas DataFrame.
|
468
473
|
"""
|
@@ -493,15 +498,13 @@ class DataOperationsBase:
|
|
493
498
|
# df_sorted = df.sort_index(axis=1) # Sort columns alphabetically
|
494
499
|
return df
|
495
500
|
|
496
|
-
def to_polars(
|
497
|
-
self, remove_prefix: bool = False, lists_as_strings=False
|
498
|
-
):
|
501
|
+
def to_polars(self, remove_prefix: bool = False, lists_as_strings=False):
|
499
502
|
"""Convert the results to a Polars DataFrame.
|
500
503
|
|
501
504
|
Args:
|
502
505
|
remove_prefix: Whether to remove the prefix from the column names.
|
503
506
|
lists_as_strings: Whether to convert lists to strings.
|
504
|
-
|
507
|
+
|
505
508
|
Returns:
|
506
509
|
A Polars DataFrame.
|
507
510
|
"""
|
@@ -513,7 +516,7 @@ class DataOperationsBase:
|
|
513
516
|
|
514
517
|
Args:
|
515
518
|
remove_prefix: Whether to remove the prefix from the column names.
|
516
|
-
|
519
|
+
|
517
520
|
Returns:
|
518
521
|
A Polars DataFrame.
|
519
522
|
"""
|
@@ -522,17 +525,18 @@ class DataOperationsBase:
|
|
522
525
|
csv_string = self.to_csv(remove_prefix=remove_prefix).text
|
523
526
|
df = pl.read_csv(io.StringIO(csv_string))
|
524
527
|
return df
|
525
|
-
|
528
|
+
|
526
529
|
def tree(self, node_order: Optional[List[str]] = None):
|
527
530
|
"""Convert the results to a Tree.
|
528
531
|
|
529
532
|
Args:
|
530
533
|
node_order: The order of the nodes.
|
531
|
-
|
534
|
+
|
532
535
|
Returns:
|
533
536
|
A Tree object.
|
534
537
|
"""
|
535
538
|
from .dataset_tree import Tree
|
539
|
+
|
536
540
|
return Tree(self, node_order=node_order)
|
537
541
|
|
538
542
|
def to_scenario_list(self, remove_prefix: bool = True) -> list[dict]:
|
@@ -552,7 +556,6 @@ class DataOperationsBase:
|
|
552
556
|
for d in list_of_dicts:
|
553
557
|
scenarios.append(Scenario(d))
|
554
558
|
return ScenarioList(scenarios)
|
555
|
-
|
556
559
|
|
557
560
|
def to_agent_list(self, remove_prefix: bool = True):
|
558
561
|
"""Convert the results to a list of dictionaries, one per agent.
|
@@ -661,10 +664,9 @@ class DataOperationsBase:
|
|
661
664
|
new_list.append(item)
|
662
665
|
list_to_return = new_list
|
663
666
|
|
664
|
-
|
665
|
-
#return PrettyList(list_to_return)
|
667
|
+
# return PrettyList(list_to_return)
|
666
668
|
return list_to_return
|
667
|
-
|
669
|
+
|
668
670
|
def html(
|
669
671
|
self,
|
670
672
|
filename: Optional[str] = None,
|
@@ -700,33 +702,37 @@ class DataOperationsBase:
|
|
700
702
|
|
701
703
|
if return_link:
|
702
704
|
return filename
|
703
|
-
|
704
|
-
def _prepare_report_data(
|
705
|
-
|
705
|
+
|
706
|
+
def _prepare_report_data(
|
707
|
+
self,
|
708
|
+
*fields: Optional[str],
|
709
|
+
top_n: Optional[int] = None,
|
710
|
+
header_fields: Optional[List[str]] = None,
|
711
|
+
) -> tuple:
|
706
712
|
"""Prepares data for report generation in various formats.
|
707
|
-
|
713
|
+
|
708
714
|
Args:
|
709
715
|
*fields: The fields to include in the report. If none provided, all fields are used.
|
710
716
|
top_n: Optional limit on the number of observations to include.
|
711
717
|
header_fields: Optional list of fields to include in the main header instead of as sections.
|
712
|
-
|
718
|
+
|
713
719
|
Returns:
|
714
720
|
A tuple containing (field_data, num_obs, fields, header_fields)
|
715
721
|
"""
|
716
722
|
# If no fields specified, use all columns
|
717
723
|
if not fields:
|
718
724
|
fields = self.relevant_columns()
|
719
|
-
|
725
|
+
|
720
726
|
# Initialize header_fields if not provided
|
721
727
|
if header_fields is None:
|
722
728
|
header_fields = []
|
723
|
-
|
729
|
+
|
724
730
|
# Validate all fields
|
725
731
|
all_fields = list(fields) + [f for f in header_fields if f not in fields]
|
726
732
|
for field in all_fields:
|
727
733
|
if field not in self.relevant_columns():
|
728
734
|
raise DatasetKeyError(f"Field '{field}' not found in dataset")
|
729
|
-
|
735
|
+
|
730
736
|
# Get data for each field
|
731
737
|
field_data = {}
|
732
738
|
for field in all_fields:
|
@@ -734,24 +740,26 @@ class DataOperationsBase:
|
|
734
740
|
if field in entry:
|
735
741
|
field_data[field] = entry[field]
|
736
742
|
break
|
737
|
-
|
743
|
+
|
738
744
|
# Number of observations to process
|
739
745
|
num_obs = self.num_observations()
|
740
746
|
if top_n is not None:
|
741
747
|
num_obs = min(num_obs, top_n)
|
742
|
-
|
748
|
+
|
743
749
|
return field_data, num_obs, fields, header_fields
|
744
750
|
|
745
|
-
def _report_markdown(
|
751
|
+
def _report_markdown(
|
752
|
+
self, field_data, num_obs, fields, header_fields, divider: bool = True
|
753
|
+
) -> str:
|
746
754
|
"""Generates a markdown report from the prepared data.
|
747
|
-
|
755
|
+
|
748
756
|
Args:
|
749
757
|
field_data: Dictionary mapping field names to their values
|
750
758
|
num_obs: Number of observations to include
|
751
759
|
fields: Fields to include as sections
|
752
760
|
header_fields: Fields to include in the observation header
|
753
761
|
divider: If True, adds a horizontal rule between observations
|
754
|
-
|
762
|
+
|
755
763
|
Returns:
|
756
764
|
A string containing the markdown report
|
757
765
|
"""
|
@@ -764,13 +772,13 @@ class DataOperationsBase:
|
|
764
772
|
for field in header_fields:
|
765
773
|
value = field_data[field][i]
|
766
774
|
# Get the field name without prefix for cleaner display
|
767
|
-
display_name = field.split(
|
775
|
+
display_name = field.split(".")[-1] if "." in field else field
|
768
776
|
# Format with backticks for monospace
|
769
777
|
header_parts.append(f"`{display_name}`: {value}")
|
770
778
|
if header_parts:
|
771
779
|
header += f" ({', '.join(header_parts)})"
|
772
780
|
report_lines.append(header)
|
773
|
-
|
781
|
+
|
774
782
|
# Add the remaining fields
|
775
783
|
for field in fields:
|
776
784
|
if field not in header_fields:
|
@@ -778,27 +786,28 @@ class DataOperationsBase:
|
|
778
786
|
value = field_data[field][i]
|
779
787
|
if isinstance(value, list) or isinstance(value, dict):
|
780
788
|
import json
|
789
|
+
|
781
790
|
report_lines.append(f"```\n{json.dumps(value, indent=2)}\n```")
|
782
791
|
else:
|
783
792
|
report_lines.append(str(value))
|
784
|
-
|
793
|
+
|
785
794
|
# Add divider between observations if requested
|
786
795
|
if divider and i < num_obs - 1:
|
787
796
|
report_lines.append("\n---\n")
|
788
797
|
else:
|
789
798
|
report_lines.append("") # Empty line between observations
|
790
|
-
|
799
|
+
|
791
800
|
return "\n".join(report_lines)
|
792
801
|
|
793
802
|
def _report_docx(self, field_data, num_obs, fields, header_fields) -> "Document":
|
794
803
|
"""Generates a Word document report from the prepared data.
|
795
|
-
|
804
|
+
|
796
805
|
Args:
|
797
806
|
field_data: Dictionary mapping field names to their values
|
798
807
|
num_obs: Number of observations to include
|
799
808
|
fields: Fields to include as sections
|
800
809
|
header_fields: Fields to include in the observation header
|
801
|
-
|
810
|
+
|
802
811
|
Returns:
|
803
812
|
A docx.Document object containing the report
|
804
813
|
"""
|
@@ -808,10 +817,13 @@ class DataOperationsBase:
|
|
808
817
|
import json
|
809
818
|
except ImportError:
|
810
819
|
from .exceptions import DatasetImportError
|
811
|
-
|
812
|
-
|
820
|
+
|
821
|
+
raise DatasetImportError(
|
822
|
+
"The python-docx package is required for DOCX export. Install it with 'pip install python-docx'."
|
823
|
+
)
|
824
|
+
|
813
825
|
doc = Document()
|
814
|
-
|
826
|
+
|
815
827
|
for i in range(num_obs):
|
816
828
|
# Create header with observation number and any header fields
|
817
829
|
header_text = f"Observation: {i+1}"
|
@@ -820,40 +832,46 @@ class DataOperationsBase:
|
|
820
832
|
for field in header_fields:
|
821
833
|
value = field_data[field][i]
|
822
834
|
# Get the field name without prefix for cleaner display
|
823
|
-
display_name = field.split(
|
835
|
+
display_name = field.split(".")[-1] if "." in field else field
|
824
836
|
header_parts.append(f"{display_name}: {value}")
|
825
837
|
if header_parts:
|
826
838
|
header_text += f" ({', '.join(header_parts)})"
|
827
|
-
|
839
|
+
|
828
840
|
doc.add_heading(header_text, level=1)
|
829
|
-
|
841
|
+
|
830
842
|
# Add the remaining fields
|
831
843
|
for field in fields:
|
832
844
|
if field not in header_fields:
|
833
845
|
doc.add_heading(field, level=2)
|
834
846
|
value = field_data[field][i]
|
835
|
-
|
847
|
+
|
836
848
|
if isinstance(value, (list, dict)):
|
837
849
|
# Format structured data with indentation
|
838
850
|
formatted_value = json.dumps(value, indent=2)
|
839
851
|
p = doc.add_paragraph()
|
840
|
-
p.add_run(formatted_value).font.name =
|
852
|
+
p.add_run(formatted_value).font.name = "Courier New"
|
841
853
|
p.add_run().font.size = Pt(10)
|
842
854
|
else:
|
843
855
|
doc.add_paragraph(str(value))
|
844
|
-
|
856
|
+
|
845
857
|
# Add page break between observations except for the last one
|
846
858
|
if i < num_obs - 1:
|
847
859
|
doc.add_page_break()
|
848
|
-
|
860
|
+
|
849
861
|
return doc
|
850
|
-
|
851
|
-
def report(
|
852
|
-
|
853
|
-
|
854
|
-
|
862
|
+
|
863
|
+
def report(
|
864
|
+
self,
|
865
|
+
*fields: Optional[str],
|
866
|
+
top_n: Optional[int] = None,
|
867
|
+
header_fields: Optional[List[str]] = None,
|
868
|
+
divider: bool = True,
|
869
|
+
return_string: bool = False,
|
870
|
+
format: str = "markdown",
|
871
|
+
filename: Optional[str] = None,
|
872
|
+
) -> Optional[Union[str, "Document"]]:
|
855
873
|
"""Generates a report of the results by iterating through rows.
|
856
|
-
|
874
|
+
|
857
875
|
Args:
|
858
876
|
*fields: The fields to include in the report. If none provided, all fields are used.
|
859
877
|
top_n: Optional limit on the number of observations to include.
|
@@ -863,12 +881,12 @@ class DataOperationsBase:
|
|
863
881
|
only displays the markdown without returning.
|
864
882
|
format: Output format - either "markdown" or "docx".
|
865
883
|
filename: If provided and format is "docx", saves the document to this file.
|
866
|
-
|
884
|
+
|
867
885
|
Returns:
|
868
886
|
Depending on format and return_string:
|
869
887
|
- For markdown: A string if return_string is True, otherwise None (displays in notebook)
|
870
888
|
- For docx: A docx.Document object, or None if filename is provided (saves to file)
|
871
|
-
|
889
|
+
|
872
890
|
Examples:
|
873
891
|
>>> from edsl.results import Results
|
874
892
|
>>> r = Results.example()
|
@@ -880,81 +898,84 @@ class DataOperationsBase:
|
|
880
898
|
True
|
881
899
|
"""
|
882
900
|
from ..utilities.utilities import is_notebook
|
883
|
-
|
901
|
+
|
884
902
|
# Prepare the data for the report
|
885
903
|
field_data, num_obs, fields, header_fields = self._prepare_report_data(
|
886
904
|
*fields, top_n=top_n, header_fields=header_fields
|
887
905
|
)
|
888
|
-
|
906
|
+
|
889
907
|
# Generate the report in the requested format
|
890
908
|
if format.lower() == "markdown":
|
891
909
|
report_text = self._report_markdown(
|
892
910
|
field_data, num_obs, fields, header_fields, divider
|
893
911
|
)
|
894
|
-
|
912
|
+
|
895
913
|
# In notebooks, display as markdown
|
896
914
|
is_nb = is_notebook()
|
897
915
|
if is_nb and not return_string:
|
898
916
|
from IPython.display import Markdown, display
|
917
|
+
|
899
918
|
display(Markdown(report_text))
|
900
919
|
return None
|
901
|
-
|
920
|
+
|
902
921
|
# Return the string if requested or if not in a notebook
|
903
922
|
return report_text
|
904
|
-
|
923
|
+
|
905
924
|
elif format.lower() == "docx":
|
906
925
|
doc = self._report_docx(field_data, num_obs, fields, header_fields)
|
907
|
-
|
926
|
+
|
908
927
|
# Save to file if filename is provided
|
909
928
|
if filename:
|
910
929
|
doc.save(filename)
|
911
930
|
print(f"Report saved to {filename}")
|
912
931
|
return None
|
913
|
-
|
932
|
+
|
914
933
|
return doc
|
915
|
-
|
934
|
+
|
916
935
|
else:
|
917
|
-
raise DatasetExportError(
|
936
|
+
raise DatasetExportError(
|
937
|
+
f"Unsupported format: {format}. Use 'markdown' or 'docx'."
|
938
|
+
)
|
918
939
|
|
919
940
|
def tally(
|
920
941
|
self, *fields: Optional[str], top_n: Optional[int] = None, output="Dataset"
|
921
942
|
) -> Union[dict, "Dataset"]:
|
922
943
|
"""
|
923
944
|
Count frequency distributions of values in specified fields.
|
924
|
-
|
945
|
+
|
925
946
|
This method tallies the occurrence of unique values within one or more fields,
|
926
947
|
similar to a GROUP BY and COUNT in SQL. When multiple fields are provided, it
|
927
948
|
performs cross-tabulation across those fields.
|
928
|
-
|
949
|
+
|
929
950
|
Parameters:
|
930
951
|
*fields: Field names to tally. If none provided, uses all available fields.
|
931
952
|
top_n: Optional limit to return only the top N most frequent values.
|
932
953
|
output: Format for results, either "Dataset" (recommended) or "dict".
|
933
|
-
|
954
|
+
|
934
955
|
Returns:
|
935
956
|
By default, returns a Dataset with columns for the field(s) and a 'count' column.
|
936
957
|
If output="dict", returns a dictionary mapping values to counts.
|
937
|
-
|
958
|
+
|
938
959
|
Notes:
|
939
960
|
- For single fields, returns counts of each unique value
|
940
961
|
- For multiple fields, returns counts of each unique combination of values
|
941
962
|
- Results are sorted in descending order by count
|
942
963
|
- Fields can be specified with or without their type prefix
|
943
|
-
|
964
|
+
|
944
965
|
Examples:
|
945
966
|
>>> from edsl import Results
|
946
967
|
>>> r = Results.example()
|
947
|
-
|
968
|
+
|
948
969
|
# Single field frequency count
|
949
970
|
>>> r.select('how_feeling').tally('answer.how_feeling', output="dict")
|
950
971
|
{'OK': 2, 'Great': 1, 'Terrible': 1}
|
951
|
-
|
972
|
+
|
952
973
|
# Return as Dataset (default)
|
953
974
|
>>> from edsl.dataset import Dataset
|
954
975
|
>>> expected = Dataset([{'answer.how_feeling': ['OK', 'Great', 'Terrible']}, {'count': [2, 1, 1]}])
|
955
976
|
>>> r.select('how_feeling').tally('answer.how_feeling', output="Dataset") == expected
|
956
977
|
True
|
957
|
-
|
978
|
+
|
958
979
|
# Multi-field cross-tabulation - exact output varies based on data
|
959
980
|
>>> result = r.tally('how_feeling', 'how_feeling_yesterday')
|
960
981
|
>>> 'how_feeling' in result.keys() and 'how_feeling_yesterday' in result.keys() and 'count' in result.keys()
|
@@ -973,9 +994,10 @@ class DataOperationsBase:
|
|
973
994
|
f in self.relevant_columns() or f in relevant_columns_without_prefix
|
974
995
|
for f in fields
|
975
996
|
):
|
976
|
-
raise DatasetKeyError(
|
977
|
-
|
978
|
-
|
997
|
+
raise DatasetKeyError(
|
998
|
+
"One or more specified fields are not in the dataset."
|
999
|
+
f"The available fields are: {self.relevant_columns()}"
|
1000
|
+
)
|
979
1001
|
|
980
1002
|
if len(fields) == 1:
|
981
1003
|
field = fields[0]
|
@@ -992,7 +1014,7 @@ class DataOperationsBase:
|
|
992
1014
|
tally = dict(Counter([str(v) for v in values]))
|
993
1015
|
except Exception as e:
|
994
1016
|
raise DatasetValueError(f"Error tallying values: {e}")
|
995
|
-
|
1017
|
+
|
996
1018
|
sorted_tally = dict(sorted(tally.items(), key=lambda item: -item[1]))
|
997
1019
|
if top_n is not None:
|
998
1020
|
sorted_tally = dict(list(sorted_tally.items())[:top_n])
|
@@ -1031,35 +1053,35 @@ class DataOperationsBase:
|
|
1031
1053
|
def flatten(self, field: str, keep_original: bool = False) -> "Dataset":
|
1032
1054
|
"""
|
1033
1055
|
Expand a field containing dictionaries into separate fields.
|
1034
|
-
|
1056
|
+
|
1035
1057
|
This method takes a field that contains a list of dictionaries and expands
|
1036
1058
|
it into multiple fields, one for each key in the dictionaries. This is useful
|
1037
1059
|
when working with nested data structures or results from extraction operations.
|
1038
|
-
|
1060
|
+
|
1039
1061
|
Parameters:
|
1040
1062
|
field: The field containing dictionaries to flatten
|
1041
1063
|
keep_original: Whether to retain the original field in the result
|
1042
|
-
|
1064
|
+
|
1043
1065
|
Returns:
|
1044
1066
|
A new Dataset with the dictionary keys expanded into separate fields
|
1045
|
-
|
1067
|
+
|
1046
1068
|
Notes:
|
1047
1069
|
- Each key in the dictionaries becomes a new field with name pattern "{field}.{key}"
|
1048
1070
|
- All dictionaries in the field must have compatible structures
|
1049
1071
|
- If a dictionary is missing a key, the corresponding value will be None
|
1050
1072
|
- Non-dictionary values in the field will cause a warning
|
1051
|
-
|
1073
|
+
|
1052
1074
|
Examples:
|
1053
1075
|
>>> from edsl.dataset import Dataset
|
1054
|
-
|
1076
|
+
|
1055
1077
|
# Basic flattening of nested dictionaries
|
1056
1078
|
>>> Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('a')
|
1057
1079
|
Dataset([{'c': [5]}, {'a.a': [1]}, {'a.b': [2]}])
|
1058
|
-
|
1080
|
+
|
1059
1081
|
# Works with prefixed fields too
|
1060
1082
|
>>> Dataset([{'answer.example': [{'a': 1, 'b': 2}]}, {'c': [5]}]).flatten('answer.example')
|
1061
1083
|
Dataset([{'c': [5]}, {'answer.example.a': [1]}, {'answer.example.b': [2]}])
|
1062
|
-
|
1084
|
+
|
1063
1085
|
# Keep the original field if needed
|
1064
1086
|
>>> d = Dataset([{'a': [{'a': 1, 'b': 2}]}, {'c': [5]}])
|
1065
1087
|
>>> d.flatten('a', keep_original=True)
|
@@ -1070,21 +1092,22 @@ class DataOperationsBase:
|
|
1070
1092
|
# Ensure the dataset isn't empty
|
1071
1093
|
if not self.data:
|
1072
1094
|
return self.copy()
|
1073
|
-
|
1095
|
+
|
1074
1096
|
# Find all columns that contain the field
|
1075
1097
|
matching_entries = []
|
1076
1098
|
for entry in self.data:
|
1077
1099
|
col_name = next(iter(entry.keys()))
|
1078
1100
|
if field == col_name or (
|
1079
|
-
|
1080
|
-
(col_name.endswith(
|
1101
|
+
"." in col_name
|
1102
|
+
and (col_name.endswith("." + field) or col_name.startswith(field + "."))
|
1081
1103
|
):
|
1082
1104
|
matching_entries.append(entry)
|
1083
|
-
|
1105
|
+
|
1084
1106
|
# Check if the field is ambiguous
|
1085
1107
|
if len(matching_entries) > 1:
|
1086
1108
|
matching_cols = [next(iter(entry.keys())) for entry in matching_entries]
|
1087
1109
|
from .exceptions import DatasetValueError
|
1110
|
+
|
1088
1111
|
raise DatasetValueError(
|
1089
1112
|
f"Ambiguous field name '{field}'. It matches multiple columns: {matching_cols}. "
|
1090
1113
|
f"Please specify the full column name to flatten."
|
@@ -1194,7 +1217,9 @@ class DataOperationsBase:
|
|
1194
1217
|
|
1195
1218
|
# Check if values are lists
|
1196
1219
|
if not all(isinstance(v, list) for v in field_data):
|
1197
|
-
raise DatasetTypeError(
|
1220
|
+
raise DatasetTypeError(
|
1221
|
+
f"Field '{field}' does not contain lists in all entries"
|
1222
|
+
)
|
1198
1223
|
|
1199
1224
|
# Get the maximum length of lists
|
1200
1225
|
max_len = max(len(v) for v in field_data)
|
@@ -1218,50 +1243,50 @@ class DataOperationsBase:
|
|
1218
1243
|
result.data.pop(field_index)
|
1219
1244
|
|
1220
1245
|
return result
|
1221
|
-
|
1246
|
+
|
1222
1247
|
def drop(self, field_name):
|
1223
1248
|
"""
|
1224
1249
|
Returns a new Dataset with the specified field removed.
|
1225
|
-
|
1250
|
+
|
1226
1251
|
Args:
|
1227
1252
|
field_name (str): The name of the field to remove.
|
1228
|
-
|
1253
|
+
|
1229
1254
|
Returns:
|
1230
1255
|
Dataset: A new Dataset instance without the specified field.
|
1231
|
-
|
1256
|
+
|
1232
1257
|
Raises:
|
1233
1258
|
KeyError: If the field_name doesn't exist in the dataset.
|
1234
|
-
|
1259
|
+
|
1235
1260
|
Examples:
|
1236
1261
|
>>> from .dataset import Dataset
|
1237
1262
|
>>> d = Dataset([{'a': [1, 2, 3]}, {'b': [4, 5, 6]}])
|
1238
1263
|
>>> d.drop('a')
|
1239
1264
|
Dataset([{'b': [4, 5, 6]}])
|
1240
|
-
|
1265
|
+
|
1241
1266
|
>>> # Testing drop with nonexistent field raises DatasetKeyError - tested in unit tests
|
1242
1267
|
"""
|
1243
1268
|
from .dataset import Dataset
|
1244
|
-
|
1269
|
+
|
1245
1270
|
# Check if field exists in the dataset
|
1246
1271
|
if field_name not in self.relevant_columns():
|
1247
1272
|
raise DatasetKeyError(f"Field '{field_name}' not found in dataset")
|
1248
|
-
|
1273
|
+
|
1249
1274
|
# Create a new dataset without the specified field
|
1250
1275
|
new_data = [entry for entry in self.data if field_name not in entry]
|
1251
1276
|
return Dataset(new_data)
|
1252
1277
|
|
1253
1278
|
def remove_prefix(self):
|
1254
1279
|
"""Returns a new Dataset with the prefix removed from all column names.
|
1255
|
-
|
1280
|
+
|
1256
1281
|
The prefix is defined as everything before the first dot (.) in the column name.
|
1257
1282
|
If removing prefixes would result in duplicate column names, an exception is raised.
|
1258
|
-
|
1283
|
+
|
1259
1284
|
Returns:
|
1260
1285
|
Dataset: A new Dataset with prefixes removed from column names
|
1261
|
-
|
1286
|
+
|
1262
1287
|
Raises:
|
1263
1288
|
ValueError: If removing prefixes would result in duplicate column names
|
1264
|
-
|
1289
|
+
|
1265
1290
|
Examples:
|
1266
1291
|
>>> from edsl.results import Results
|
1267
1292
|
>>> r = Results.example()
|
@@ -1269,70 +1294,73 @@ class DataOperationsBase:
|
|
1269
1294
|
['answer.how_feeling', 'answer.how_feeling_yesterday']
|
1270
1295
|
>>> r.select('how_feeling', 'how_feeling_yesterday').remove_prefix().relevant_columns()
|
1271
1296
|
['how_feeling', 'how_feeling_yesterday']
|
1272
|
-
|
1297
|
+
|
1273
1298
|
>>> from edsl.dataset import Dataset
|
1274
1299
|
>>> d = Dataset([{'a.x': [1, 2, 3]}, {'b.x': [4, 5, 6]}])
|
1275
1300
|
>>> # d.remove_prefix()
|
1276
|
-
|
1301
|
+
|
1277
1302
|
# Testing remove_prefix with duplicate column names raises DatasetValueError - tested in unit tests
|
1278
1303
|
"""
|
1279
1304
|
from .dataset import Dataset
|
1280
|
-
|
1305
|
+
|
1281
1306
|
# Get all column names
|
1282
1307
|
columns = self.relevant_columns()
|
1283
|
-
|
1308
|
+
|
1284
1309
|
# Extract the unprefixed names
|
1285
1310
|
unprefixed = {}
|
1286
1311
|
duplicates = set()
|
1287
|
-
|
1312
|
+
|
1288
1313
|
for col in columns:
|
1289
|
-
if
|
1290
|
-
unprefixed_name = col.split(
|
1314
|
+
if "." in col:
|
1315
|
+
unprefixed_name = col.split(".", 1)[1]
|
1291
1316
|
if unprefixed_name in unprefixed:
|
1292
1317
|
duplicates.add(unprefixed_name)
|
1293
1318
|
unprefixed[unprefixed_name] = col
|
1294
1319
|
else:
|
1295
1320
|
# For columns without a prefix, keep them as is
|
1296
1321
|
unprefixed[col] = col
|
1297
|
-
|
1322
|
+
|
1298
1323
|
# Check for duplicates
|
1299
1324
|
if duplicates:
|
1300
|
-
raise DatasetValueError(
|
1301
|
-
|
1325
|
+
raise DatasetValueError(
|
1326
|
+
f"Removing prefixes would result in duplicate column names: {sorted(list(duplicates))}"
|
1327
|
+
)
|
1328
|
+
|
1302
1329
|
# Create a new dataset with unprefixed column names
|
1303
1330
|
new_data = []
|
1304
1331
|
for entry in self.data:
|
1305
1332
|
key, values = list(entry.items())[0]
|
1306
|
-
if
|
1307
|
-
new_key = key.split(
|
1333
|
+
if "." in key:
|
1334
|
+
new_key = key.split(".", 1)[1]
|
1308
1335
|
else:
|
1309
1336
|
new_key = key
|
1310
1337
|
new_data.append({new_key: values})
|
1311
|
-
|
1338
|
+
|
1312
1339
|
return Dataset(new_data)
|
1313
1340
|
|
1314
1341
|
|
1315
1342
|
def to_dataset(func):
|
1316
1343
|
"""
|
1317
1344
|
Decorator that ensures functions receive a Dataset object as their first argument.
|
1318
|
-
|
1345
|
+
|
1319
1346
|
This decorator automatically converts various EDSL container objects (Results,
|
1320
1347
|
AgentList, ScenarioList) to Dataset objects before passing them to the decorated
|
1321
1348
|
function. This allows methods defined in DataOperationsBase to work seamlessly
|
1322
1349
|
across different container types without duplicating conversion logic.
|
1323
|
-
|
1350
|
+
|
1324
1351
|
Parameters:
|
1325
1352
|
func: The function to decorate
|
1326
|
-
|
1353
|
+
|
1327
1354
|
Returns:
|
1328
1355
|
A wrapped function that ensures its first argument is a Dataset
|
1329
|
-
|
1356
|
+
|
1330
1357
|
Notes:
|
1331
1358
|
- For Results objects, calls select() to convert to a Dataset
|
1332
1359
|
- For AgentList and ScenarioList objects, calls their to_dataset() method
|
1333
1360
|
- For Dataset objects, passes them through unchanged
|
1334
1361
|
- This decorator is used internally by the mixin system to enable method sharing
|
1335
1362
|
"""
|
1363
|
+
|
1336
1364
|
@wraps(func)
|
1337
1365
|
def wrapper(self, *args, **kwargs):
|
1338
1366
|
"""Execute the function with self converted to a Dataset if needed."""
|
@@ -1345,7 +1373,7 @@ def to_dataset(func):
|
|
1345
1373
|
dataset_self = self.to_dataset()
|
1346
1374
|
else:
|
1347
1375
|
dataset_self = self
|
1348
|
-
|
1376
|
+
|
1349
1377
|
# Call the function with the converted self
|
1350
1378
|
return func(dataset_self, *args, **kwargs)
|
1351
1379
|
|
@@ -1357,22 +1385,22 @@ def to_dataset(func):
|
|
1357
1385
|
def decorate_methods_from_mixin(cls, mixin_cls):
|
1358
1386
|
"""
|
1359
1387
|
Apply the to_dataset decorator to methods inherited from a mixin class.
|
1360
|
-
|
1388
|
+
|
1361
1389
|
This function is part of EDSL's method inheritance system. It takes methods
|
1362
1390
|
from a source mixin class, applies the to_dataset decorator to them, and adds
|
1363
1391
|
them to a target class. This enables the sharing of data manipulation methods
|
1364
1392
|
across different container types while ensuring they receive the right data type.
|
1365
|
-
|
1393
|
+
|
1366
1394
|
The function is careful not to override methods that are already defined in
|
1367
1395
|
more specific parent classes, preserving the method resolution order (MRO).
|
1368
|
-
|
1396
|
+
|
1369
1397
|
Parameters:
|
1370
1398
|
cls: The target class to add decorated methods to
|
1371
1399
|
mixin_cls: The source mixin class providing the methods
|
1372
|
-
|
1400
|
+
|
1373
1401
|
Returns:
|
1374
1402
|
The modified target class with decorated methods added
|
1375
|
-
|
1403
|
+
|
1376
1404
|
Notes:
|
1377
1405
|
- Only public methods (not starting with "_") are decorated and added
|
1378
1406
|
- Methods already defined in more specific parent classes are not overridden
|
@@ -1381,14 +1409,13 @@ def decorate_methods_from_mixin(cls, mixin_cls):
|
|
1381
1409
|
# Get all attributes, including inherited ones
|
1382
1410
|
for attr_name in dir(mixin_cls):
|
1383
1411
|
# Skip magic methods and private methods
|
1384
|
-
if not attr_name.startswith(
|
1412
|
+
if not attr_name.startswith("_"):
|
1385
1413
|
attr_value = getattr(mixin_cls, attr_name)
|
1386
1414
|
if callable(attr_value):
|
1387
1415
|
# Check if the method is already defined in the class's MRO
|
1388
1416
|
# but skip DataOperationsBase methods
|
1389
1417
|
for base in cls.__mro__[1:]: # Skip the class itself
|
1390
|
-
if
|
1391
|
-
base is not DataOperationsBase):
|
1418
|
+
if attr_name in base.__dict__ and base is not DataOperationsBase:
|
1392
1419
|
# Method is overridden in a more specific class, skip decorating
|
1393
1420
|
break
|
1394
1421
|
else:
|
@@ -1396,9 +1423,10 @@ def decorate_methods_from_mixin(cls, mixin_cls):
|
|
1396
1423
|
setattr(cls, attr_name, to_dataset(attr_value))
|
1397
1424
|
return cls
|
1398
1425
|
|
1426
|
+
|
1399
1427
|
# def decorate_methods_from_mixin(cls, mixin_cls):
|
1400
1428
|
# """Decorates all methods from mixin_cls with to_dataset decorator."""
|
1401
|
-
|
1429
|
+
|
1402
1430
|
# # Get all attributes, including inherited ones
|
1403
1431
|
# for attr_name in dir(mixin_cls):
|
1404
1432
|
# # Skip magic methods and private methods
|
@@ -1408,99 +1436,107 @@ def decorate_methods_from_mixin(cls, mixin_cls):
|
|
1408
1436
|
# setattr(cls, attr_name, to_dataset(attr_value))
|
1409
1437
|
# return cls
|
1410
1438
|
|
1439
|
+
|
1411
1440
|
class DatasetOperationsMixin(DataOperationsBase):
|
1412
1441
|
"""
|
1413
1442
|
Mixin providing data manipulation operations for Dataset objects.
|
1414
|
-
|
1415
|
-
This mixin class is the cornerstone of EDSL's data manipulation system. It directly
|
1443
|
+
|
1444
|
+
This mixin class is the cornerstone of EDSL's data manipulation system. It directly
|
1416
1445
|
inherits methods from DataOperationsBase without requiring conversion, as it's
|
1417
1446
|
designed specifically for the Dataset class. It serves as the primary implementation
|
1418
|
-
of all data operations methods that other container types will inherit and adapt
|
1447
|
+
of all data operations methods that other container types will inherit and adapt
|
1419
1448
|
through the to_dataset decorator.
|
1420
|
-
|
1449
|
+
|
1421
1450
|
The design follows a standard mixin pattern where common functionality is defined
|
1422
1451
|
in a standalone class that can be "mixed in" to other classes. In EDSL's case,
|
1423
1452
|
this allows different container types (Results, AgentList, ScenarioList) to share
|
1424
1453
|
the same powerful data manipulation interface.
|
1425
|
-
|
1454
|
+
|
1426
1455
|
Key features:
|
1427
|
-
|
1456
|
+
|
1428
1457
|
1. Data Transformation:
|
1429
1458
|
- Filtering with `filter()`
|
1430
1459
|
- Creating new columns with `mutate()`
|
1431
1460
|
- Reshaping with `long()`, `wide()`, `flatten()`, etc.
|
1432
1461
|
- Selecting specific data with `select()`
|
1433
|
-
|
1462
|
+
|
1434
1463
|
2. Visualization:
|
1435
1464
|
- Table display with `table()`
|
1436
1465
|
- R integration with `ggplot2()`
|
1437
1466
|
- Report generation with `report()`
|
1438
|
-
|
1467
|
+
|
1439
1468
|
3. Data Export:
|
1440
1469
|
- To files with `to_csv()`, `to_excel()`, etc.
|
1441
1470
|
- To other formats with `to_pandas()`, `to_dicts()`, etc.
|
1442
|
-
|
1471
|
+
|
1443
1472
|
4. Analysis:
|
1444
1473
|
- SQL queries with `sql()`
|
1445
1474
|
- Aggregation with `tally()`
|
1446
1475
|
- Tree-based exploration with `tree()`
|
1447
|
-
|
1476
|
+
|
1448
1477
|
This mixin is designed for fluent method chaining, allowing complex data manipulation
|
1449
1478
|
pipelines to be built in an expressive and readable way.
|
1450
1479
|
"""
|
1480
|
+
|
1451
1481
|
pass
|
1452
1482
|
|
1483
|
+
|
1453
1484
|
class ResultsOperationsMixin(DataOperationsBase):
|
1454
1485
|
"""
|
1455
1486
|
Mixin providing data operations for Results objects.
|
1456
|
-
|
1487
|
+
|
1457
1488
|
This mixin adapts DatasetOperationsMixin methods to work with Results objects.
|
1458
1489
|
When a method is called on a Results object, it's automatically converted to
|
1459
1490
|
a Dataset first via the to_dataset decorator applied in __init_subclass__.
|
1460
|
-
|
1491
|
+
|
1461
1492
|
This allows Results objects to have the same data manipulation capabilities
|
1462
1493
|
as Dataset objects without duplicating code.
|
1463
1494
|
"""
|
1495
|
+
|
1464
1496
|
def __init_subclass__(cls, **kwargs):
|
1465
1497
|
"""
|
1466
1498
|
Automatically decorate all methods from DatasetOperationsMixin.
|
1467
|
-
|
1499
|
+
|
1468
1500
|
This hook runs when a class inherits from ResultsOperationsMixin,
|
1469
1501
|
applying the to_dataset decorator to all methods from DatasetOperationsMixin.
|
1470
1502
|
"""
|
1471
1503
|
super().__init_subclass__(**kwargs)
|
1472
1504
|
decorate_methods_from_mixin(cls, DatasetOperationsMixin)
|
1473
1505
|
|
1506
|
+
|
1474
1507
|
class ScenarioListOperationsMixin(DataOperationsBase):
|
1475
1508
|
"""
|
1476
1509
|
Mixin providing data operations for ScenarioList objects.
|
1477
|
-
|
1510
|
+
|
1478
1511
|
This mixin adapts DatasetOperationsMixin methods to work with ScenarioList objects.
|
1479
1512
|
ScenarioList objects are converted to Dataset objects before method execution
|
1480
1513
|
via the to_dataset decorator applied in __init_subclass__.
|
1481
1514
|
"""
|
1515
|
+
|
1482
1516
|
def __init_subclass__(cls, **kwargs):
|
1483
1517
|
"""
|
1484
1518
|
Automatically decorate all methods from DatasetOperationsMixin.
|
1485
|
-
|
1519
|
+
|
1486
1520
|
This hook runs when a class inherits from ScenarioListOperationsMixin,
|
1487
1521
|
applying the to_dataset decorator to all methods from DatasetOperationsMixin.
|
1488
1522
|
"""
|
1489
1523
|
super().__init_subclass__(**kwargs)
|
1490
1524
|
decorate_methods_from_mixin(cls, DatasetOperationsMixin)
|
1491
1525
|
|
1526
|
+
|
1492
1527
|
class AgentListOperationsMixin(DataOperationsBase):
|
1493
1528
|
"""
|
1494
1529
|
Mixin providing data operations for AgentList objects.
|
1495
|
-
|
1530
|
+
|
1496
1531
|
This mixin adapts DatasetOperationsMixin methods to work with AgentList objects.
|
1497
1532
|
AgentList objects are converted to Dataset objects before method execution
|
1498
1533
|
via the to_dataset decorator applied in __init_subclass__.
|
1499
1534
|
"""
|
1535
|
+
|
1500
1536
|
def __init_subclass__(cls, **kwargs):
|
1501
1537
|
"""
|
1502
1538
|
Automatically decorate all methods from DatasetOperationsMixin.
|
1503
|
-
|
1539
|
+
|
1504
1540
|
This hook runs when a class inherits from AgentListOperationsMixin,
|
1505
1541
|
applying the to_dataset decorator to all methods from DatasetOperationsMixin.
|
1506
1542
|
"""
|