PyPI - edsl - Versions diffs - 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl - Mend

edsl 0.1.54py3-none-any.whl → 0.1.56py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

edsl/__init__.py +8 -1
edsl/__init__original.py +134 -0
edsl/__version__.py +1 -1
edsl/agents/agent.py +29 -0
edsl/agents/agent_list.py +36 -1
edsl/base/base_class.py +281 -151
edsl/base/data_transfer_models.py +15 -4
edsl/buckets/__init__.py +8 -3
edsl/buckets/bucket_collection.py +9 -3
edsl/buckets/model_buckets.py +4 -2
edsl/buckets/token_bucket.py +2 -2
edsl/buckets/token_bucket_client.py +5 -3
edsl/caching/cache.py +131 -62
edsl/caching/cache_entry.py +70 -58
edsl/caching/sql_dict.py +17 -0
edsl/cli.py +99 -0
edsl/config/config_class.py +16 -0
edsl/conversation/__init__.py +31 -0
edsl/coop/coop.py +276 -242
edsl/coop/coop_jobs_objects.py +59 -0
edsl/coop/coop_objects.py +29 -0
edsl/coop/coop_regular_objects.py +26 -0
edsl/coop/utils.py +24 -19
edsl/dataset/dataset.py +338 -101
edsl/dataset/dataset_operations_mixin.py +216 -180
edsl/db_list/sqlite_list.py +349 -0
edsl/inference_services/__init__.py +40 -5
edsl/inference_services/exceptions.py +11 -0
edsl/inference_services/services/anthropic_service.py +5 -2
edsl/inference_services/services/aws_bedrock.py +6 -2
edsl/inference_services/services/azure_ai.py +6 -2
edsl/inference_services/services/google_service.py +7 -3
edsl/inference_services/services/mistral_ai_service.py +6 -2
edsl/inference_services/services/open_ai_service.py +6 -2
edsl/inference_services/services/perplexity_service.py +6 -2
edsl/inference_services/services/test_service.py +94 -5
edsl/interviews/answering_function.py +167 -59
edsl/interviews/interview.py +124 -72
edsl/interviews/interview_task_manager.py +10 -0
edsl/interviews/request_token_estimator.py +8 -0
edsl/invigilators/invigilators.py +35 -13
edsl/jobs/async_interview_runner.py +146 -104
edsl/jobs/data_structures.py +6 -4
edsl/jobs/decorators.py +61 -0
edsl/jobs/fetch_invigilator.py +61 -18
edsl/jobs/html_table_job_logger.py +14 -2
edsl/jobs/jobs.py +180 -104
edsl/jobs/jobs_component_constructor.py +2 -2
edsl/jobs/jobs_interview_constructor.py +2 -0
edsl/jobs/jobs_pricing_estimation.py +154 -113
edsl/jobs/jobs_remote_inference_logger.py +4 -0
edsl/jobs/jobs_runner_status.py +30 -25
edsl/jobs/progress_bar_manager.py +79 -0
edsl/jobs/remote_inference.py +35 -1
edsl/key_management/key_lookup_builder.py +6 -1
edsl/language_models/language_model.py +110 -12
edsl/language_models/model.py +10 -3
edsl/language_models/price_manager.py +176 -71
edsl/language_models/registry.py +5 -0
edsl/notebooks/notebook.py +77 -10
edsl/questions/VALIDATION_README.md +134 -0
edsl/questions/__init__.py +24 -1
edsl/questions/exceptions.py +21 -0
edsl/questions/question_dict.py +201 -16
edsl/questions/question_multiple_choice_with_other.py +624 -0
edsl/questions/question_registry.py +2 -1
edsl/questions/templates/multiple_choice_with_other/__init__.py +0 -0
edsl/questions/templates/multiple_choice_with_other/answering_instructions.jinja +15 -0
edsl/questions/templates/multiple_choice_with_other/question_presentation.jinja +17 -0
edsl/questions/validation_analysis.py +185 -0
edsl/questions/validation_cli.py +131 -0
edsl/questions/validation_html_report.py +404 -0
edsl/questions/validation_logger.py +136 -0
edsl/results/result.py +115 -46
edsl/results/results.py +702 -171
edsl/scenarios/construct_download_link.py +16 -3
edsl/scenarios/directory_scanner.py +226 -226
edsl/scenarios/file_methods.py +5 -0
edsl/scenarios/file_store.py +150 -9
edsl/scenarios/handlers/__init__.py +5 -1
edsl/scenarios/handlers/mp4_file_store.py +104 -0
edsl/scenarios/handlers/webm_file_store.py +104 -0
edsl/scenarios/scenario.py +120 -101
edsl/scenarios/scenario_list.py +800 -727
edsl/scenarios/scenario_list_gc_test.py +146 -0
edsl/scenarios/scenario_list_memory_test.py +214 -0
edsl/scenarios/scenario_list_source_refactor.md +35 -0
edsl/scenarios/scenario_selector.py +5 -4
edsl/scenarios/scenario_source.py +1990 -0
edsl/scenarios/tests/test_scenario_list_sources.py +52 -0
edsl/surveys/survey.py +22 -0
edsl/tasks/__init__.py +4 -2
edsl/tasks/task_history.py +198 -36
edsl/tests/scenarios/test_ScenarioSource.py +51 -0
edsl/tests/scenarios/test_scenario_list_sources.py +51 -0
edsl/utilities/__init__.py +2 -1
edsl/utilities/decorators.py +121 -0
edsl/utilities/memory_debugger.py +1010 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/METADATA +51 -76
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/RECORD +103 -79
edsl/jobs/jobs_runner_asyncio.py +0 -281
edsl/language_models/unused/fake_openai_service.py +0 -60
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/LICENSE +0 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/WHEEL +0 -0
{edsl-0.1.54.dist-info → edsl-0.1.56.dist-info}/entry_points.txt +0 -0

edsl/scenarios/scenario_list_gc_test.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""
+Test script to investigate garbage collection behavior with ScenarioList.
+"""
+import gc
+import os
+import psutil
+import sys
+import pickle
+import tracemalloc
+from typing import Dict, List, Any
+def get_memory_usage() -> float:
+    """Get current memory usage in MB"""
+    process = psutil.Process(os.getpid())
+    memory_info = process.memory_info()
+    memory_mb = memory_info.rss / (1024 * 1024)
+    return memory_mb
+def display_memory_usage(label: str) -> float:
+    """Display and return current memory usage."""
+    mem = get_memory_usage()
+    print(f"{label}: {mem:.2f} MB")
+    return mem
+def test_pickle_gc() -> None:
+    """Test if pickle.dumps objects are properly garbage collected."""
+    display_memory_usage("Initial memory")
+    # Force garbage collection
+    gc.collect()
+    start_mem = display_memory_usage("Memory after initial gc")
+    # Test with a string of 1MB
+    text = "x" * (1024 * 1024)  # 1MB string
+    display_memory_usage("Memory after creating 1MB string")
+    # Create 100 large dictionaries
+    data_list: List[Dict[str, Any]] = []
+    for i in range(100):
+        data = {"id": i, "text": text, "value": i * 100}
+        data_list.append(data)
+    gc.collect()
+    display_memory_usage("Memory after creating 100 dicts with shared string")
+    # Now pickle each object
+    print("\nPickling objects one at a time:")
+    for i, data in enumerate(data_list):
+        # Pickle the object and immediately delete the reference
+        serialized = pickle.dumps(data)
+        # Explicitly delete to help garbage collection
+        del serialized
+        # Every 10 items, force garbage collection and check memory
+        if i % 10 == 9:
+            gc.collect()
+            display_memory_usage(f"Memory after pickling {i+1} objects and gc")
+    # Final garbage collection
+    gc.collect()
+    end_mem = display_memory_usage("Memory after final gc")
+    print(f"\nTotal memory increase: {end_mem - start_mem:.2f} MB")
+    # Now test with tracemalloc to see exactly where memory is allocated
+    print("\nDetailed memory tracing with tracemalloc:")
+    tracemalloc.start()
+    # Get snapshot before
+    data = {"id": 1, "text": "x" * (1024 * 1024), "value": 100}
+    snapshot1 = tracemalloc.take_snapshot()
+    # Do 10 pickling operations
+    for i in range(10):
+        serialized = pickle.dumps(data)
+        del serialized
+    # Get snapshot after
+    snapshot2 = tracemalloc.take_snapshot()
+    # Compare and show top differences
+    top_stats = snapshot2.compare_to(snapshot1, 'lineno')
+    print("\nTop 10 memory allocations:")
+    for stat in top_stats[:10]:
+        print(f"{stat.traceback.format()[0]} - {stat.size_diff / 1024:.2f} KB")
+    tracemalloc.stop()
+def test_sqlite_insert() -> None:
+    """Test SQLite insertion memory behavior."""
+    import sqlite3
+    import tempfile
+    display_memory_usage("Initial memory")
+    # Force garbage collection
+    gc.collect()
+    start_mem = display_memory_usage("Memory after initial gc")
+    # Create a temporary database
+    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as tmp:
+        db_path = tmp.name
+    # Connect to the database
+    conn = sqlite3.connect(db_path)
+    conn.execute("CREATE TABLE IF NOT EXISTS items (idx INTEGER, value BLOB)")
+    # Create a large string and dictionary
+    text = "x" * (1024 * 1024)  # 1MB string
+    display_memory_usage("Memory after creating 1MB string")
+    # Insert 100 large pickled objects
+    print("\nInserting objects one at a time:")
+    for i in range(100):
+        data = {"id": i, "text": text, "value": i * 100}
+        serialized = pickle.dumps(data)
+        # Insert into database
+        conn.execute("INSERT INTO items (idx, value) VALUES (?, ?)", (i, serialized))
+        # Explicitly delete serialized data
+        del serialized
+        # Every 10 items, force garbage collection and check memory
+        if i % 10 == 9:
+            conn.commit()  # Commit to ensure SQLite releases memory
+            gc.collect()
+            display_memory_usage(f"Memory after inserting {i+1} objects and gc")
+    # Close connection and clean up
+    conn.close()
+    os.unlink(db_path)
+    # Final garbage collection
+    gc.collect()
+    end_mem = display_memory_usage("Memory after final gc")
+    print(f"\nTotal memory increase: {end_mem - start_mem:.2f} MB")
+if __name__ == "__main__":
+    print("===== Testing pickle garbage collection =====")
+    test_pickle_gc()
+    print("\n\n===== Testing SQLite insertion memory behavior =====")
+    test_sqlite_insert()

edsl/scenarios/scenario_list_memory_test.py ADDED Viewed

@@ -0,0 +1,214 @@
+"""
+Memory usage test for ScenarioList with different data sizes.
+"""
+import gc
+import os
+import psutil
+import time
+import json
+from typing import Dict, List, Any, Tuple
+import matplotlib.pyplot as plt
+def get_memory_usage() -> float:
+    """Get current memory usage in MB"""
+    process = psutil.Process(os.getpid())
+    memory_info = process.memory_info()
+    memory_mb = memory_info.rss / (1024 * 1024)
+    return memory_mb
+def log_memory(label: str) -> float:
+    """Log and return current memory usage."""
+    mem = get_memory_usage()
+    print(f"{label}: {mem:.2f} MB")
+    return mem
+def run_memory_test(sizes: List[int], item_size_kb: int = 10) -> Dict[str, Dict[int, float]]:
+    """
+    Test memory usage for ScenarioList with different dataset sizes.
+    Args:
+        sizes: List of dataset sizes to test
+        item_size_kb: Size of text data in each scenario (in KB)
+    Returns:
+        Dictionary with memory usage metrics for each size
+    """
+    from edsl.scenarios import ScenarioList
+    results = {
+        "creation": {},
+        "filter": {},
+        "baseline": {},
+        "total": {}
+    }
+    for size in sizes:
+        print(f"\n{'='*50}")
+        print(f"Testing with {size} scenarios (each with {item_size_kb}KB text)")
+        print(f"{'='*50}")
+        # Force garbage collection before starting
+        gc.collect()
+        gc.collect()
+        time.sleep(1)  # Give system time to stabilize
+        baseline_mem = log_memory(f"Baseline memory")
+        results["baseline"][size] = baseline_mem
+        # Create test data
+        text_size = item_size_kb * 1024  # Convert KB to bytes
+        text = "x" * text_size
+        # Create scenarios
+        print(f"Creating {size} scenarios...")
+        scenarios = []
+        for i in range(size):
+            scenarios.append({
+                "id": i,
+                "text": text,
+                "category": "A" if i % 2 == 0 else "B",
+                "value": i * 10
+            })
+        # Measure memory after creating raw data
+        after_raw_mem = log_memory("Memory after creating raw data")
+        # Create ScenarioList
+        print(f"Creating ScenarioList...")
+        start_time = time.time()
+        sl = ScenarioList(scenarios)
+        creation_time = time.time() - start_time
+        # Measure memory after creating ScenarioList
+        after_creation_mem = log_memory("Memory after creating ScenarioList")
+        creation_mem_diff = after_creation_mem - after_raw_mem
+        results["creation"][size] = creation_mem_diff
+        print(f"Creation memory increase: {creation_mem_diff:.2f} MB")
+        print(f"Creation time: {creation_time:.2f} seconds")
+        # Filter ScenarioList
+        print(f"Filtering ScenarioList...")
+        start_time = time.time()
+        filtered = sl.filter("id % 2 == 0")
+        filter_time = time.time() - start_time
+        # Measure memory after filtering
+        after_filter_mem = log_memory("Memory after filtering")
+        filter_mem_diff = after_filter_mem - after_creation_mem
+        results["filter"][size] = filter_mem_diff
+        results["total"][size] = after_filter_mem - baseline_mem
+        print(f"Filter memory increase: {filter_mem_diff:.2f} MB")
+        print(f"Filter time: {filter_time:.2f} seconds")
+        print(f"Total memory increase: {after_filter_mem - baseline_mem:.2f} MB")
+        # Clean up to prepare for next iteration
+        del scenarios
+        del sl
+        del filtered
+        gc.collect()
+        gc.collect()
+        time.sleep(1)  # Give system time to stabilize
+    return results
+def plot_results(results: Dict[str, Dict[int, float]],
+                 output_path: str = "memory_usage_plot.png") -> None:
+    """
+    Plot memory usage results.
+    Args:
+        results: Dictionary with memory usage metrics
+        output_path: Path to save the plot
+    """
+    sizes = sorted(results["creation"].keys())
+    # Extract data for plotting
+    creation_memory = [results["creation"][size] for size in sizes]
+    filter_memory = [results["filter"][size] for size in sizes]
+    total_memory = [results["total"][size] for size in sizes]
+    # Create figure and axis
+    plt.figure(figsize=(12, 8))
+    # Plot memory usage
+    plt.subplot(2, 1, 1)
+    plt.plot(sizes, creation_memory, 'o-', label='ScenarioList Creation')
+    plt.plot(sizes, filter_memory, 's-', label='Filter Operation')
+    plt.plot(sizes, total_memory, '^-', label='Total Memory Usage')
+    plt.xlabel('Number of Scenarios')
+    plt.ylabel('Memory Usage (MB)')
+    plt.title('Memory Usage vs. Dataset Size')
+    plt.grid(True)
+    plt.legend()
+    # Plot memory usage per scenario
+    plt.subplot(2, 1, 2)
+    mem_per_scenario_creation = [mem/size for mem, size in zip(creation_memory, sizes)]
+    mem_per_scenario_filter = [mem/size for mem, size in zip(filter_memory, sizes)]
+    mem_per_scenario_total = [mem/size for mem, size in zip(total_memory, sizes)]
+    plt.plot(sizes, mem_per_scenario_creation, 'o-', label='Creation (per scenario)')
+    plt.plot(sizes, mem_per_scenario_filter, 's-', label='Filter (per scenario)')
+    plt.plot(sizes, mem_per_scenario_total, '^-', label='Total (per scenario)')
+    plt.xlabel('Number of Scenarios')
+    plt.ylabel('Memory Usage per Scenario (MB)')
+    plt.title('Memory Efficiency vs. Dataset Size')
+    plt.grid(True)
+    plt.legend()
+    # Set log scale for x-axis to better visualize the trend
+    plt.xscale('log')
+    plt.tight_layout()
+    plt.savefig(output_path)
+    print(f"Plot saved to {output_path}")
+def main():
+    """Run memory tests and plot results."""
+    # Test with increasing dataset sizes
+    sizes = [100, 500, 1000, 2000, 5000, 10000]
+    # Each scenario will have a 10KB text field
+    item_size_kb = 10
+    # Create output directory if it doesn't exist
+    os.makedirs("benchmark_logs/memory_reports", exist_ok=True)
+    # Run tests
+    results = run_memory_test(sizes, item_size_kb)
+    # Save results as JSON
+    timestamp = time.strftime("%Y%m%d_%H%M%S")
+    results_path = f"benchmark_logs/memory_reports/memory_test_results_{timestamp}.json"
+    with open(results_path, 'w') as f:
+        json.dump(results, f, indent=2)
+    print(f"\nResults saved to {results_path}")
+    # Plot results
+    plot_path = f"benchmark_logs/memory_reports/memory_usage_plot_{timestamp}.png"
+    plot_results(results, plot_path)
+def test_scenario_list_memory():
+    """
+    Simple test function for pytest to run a small memory test.
+    This is a simplified version of the main benchmarking function.
+    """
+    # Use very small sample size for pytest
+    sizes = [10, 20]
+    item_size_kb = 1
+    # Run a minimal test
+    results = run_memory_test(sizes, item_size_kb)
+    # Verify we got results
+    assert isinstance(results, dict)
+    assert "creation" in results
+    assert "filter" in results
+    assert "total" in results
+    # No need to return anything for pytest
+if __name__ == "__main__":
+    main()

edsl/scenarios/scenario_list_source_refactor.md ADDED Viewed

@@ -0,0 +1,35 @@
+# ScenarioList Source Refactoring Checklist
+This document outlines the refactoring process to move the `from_X` methods from `ScenarioList` to child classes of `Source`.
+## Refactoring Process
+For each source type, follow these steps:
+1. Create a new child class of `Source` in `scenario_source.py`
+2. Add a deprecated classmethod in `ScenarioList` that references the new source class
+3. Run pytest to confirm everything works correctly
+4. Move to the next source type
+## Source Types Checklist
+- [x] `urls` - Already implemented as `URLSource`
+- [x] `list` - Already implemented as `ListSource`
+- [x] `directory` - Implemented as `DirectorySource`
+- [x] `list_of_tuples` - Implemented as `TuplesSource`
+- [x] `sqlite` - Implemented as `SQLiteSource`
+- [x] `latex` - Implemented as `LaTeXSource`
+- [x] `google_doc` - Implemented as `GoogleDocSource`
+- [x] `pandas` - Implemented as `PandasSource`
+- [x] `dta` - Implemented as `StataSource`
+- [x] `wikipedia` - Implemented as `WikipediaSource`
+- [x] `excel` - Implemented as `ExcelSource`
+- [x] `google_sheet` - Implemented as `GoogleSheetSource`
+- [x] `delimited_file` - Implemented as `DelimitedFileSource`
+- [x] `csv` - Implemented as `CSVSource` (extending `DelimitedFileSource`)
+- [x] `tsv` - Implemented as `TSVSource` (extending `DelimitedFileSource`)
+- [ ] `dict` - Implement as `DictSource`
+- [ ] `nested_dict` - Implement as `NestedDictSource`
+- [x] `parquet` - Implemented as `ParquetSource`
+- [x] `pdf` - Implemented as `PDFSource`
+- [x] `pdf_to_image` - Implemented as `PDFImageSource`

edsl/scenarios/scenario_selector.py CHANGED Viewed

@@ -131,10 +131,11 @@ class ScenarioSelector:
                 f"No fields matched the given patterns: {patterns}. "
                 f"Available fields are: {self.available_fields}"
             )
-        return self.scenario_list.__class__(
-            [scenario.select(fields_to_select) for scenario in self.scenario_list.data]
-        )
+        new_sl = self.scenario_list.__class__(data=[], codebook=self.scenario_list.codebook)
+        for scenario in self.scenario_list:
+            new_sl.append(scenario.select(fields_to_select))
+        return new_sl
     def get_available_fields(self) -> list[str]:
         """

edsl 0.1.54__py3-none-any.whl → 0.1.56__py3-none-any.whl

edsl 0.1.54py3-none-any.whl → 0.1.56py3-none-any.whl