PyPI - lumibot - Versions diffs - 4.0.23__py3-none-any.whl → 4.1.1__py3-none-any.whl - Mend

lumibot 4.0.23py3-none-any.whl → 4.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lumibot might be problematic. Click here for more details.

Files changed (161) hide show

tests/backtest/profile_thetadata_vs_polygon.py ADDED Viewed

@@ -0,0 +1,255 @@
+"""
+Performance profiling comparison between ThetaData and Polygon.
+This script uses YAPPI (thread-safe profiler) to identify bottlenecks in both data sources.
+Usage:
+    python profile_thetadata_vs_polygon.py
+Requirements:
+    pip install yappi snakeviz
+To visualize results:
+    snakeviz thetadata_nocache.prof
+    snakeviz thetadata_cached.prof
+    snakeviz polygon_nocache.prof
+    snakeviz polygon_cached.prof
+"""
+import datetime
+import os
+import shutil
+from pathlib import Path
+import yappi
+from dotenv import load_dotenv
+from lumibot.strategies import Strategy
+from lumibot.backtesting import ThetaDataBacktesting, PolygonDataBacktesting
+from lumibot.entities import Asset
+# Load environment variables from .env file
+load_dotenv()
+class SimpleBacktestStrategy(Strategy):
+    """Simple buy-and-hold strategy for profiling"""
+    parameters = {
+        "symbol": "AMZN",
+        "quantity": 10
+    }
+    def initialize(self):
+        self.sleeptime = "1D"
+    def on_trading_iteration(self):
+        if self.first_iteration:
+            asset = Asset(self.parameters["symbol"])
+            order = self.create_order(asset, self.parameters["quantity"], "buy")
+            self.submit_order(order)
+def get_cache_dir():
+    """Get the lumibot cache directory"""
+    cache_dir = Path.home() / ".lumibot"
+    return cache_dir
+def clear_cache():
+    """Clear all cached data"""
+    cache_dir = get_cache_dir()
+    if cache_dir.exists():
+        print(f"Clearing cache at {cache_dir}")
+        shutil.rmtree(cache_dir)
+        print("Cache cleared")
+    else:
+        print("No cache to clear")
+def profile_backtest(data_source_class, name, profile_file, clear_cache_first=True):
+    """
+    Profile a backtest run.
+    Args:
+        data_source_class: ThetaDataBacktesting or PolygonDataBacktesting
+        name: Name for logging
+        profile_file: Output file for profiling results
+        clear_cache_first: Whether to clear cache before running
+    """
+    if clear_cache_first:
+        clear_cache()
+    print(f"\n{'='*80}")
+    print(f"PROFILING: {name}")
+    print(f"Cache: {'CLEARED' if clear_cache_first else 'WARMED'}")
+    print(f"{'='*80}\n")
+    # Configure data source
+    start = datetime.datetime(2024, 8, 1)
+    end = datetime.datetime(2024, 8, 2)
+    # Get credentials
+    thetadata_username = os.environ.get("THETADATA_USERNAME")
+    thetadata_password = os.environ.get("THETADATA_PASSWORD")
+    polygon_api_key = os.environ.get("POLYGON_API_KEY")
+    # Start profiling
+    yappi.clear_stats()
+    yappi.set_clock_type("wall")  # Use wall clock time
+    yappi.start()
+    # Run backtest
+    start_time = datetime.datetime.now()
+    try:
+        results, strategy = SimpleBacktestStrategy.run_backtest(
+            data_source_class,
+            start,
+            end,
+            show_plot=False,
+            show_tearsheet=False,
+            save_tearsheet=False,
+            parameters={"symbol": "AMZN", "quantity": 10},
+            thetadata_username=thetadata_username,
+            thetadata_password=thetadata_password,
+            polygon_api_key=polygon_api_key,
+        )
+        end_time = datetime.datetime.now()
+        elapsed = (end_time - start_time).total_seconds()
+        print(f"✓ Backtest completed in {elapsed:.2f} seconds")
+        print(f"  Orders: {len(strategy.orders)}")
+        print(f"  Final portfolio value: ${strategy.get_portfolio_value():,.2f}")
+    except Exception as e:
+        print(f"✗ Backtest failed: {e}")
+        raise
+    finally:
+        # Stop profiling
+        yappi.stop()
+        # Save profiling results
+        func_stats = yappi.get_func_stats()
+        # Save to pstat format for snakeviz
+        func_stats.save(profile_file, type="pstat")
+        print(f"  Profile saved to: {profile_file}")
+        # Print top 30 time-consuming functions
+        print(f"\nTop 30 time-consuming functions:")
+        print("="*120)
+        func_stats.sort("totaltime", "desc")
+        # Print first 30 functions
+        for i, stat in enumerate(func_stats[:30]):
+            if i == 0:
+                print(f"{'Function':<60} {'Calls':<10} {'TotTime':<12} {'PerCall':<12}")
+                print("-"*120)
+            print(f"{stat.name:<60} {stat.ncall:<10} {stat.ttot:<12.6f} {stat.tavg:<12.6f}")
+        return elapsed
+def main():
+    """Run profiling comparison"""
+    # Check if credentials are available
+    thetadata_username = os.environ.get("THETADATA_USERNAME")
+    thetadata_password = os.environ.get("THETADATA_PASSWORD")
+    polygon_api_key = os.environ.get("POLYGON_API_KEY")
+    if not thetadata_username or not thetadata_password:
+        print("ERROR: ThetaData credentials not found")
+        print("Set THETADATA_USERNAME and THETADATA_PASSWORD environment variables")
+        return
+    if not polygon_api_key:
+        print("ERROR: Polygon API key not found")
+        print("Set POLYGON_API_KEY environment variable")
+        return
+    print("\n" + "="*80)
+    print("PERFORMANCE PROFILING: ThetaData vs Polygon")
+    print("="*80)
+    print(f"Date range: 2024-08-01 to 2024-08-02 (1 trading day)")
+    print(f"Strategy: Buy & hold 10 shares of AMZN")
+    print("="*80)
+    results = {}
+    # 1. ThetaData with cache cleared
+    results["thetadata_nocache"] = profile_backtest(
+        ThetaDataBacktesting,
+        "ThetaData (NO CACHE)",
+        "thetadata_nocache.prof",
+        clear_cache_first=True
+    )
+    # 2. ThetaData with cache warmed
+    results["thetadata_cached"] = profile_backtest(
+        ThetaDataBacktesting,
+        "ThetaData (CACHED)",
+        "thetadata_cached.prof",
+        clear_cache_first=False
+    )
+    # 3. Polygon with cache cleared
+    results["polygon_nocache"] = profile_backtest(
+        PolygonDataBacktesting,
+        "Polygon (NO CACHE)",
+        "polygon_nocache.prof",
+        clear_cache_first=True
+    )
+    # 4. Polygon with cache warmed
+    results["polygon_cached"] = profile_backtest(
+        PolygonDataBacktesting,
+        "Polygon (CACHED)",
+        "polygon_cached.prof",
+        clear_cache_first=False
+    )
+    # Summary
+    print("\n" + "="*80)
+    print("SUMMARY")
+    print("="*80)
+    print(f"{'Test':<30} {'Time (s)':<15} {'Speedup vs ThetaData'}")
+    print("-"*80)
+    baseline = results["thetadata_nocache"]
+    for key, elapsed in results.items():
+        speedup = baseline / elapsed if elapsed > 0 else 0
+        speedup_str = f"{speedup:.1f}x" if speedup != 1.0 else "-"
+        print(f"{key:<30} {elapsed:>10.2f}      {speedup_str:>10}")
+    print("\n" + "="*80)
+    print("ANALYSIS")
+    print("="*80)
+    theta_cache_benefit = results["thetadata_nocache"] / results["thetadata_cached"] if results["thetadata_cached"] > 0 else 0
+    polygon_cache_benefit = results["polygon_nocache"] / results["polygon_cached"] if results["polygon_cached"] > 0 else 0
+    print(f"ThetaData cache benefit: {theta_cache_benefit:.1f}x faster with cache")
+    print(f"Polygon cache benefit: {polygon_cache_benefit:.1f}x faster with cache")
+    # Compare cached performance (most relevant for production)
+    if results["thetadata_cached"] > results["polygon_cached"]:
+        slowdown = results["thetadata_cached"] / results["polygon_cached"]
+        print(f"\n⚠️  ThetaData (cached) is {slowdown:.1f}x SLOWER than Polygon (cached)")
+    else:
+        speedup = results["polygon_cached"] / results["thetadata_cached"]
+        print(f"\n✓ ThetaData (cached) is {speedup:.1f}x FASTER than Polygon (cached)")
+    print("\n" + "="*80)
+    print("PROFILING FILES GENERATED")
+    print("="*80)
+    print("To visualize bottlenecks, run:")
+    print("  snakeviz thetadata_nocache.prof")
+    print("  snakeviz thetadata_cached.prof")
+    print("  snakeviz polygon_nocache.prof")
+    print("  snakeviz polygon_cached.prof")
+    print("="*80 + "\n")
+if __name__ == "__main__":
+    main()

tests/backtest/root_cause_analysis.py ADDED Viewed

@@ -0,0 +1,109 @@
+"""
+Root cause analysis: Is the +1 minute offset from ThetaData's API or our processing?
+"""
+import requests
+import pandas as pd
+import datetime
+print("="*100)
+print("ROOT CAUSE ANALYSIS: ThetaData +1 Minute Offset")
+print("="*100)
+# Get raw API response
+response = requests.get('http://127.0.0.1:25510/hist/stock/ohlc', params={
+    'root': 'AMZN',
+    'start_date': '20240801',
+    'end_date': '20240801',
+    'ivl': 60000,
+    'rth': 'true'
+})
+data = response.json()
+print("\n1. ThetaData RAW API Response (no processing):")
+print("-" * 100)
+print(f"{'Bar':<5} {'ms_of_day':<12} {'Time':<10} {'Volume':<12} {'Notes'}")
+print("-" * 100)
+for i, row in enumerate(data['response'][:5]):
+    ms_of_day, o, h, l, c, v, count, date = row
+    hours = ms_of_day // (1000 * 60 * 60)
+    minutes = (ms_of_day % (1000 * 60 * 60)) // (1000 * 60)
+    time_str = f"{hours:02d}:{minutes:02d}"
+    note = ""
+    if i == 0:
+        note = "← Should be pre-market if labeled correctly"
+    elif i == 1:
+        note = "← MASSIVE SPIKE (market open)" if v > 1000000 else ""
+    print(f"{i+1:<5} {ms_of_day:<12} {time_str:<10} {v:<12,} {note}")
+print("\n2. After Our Code Processing (thetadata_helper.py):")
+print("-" * 100)
+# Replicate our processing from thetadata_helper.py
+df = pd.DataFrame(data['response'][:5], columns=data['header']['format'])
+def combine_datetime(row):
+    date_str = str(int(row["date"]))
+    base_date = datetime.datetime.strptime(date_str, "%Y%m%d")
+    datetime_value = base_date + datetime.timedelta(milliseconds=int(row["ms_of_day"]))
+    return datetime_value
+datetime_combined = df.apply(combine_datetime, axis=1)
+df = df.assign(datetime=datetime_combined)
+df["datetime"] = pd.to_datetime(df["datetime"])
+print(f"{'Bar':<5} {'Datetime':<30} {'Volume':<12} {'Notes'}")
+print("-" * 100)
+for i, (idx, row) in enumerate(df.iterrows()):
+    note = ""
+    if i == 0:
+        note = "← Should be pre-market if labeled correctly"
+    elif i == 1 and row['volume'] > 1000000:
+        note = "← MASSIVE SPIKE (market open)"
+    print(f"{i+1:<5} {str(row['datetime']):<30} {row['volume']:<12,} {note}")
+print("\n3. Expected Correct Labeling (based on volume spike = market open at 9:30):")
+print("-" * 100)
+print("Bar 1 (10,434 volume):     Should be labeled 9:29 (pre-market)")
+print("Bar 2 (1,517,215 volume):  Should be labeled 9:30 (market open SPIKE)")
+print()
+print("Actual ThetaData Labeling:")
+print("Bar 1 (10,434 volume):     Labeled as 9:30")
+print("Bar 2 (1,517,215 volume):  Labeled as 9:31")
+print()
+print("="*100)
+print("CONCLUSION:")
+print("="*100)
+print("The +1 minute offset exists in ThetaData's RAW API response.")
+print("Our processing code does NOT introduce any shifts.")
+print("The ms_of_day values from ThetaData are already off by +1 minute.")
+print()
+print("PROOF:")
+print("- ThetaData labels the low-volume bar as 9:30")
+print("- ThetaData labels the spike bar as 9:31")
+print("- But market opens at 9:30, so the spike SHOULD be labeled 9:30")
+print("- Therefore, ThetaData's timestamps are +1 minute ahead of reality")
+print("="*100)
+print("\n4. Checking ThetaData's Documentation Claim:")
+print("-" * 100)
+print("ThetaData docs say: 'bar timestamp <= trade time < bar timestamp + ivl'")
+print("For bar labeled 9:30 with ivl=60000ms (1 minute):")
+print("  Should include trades: 9:30:00.000 <= trade < 9:31:00.000")
+print()
+print("But we observe:")
+print("  Bar labeled 9:30 has 10,434 volume (pre-market level)")
+print("  Bar labeled 9:31 has 1,517,215 volume (market open spike)")
+print()
+print("This means:")
+print("  Bar labeled 9:30 actually contains 9:29:00-9:29:59 data")
+print("  Bar labeled 9:31 actually contains 9:30:00-9:30:59 data")
+print()
+print("Therefore: ThetaData's bars are MISLABELED by +1 minute in their API")
+print("="*100)

tests/backtest/test_accuracy_verification.py ADDED Viewed

@@ -0,0 +1,244 @@
+"""
+Phase 1: Accuracy Verification Tests
+This test suite verifies that ThetaData price variance compared to Polygon
+remains acceptable over long time periods and across different price ranges.
+Goals:
+- Verify portfolio variance < 0.01% over 1 year
+- Verify price differences remain sub-penny across all price ranges
+- Verify no systematic bias (variance is random, not directional)
+"""
+import datetime
+import os
+import pytest
+from dotenv import load_dotenv
+from lumibot.strategies import Strategy
+from lumibot.backtesting import PolygonDataBacktesting, ThetaDataBacktesting
+from lumibot.entities import Asset
+# Load environment variables from .env file
+load_dotenv()
+# Get credentials from environment variables
+POLYGON_API_KEY = os.environ.get("POLYGON_API_KEY")
+THETADATA_USERNAME = os.environ.get("THETADATA_USERNAME")
+THETADATA_PASSWORD = os.environ.get("THETADATA_PASSWORD")
+class AccuracyTestStrategy(Strategy):
+    """Simple buy-and-hold strategy for accuracy testing"""
+    parameters = {
+        "symbol": "AMZN",
+        "quantity": 10
+    }
+    def initialize(self):
+        self.sleeptime = "1D"
+        self.bought = False
+    def on_trading_iteration(self):
+        if not self.bought:
+            asset = Asset(self.parameters["symbol"])
+            price = self.get_last_price(asset)
+            self.log_message(f"Buying {self.parameters['quantity']} shares of {self.parameters['symbol']} at ${price}")
+            order = self.create_order(asset, quantity=self.parameters["quantity"], side="buy")
+            self.submit_order(order)
+            self.bought = True
+@pytest.mark.apitest
+@pytest.mark.skipif(
+    not POLYGON_API_KEY or not THETADATA_USERNAME or not THETADATA_PASSWORD,
+    reason="Requires both Polygon and ThetaData credentials"
+)
+class TestAccuracyVerification:
+    """Accuracy verification test suite"""
+    def test_one_year_amzn_accuracy(self):
+        """
+        Test 1: Verify AMZN accuracy over 1 year (2023)
+        Expected:
+        - Portfolio variance < 0.01% ($10 on $100k portfolio)
+        - Price differences remain sub-penny
+        - No systematic directional bias
+        """
+        backtesting_start = datetime.datetime(2023, 1, 3)  # First trading day of 2023
+        backtesting_end = datetime.datetime(2023, 12, 29)  # Last trading day of 2023
+        print("\n" + "="*80)
+        print("TEST 1: ONE YEAR ACCURACY VERIFICATION - AMZN")
+        print("="*80)
+        print(f"Period: {backtesting_start.date()} to {backtesting_end.date()}")
+        print(f"Symbol: AMZN")
+        print(f"Trading days: ~252")
+        # Run ThetaData backtest
+        print("\n[1/2] Running ThetaData backtest...")
+        theta_results, theta_strat = AccuracyTestStrategy.run_backtest(
+            ThetaDataBacktesting,
+            backtesting_start,
+            backtesting_end,
+            benchmark_asset="SPY",
+            show_plot=False,
+            show_tearsheet=False,
+            save_tearsheet=False,
+            parameters={"symbol": "AMZN", "quantity": 100},
+            thetadata_username=THETADATA_USERNAME,
+            thetadata_password=THETADATA_PASSWORD,
+        )
+        # Run Polygon backtest
+        print("\n[2/2] Running Polygon backtest...")
+        polygon_results, polygon_strat = AccuracyTestStrategy.run_backtest(
+            PolygonDataBacktesting,
+            backtesting_start,
+            backtesting_end,
+            benchmark_asset="SPY",
+            show_plot=False,
+            show_tearsheet=False,
+            save_tearsheet=False,
+            parameters={"symbol": "AMZN", "quantity": 100},
+            polygon_api_key=POLYGON_API_KEY,
+        )
+        # Compare results - get final portfolio value from strategy
+        theta_final = theta_strat.get_portfolio_value()
+        polygon_final = polygon_strat.get_portfolio_value()
+        difference = abs(theta_final - polygon_final)
+        percent_diff = (difference / polygon_final) * 100
+        print("\n" + "-"*80)
+        print("RESULTS:")
+        print("-"*80)
+        print(f"ThetaData Final Portfolio Value:  ${theta_final:,.2f}")
+        print(f"Polygon Final Portfolio Value:    ${polygon_final:,.2f}")
+        print(f"Absolute Difference:              ${difference:,.2f}")
+        print(f"Percentage Difference:            {percent_diff:.4f}%")
+        print(f"Acceptance Threshold:             0.01% (${polygon_final * 0.0001:,.2f})")
+        # Verify acceptance criteria
+        assert percent_diff < 0.01, f"Portfolio variance {percent_diff:.4f}% exceeds 0.01% threshold"
+        print(f"\n✓ TEST PASSED: Variance {percent_diff:.4f}% is within acceptable range")
+        print("="*80 + "\n")
+    def test_multi_symbol_price_ranges(self):
+        """
+        Test 2: Verify accuracy across different price ranges
+        Tests 5 symbols with different price points:
+        - AMZN: ~$180
+        - AAPL: ~$175
+        - GOOGL: ~$140
+        - SPY: ~$450
+        - BRK.B: ~$420
+        Expected:
+        - 0.5¢ variance is consistent percentage across all price ranges
+        - Sub-penny differences for all symbols
+        """
+        backtesting_start = datetime.datetime(2024, 8, 1)
+        backtesting_end = datetime.datetime(2024, 8, 5)  # 1 week for speed
+        symbols = [
+            ("AMZN", 10, 180),   # ~$180/share, 10 shares
+            ("AAPL", 10, 175),   # ~$175/share, 10 shares
+            ("GOOGL", 10, 140),  # ~$140/share, 10 shares
+            ("SPY", 10, 450),    # ~$450/share, 10 shares
+            ("BRK.B", 5, 420),   # ~$420/share, 5 shares
+        ]
+        print("\n" + "="*80)
+        print("TEST 2: MULTI-SYMBOL PRICE RANGE VERIFICATION")
+        print("="*80)
+        print(f"Period: {backtesting_start.date()} to {backtesting_end.date()}")
+        print(f"Symbols: {len(symbols)}")
+        results_table = []
+        for symbol, qty, approx_price in symbols:
+            print(f"\n--- Testing {symbol} (~${approx_price}/share, {qty} shares) ---")
+            # Run ThetaData backtest
+            theta_results, theta_strat = AccuracyTestStrategy.run_backtest(
+                ThetaDataBacktesting,
+                backtesting_start,
+                backtesting_end,
+                benchmark_asset="SPY",
+                show_plot=False,
+                show_tearsheet=False,
+                save_tearsheet=False,
+                parameters={"symbol": symbol, "quantity": qty},
+                thetadata_username=THETADATA_USERNAME,
+                thetadata_password=THETADATA_PASSWORD,
+            )
+            # Run Polygon backtest
+            polygon_results, polygon_strat = AccuracyTestStrategy.run_backtest(
+                PolygonDataBacktesting,
+                backtesting_start,
+                backtesting_end,
+                benchmark_asset="SPY",
+                show_plot=False,
+                show_tearsheet=False,
+                save_tearsheet=False,
+                parameters={"symbol": symbol, "quantity": qty},
+                polygon_api_key=POLYGON_API_KEY,
+            )
+            # Compare final portfolio values
+            theta_final = theta_strat.get_portfolio_value()
+            polygon_final = polygon_strat.get_portfolio_value()
+            difference = abs(theta_final - polygon_final)
+            percent_diff = (difference / polygon_final) * 100
+            results_table.append({
+                "symbol": symbol,
+                "price": approx_price,
+                "qty": qty,
+                "theta": theta_final,
+                "polygon": polygon_final,
+                "diff": difference,
+                "pct": percent_diff
+            })
+            print(f"  ThetaData:  ${theta_final:,.2f}")
+            print(f"  Polygon:    ${polygon_final:,.2f}")
+            print(f"  Difference: ${difference:,.2f} ({percent_diff:.4f}%)")
+            # Verify sub-0.01% variance for each symbol
+            assert percent_diff < 0.01, f"{symbol}: Variance {percent_diff:.4f}% exceeds 0.01%"
+        # Summary table
+        print("\n" + "-"*80)
+        print("SUMMARY TABLE:")
+        print("-"*80)
+        print(f"{'Symbol':<8} {'Price':<8} {'Qty':<5} {'ThetaData':<15} {'Polygon':<15} {'Diff':<10} {'%':<8}")
+        print("-"*80)
+        for r in results_table:
+            print(f"{r['symbol']:<8} ${r['price']:<7} {r['qty']:<5} ${r['theta']:<14,.2f} ${r['polygon']:<14,.2f} ${r['diff']:<9,.2f} {r['pct']:.4f}%")
+        # Calculate average variance
+        avg_pct = sum(r['pct'] for r in results_table) / len(results_table)
+        max_pct = max(r['pct'] for r in results_table)
+        print("-"*80)
+        print(f"Average Variance: {avg_pct:.4f}%")
+        print(f"Maximum Variance: {max_pct:.4f}%")
+        print(f"Threshold:        0.01%")
+        assert avg_pct < 0.01, f"Average variance {avg_pct:.4f}% exceeds 0.01%"
+        assert max_pct < 0.01, f"Max variance {max_pct:.4f}% exceeds 0.01%"
+        print(f"\n✓ TEST PASSED: All symbols within acceptable variance")
+        print("="*80 + "\n")
+if __name__ == "__main__":
+    # Run tests directly
+    pytest.main([__file__, "-v", "-s"])

lumibot 4.0.23__py3-none-any.whl → 4.1.1__py3-none-any.whl

Potentially problematic release.

lumibot 4.0.23py3-none-any.whl → 4.1.1py3-none-any.whl