lumibot 4.1.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lumibot might be problematic. Click here for more details.

Files changed (164) hide show
  1. lumibot/backtesting/__init__.py +19 -5
  2. lumibot/backtesting/backtesting_broker.py +98 -18
  3. lumibot/backtesting/databento_backtesting.py +5 -686
  4. lumibot/backtesting/databento_backtesting_pandas.py +738 -0
  5. lumibot/backtesting/databento_backtesting_polars.py +860 -546
  6. lumibot/backtesting/fix_debug.py +37 -0
  7. lumibot/backtesting/thetadata_backtesting.py +9 -355
  8. lumibot/backtesting/thetadata_backtesting_pandas.py +1178 -0
  9. lumibot/brokers/alpaca.py +8 -1
  10. lumibot/brokers/schwab.py +12 -2
  11. lumibot/credentials.py +13 -0
  12. lumibot/data_sources/__init__.py +5 -8
  13. lumibot/data_sources/data_source.py +6 -2
  14. lumibot/data_sources/data_source_backtesting.py +30 -0
  15. lumibot/data_sources/databento_data.py +5 -390
  16. lumibot/data_sources/databento_data_pandas.py +440 -0
  17. lumibot/data_sources/databento_data_polars.py +15 -9
  18. lumibot/data_sources/pandas_data.py +30 -17
  19. lumibot/data_sources/polars_data.py +986 -0
  20. lumibot/data_sources/polars_mixin.py +472 -96
  21. lumibot/data_sources/polygon_data_polars.py +5 -0
  22. lumibot/data_sources/yahoo_data.py +9 -2
  23. lumibot/data_sources/yahoo_data_polars.py +5 -0
  24. lumibot/entities/__init__.py +15 -0
  25. lumibot/entities/asset.py +5 -28
  26. lumibot/entities/bars.py +89 -20
  27. lumibot/entities/data.py +29 -6
  28. lumibot/entities/data_polars.py +668 -0
  29. lumibot/entities/position.py +38 -4
  30. lumibot/strategies/_strategy.py +31 -9
  31. lumibot/strategies/strategy.py +61 -49
  32. lumibot/tools/backtest_cache.py +284 -0
  33. lumibot/tools/databento_helper.py +65 -42
  34. lumibot/tools/databento_helper_polars.py +748 -778
  35. lumibot/tools/futures_roll.py +251 -0
  36. lumibot/tools/indicators.py +135 -104
  37. lumibot/tools/polars_utils.py +142 -0
  38. lumibot/tools/thetadata_helper.py +1068 -134
  39. {lumibot-4.1.2.dist-info → lumibot-4.2.0.dist-info}/METADATA +9 -1
  40. {lumibot-4.1.2.dist-info → lumibot-4.2.0.dist-info}/RECORD +72 -148
  41. tests/backtest/test_databento.py +37 -6
  42. tests/backtest/test_databento_comprehensive_trading.py +70 -87
  43. tests/backtest/test_databento_parity.py +31 -7
  44. tests/backtest/test_debug_avg_fill_price.py +1 -1
  45. tests/backtest/test_example_strategies.py +11 -1
  46. tests/backtest/test_futures_edge_cases.py +96 -63
  47. tests/backtest/test_futures_single_trade.py +2 -2
  48. tests/backtest/test_futures_ultra_simple.py +2 -2
  49. tests/backtest/test_polars_lru_eviction.py +470 -0
  50. tests/backtest/test_yahoo.py +42 -0
  51. tests/test_asset.py +4 -4
  52. tests/test_backtest_cache_manager.py +149 -0
  53. tests/test_backtesting_data_source_env.py +50 -10
  54. tests/test_continuous_futures_resolution.py +60 -48
  55. tests/test_data_polars_parity.py +160 -0
  56. tests/test_databento_asset_validation.py +23 -5
  57. tests/test_databento_backtesting.py +1 -1
  58. tests/test_databento_backtesting_polars.py +312 -192
  59. tests/test_databento_data.py +220 -463
  60. tests/test_databento_helper.py +6 -1
  61. tests/test_databento_live.py +10 -10
  62. tests/test_futures_roll.py +38 -0
  63. tests/test_indicator_subplots.py +101 -0
  64. tests/test_market_infinite_loop_bug.py +77 -3
  65. tests/test_polars_resample.py +67 -0
  66. tests/test_polygon_helper.py +46 -0
  67. tests/test_thetadata_backwards_compat.py +97 -0
  68. tests/test_thetadata_helper.py +222 -23
  69. tests/test_thetadata_pandas_verification.py +186 -0
  70. lumibot/__pycache__/__init__.cpython-312.pyc +0 -0
  71. lumibot/__pycache__/constants.cpython-312.pyc +0 -0
  72. lumibot/__pycache__/credentials.cpython-312.pyc +0 -0
  73. lumibot/backtesting/__pycache__/__init__.cpython-312.pyc +0 -0
  74. lumibot/backtesting/__pycache__/alpaca_backtesting.cpython-312.pyc +0 -0
  75. lumibot/backtesting/__pycache__/alpha_vantage_backtesting.cpython-312.pyc +0 -0
  76. lumibot/backtesting/__pycache__/backtesting_broker.cpython-312.pyc +0 -0
  77. lumibot/backtesting/__pycache__/ccxt_backtesting.cpython-312.pyc +0 -0
  78. lumibot/backtesting/__pycache__/databento_backtesting.cpython-312.pyc +0 -0
  79. lumibot/backtesting/__pycache__/interactive_brokers_rest_backtesting.cpython-312.pyc +0 -0
  80. lumibot/backtesting/__pycache__/pandas_backtesting.cpython-312.pyc +0 -0
  81. lumibot/backtesting/__pycache__/polygon_backtesting.cpython-312.pyc +0 -0
  82. lumibot/backtesting/__pycache__/thetadata_backtesting.cpython-312.pyc +0 -0
  83. lumibot/backtesting/__pycache__/yahoo_backtesting.cpython-312.pyc +0 -0
  84. lumibot/brokers/__pycache__/__init__.cpython-312.pyc +0 -0
  85. lumibot/brokers/__pycache__/alpaca.cpython-312.pyc +0 -0
  86. lumibot/brokers/__pycache__/bitunix.cpython-312.pyc +0 -0
  87. lumibot/brokers/__pycache__/broker.cpython-312.pyc +0 -0
  88. lumibot/brokers/__pycache__/ccxt.cpython-312.pyc +0 -0
  89. lumibot/brokers/__pycache__/example_broker.cpython-312.pyc +0 -0
  90. lumibot/brokers/__pycache__/interactive_brokers.cpython-312.pyc +0 -0
  91. lumibot/brokers/__pycache__/interactive_brokers_rest.cpython-312.pyc +0 -0
  92. lumibot/brokers/__pycache__/projectx.cpython-312.pyc +0 -0
  93. lumibot/brokers/__pycache__/schwab.cpython-312.pyc +0 -0
  94. lumibot/brokers/__pycache__/tradier.cpython-312.pyc +0 -0
  95. lumibot/brokers/__pycache__/tradovate.cpython-312.pyc +0 -0
  96. lumibot/data_sources/__pycache__/__init__.cpython-312.pyc +0 -0
  97. lumibot/data_sources/__pycache__/alpaca_data.cpython-312.pyc +0 -0
  98. lumibot/data_sources/__pycache__/alpha_vantage_data.cpython-312.pyc +0 -0
  99. lumibot/data_sources/__pycache__/bitunix_data.cpython-312.pyc +0 -0
  100. lumibot/data_sources/__pycache__/ccxt_backtesting_data.cpython-312.pyc +0 -0
  101. lumibot/data_sources/__pycache__/ccxt_data.cpython-312.pyc +0 -0
  102. lumibot/data_sources/__pycache__/data_source.cpython-312.pyc +0 -0
  103. lumibot/data_sources/__pycache__/data_source_backtesting.cpython-312.pyc +0 -0
  104. lumibot/data_sources/__pycache__/databento_data_polars_backtesting.cpython-312.pyc +0 -0
  105. lumibot/data_sources/__pycache__/databento_data_polars_live.cpython-312.pyc +0 -0
  106. lumibot/data_sources/__pycache__/example_broker_data.cpython-312.pyc +0 -0
  107. lumibot/data_sources/__pycache__/exceptions.cpython-312.pyc +0 -0
  108. lumibot/data_sources/__pycache__/interactive_brokers_data.cpython-312.pyc +0 -0
  109. lumibot/data_sources/__pycache__/interactive_brokers_rest_data.cpython-312.pyc +0 -0
  110. lumibot/data_sources/__pycache__/pandas_data.cpython-312.pyc +0 -0
  111. lumibot/data_sources/__pycache__/polars_mixin.cpython-312.pyc +0 -0
  112. lumibot/data_sources/__pycache__/polygon_data_polars.cpython-312.pyc +0 -0
  113. lumibot/data_sources/__pycache__/projectx_data.cpython-312.pyc +0 -0
  114. lumibot/data_sources/__pycache__/schwab_data.cpython-312.pyc +0 -0
  115. lumibot/data_sources/__pycache__/tradier_data.cpython-312.pyc +0 -0
  116. lumibot/data_sources/__pycache__/tradovate_data.cpython-312.pyc +0 -0
  117. lumibot/data_sources/__pycache__/yahoo_data_polars.cpython-312.pyc +0 -0
  118. lumibot/entities/__pycache__/__init__.cpython-312.pyc +0 -0
  119. lumibot/entities/__pycache__/asset.cpython-312.pyc +0 -0
  120. lumibot/entities/__pycache__/bar.cpython-312.pyc +0 -0
  121. lumibot/entities/__pycache__/bars.cpython-312.pyc +0 -0
  122. lumibot/entities/__pycache__/chains.cpython-312.pyc +0 -0
  123. lumibot/entities/__pycache__/data.cpython-312.pyc +0 -0
  124. lumibot/entities/__pycache__/dataline.cpython-312.pyc +0 -0
  125. lumibot/entities/__pycache__/order.cpython-312.pyc +0 -0
  126. lumibot/entities/__pycache__/position.cpython-312.pyc +0 -0
  127. lumibot/entities/__pycache__/quote.cpython-312.pyc +0 -0
  128. lumibot/entities/__pycache__/trading_fee.cpython-312.pyc +0 -0
  129. lumibot/example_strategies/__pycache__/__init__.cpython-312.pyc +0 -0
  130. lumibot/example_strategies/__pycache__/test_broker_functions.cpython-312-pytest-8.4.1.pyc +0 -0
  131. lumibot/strategies/__pycache__/__init__.cpython-312.pyc +0 -0
  132. lumibot/strategies/__pycache__/_strategy.cpython-312.pyc +0 -0
  133. lumibot/strategies/__pycache__/strategy.cpython-312.pyc +0 -0
  134. lumibot/strategies/__pycache__/strategy_executor.cpython-312.pyc +0 -0
  135. lumibot/tools/__pycache__/__init__.cpython-312.pyc +0 -0
  136. lumibot/tools/__pycache__/alpaca_helpers.cpython-312.pyc +0 -0
  137. lumibot/tools/__pycache__/bitunix_helpers.cpython-312.pyc +0 -0
  138. lumibot/tools/__pycache__/black_scholes.cpython-312.pyc +0 -0
  139. lumibot/tools/__pycache__/ccxt_data_store.cpython-312.pyc +0 -0
  140. lumibot/tools/__pycache__/databento_helper.cpython-312.pyc +0 -0
  141. lumibot/tools/__pycache__/databento_helper_polars.cpython-312.pyc +0 -0
  142. lumibot/tools/__pycache__/debugers.cpython-312.pyc +0 -0
  143. lumibot/tools/__pycache__/decorators.cpython-312.pyc +0 -0
  144. lumibot/tools/__pycache__/helpers.cpython-312.pyc +0 -0
  145. lumibot/tools/__pycache__/indicators.cpython-312.pyc +0 -0
  146. lumibot/tools/__pycache__/lumibot_logger.cpython-312.pyc +0 -0
  147. lumibot/tools/__pycache__/pandas.cpython-312.pyc +0 -0
  148. lumibot/tools/__pycache__/polygon_helper.cpython-312.pyc +0 -0
  149. lumibot/tools/__pycache__/polygon_helper_async.cpython-312.pyc +0 -0
  150. lumibot/tools/__pycache__/polygon_helper_polars_optimized.cpython-312.pyc +0 -0
  151. lumibot/tools/__pycache__/projectx_helpers.cpython-312.pyc +0 -0
  152. lumibot/tools/__pycache__/schwab_helper.cpython-312.pyc +0 -0
  153. lumibot/tools/__pycache__/thetadata_helper.cpython-312.pyc +0 -0
  154. lumibot/tools/__pycache__/types.cpython-312.pyc +0 -0
  155. lumibot/tools/__pycache__/yahoo_helper.cpython-312.pyc +0 -0
  156. lumibot/tools/__pycache__/yahoo_helper_polars_optimized.cpython-312.pyc +0 -0
  157. lumibot/traders/__pycache__/__init__.cpython-312.pyc +0 -0
  158. lumibot/traders/__pycache__/trader.cpython-312.pyc +0 -0
  159. lumibot/trading_builtins/__pycache__/__init__.cpython-312.pyc +0 -0
  160. lumibot/trading_builtins/__pycache__/custom_stream.cpython-312.pyc +0 -0
  161. lumibot/trading_builtins/__pycache__/safe_list.cpython-312.pyc +0 -0
  162. {lumibot-4.1.2.dist-info → lumibot-4.2.0.dist-info}/WHEEL +0 -0
  163. {lumibot-4.1.2.dist-info → lumibot-4.2.0.dist-info}/licenses/LICENSE +0 -0
  164. {lumibot-4.1.2.dist-info → lumibot-4.2.0.dist-info}/top_level.txt +0 -0
@@ -1,36 +1,39 @@
1
- """Ultra-optimized DataBento backtesting using pure polars"""
2
-
3
- from datetime import timedelta
1
+ import traceback
2
+ from datetime import datetime, timedelta
4
3
 
4
+ import pandas as pd
5
5
  import polars as pl
6
- from polars.datatypes import Datetime as PlDatetime
7
- import pytz
8
6
 
9
- from lumibot.data_sources import DataSourceBacktesting
10
- from lumibot.entities import Asset, Bars
11
- from lumibot.tools import databento_helper_polars
12
- from lumibot.tools.lumibot_logger import get_logger
7
+ from lumibot import LUMIBOT_DEFAULT_PYTZ
8
+ from lumibot.data_sources import PolarsData
9
+ from lumibot.entities import Asset, Data, Quote
10
+ from lumibot.entities.data_polars import DataPolars
11
+ from lumibot.tools import databento_helper_polars as databento_helper
12
+ from lumibot.tools.databento_helper_polars import DataBentoAuthenticationError
13
+ from lumibot.tools.helpers import to_datetime_aware
14
+ from termcolor import colored
13
15
 
16
+ from lumibot.tools.lumibot_logger import get_logger
14
17
  logger = get_logger(__name__)
15
18
 
19
+ # Conversion tracking for optimization analysis
20
+ def _log_conversion(operation, from_type, to_type, location):
21
+ """Log DataFrame conversions to track optimization progress."""
22
+ logger.debug(f"[CONVERSION] {operation} | {from_type} → {to_type} | {location}")
23
+
16
24
  START_BUFFER = timedelta(days=5)
17
25
 
18
26
 
19
- class DataBentoDataBacktestingPolars(DataSourceBacktesting):
27
+ class DataBentoDataBacktestingPolars(PolarsData):
20
28
  """
21
- Ultra-optimized backtesting implementation of DataBento data source using polars
22
-
23
- This class provides DataBento-specific backtesting functionality with
24
- 3x+ performance improvement through polars operations and efficient caching.
29
+ Backtesting implementation of DataBento data source
30
+
31
+ This class extends PolarsData to provide DataBento-specific backtesting functionality.
32
+ Currently identical to pandas version - will be incrementally optimized to use Polars.
25
33
  """
26
34
 
27
- SOURCE = "DATABENTO"
28
- MIN_TIMESTEP = "minute"
29
- TIMESTEP_MAPPING = [
30
- {"timestep": "minute", "representations": ["1m", "minute", "1 minute"]},
31
- {"timestep": "hour", "representations": ["1h", "hour", "1 hour"]},
32
- {"timestep": "day", "representations": ["1d", "day", "1 day"]},
33
- ]
35
+ # Override SOURCE so broker recognizes this as DataBento and applies correct timeshift
36
+ SOURCE = "DATABENTO_POLARS"
34
37
 
35
38
  def __init__(
36
39
  self,
@@ -40,12 +43,10 @@ class DataBentoDataBacktestingPolars(DataSourceBacktesting):
40
43
  api_key=None,
41
44
  timeout=30,
42
45
  max_retries=3,
43
- max_memory=None,
44
- enable_cache=True,
45
46
  **kwargs,
46
47
  ):
47
48
  """
48
- Initialize DataBento backtesting data source with polars optimization
49
+ Initialize DataBento backtesting data source
49
50
 
50
51
  Parameters
51
52
  ----------
@@ -54,556 +55,719 @@ class DataBentoDataBacktestingPolars(DataSourceBacktesting):
54
55
  datetime_end : datetime
55
56
  End datetime for backtesting period
56
57
  pandas_data : dict, optional
57
- Pre-loaded pandas data (will be converted to polars)
58
+ Pre-loaded pandas data
58
59
  api_key : str
59
60
  DataBento API key
60
61
  timeout : int, optional
61
62
  API request timeout in seconds, default 30
62
63
  max_retries : int, optional
63
64
  Maximum number of API retry attempts, default 3
64
- max_memory : int, optional
65
- Maximum memory usage in bytes for data storage
66
- enable_cache : bool, optional
67
- Enable caching of fetched data, default True
68
65
  **kwargs
69
66
  Additional parameters passed to parent class
70
67
  """
71
- # Initialize parent
72
68
  super().__init__(
73
69
  datetime_start=datetime_start,
74
70
  datetime_end=datetime_end,
71
+ pandas_data=pandas_data,
75
72
  api_key=api_key,
76
73
  **kwargs
77
74
  )
78
75
 
79
- self.name = "databento"
80
- # Load API key from environment if not provided
81
- import os
82
- self._api_key = api_key or os.environ.get("DATABENTO_API_KEY")
83
- if not self._api_key:
84
- logger.error("DataBento API key not provided and DATABENTO_API_KEY environment variable not set")
85
- else:
86
- logger.info(f"DataBento API key loaded: {bool(self._api_key)}")
76
+ # Store DataBento-specific configuration
77
+ self._api_key = api_key
87
78
  self._timeout = timeout
88
79
  self._max_retries = max_retries
89
- self.MAX_STORAGE_BYTES = max_memory
90
- self.enable_cache = enable_cache
91
-
92
- # Optimized data storage - lazy frames for efficiency
93
- self._data_store = {} # Asset -> pl.LazyFrame
94
- self._eager_cache = {} # Asset -> pl.DataFrame
80
+
81
+ # Track which assets we've already fetched to avoid redundant requests
82
+ self._prefetched_assets = set()
83
+ # Track data requests to avoid repeated log messages
84
+ self._logged_requests = set()
95
85
 
96
- # Performance optimizations
97
- self._last_price_cache = {}
98
- self._cache_datetime = None
86
+ # OPTIMIZATION: Iteration-level caching to avoid redundant filtering
87
+ # Cache filtered DataFrames per iteration (datetime)
88
+ self._filtered_bars_cache = {} # {(asset_key, length, timestep, timeshift, dt): DataFrame}
89
+ self._last_price_cache = {} # {(asset_key, dt): price}
90
+ self._cache_datetime = None # Track when to invalidate cache
99
91
 
100
- # Column access optimization
101
- self._column_indices = {}
92
+ # Track which futures assets we've fetched multipliers for (to avoid redundant API calls)
93
+ self._multiplier_fetched_assets = set()
102
94
 
103
- # Pre-filtered data cache for massive speedup
104
- self._filtered_data_cache = {}
95
+ # Verify DataBento availability
96
+ if not databento_helper.DATABENTO_AVAILABLE:
97
+ logger.error("DataBento package not available. Please install with: pip install databento")
98
+ raise ImportError("DataBento package not available")
105
99
 
106
- # Cache metadata to avoid unnecessary collections
107
- self._cache_metadata = {} # cache_key -> {'min_dt': datetime, 'max_dt': datetime, 'count': int}
100
+ logger.debug(f"DataBento backtesting initialized for period: {datetime_start} to {datetime_end}")
108
101
 
109
- # Convert pandas_data to polars if provided
110
- if pandas_data:
111
- for asset, df in pandas_data.items():
112
- if not isinstance(df, pl.DataFrame):
113
- # Convert pandas to polars
114
- if hasattr(df, 'index') and hasattr(df.index, 'name'):
115
- pl_df = pl.from_pandas(df.reset_index())
116
- else:
117
- pl_df = pl.from_pandas(df)
118
- self._store_data(asset, pl_df)
119
- else:
120
- self._store_data(asset, df)
102
+ def _check_and_clear_cache(self):
103
+ """
104
+ OPTIMIZATION: Clear iteration caches when datetime changes.
105
+ This ensures fresh filtering for each new iteration while reusing
106
+ results within the same iteration.
107
+ """
108
+ current_dt = self.get_datetime()
109
+ if self._cache_datetime != current_dt:
110
+ self._filtered_bars_cache.clear()
111
+ self._last_price_cache.clear()
112
+ self._cache_datetime = current_dt
121
113
 
122
- def _to_naive_datetime(self, dt):
123
- """Convert datetime to naive (no timezone) for consistent comparisons."""
124
- if dt is None:
125
- return None
126
- if hasattr(dt, 'tzinfo') and dt.tzinfo is not None:
127
- return dt.replace(tzinfo=None)
128
- return dt
129
-
130
- def _ensure_strategy_timezone(self, df: pl.DataFrame, column: str = "datetime") -> pl.DataFrame:
131
- """Ensure dataframe datetime column aligns with the strategy timezone."""
132
- if df is None or column not in df.columns:
133
- return df
134
-
135
- dtype = df.schema.get(column)
136
- strategy_tz = self.tzinfo.zone if hasattr(self.tzinfo, "zone") else str(self.tzinfo)
137
- expr = pl.col(column)
138
-
139
- if isinstance(dtype, PlDatetime):
140
- if dtype.time_zone is None:
141
- expr = expr.dt.replace_time_zone(strategy_tz)
142
- elif dtype.time_zone != strategy_tz:
143
- expr = expr.dt.convert_time_zone(strategy_tz)
144
- else:
145
- expr = expr.cast(pl.Datetime(time_unit="ns")).dt.replace_time_zone(strategy_tz)
114
+ def _ensure_futures_multiplier(self, asset):
115
+ """
116
+ Ensure futures asset has correct multiplier set.
146
117
 
147
- return df.with_columns(expr.alias(column))
118
+ This method is idempotent and cached - safe to call multiple times.
119
+ Only fetches multiplier once per unique asset.
148
120
 
149
- def _store_data(self, asset, data):
150
- """Store data efficiently using lazy frames."""
151
- # Standardize column names
152
- rename_map = {
153
- "Open": "open", "High": "high", "Low": "low", "Close": "close",
154
- "Volume": "volume", "Dividends": "dividend", "Stock Splits": "stock_splits",
155
- "Adj Close": "adj_close", "index": "datetime", "Date": "datetime"
156
- }
121
+ Design rationale:
122
+ - Futures multipliers must be fetched from data provider (e.g., DataBento)
123
+ - Asset class defaults to multiplier=1
124
+ - Data source is responsible for updating multiplier on first use
125
+ - Lazy fetching is more efficient than prefetching all possible assets
157
126
 
158
- existing_renames = {k: v for k, v in rename_map.items() if k in data.columns}
159
- if existing_renames:
160
- data = data.rename(existing_renames)
127
+ Parameters
128
+ ----------
129
+ asset : Asset
130
+ The asset to ensure has correct multiplier
131
+ """
132
+ # Skip if not a futures asset
133
+ if asset.asset_type not in (Asset.AssetType.FUTURE, Asset.AssetType.CONT_FUTURE):
134
+ return
161
135
 
162
- data = self._ensure_strategy_timezone(data)
136
+ # Skip if multiplier already set to non-default value
137
+ if asset.multiplier != 1:
138
+ return
163
139
 
164
- # Use lazy evaluation
165
- lazy_data = data.lazy()
140
+ # Create cache key to track which assets we've already processed
141
+ # Use symbol + asset_type + expiration to handle different contracts
142
+ cache_key = (asset.symbol, asset.asset_type, getattr(asset, 'expiration', None))
166
143
 
167
- # Store lazy frame
168
- self._data_store[asset] = lazy_data
144
+ # Check if we already tried to fetch for this asset
145
+ if cache_key in self._multiplier_fetched_assets:
146
+ return # Already attempted (even if failed, don't retry every time)
169
147
 
170
- # DON'T cache eager version - collect on demand instead for memory efficiency
171
- # Remove this line: self._eager_cache[asset] = lazy_data.collect()
148
+ # Mark as attempted to avoid redundant API calls
149
+ self._multiplier_fetched_assets.add(cache_key)
172
150
 
173
- # Cache column indices from schema without collecting
151
+ # Fetch and set multiplier from DataBento
174
152
  try:
175
- schema = lazy_data.collect_schema()
176
- self._column_indices[asset] = {col: i for i, col in enumerate(schema.names())}
177
- except:
178
- # Fallback: collect a tiny sample for column info
179
- sample = lazy_data.limit(1).collect()
180
- self._column_indices[asset] = {col: i for i, col in enumerate(sample.columns)}
153
+ client = databento_helper.DataBentoClient(self._api_key)
181
154
 
182
- # Enforce storage limit
183
- self._enforce_storage_limit(self._data_store)
155
+ # Resolve symbol based on asset type
156
+ if asset.asset_type == Asset.AssetType.CONT_FUTURE:
157
+ resolved_symbol = databento_helper._format_futures_symbol_for_databento(
158
+ asset, reference_date=self.datetime_start
159
+ )
160
+ else:
161
+ resolved_symbol = databento_helper._format_futures_symbol_for_databento(asset)
184
162
 
185
- return lazy_data
163
+ # Fetch multiplier from DataBento instrument definition
164
+ databento_helper._fetch_and_update_futures_multiplier(
165
+ client=client,
166
+ asset=asset,
167
+ resolved_symbol=resolved_symbol,
168
+ dataset="GLBX.MDP3",
169
+ reference_date=self.datetime_start
170
+ )
186
171
 
187
- def _enforce_storage_limit(self, data_store):
188
- """Enforce storage limit by removing least recently used data."""
189
- if not self.MAX_STORAGE_BYTES:
190
- return
172
+ logger.debug(f"Successfully set multiplier for {asset.symbol}: {asset.multiplier}")
191
173
 
192
- # Estimate storage without collecting
193
- estimated_storage = 0
194
- items_with_sizes = []
174
+ except DataBentoAuthenticationError as e:
175
+ logger.error(colored(f"DataBento authentication failed while fetching multiplier for {asset.symbol}: {e}", "red"))
176
+ raise
177
+ except Exception as e:
178
+ logger.warning(f"Could not fetch multiplier for {asset.symbol}: {e}")
195
179
 
196
- for asset, lazy_df in data_store.items():
180
+ def prefetch_data(self, assets, timestep="minute"):
181
+ """
182
+ Prefetch all required data for the specified assets for the entire backtest period.
183
+ This reduces redundant API calls and log spam during backtesting.
184
+
185
+ Parameters
186
+ ----------
187
+ assets : list of Asset
188
+ List of assets to prefetch data for
189
+ timestep : str, optional
190
+ Timestep to fetch (default: "minute")
191
+ """
192
+ if not assets:
193
+ return
194
+
195
+ logger.debug(f"Prefetching DataBento data for {len(assets)} assets...")
196
+
197
+ for asset in assets:
198
+ # Create search key for the asset
199
+ quote_asset = Asset("USD", "forex")
200
+ search_asset = (asset, quote_asset)
201
+
202
+ # Skip if already prefetched
203
+ if search_asset in self._prefetched_assets:
204
+ continue
205
+
197
206
  try:
198
- # Estimate size without collecting
199
- schema = lazy_df.collect_schema()
200
- # Rough estimate: 8 bytes per numeric value, 50 bytes per string
201
- bytes_per_row = sum(8 if str(dtype).startswith('Float') or str(dtype).startswith('Int')
202
- else 50 for dtype in schema.dtypes())
203
-
204
- # Try to get row count without full collect
205
- estimated_rows = 10000 # Default estimate
206
- if asset in self._filtered_data_cache:
207
- # Use cached data to estimate
208
- for key in self._filtered_data_cache:
209
- if key[0] == asset:
210
- estimated_rows = len(self._filtered_data_cache[key])
211
- break
212
-
213
- estimated_bytes = bytes_per_row * estimated_rows
214
- estimated_storage += estimated_bytes
215
- items_with_sizes.append((asset, estimated_bytes))
216
- except:
217
- # If estimation fails, use default
218
- items_with_sizes.append((asset, 100000)) # 100KB default
219
-
220
- logger.debug(f"Estimated storage: {estimated_storage:,} bytes for {len(data_store)} items")
221
-
222
- # Remove items if over limit
223
- if estimated_storage > self.MAX_STORAGE_BYTES:
224
- # Sort by size and remove largest first
225
- items_with_sizes.sort(key=lambda x: x[1], reverse=True)
226
- for asset, _ in items_with_sizes[:len(items_with_sizes)//2]:
227
- if asset in data_store:
228
- del data_store[asset]
229
- if asset in self._eager_cache:
230
- del self._eager_cache[asset]
231
- if asset in self._column_indices:
232
- del self._column_indices[asset]
233
- if asset in self._filtered_data_cache:
234
- # Clear related cache entries
235
- keys_to_remove = [k for k in self._filtered_data_cache if k[0] == asset]
236
- for k in keys_to_remove:
237
- del self._filtered_data_cache[k]
238
- logger.debug(f"Storage limit exceeded. Evicted data for {asset}")
239
-
240
- def _convert_to_polars(self, df, asset=None):
241
- """Convert pandas DataFrame or raw data to polars DataFrame efficiently."""
242
- if df is None:
243
- return None
244
-
245
- if isinstance(df, pl.DataFrame):
246
- return df
247
-
248
- # Convert pandas to polars
249
- try:
250
- if hasattr(df, 'index') and hasattr(df.index, 'name'):
251
- pl_df = pl.from_pandas(df.reset_index())
252
- else:
253
- pl_df = pl.from_pandas(df)
254
-
255
- # Ensure datetime column exists
256
- datetime_cols = ['datetime', 'timestamp', 'ts_event', 'time']
257
- datetime_col = None
258
- for col in datetime_cols:
259
- if col in pl_df.columns:
260
- datetime_col = col
261
- break
207
+ # Calculate start with buffer for better data coverage
208
+ start_datetime = self.datetime_start - START_BUFFER
209
+ end_datetime = self.datetime_end + timedelta(days=1)
210
+
211
+ logger.debug(f"Fetching {asset.symbol} data from {start_datetime.date()} to {end_datetime.date()}")
212
+
213
+ # Get data from DataBento for entire period
214
+ df = databento_helper.get_price_data_from_databento(
215
+ api_key=self._api_key,
216
+ asset=asset,
217
+ start=start_datetime,
218
+ end=end_datetime,
219
+ timestep=timestep,
220
+ venue=None,
221
+ force_cache_update=False
222
+ )
262
223
 
263
- if datetime_col and datetime_col != 'datetime':
264
- pl_df = pl_df.rename({datetime_col: 'datetime'})
224
+ is_empty = False
225
+ if df is None:
226
+ is_empty = True
227
+ elif hasattr(df, "empty"):
228
+ is_empty = df.empty
229
+ elif hasattr(df, "is_empty"):
230
+ is_empty = df.is_empty()
231
+
232
+ if is_empty:
233
+ # For empty data, create an empty Data object with proper timezone handling
234
+ empty_df = pd.DataFrame(columns=['open', 'high', 'low', 'close', 'volume'])
235
+ # Create an empty DatetimeIndex with proper timezone
236
+ empty_df.index = pd.DatetimeIndex([], tz=LUMIBOT_DEFAULT_PYTZ, name='datetime')
237
+
238
+ data_obj = Data(
239
+ asset,
240
+ df=empty_df,
241
+ timestep=timestep,
242
+ quote=quote_asset,
243
+ # Explicitly set dates to avoid timezone issues
244
+ date_start=None,
245
+ date_end=None
246
+ )
247
+ self.pandas_data[search_asset] = data_obj
248
+ else:
249
+ pandas_df = df.to_pandas() if hasattr(df, "to_pandas") else df
250
+ # Create Data object and store
251
+ data_obj = Data(
252
+ asset,
253
+ df=pandas_df,
254
+ timestep=timestep,
255
+ quote=quote_asset,
256
+ )
257
+ self.pandas_data[search_asset] = data_obj
258
+ cached_len = len(pandas_df) if hasattr(pandas_df, "__len__") else 0
259
+ logger.debug(f"Cached {cached_len} rows for {asset.symbol}")
260
+
261
+ # Mark as prefetched
262
+ self._prefetched_assets.add(search_asset)
263
+
264
+ except DataBentoAuthenticationError as e:
265
+ logger.error(colored(f"DataBento authentication failed while prefetching {asset.symbol}: {e}", "red"))
266
+ raise
267
+ except Exception as e:
268
+ logger.error(f"Error prefetching data for {asset.symbol}: {str(e)}")
269
+ logger.error(traceback.format_exc())
270
+
271
+ def _update_pandas_data(self, asset, quote, length, timestep, start_dt=None):
272
+ """
273
+ Get asset data and update the self.pandas_data dictionary.
265
274
 
266
- return pl_df
267
- except Exception as e:
268
- logger.error(f"Error converting to polars DataFrame: {e}")
269
- return None
275
+ This method retrieves historical data from DataBento and caches it for backtesting use.
276
+ If data has already been prefetched, it skips redundant API calls.
270
277
 
278
+ Parameters
279
+ ----------
280
+ asset : Asset
281
+ The asset to get data for.
282
+ quote : Asset
283
+ The quote asset to use. For DataBento, this is typically not used.
284
+ length : int
285
+ The number of data points to get.
286
+ timestep : str
287
+ The timestep to use. For example, "minute", "hour", or "day".
288
+ start_dt : datetime, optional
289
+ The start datetime to use. If None, the current self.datetime_start will be used.
290
+ """
291
+ search_asset = asset
292
+ asset_separated = asset
293
+ quote_asset = quote if quote is not None else Asset("USD", "forex")
271
294
 
272
- def get_historical_prices(
273
- self,
274
- asset,
275
- length,
276
- timestep="minute",
277
- timeshift=None,
278
- quote=None,
279
- exchange=None,
280
- include_after_hours=True,
281
- return_polars=False,
282
- ):
283
- logger.info(
284
- "[get_historical_prices] Getting historical prices for %s, length=%s, timestep=%s, current_dt=%s, datetime_start=%s",
285
- asset.symbol,
286
- length,
287
- timestep,
288
- self.get_datetime(),
289
- self.datetime_start,
290
- )
295
+ # Handle tuple assets (asset, quote pairs)
296
+ if isinstance(search_asset, tuple):
297
+ asset_separated, quote_asset = search_asset
298
+ else:
299
+ search_asset = (search_asset, quote_asset)
291
300
 
292
- supported_asset_types = [Asset.AssetType.FUTURE, Asset.AssetType.CONT_FUTURE]
293
- if asset.asset_type not in supported_asset_types:
294
- error_msg = (
295
- f"DataBento only supports futures assets. Received '{asset.asset_type}' for '{asset.symbol}'"
296
- )
297
- logger.error(error_msg)
298
- raise ValueError(error_msg)
301
+ # Ensure futures have correct multiplier set
302
+ self._ensure_futures_multiplier(asset_separated)
299
303
 
300
- cache_key = (asset, timestep)
304
+ # If this asset was already prefetched, we don't need to do anything
305
+ if search_asset in self._prefetched_assets:
306
+ logger.debug(f"[CACHE HIT] Asset {asset_separated.symbol} already prefetched")
307
+ return
301
308
 
302
- current_dt = self.get_datetime()
303
- if current_dt.tzinfo is None:
304
- current_dt = self.tzinfo.localize(current_dt)
309
+ # Check if we already have adequate data for this asset
310
+ if search_asset in self.pandas_data:
311
+ logger.debug(f"[CACHE CHECK] Checking existing data for {asset_separated.symbol}")
312
+ asset_data = self.pandas_data[search_asset]
313
+
314
+ # OPTIMIZATION: For DataPolars, check polars_df directly without converting to pandas
315
+ if isinstance(asset_data, DataPolars):
316
+ # Use polars DataFrame directly to avoid conversion overhead
317
+ polars_df = asset_data.polars_df
318
+ if polars_df.height > 0:
319
+ # Get datetime bounds from polars DataFrame
320
+ data_start_datetime = polars_df["datetime"].min()
321
+ data_end_datetime = polars_df["datetime"].max()
322
+
323
+ # Convert polars datetime to pandas Timestamp
324
+ data_start_datetime = pd.Timestamp(data_start_datetime)
325
+ data_end_datetime = pd.Timestamp(data_end_datetime)
326
+
327
+ # Convert UTC to default timezone for proper comparison
328
+ if data_start_datetime.tz is not None:
329
+ data_start_datetime = data_start_datetime.tz_convert(LUMIBOT_DEFAULT_PYTZ)
330
+ else:
331
+ data_start_datetime = data_start_datetime.tz_localize(LUMIBOT_DEFAULT_PYTZ)
305
332
 
306
- effective_dt = current_dt
307
- if timeshift:
308
- if isinstance(timeshift, int):
309
- effective_dt = effective_dt - timedelta(minutes=timeshift)
310
- else:
311
- effective_dt = effective_dt - timeshift
312
-
313
- current_dt_utc = effective_dt.astimezone(pytz.UTC)
314
- current_dt_naive_utc = current_dt_utc.replace(tzinfo=None)
315
-
316
- future_end = self.datetime_end
317
- if future_end.tzinfo is None:
318
- future_end = self.tzinfo.localize(future_end)
319
- future_end_naive = future_end.astimezone(pytz.UTC).replace(tzinfo=None)
320
-
321
- earliest_start = self.datetime_start
322
- if earliest_start.tzinfo is None:
323
- earliest_start = self.tzinfo.localize(earliest_start)
324
- earliest_start_naive = earliest_start.astimezone(pytz.UTC).replace(tzinfo=None)
325
-
326
- if timestep == "day":
327
- buffer_days = max(10, length // 2)
328
- dynamic_start = current_dt_naive_utc - timedelta(days=length + buffer_days)
329
- start_dt = min(dynamic_start, earliest_start_naive - timedelta(days=buffer_days))
330
- end_dt = future_end_naive
331
- coverage_buffer = timedelta(days=2)
332
- bar_delta = timedelta(days=1)
333
- elif timestep == "hour":
334
- buffer_hours = max(24, length // 2)
335
- start_dt = current_dt_naive_utc - timedelta(hours=length + buffer_hours)
336
- end_dt = min(current_dt_naive_utc + timedelta(days=30), future_end_naive)
337
- coverage_buffer = timedelta(hours=6)
338
- bar_delta = timedelta(hours=1)
339
- else:
340
- buffer_minutes = max(720, length + 100)
341
- start_dt = current_dt_naive_utc - timedelta(minutes=buffer_minutes)
342
- end_dt = min(current_dt_naive_utc + timedelta(days=3), future_end_naive)
343
- coverage_buffer = timedelta(minutes=30)
344
- bar_delta = timedelta(minutes=1)
345
-
346
- start_dt = self._to_naive_datetime(start_dt)
347
- end_dt = self._to_naive_datetime(end_dt)
348
-
349
- # Guarantee the requested window spans at least a full bar to avoid inverted ranges
350
- min_required_end = start_dt + bar_delta
351
- if end_dt <= start_dt:
352
- end_dt = min_required_end
353
- elif end_dt < min_required_end:
354
- end_dt = min_required_end
355
-
356
- cached_df = None
357
- coverage_ok = False
358
- if cache_key in self._filtered_data_cache:
359
- cached_df = self._ensure_strategy_timezone(self._filtered_data_cache[cache_key])
360
- self._filtered_data_cache[cache_key] = cached_df
361
-
362
- metadata = self._cache_metadata.get(cache_key)
363
- if metadata:
364
- cached_min = self._to_naive_datetime(metadata.get("min_dt"))
365
- cached_max = self._to_naive_datetime(metadata.get("max_dt"))
333
+ if data_end_datetime.tz is not None:
334
+ data_end_datetime = data_end_datetime.tz_convert(LUMIBOT_DEFAULT_PYTZ)
335
+ else:
336
+ data_end_datetime = data_end_datetime.tz_localize(LUMIBOT_DEFAULT_PYTZ)
337
+
338
+ data_timestep = asset_data.timestep
339
+
340
+ if data_timestep == timestep:
341
+ # Use timezone-aware timestamps for comparison
342
+ data_start_tz = data_start_datetime
343
+ data_end_tz = data_end_datetime
344
+
345
+ start_datetime, _ = self.get_start_datetime_and_ts_unit(
346
+ length, timestep, start_dt, start_buffer=START_BUFFER
347
+ )
348
+ start_tz = to_datetime_aware(start_datetime)
349
+
350
+ # start_tz already includes START_BUFFER from get_start_datetime_and_ts_unit
351
+ needed_start = start_tz
352
+ needed_end = self.datetime_end
353
+
354
+ if data_start_tz <= needed_start and data_end_tz >= needed_end:
355
+ # Data is already sufficient - return without converting to pandas!
356
+ logger.debug(f"[CACHE HIT] Data sufficient for {asset_separated.symbol}, returning early")
357
+ return
358
+ else:
359
+ logger.debug(f"[CACHE MISS] Data insufficient - need: {needed_start} to {needed_end}, have: {data_start_tz} to {data_end_tz}")
366
360
  else:
367
- cached_min = cached_df.lazy().select(pl.col("datetime").min()).collect().item()
368
- cached_max = cached_df.lazy().select(pl.col("datetime").max()).collect().item()
369
- cached_min = self._to_naive_datetime(cached_min)
370
- cached_max = self._to_naive_datetime(cached_max)
371
- self._cache_metadata[cache_key] = {
372
- "min_dt": cached_min,
373
- "max_dt": cached_max,
374
- "count": cached_df.height,
375
- }
376
-
377
- if cached_min is not None and cached_max is not None:
378
- coverage_ok = cached_min <= start_dt and cached_max >= (end_dt - coverage_buffer)
379
-
380
- logger.debug(
381
- "[get_historical_prices] cache window for %s (%s): min=%s max=%s required=[%s, %s] buffer=%s",
382
- asset.symbol,
383
- timestep,
384
- cached_min,
385
- cached_max,
386
- start_dt,
387
- end_dt,
388
- coverage_buffer,
361
+ # For pandas Data objects, use the regular .df property
362
+ asset_data_df = asset_data.df
363
+
364
+ # Only check if we have actual data (not empty DataFrame)
365
+ if not asset_data_df.empty and len(asset_data_df.index) > 0:
366
+ data_start_datetime = asset_data_df.index[0]
367
+ data_end_datetime = asset_data_df.index[-1]
368
+
369
+ # Get the timestep of the existing data
370
+ data_timestep = asset_data.timestep
371
+
372
+ # If the timestep matches, check if we have sufficient coverage
373
+ if data_timestep == timestep:
374
+ # Ensure both datetimes are timezone-aware for comparison
375
+ data_start_tz = to_datetime_aware(data_start_datetime)
376
+ data_end_tz = to_datetime_aware(data_end_datetime)
377
+
378
+ # Get the start datetime with buffer
379
+ start_datetime, _ = self.get_start_datetime_and_ts_unit(
380
+ length, timestep, start_dt, start_buffer=START_BUFFER
381
+ )
382
+ start_tz = to_datetime_aware(start_datetime)
383
+
384
+ # start_tz already includes START_BUFFER from get_start_datetime_and_ts_unit
385
+ needed_start = start_tz
386
+ needed_end = self.datetime_end
387
+
388
+ if data_start_tz <= needed_start and data_end_tz >= needed_end:
389
+ # Data is already sufficient - return silently
390
+ return
391
+
392
+ # We need to fetch new data from DataBento
393
+ # Create a unique key for logging to avoid spam
394
+ log_key = f"{asset_separated.symbol}_{timestep}"
395
+
396
+ try:
397
+ # Only log fetch message once per asset/timestep combination
398
+ if log_key not in self._logged_requests:
399
+ logger.debug(f"Fetching {timestep} data for {asset_separated.symbol}")
400
+ self._logged_requests.add(log_key)
401
+
402
+ # Get the start datetime and timestep unit
403
+ start_datetime, ts_unit = self.get_start_datetime_and_ts_unit(
404
+ length, timestep, start_dt, start_buffer=START_BUFFER
389
405
  )
390
406
 
391
- if coverage_ok:
392
- allow_current_bar = getattr(self, "_include_current_bar_for_orders", False)
393
- if isinstance(timeshift, int) and timeshift > 0:
394
- allow_current_bar = True
395
- elif isinstance(timeshift, timedelta) and timeshift.total_seconds() > 0:
396
- allow_current_bar = True
397
-
398
- cutoff_dt = effective_dt if allow_current_bar else effective_dt - bar_delta
399
-
400
- df_result = (
401
- cached_df.lazy()
402
- .filter(pl.col("datetime") <= pl.lit(cutoff_dt))
403
- .sort("datetime")
404
- .tail(length)
405
- .collect()
406
- )
407
-
408
- if df_result.height >= length:
409
- return Bars(
410
- df=df_result,
411
- source=self.SOURCE,
412
- asset=asset,
413
- quote=quote,
414
- return_polars=return_polars,
415
- )
416
- else:
417
- logger.debug(
418
- "Cache coverage insufficient for %s (%s); requesting additional data.",
419
- asset.symbol,
420
- timestep,
421
- )
407
+ # Calculate end datetime (use current backtest end or a bit beyond)
408
+ end_datetime = self.datetime_end + timedelta(days=1)
422
409
 
423
- logger.debug(
424
- "[get_historical_prices] Requesting DataBento data for %s from %s to %s",
425
- asset.symbol,
426
- start_dt,
427
- end_dt,
428
- )
410
+ # NOTE: Sliding window clamping is disabled during initial data fetch
411
+ # to ensure we have sufficient data for the entire backtest period.
412
+ # Runtime trimming is handled by _trim_cached_data() which is called
413
+ # periodically during get_historical_prices().
414
+ #
415
+ # Premature clamping here causes accuracy issues when strategies request
416
+ # more lookback than the window size (e.g., 500 bars with 5000 bar window)
429
417
 
430
- try:
431
- df = databento_helper_polars.get_price_data_from_databento_polars(
418
+ # Get data from DataBento (returns polars DataFrame by default)
419
+ _log_conversion("FETCH", "DataBento", "polars", "_update_pandas_data")
420
+ df = databento_helper.get_price_data_from_databento(
432
421
  api_key=self._api_key,
433
- asset=asset,
434
- start=start_dt,
435
- end=end_dt,
436
- timestep=timestep,
437
- venue=exchange,
438
- reference_date=effective_dt,
422
+ asset=asset_separated,
423
+ start=start_datetime,
424
+ end=end_datetime,
425
+ timestep=ts_unit,
426
+ venue=None, # Could add venue support later
427
+ force_cache_update=False,
428
+ return_polars=True # Fetch as polars for optimal performance
439
429
  )
440
430
 
441
- if df is None:
442
- logger.error(
443
- "[get_historical_prices] No data returned from DataBento for %s - df is None",
444
- asset.symbol,
431
+ # Check if DataFrame is empty (works for both pandas and polars)
432
+ is_empty = df is None or (hasattr(df, 'is_empty') and df.is_empty()) or (hasattr(df, 'empty') and df.empty)
433
+
434
+ if is_empty:
435
+ # For empty data, create an empty Data object with proper timezone handling
436
+ # to maintain backward compatibility with tests
437
+ empty_df = pd.DataFrame(columns=['open', 'high', 'low', 'close', 'volume'])
438
+ # Create an empty DatetimeIndex with proper timezone
439
+ empty_df.index = pd.DatetimeIndex([], tz=LUMIBOT_DEFAULT_PYTZ, name='datetime')
440
+
441
+ data_obj = Data(
442
+ asset_separated,
443
+ df=empty_df,
444
+ timestep=ts_unit,
445
+ quote=quote_asset,
446
+ # Use timezone-aware dates to avoid timezone issues
447
+ date_start=LUMIBOT_DEFAULT_PYTZ.localize(datetime(2000, 1, 1)),
448
+ date_end=LUMIBOT_DEFAULT_PYTZ.localize(datetime(2000, 1, 1))
445
449
  )
446
- return None
447
- if df.is_empty():
448
- logger.error(
449
- "[get_historical_prices] No data returned from DataBento for %s - df is empty",
450
- asset.symbol,
450
+ self.pandas_data[search_asset] = data_obj
451
+ return
452
+
453
+ # Handle polars DataFrame (has 'datetime' column) or pandas DataFrame (has datetime index)
454
+ if isinstance(df, pl.DataFrame):
455
+ _log_conversion("STORE", "polars", "DataPolars", "_update_pandas_data")
456
+ logger.debug(f"[POLARS] Storing polars DataFrame for {asset_separated.symbol}: {df.height} rows")
457
+ # Create DataPolars object with polars DataFrame (keeps polars end-to-end)
458
+ data_obj = DataPolars(
459
+ asset_separated,
460
+ df=df,
461
+ timestep=ts_unit,
462
+ quote=quote_asset,
451
463
  )
452
- return None
453
-
454
- df = self._ensure_strategy_timezone(df)
455
-
456
- if self.enable_cache:
457
- if cached_df is not None:
458
- combined_df = pl.concat([cached_df, df], how="vertical", rechunk=True)
459
- combined_df = combined_df.unique(subset=["datetime"]).sort("datetime")
460
- else:
461
- combined_df = df
462
-
463
- self._filtered_data_cache[cache_key] = combined_df
464
-
465
- cache_min = combined_df.lazy().select(pl.col("datetime").min()).collect().item()
466
- cache_max = combined_df.lazy().select(pl.col("datetime").max()).collect().item()
467
- cache_min = self._to_naive_datetime(cache_min)
468
- cache_max = self._to_naive_datetime(cache_max)
469
- self._cache_metadata[cache_key] = {
470
- "min_dt": cache_min,
471
- "max_dt": cache_max,
472
- "count": combined_df.height,
473
- }
474
- df_to_use = combined_df
475
- else:
476
- df_to_use = df
477
-
478
- allow_current_bar = getattr(self, "_include_current_bar_for_orders", False)
479
- if isinstance(timeshift, int) and timeshift > 0:
480
- allow_current_bar = True
481
- elif isinstance(timeshift, timedelta) and timeshift.total_seconds() > 0:
482
- allow_current_bar = True
483
-
484
- cutoff_dt_api = effective_dt if allow_current_bar else effective_dt - bar_delta
485
-
486
- df_result = (
487
- df_to_use.lazy()
488
- .filter(pl.col("datetime") <= pl.lit(cutoff_dt_api))
489
- .sort("datetime")
490
- .tail(length)
491
- .collect()
492
- )
493
-
494
- if df_result.is_empty():
495
- logger.warning(
496
- "No data available for %s up to %s",
497
- asset.symbol,
498
- effective_dt,
464
+ elif isinstance(df, pd.DataFrame):
465
+ # Ensure the pandas DataFrame has a datetime index
466
+ if not isinstance(df.index, pd.DatetimeIndex):
467
+ logger.error(f"DataBento data for {asset_separated.symbol} doesn't have datetime index")
468
+ return
469
+ # Create Data object with pandas DataFrame
470
+ data_obj = Data(
471
+ asset_separated,
472
+ df=df,
473
+ timestep=ts_unit,
474
+ quote=quote_asset,
499
475
  )
500
- return None
476
+ else:
477
+ logger.error(f"Unexpected DataFrame type: {type(df)}")
478
+ return
501
479
 
502
- return Bars(
503
- df=df_result,
504
- source=self.SOURCE,
505
- asset=asset,
506
- quote=quote,
507
- return_polars=return_polars,
508
- tzinfo=self.tzinfo,
509
- )
480
+ self.pandas_data[search_asset] = data_obj
510
481
 
482
+ except DataBentoAuthenticationError as e:
483
+ logger.error(colored(f"DataBento authentication failed for {asset_separated.symbol}: {e}", "red"))
484
+ raise
511
485
  except Exception as e:
512
- logger.error(f"Error getting data from DataBento for {asset.symbol}: {e}")
513
- return None
486
+ logger.error(f"Error updating pandas data for {asset_separated.symbol}: {str(e)}")
487
+ logger.error(traceback.format_exc())
514
488
 
515
489
  def get_last_price(self, asset, quote=None, exchange=None):
516
490
  """
517
- Get the last known price for an asset using cached data when possible
518
-
491
+ Get the last price for an asset at the current backtest time
492
+
519
493
  Parameters
520
494
  ----------
521
495
  asset : Asset
522
- The asset to get the last price for
496
+ Asset to get the price for
523
497
  quote : Asset, optional
524
- Quote asset (not used for DataBento)
498
+ Quote asset (not typically used with DataBento)
525
499
  exchange : str, optional
526
- Exchange/venue filter
527
-
500
+ Exchange filter
501
+
528
502
  Returns
529
503
  -------
530
- float or None
531
- Last known price of the asset
504
+ float, Decimal, or None
505
+ Last price at current backtest time
532
506
  """
533
- # Check cache first
534
- cache_key = (asset, self.get_datetime())
535
- if cache_key in self._last_price_cache:
536
- cached_price = self._last_price_cache[cache_key]
537
- logger.debug(f"Using cached last price for {asset.symbol}: {cached_price}")
538
- return cached_price
539
-
540
- logger.debug(f"Getting last price for {asset.symbol}")
541
-
542
- # Try to get from lazy data first (more memory efficient)
543
- if asset in self._data_store:
544
- lazy_df = self._data_store[asset]
545
-
546
- # Get current time for filtering
507
+ try:
508
+ # OPTIMIZATION: Check cache first
509
+ self._check_and_clear_cache()
547
510
  current_dt = self.get_datetime()
548
511
 
549
- # Make timezone-naive for comparison
550
- if current_dt.tzinfo is not None:
551
- current_dt_naive = current_dt.replace(tzinfo=None)
552
- else:
553
- current_dt_naive = current_dt
554
-
555
- # Get last price with single lazy operation
556
- try:
557
- cutoff_dt_lp = current_dt_naive - timedelta(minutes=1)
558
- last_price = (
559
- lazy_df
560
- .filter(pl.col('datetime') <= pl.lit(cutoff_dt_lp))
561
- .select(pl.col('close').tail(1))
562
- .collect()
563
- .item()
564
- )
512
+ # Try to get data from our cached pandas_data first
513
+ search_asset = asset
514
+ quote_asset = quote if quote is not None else Asset("USD", "forex")
565
515
 
566
- if last_price is not None:
567
- last_price = float(last_price)
568
- cache_key = (asset, self.get_datetime())
569
- self._last_price_cache[asset] = last_price
570
- logger.debug(f"Last price from lazy data for {asset.symbol}: {last_price}")
571
- return last_price
572
- except:
573
- pass # Fall back to historical prices
574
-
575
- # Fall back to getting historical prices
576
- bars = self.get_historical_prices(asset, 1, "minute", exchange=exchange)
577
- if bars and not bars.empty:
578
- # Get the last close price - handle both index types
579
- df = bars.df
580
- if 'close' in df.columns:
581
- last_price = float(df['close'].iloc[-1])
582
- cache_key = (asset, self.get_datetime())
583
- self._last_price_cache[asset] = last_price
584
- logger.debug(f"Last price from historical for {asset.symbol}: {last_price}")
585
- return last_price
586
-
587
- logger.warning(f"No last price available for {asset.symbol}")
588
- return None
516
+ if isinstance(search_asset, tuple):
517
+ asset_separated, quote_asset = search_asset
518
+ else:
519
+ search_asset = (search_asset, quote_asset)
520
+ asset_separated = asset
521
+
522
+ # Ensure futures have correct multiplier set
523
+ self._ensure_futures_multiplier(asset_separated)
524
+
525
+ # OPTIMIZATION: Check iteration cache
526
+ cache_key = (search_asset, current_dt)
527
+ if cache_key in self._last_price_cache:
528
+ return self._last_price_cache[cache_key]
529
+
530
+ if search_asset not in self.pandas_data:
531
+ fetch_timestep = getattr(self, '_timestep', self.MIN_TIMESTEP if hasattr(self, 'MIN_TIMESTEP') else 'minute')
532
+ self._update_pandas_data(asset_separated, quote_asset, length=10, timestep=fetch_timestep)
533
+
534
+ if search_asset in self.pandas_data:
535
+ asset_data = self.pandas_data[search_asset]
536
+
537
+ # OPTIMIZATION: If asset_data is DataPolars, work with polars directly to avoid conversion
538
+ if isinstance(asset_data, DataPolars):
539
+ polars_df = asset_data.polars_df
540
+
541
+ if polars_df.height > 0 and 'close' in polars_df.columns:
542
+ # Ensure current_dt is timezone-aware for comparison
543
+ current_dt_aware = to_datetime_aware(current_dt)
544
+
545
+ # Step back one bar so only fully closed bars are visible
546
+ bar_delta = timedelta(minutes=1)
547
+ if asset_data.timestep == "hour":
548
+ bar_delta = timedelta(hours=1)
549
+ elif asset_data.timestep == "day":
550
+ bar_delta = timedelta(days=1)
551
+
552
+ cutoff_dt = current_dt_aware - bar_delta
553
+
554
+ # Convert to UTC for polars comparison (polars DataFrame datetime is in UTC)
555
+ polars_tz = polars_df["datetime"].dtype.time_zone
556
+ if polars_tz:
557
+ cutoff_dt_compat = pd.Timestamp(cutoff_dt).tz_convert(polars_tz)
558
+ current_dt_compat = pd.Timestamp(current_dt_aware).tz_convert(polars_tz)
559
+ else:
560
+ cutoff_dt_compat = cutoff_dt
561
+ current_dt_compat = current_dt_aware
562
+
563
+ # Filter using polars operations (no conversion!)
564
+ filtered_df = polars_df.filter(pl.col("datetime") <= cutoff_dt_compat)
565
+
566
+ # If we have no prior bar (e.g., first iteration), allow the current timestamp
567
+ if filtered_df.height == 0:
568
+ filtered_df = polars_df.filter(pl.col("datetime") <= current_dt_compat)
569
+
570
+ if filtered_df.height > 0:
571
+ last_price = filtered_df['close'][-1]
572
+ if not pd.isna(last_price):
573
+ price = float(last_price)
574
+ # OPTIMIZATION: Cache the result
575
+ self._last_price_cache[cache_key] = price
576
+ return price
577
+ else:
578
+ # For regular Data objects, use pandas operations
579
+ df = asset_data.df
580
+
581
+ if not df.empty and 'close' in df.columns:
582
+ # Ensure current_dt is timezone-aware for comparison
583
+ current_dt_aware = to_datetime_aware(current_dt)
584
+
585
+ # Step back one bar so only fully closed bars are visible
586
+ bar_delta = timedelta(minutes=1)
587
+ if asset_data.timestep == "hour":
588
+ bar_delta = timedelta(hours=1)
589
+ elif asset_data.timestep == "day":
590
+ bar_delta = timedelta(days=1)
591
+
592
+ cutoff_dt = current_dt_aware - bar_delta
593
+
594
+ # Filter to data up to current backtest time (exclude current bar unless broker overrides)
595
+ filtered_df = df[df.index <= cutoff_dt]
596
+
597
+ # If we have no prior bar (e.g., first iteration), allow the current timestamp
598
+ if filtered_df.empty:
599
+ filtered_df = df[df.index <= current_dt_aware]
600
+
601
+ if not filtered_df.empty:
602
+ last_price = filtered_df['close'].iloc[-1]
603
+ if not pd.isna(last_price):
604
+ price = float(last_price)
605
+ # OPTIMIZATION: Cache the result
606
+ self._last_price_cache[cache_key] = price
607
+ return price
608
+
609
+ # If no cached data, try to get recent data
610
+ logger.warning(f"No cached data for {asset.symbol}, attempting direct fetch")
611
+ return databento_helper.get_last_price_from_databento(
612
+ api_key=self._api_key,
613
+ asset=asset_separated,
614
+ venue=exchange
615
+ )
616
+
617
+ except DataBentoAuthenticationError as e:
618
+ logger.error(colored(f"DataBento authentication failed while getting last price for {asset.symbol}: {e}", "red"))
619
+ raise
620
+ except Exception as e:
621
+ logger.error(f"Error getting last price for {asset.symbol}: {e}")
622
+ return None
589
623
 
590
624
  def get_chains(self, asset, quote=None):
591
- """DataBento doesn't provide options chain data"""
625
+ """
626
+ Get option chains for an asset
627
+
628
+ DataBento doesn't provide options chain data, so this returns an empty dict.
629
+
630
+ Parameters
631
+ ----------
632
+ asset : Asset
633
+ Asset to get chains for
634
+ quote : Asset, optional
635
+ Quote asset
636
+
637
+ Returns
638
+ -------
639
+ dict
640
+ Empty dictionary
641
+ """
592
642
  logger.warning("DataBento does not provide options chain data")
593
643
  return {}
594
644
 
595
645
  def get_quote(self, asset, quote=None):
596
- """Get current quote for an asset"""
597
- return self.get_last_price(asset, quote=quote)
598
-
599
- def clear_cache(self):
600
- """Clear all cached data to free memory"""
601
- self._data_store.clear()
602
- self._eager_cache.clear()
603
- self._column_indices.clear()
604
- self._filtered_data_cache.clear()
605
- self._last_price_cache.clear()
606
- logger.info("Cleared all DataBento data caches")
646
+ """Return a Quote object using cached bars or a direct fetch."""
647
+ try:
648
+ search_asset = asset if isinstance(asset, tuple) else (asset, Asset("USD", "forex"))
649
+ asset_data = self.pandas_data.get(search_asset)
650
+ df = None
651
+ if isinstance(asset_data, DataPolars):
652
+ df = asset_data.polars_df
653
+ elif asset_data is not None:
654
+ df = asset_data.polars_df if hasattr(asset_data, "polars_df") else asset_data.df
655
+ if df is None:
656
+ default_timestep = getattr(self, "_timestep", self.MIN_TIMESTEP if hasattr(self, "MIN_TIMESTEP") else "minute")
657
+ df = self._pull_source_symbol_bars(asset, length=1, timestep=default_timestep)
658
+ bid = ask = price = volume = mid = None
659
+ if isinstance(df, pl.DataFrame) and df.height > 0:
660
+ row = df.row(0, named=True)
661
+ bid = row.get("bid")
662
+ ask = row.get("ask")
663
+ price = row.get("close")
664
+ volume = row.get("volume")
665
+ elif isinstance(df, pd.DataFrame) and not df.empty:
666
+ row = df.iloc[-1]
667
+ bid = row.get("bid")
668
+ ask = row.get("ask")
669
+ price = row.get("close")
670
+ volume = row.get("volume")
671
+ if bid is not None and ask is not None:
672
+ mid = float(bid + ask) / 2.0
673
+ quote_obj = Quote(
674
+ asset if not isinstance(asset, tuple) else asset[0],
675
+ price=float(price) if price is not None else None,
676
+ bid=float(bid) if bid is not None else None,
677
+ ask=float(ask) if ask is not None else None,
678
+ volume=float(volume) if volume is not None else None,
679
+ mid_price=mid,
680
+ raw_data={"bid": bid, "ask": ask, "price": price},
681
+ )
682
+ quote_obj.source = "polars"
683
+ return quote_obj
684
+ except DataBentoAuthenticationError as exc:
685
+ logger.error(colored(f"DataBento authentication failed while getting quote for {asset}: {exc}", "red"))
686
+ raise
687
+ except Exception as exc:
688
+ logger.error(f"Error getting quote for {asset}: {exc}")
689
+ return Quote(asset if not isinstance(asset, tuple) else asset[0], raw_data={})
690
+
691
+ def _get_bars_dict(self, assets, length, timestep, timeshift=None):
692
+ """
693
+ Override parent method to handle DataBento-specific data retrieval
694
+
695
+ Parameters
696
+ ----------
697
+ assets : list
698
+ List of assets to get data for
699
+ length : int
700
+ Number of bars to retrieve
701
+ timestep : str
702
+ Timestep for the data
703
+ timeshift : timedelta, optional
704
+ Time shift to apply
705
+
706
+ Returns
707
+ -------
708
+ dict
709
+ Dictionary mapping assets to their bar data
710
+ """
711
+ result = {}
712
+
713
+ for asset in assets:
714
+ try:
715
+ # Update pandas data if needed
716
+ self._update_pandas_data(asset, None, length, timestep)
717
+
718
+ # Get data from pandas_data
719
+ search_asset = asset
720
+ if not isinstance(search_asset, tuple):
721
+ search_asset = (search_asset, Asset("USD", "forex"))
722
+
723
+ if search_asset in self.pandas_data:
724
+ asset_data = self.pandas_data[search_asset]
725
+ df = asset_data.df
726
+
727
+ if not df.empty:
728
+ # Apply timeshift if specified
729
+ current_dt = self.get_datetime()
730
+ shift_seconds = 0
731
+ if timeshift:
732
+ if isinstance(timeshift, int):
733
+ shift_seconds = timeshift * 60
734
+ current_dt = current_dt - timedelta(minutes=timeshift)
735
+ else:
736
+ shift_seconds = timeshift.total_seconds()
737
+ current_dt = current_dt - timeshift
738
+
739
+ # Ensure current_dt is timezone-aware for comparison
740
+ current_dt_aware = to_datetime_aware(current_dt)
741
+
742
+ # Filter data up to current backtest time (exclude current bar unless broker overrides)
743
+ include_current = getattr(self, "_include_current_bar_for_orders", False)
744
+ allow_current = include_current or shift_seconds > 0
745
+ mask = df.index <= current_dt_aware if allow_current else df.index < current_dt_aware
746
+ filtered_df = df[mask]
747
+
748
+ # Take the last 'length' bars
749
+ result_df = filtered_df.tail(length)
750
+
751
+ if not result_df.empty:
752
+ result[asset] = result_df
753
+ else:
754
+ logger.warning(f"No data available for {asset.symbol} at {current_dt}")
755
+ result[asset] = None
756
+ else:
757
+ logger.warning(f"Empty data for {asset.symbol}")
758
+ result[asset] = None
759
+ else:
760
+ logger.warning(f"No data found for {asset.symbol}")
761
+ result[asset] = None
762
+
763
+ except DataBentoAuthenticationError as e:
764
+ logger.error(colored(f"DataBento authentication failed while getting bars for {asset}: {e}", "red"))
765
+ raise
766
+ except Exception as e:
767
+ logger.error(f"Error getting bars for {asset}: {e}")
768
+ result[asset] = None
769
+
770
+ return result
607
771
 
608
772
  def _pull_source_symbol_bars(
609
773
  self,
@@ -616,62 +780,212 @@ class DataBentoDataBacktestingPolars(DataSourceBacktesting):
616
780
  include_after_hours=True,
617
781
  ):
618
782
  """
619
- Pull historical bars from DataBento data source.
620
-
621
- This is the critical method that the backtesting framework calls to get data.
622
- It must return a pandas DataFrame for compatibility with the backtesting engine.
623
-
624
- Parameters
625
- ----------
626
- asset : Asset
627
- The asset to get data for
628
- length : int
629
- Number of bars to retrieve
630
- timestep : str
631
- Timestep for the data ('minute', 'hour', 'day')
632
- timeshift : int
633
- Minutes to shift back in time
634
- quote : Asset, optional
635
- Quote asset (not used for DataBento)
636
- exchange : str, optional
637
- Exchange/venue filter
638
- include_after_hours : bool
639
- Whether to include after-hours data
640
-
641
- Returns
642
- -------
643
- pandas.DataFrame
644
- Historical price data with datetime index
783
+ Override parent method to fetch data from DataBento instead of pre-loaded data store
784
+
785
+ This method is called by get_historical_prices and is responsible for actually
786
+ fetching the data from the DataBento API.
645
787
  """
646
788
  timestep = timestep if timestep else "minute"
647
789
 
648
- logger.debug(f"[_pull_source_symbol_bars] Called with asset={asset.symbol}, length={length}, timestep={timestep}, timeshift={timeshift}")
649
-
650
- # Get historical prices using our existing method
651
- bars = self.get_historical_prices(
652
- asset=asset,
653
- length=length,
654
- timestep=timestep,
655
- timeshift=timedelta(minutes=timeshift) if timeshift else None,
656
- quote=quote,
657
- exchange=exchange,
658
- include_after_hours=include_after_hours
659
- )
790
+ # OPTIMIZATION: Check iteration cache first
791
+ self._check_and_clear_cache()
792
+ current_dt = self.get_datetime()
660
793
 
661
- if bars is None:
662
- logger.warning(f"[_pull_source_symbol_bars] bars is None for {asset.symbol}")
663
- return None
794
+ # Get data from our cached pandas_data
795
+ search_asset = asset
796
+ quote_asset = quote if quote is not None else Asset("USD", "forex")
664
797
 
665
- if bars.empty:
666
- logger.warning(f"[_pull_source_symbol_bars] bars is empty for {asset.symbol}")
667
- return None
798
+ if isinstance(search_asset, tuple):
799
+ asset_separated, quote_asset = search_asset
800
+ else:
801
+ search_asset = (search_asset, quote_asset)
802
+ asset_separated = asset
668
803
 
669
- # Return the pandas DataFrame from the Bars object
670
- # The Bars.df property already converts to pandas when accessed
671
- result_df = bars.df
672
- logger.debug(f"[_pull_source_symbol_bars] Returning DataFrame with shape {result_df.shape} for {asset.symbol}")
673
- if not result_df.empty:
674
- logger.debug(f"[_pull_source_symbol_bars] DataFrame columns: {result_df.columns.tolist()}")
675
- logger.debug(f"[_pull_source_symbol_bars] First row: {result_df.iloc[0].to_dict() if len(result_df) > 0 else 'N/A'}")
676
- logger.debug(f"[_pull_source_symbol_bars] Last row: {result_df.iloc[-1].to_dict() if len(result_df) > 0 else 'N/A'}")
677
- return result_df
804
+ # OPTIMIZATION: Build cache key and check cache
805
+ # Convert timeshift to consistent format for caching
806
+ timeshift_key = 0
807
+ if timeshift:
808
+ if isinstance(timeshift, int):
809
+ timeshift_key = timeshift
810
+ else:
811
+ timeshift_key = int(timeshift.total_seconds() / 60)
812
+
813
+ cache_key = (search_asset, length, timestep, timeshift_key, current_dt)
814
+ if cache_key in self._filtered_bars_cache:
815
+ return self._filtered_bars_cache[cache_key]
816
+
817
+ # Check if we need to fetch data by calling _update_pandas_data first
818
+ # This will only fetch if data is not already cached or prefetched
819
+ self._update_pandas_data(asset, quote, length, timestep)
820
+
821
+ # Check if we have data in pandas_data cache
822
+ if search_asset in self.pandas_data:
823
+ asset_data = self.pandas_data[search_asset]
824
+
825
+ # OPTIMIZATION: If asset_data is DataPolars, work with polars directly to avoid conversion
826
+ if isinstance(asset_data, DataPolars):
827
+ polars_df = asset_data.polars_df
828
+
829
+ if polars_df.height > 0:
830
+ # ========================================================================
831
+ # CRITICAL: NEGATIVE TIMESHIFT ARITHMETIC FOR LOOKAHEAD (MATCHES PANDAS)
832
+ # ========================================================================
833
+ # Negative timeshift allows broker to "peek ahead" for realistic fills.
834
+ # This arithmetic MUST match pandas exactly: current_dt - timeshift
835
+ # With timeshift=-2: current_dt - (-2) = current_dt + 2 minutes ✓
836
+ # ========================================================================
837
+ shift_seconds = 0
838
+ if timeshift:
839
+ if isinstance(timeshift, int):
840
+ shift_seconds = timeshift * 60
841
+ current_dt = current_dt - timedelta(minutes=timeshift) # FIXED: was +, now matches pandas
842
+ else:
843
+ shift_seconds = timeshift.total_seconds()
844
+ current_dt = current_dt - timeshift # FIXED: was +, now matches pandas
845
+
846
+ # Ensure current_dt is timezone-aware for comparison
847
+ current_dt_aware = to_datetime_aware(current_dt)
848
+
849
+ # Step back one bar to avoid exposing the in-progress bar
850
+ bar_delta = timedelta(minutes=1)
851
+ if asset_data.timestep == "hour":
852
+ bar_delta = timedelta(hours=1)
853
+ elif asset_data.timestep == "day":
854
+ bar_delta = timedelta(days=1)
855
+
856
+ cutoff_dt = current_dt_aware - bar_delta
857
+
858
+ # Convert to UTC for polars comparison (polars DataFrame datetime is in UTC)
859
+ # Get the timezone from polars DataFrame
860
+ polars_tz = polars_df["datetime"].dtype.time_zone
861
+ if polars_tz:
862
+ # Convert current_dt_aware to match polars timezone
863
+ cutoff_dt_compat = pd.Timestamp(cutoff_dt).tz_convert(polars_tz)
864
+ current_dt_compat = pd.Timestamp(current_dt_aware).tz_convert(polars_tz)
865
+ else:
866
+ cutoff_dt_compat = cutoff_dt
867
+ current_dt_compat = current_dt_aware
868
+
869
+ # INSTRUMENTATION: Log timeshift application and filtering
870
+ broker_dt_orig = self.get_datetime()
871
+ filter_branch = "shift_seconds > 0 (<=cutoff)" if shift_seconds > 0 else "shift_seconds <= 0 (<current)"
872
+
873
+ # Filter using polars operations (no conversion!)
874
+ if shift_seconds > 0:
875
+ filtered_df = polars_df.filter(pl.col("datetime") <= cutoff_dt_compat)
876
+ else:
877
+ filtered_df = polars_df.filter(pl.col("datetime") < current_dt_compat)
878
+
879
+ # Log what bar we're returning
880
+ if filtered_df.height > 0:
881
+ returned_bar_dt = filtered_df["datetime"][-1]
882
+ logger.debug(f"[TIMESHIFT_POLARS] asset={asset_separated.symbol} broker_dt={broker_dt_orig} "
883
+ f"timeshift={timeshift} shift_seconds={shift_seconds} "
884
+ f"shifted_dt={current_dt_aware} cutoff_dt={cutoff_dt} "
885
+ f"filter={filter_branch} returned_bar={returned_bar_dt}")
886
+
887
+ # Take the last 'length' bars
888
+ result_df = filtered_df.tail(length)
889
+
890
+ # OPTIMIZATION: Cache the result before returning
891
+ if result_df.height > 0:
892
+ self._filtered_bars_cache[cache_key] = result_df
893
+ return result_df
894
+ else:
895
+ self._filtered_bars_cache[cache_key] = None
896
+ return None
897
+ else:
898
+ return None
899
+ else:
900
+ # For regular Data objects, use pandas operations
901
+ df = asset_data.df
902
+
903
+ if not df.empty:
904
+ # ========================================================================
905
+ # CRITICAL: NEGATIVE TIMESHIFT ARITHMETIC FOR LOOKAHEAD (MATCHES PANDAS)
906
+ # ========================================================================
907
+ # Negative timeshift allows broker to "peek ahead" for realistic fills.
908
+ # This arithmetic MUST match pandas exactly: current_dt - timeshift
909
+ # With timeshift=-2: current_dt - (-2) = current_dt + 2 minutes ✓
910
+ # ========================================================================
911
+ shift_seconds = 0
912
+ if timeshift:
913
+ if isinstance(timeshift, int):
914
+ shift_seconds = timeshift * 60
915
+ current_dt = current_dt - timedelta(minutes=timeshift) # FIXED: was +, now matches pandas
916
+ else:
917
+ shift_seconds = timeshift.total_seconds()
918
+ current_dt = current_dt - timeshift # FIXED: was +, now matches pandas
919
+
920
+ # Ensure current_dt is timezone-aware for comparison
921
+ current_dt_aware = to_datetime_aware(current_dt)
922
+
923
+ # Step back one bar to avoid exposing the in-progress bar
924
+ bar_delta = timedelta(minutes=1)
925
+ if asset_data.timestep == "hour":
926
+ bar_delta = timedelta(hours=1)
927
+ elif asset_data.timestep == "day":
928
+ bar_delta = timedelta(days=1)
929
+
930
+ cutoff_dt = current_dt_aware - bar_delta
931
+
932
+ # INSTRUMENTATION: Log timeshift application and filtering (pandas fallback)
933
+ broker_dt_orig = self.get_datetime()
934
+ filter_branch = "shift_seconds > 0 (<=cutoff)" if shift_seconds > 0 else "shift_seconds <= 0 (<current)"
935
+
936
+ # Filter data up to current backtest time (exclude current bar unless broker overrides)
937
+ filtered_df = df[df.index <= cutoff_dt] if shift_seconds > 0 else df[df.index < current_dt_aware]
938
+
939
+ # Log what bar we're returning
940
+ if not filtered_df.empty:
941
+ returned_bar_dt = filtered_df.index[-1]
942
+ logger.debug(f"[TIMESHIFT_POLARS_PD] asset={asset_separated.symbol} broker_dt={broker_dt_orig} "
943
+ f"timeshift={timeshift} shift_seconds={shift_seconds} "
944
+ f"shifted_dt={current_dt_aware} cutoff_dt={cutoff_dt} "
945
+ f"filter={filter_branch} returned_bar={returned_bar_dt}")
946
+
947
+ # Take the last 'length' bars
948
+ result_df = filtered_df.tail(length)
949
+
950
+ # OPTIMIZATION: Cache the result before returning
951
+ if not result_df.empty:
952
+ self._filtered_bars_cache[cache_key] = result_df
953
+ return result_df
954
+ else:
955
+ self._filtered_bars_cache[cache_key] = None
956
+ return None
957
+ else:
958
+ return None
959
+ else:
960
+ return None
961
+
962
+ def initialize_data_for_backtest(self, strategy_assets, timestep="minute"):
963
+ """
964
+ Convenience method to prefetch all required data for a backtest strategy.
965
+ This should be called during strategy initialization to load all data up front.
966
+
967
+ Parameters
968
+ ----------
969
+ strategy_assets : list of Asset or list of str
970
+ List of assets or asset symbols that the strategy will use
971
+ timestep : str, optional
972
+ Primary timestep for the data (default: "minute")
973
+ """
974
+ # Convert string symbols to Asset objects if needed
975
+ assets = []
976
+ for asset in strategy_assets:
977
+ if isinstance(asset, str):
978
+ # Try to determine asset type from symbol format
979
+ if any(month in asset for month in ['F', 'G', 'H', 'J', 'K', 'M', 'N', 'Q', 'U', 'V', 'X', 'Z']):
980
+ # Looks like a futures symbol
981
+ assets.append(Asset(asset, "future"))
982
+ else:
983
+ # Default to stock
984
+ assets.append(Asset(asset, "stock"))
985
+ else:
986
+ assets.append(asset)
987
+
988
+ # Prefetch data for all assets
989
+ self.prefetch_data(assets, timestep)
990
+
991
+ logger.debug(f"Initialized DataBento backtesting with prefetched data for {len(assets)} assets")