lumibot 4.1.3__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lumibot might be problematic. Click here for more details.

Files changed (163) hide show
  1. lumibot/backtesting/__init__.py +19 -5
  2. lumibot/backtesting/backtesting_broker.py +98 -18
  3. lumibot/backtesting/databento_backtesting.py +5 -686
  4. lumibot/backtesting/databento_backtesting_pandas.py +738 -0
  5. lumibot/backtesting/databento_backtesting_polars.py +860 -546
  6. lumibot/backtesting/fix_debug.py +37 -0
  7. lumibot/backtesting/thetadata_backtesting.py +9 -355
  8. lumibot/backtesting/thetadata_backtesting_pandas.py +1178 -0
  9. lumibot/brokers/alpaca.py +8 -1
  10. lumibot/brokers/schwab.py +12 -2
  11. lumibot/credentials.py +13 -0
  12. lumibot/data_sources/__init__.py +5 -8
  13. lumibot/data_sources/data_source.py +6 -2
  14. lumibot/data_sources/data_source_backtesting.py +30 -0
  15. lumibot/data_sources/databento_data.py +5 -390
  16. lumibot/data_sources/databento_data_pandas.py +440 -0
  17. lumibot/data_sources/databento_data_polars.py +15 -9
  18. lumibot/data_sources/pandas_data.py +30 -17
  19. lumibot/data_sources/polars_data.py +986 -0
  20. lumibot/data_sources/polars_mixin.py +472 -96
  21. lumibot/data_sources/polygon_data_polars.py +5 -0
  22. lumibot/data_sources/yahoo_data.py +9 -2
  23. lumibot/data_sources/yahoo_data_polars.py +5 -0
  24. lumibot/entities/__init__.py +15 -0
  25. lumibot/entities/asset.py +5 -28
  26. lumibot/entities/bars.py +89 -20
  27. lumibot/entities/data.py +29 -6
  28. lumibot/entities/data_polars.py +668 -0
  29. lumibot/entities/position.py +38 -4
  30. lumibot/strategies/_strategy.py +2 -1
  31. lumibot/strategies/strategy.py +61 -49
  32. lumibot/tools/backtest_cache.py +284 -0
  33. lumibot/tools/databento_helper.py +35 -35
  34. lumibot/tools/databento_helper_polars.py +738 -775
  35. lumibot/tools/futures_roll.py +251 -0
  36. lumibot/tools/indicators.py +135 -104
  37. lumibot/tools/polars_utils.py +142 -0
  38. lumibot/tools/thetadata_helper.py +1068 -134
  39. {lumibot-4.1.3.dist-info → lumibot-4.2.0.dist-info}/METADATA +9 -1
  40. {lumibot-4.1.3.dist-info → lumibot-4.2.0.dist-info}/RECORD +71 -147
  41. tests/backtest/test_databento.py +37 -6
  42. tests/backtest/test_databento_comprehensive_trading.py +8 -4
  43. tests/backtest/test_databento_parity.py +4 -2
  44. tests/backtest/test_debug_avg_fill_price.py +1 -1
  45. tests/backtest/test_example_strategies.py +11 -1
  46. tests/backtest/test_futures_edge_cases.py +3 -3
  47. tests/backtest/test_futures_single_trade.py +2 -2
  48. tests/backtest/test_futures_ultra_simple.py +2 -2
  49. tests/backtest/test_polars_lru_eviction.py +470 -0
  50. tests/backtest/test_yahoo.py +42 -0
  51. tests/test_asset.py +4 -4
  52. tests/test_backtest_cache_manager.py +149 -0
  53. tests/test_backtesting_data_source_env.py +6 -0
  54. tests/test_continuous_futures_resolution.py +60 -48
  55. tests/test_data_polars_parity.py +160 -0
  56. tests/test_databento_asset_validation.py +23 -5
  57. tests/test_databento_backtesting.py +1 -1
  58. tests/test_databento_backtesting_polars.py +312 -192
  59. tests/test_databento_data.py +220 -463
  60. tests/test_databento_live.py +10 -10
  61. tests/test_futures_roll.py +38 -0
  62. tests/test_indicator_subplots.py +101 -0
  63. tests/test_market_infinite_loop_bug.py +77 -3
  64. tests/test_polars_resample.py +67 -0
  65. tests/test_polygon_helper.py +46 -0
  66. tests/test_thetadata_backwards_compat.py +97 -0
  67. tests/test_thetadata_helper.py +222 -23
  68. tests/test_thetadata_pandas_verification.py +186 -0
  69. lumibot/__pycache__/__init__.cpython-312.pyc +0 -0
  70. lumibot/__pycache__/constants.cpython-312.pyc +0 -0
  71. lumibot/__pycache__/credentials.cpython-312.pyc +0 -0
  72. lumibot/backtesting/__pycache__/__init__.cpython-312.pyc +0 -0
  73. lumibot/backtesting/__pycache__/alpaca_backtesting.cpython-312.pyc +0 -0
  74. lumibot/backtesting/__pycache__/alpha_vantage_backtesting.cpython-312.pyc +0 -0
  75. lumibot/backtesting/__pycache__/backtesting_broker.cpython-312.pyc +0 -0
  76. lumibot/backtesting/__pycache__/ccxt_backtesting.cpython-312.pyc +0 -0
  77. lumibot/backtesting/__pycache__/databento_backtesting.cpython-312.pyc +0 -0
  78. lumibot/backtesting/__pycache__/interactive_brokers_rest_backtesting.cpython-312.pyc +0 -0
  79. lumibot/backtesting/__pycache__/pandas_backtesting.cpython-312.pyc +0 -0
  80. lumibot/backtesting/__pycache__/polygon_backtesting.cpython-312.pyc +0 -0
  81. lumibot/backtesting/__pycache__/thetadata_backtesting.cpython-312.pyc +0 -0
  82. lumibot/backtesting/__pycache__/yahoo_backtesting.cpython-312.pyc +0 -0
  83. lumibot/brokers/__pycache__/__init__.cpython-312.pyc +0 -0
  84. lumibot/brokers/__pycache__/alpaca.cpython-312.pyc +0 -0
  85. lumibot/brokers/__pycache__/bitunix.cpython-312.pyc +0 -0
  86. lumibot/brokers/__pycache__/broker.cpython-312.pyc +0 -0
  87. lumibot/brokers/__pycache__/ccxt.cpython-312.pyc +0 -0
  88. lumibot/brokers/__pycache__/example_broker.cpython-312.pyc +0 -0
  89. lumibot/brokers/__pycache__/interactive_brokers.cpython-312.pyc +0 -0
  90. lumibot/brokers/__pycache__/interactive_brokers_rest.cpython-312.pyc +0 -0
  91. lumibot/brokers/__pycache__/projectx.cpython-312.pyc +0 -0
  92. lumibot/brokers/__pycache__/schwab.cpython-312.pyc +0 -0
  93. lumibot/brokers/__pycache__/tradier.cpython-312.pyc +0 -0
  94. lumibot/brokers/__pycache__/tradovate.cpython-312.pyc +0 -0
  95. lumibot/data_sources/__pycache__/__init__.cpython-312.pyc +0 -0
  96. lumibot/data_sources/__pycache__/alpaca_data.cpython-312.pyc +0 -0
  97. lumibot/data_sources/__pycache__/alpha_vantage_data.cpython-312.pyc +0 -0
  98. lumibot/data_sources/__pycache__/bitunix_data.cpython-312.pyc +0 -0
  99. lumibot/data_sources/__pycache__/ccxt_backtesting_data.cpython-312.pyc +0 -0
  100. lumibot/data_sources/__pycache__/ccxt_data.cpython-312.pyc +0 -0
  101. lumibot/data_sources/__pycache__/data_source.cpython-312.pyc +0 -0
  102. lumibot/data_sources/__pycache__/data_source_backtesting.cpython-312.pyc +0 -0
  103. lumibot/data_sources/__pycache__/databento_data_polars_backtesting.cpython-312.pyc +0 -0
  104. lumibot/data_sources/__pycache__/databento_data_polars_live.cpython-312.pyc +0 -0
  105. lumibot/data_sources/__pycache__/example_broker_data.cpython-312.pyc +0 -0
  106. lumibot/data_sources/__pycache__/exceptions.cpython-312.pyc +0 -0
  107. lumibot/data_sources/__pycache__/interactive_brokers_data.cpython-312.pyc +0 -0
  108. lumibot/data_sources/__pycache__/interactive_brokers_rest_data.cpython-312.pyc +0 -0
  109. lumibot/data_sources/__pycache__/pandas_data.cpython-312.pyc +0 -0
  110. lumibot/data_sources/__pycache__/polars_mixin.cpython-312.pyc +0 -0
  111. lumibot/data_sources/__pycache__/polygon_data_polars.cpython-312.pyc +0 -0
  112. lumibot/data_sources/__pycache__/projectx_data.cpython-312.pyc +0 -0
  113. lumibot/data_sources/__pycache__/schwab_data.cpython-312.pyc +0 -0
  114. lumibot/data_sources/__pycache__/tradier_data.cpython-312.pyc +0 -0
  115. lumibot/data_sources/__pycache__/tradovate_data.cpython-312.pyc +0 -0
  116. lumibot/data_sources/__pycache__/yahoo_data_polars.cpython-312.pyc +0 -0
  117. lumibot/entities/__pycache__/__init__.cpython-312.pyc +0 -0
  118. lumibot/entities/__pycache__/asset.cpython-312.pyc +0 -0
  119. lumibot/entities/__pycache__/bar.cpython-312.pyc +0 -0
  120. lumibot/entities/__pycache__/bars.cpython-312.pyc +0 -0
  121. lumibot/entities/__pycache__/chains.cpython-312.pyc +0 -0
  122. lumibot/entities/__pycache__/data.cpython-312.pyc +0 -0
  123. lumibot/entities/__pycache__/dataline.cpython-312.pyc +0 -0
  124. lumibot/entities/__pycache__/order.cpython-312.pyc +0 -0
  125. lumibot/entities/__pycache__/position.cpython-312.pyc +0 -0
  126. lumibot/entities/__pycache__/quote.cpython-312.pyc +0 -0
  127. lumibot/entities/__pycache__/trading_fee.cpython-312.pyc +0 -0
  128. lumibot/example_strategies/__pycache__/__init__.cpython-312.pyc +0 -0
  129. lumibot/example_strategies/__pycache__/test_broker_functions.cpython-312-pytest-8.4.1.pyc +0 -0
  130. lumibot/strategies/__pycache__/__init__.cpython-312.pyc +0 -0
  131. lumibot/strategies/__pycache__/_strategy.cpython-312.pyc +0 -0
  132. lumibot/strategies/__pycache__/strategy.cpython-312.pyc +0 -0
  133. lumibot/strategies/__pycache__/strategy_executor.cpython-312.pyc +0 -0
  134. lumibot/tools/__pycache__/__init__.cpython-312.pyc +0 -0
  135. lumibot/tools/__pycache__/alpaca_helpers.cpython-312.pyc +0 -0
  136. lumibot/tools/__pycache__/bitunix_helpers.cpython-312.pyc +0 -0
  137. lumibot/tools/__pycache__/black_scholes.cpython-312.pyc +0 -0
  138. lumibot/tools/__pycache__/ccxt_data_store.cpython-312.pyc +0 -0
  139. lumibot/tools/__pycache__/databento_helper.cpython-312.pyc +0 -0
  140. lumibot/tools/__pycache__/databento_helper_polars.cpython-312.pyc +0 -0
  141. lumibot/tools/__pycache__/debugers.cpython-312.pyc +0 -0
  142. lumibot/tools/__pycache__/decorators.cpython-312.pyc +0 -0
  143. lumibot/tools/__pycache__/helpers.cpython-312.pyc +0 -0
  144. lumibot/tools/__pycache__/indicators.cpython-312.pyc +0 -0
  145. lumibot/tools/__pycache__/lumibot_logger.cpython-312.pyc +0 -0
  146. lumibot/tools/__pycache__/pandas.cpython-312.pyc +0 -0
  147. lumibot/tools/__pycache__/polygon_helper.cpython-312.pyc +0 -0
  148. lumibot/tools/__pycache__/polygon_helper_async.cpython-312.pyc +0 -0
  149. lumibot/tools/__pycache__/polygon_helper_polars_optimized.cpython-312.pyc +0 -0
  150. lumibot/tools/__pycache__/projectx_helpers.cpython-312.pyc +0 -0
  151. lumibot/tools/__pycache__/schwab_helper.cpython-312.pyc +0 -0
  152. lumibot/tools/__pycache__/thetadata_helper.cpython-312.pyc +0 -0
  153. lumibot/tools/__pycache__/types.cpython-312.pyc +0 -0
  154. lumibot/tools/__pycache__/yahoo_helper.cpython-312.pyc +0 -0
  155. lumibot/tools/__pycache__/yahoo_helper_polars_optimized.cpython-312.pyc +0 -0
  156. lumibot/traders/__pycache__/__init__.cpython-312.pyc +0 -0
  157. lumibot/traders/__pycache__/trader.cpython-312.pyc +0 -0
  158. lumibot/trading_builtins/__pycache__/__init__.cpython-312.pyc +0 -0
  159. lumibot/trading_builtins/__pycache__/custom_stream.cpython-312.pyc +0 -0
  160. lumibot/trading_builtins/__pycache__/safe_list.cpython-312.pyc +0 -0
  161. {lumibot-4.1.3.dist-info → lumibot-4.2.0.dist-info}/WHEEL +0 -0
  162. {lumibot-4.1.3.dist-info → lumibot-4.2.0.dist-info}/licenses/LICENSE +0 -0
  163. {lumibot-4.1.3.dist-info → lumibot-4.2.0.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,7 @@
3
3
  This mixin provides common polars operations without disrupting inheritance.
4
4
  """
5
5
 
6
- from datetime import datetime
6
+ from datetime import datetime, timedelta
7
7
  from typing import Any, Dict, Optional
8
8
 
9
9
  import polars as pl
@@ -13,6 +13,9 @@ from lumibot.tools.lumibot_logger import get_logger
13
13
 
14
14
  logger = get_logger(__name__)
15
15
 
16
+ # DEBUG-LOG: Always-on debug logging (will be removed after debugging is complete)
17
+ _THETA_PARITY_DEBUG = False
18
+
16
19
 
17
20
  class PolarsMixin:
18
21
  """Mixin for polars-based data sources with common functionality."""
@@ -116,17 +119,71 @@ class PolarsMixin:
116
119
  Bars
117
120
  Parsed bars object
118
121
  """
122
+ # DEBUG-LOG: Entry with response details
123
+ if _THETA_PARITY_DEBUG:
124
+ logger.debug(
125
+ "[POLARS_MIXIN][PARSE][ENTRY] asset=%s source=%s response_type=%s response_is_none=%s response_shape=%s return_polars=%s",
126
+ getattr(asset, 'symbol', asset),
127
+ source,
128
+ type(response).__name__,
129
+ response is None,
130
+ (response.height, len(response.columns)) if response is not None and hasattr(response, 'height') else 'NO_SHAPE',
131
+ return_polars
132
+ )
133
+
134
+ # DEBUG-LOG: Check for empty response
119
135
  if response is None or response.is_empty():
136
+ if _THETA_PARITY_DEBUG:
137
+ logger.warning(
138
+ "[POLARS_MIXIN][PARSE][EMPTY_INPUT] asset=%s source=%s response_is_none=%s response_is_empty=%s returning_empty_bars=True",
139
+ getattr(asset, 'symbol', asset),
140
+ source,
141
+ response is None,
142
+ response.is_empty() if response is not None else 'N/A'
143
+ )
120
144
  return Bars(response, source, asset, raw=response)
121
145
 
122
146
  # Limit length if specified
123
- if length and len(response) > length:
147
+ # DEBUG-LOG: Length limiting
148
+ if length and response.height > length:
149
+ if _THETA_PARITY_DEBUG:
150
+ logger.debug(
151
+ "[POLARS_MIXIN][PARSE][BEFORE_LENGTH_LIMIT] asset=%s source=%s height=%s length=%s will_truncate=True",
152
+ getattr(asset, 'symbol', asset),
153
+ source,
154
+ response.height,
155
+ length
156
+ )
124
157
  response = response.tail(length)
158
+ if _THETA_PARITY_DEBUG:
159
+ logger.debug(
160
+ "[POLARS_MIXIN][PARSE][AFTER_LENGTH_LIMIT] asset=%s source=%s new_height=%s",
161
+ getattr(asset, 'symbol', asset),
162
+ source,
163
+ response.height
164
+ )
125
165
 
126
166
  # Filter to only keep OHLCV + datetime columns (remove DataBento metadata like rtype, publisher_id, etc.)
127
167
  # Required columns for strategies
128
168
  required_cols = ['open', 'high', 'low', 'close', 'volume']
129
- optional_cols = ['datetime', 'timestamp', 'date', 'time', 'dividend', 'stock_splits', 'symbol']
169
+ optional_cols = [
170
+ 'datetime',
171
+ 'timestamp',
172
+ 'date',
173
+ 'time',
174
+ 'dividend',
175
+ 'stock_splits',
176
+ 'symbol',
177
+ 'bid',
178
+ 'ask',
179
+ 'bid_size',
180
+ 'ask_size',
181
+ 'bid_condition',
182
+ 'ask_condition',
183
+ 'bid_exchange',
184
+ 'ask_exchange',
185
+ 'missing',
186
+ ]
130
187
 
131
188
  # Determine which columns to keep
132
189
  keep_cols = []
@@ -138,8 +195,84 @@ class PolarsMixin:
138
195
  if keep_cols:
139
196
  response = response.select(keep_cols)
140
197
 
198
+ # DEBUG-LOG: Columns after selection
199
+ if _THETA_PARITY_DEBUG:
200
+ logger.debug(
201
+ "[POLARS_MIXIN][PARSE][AFTER_COLUMN_SELECT] asset=%s source=%s shape=%s columns=%s has_datetime=%s has_missing=%s",
202
+ getattr(asset, 'symbol', asset),
203
+ source,
204
+ (response.height, len(response.columns)),
205
+ response.columns,
206
+ 'datetime' in response.columns,
207
+ 'missing' in response.columns
208
+ )
209
+
141
210
  # Create bars object
142
- bars = Bars(response, source, asset, raw=response, quote=quote, return_polars=return_polars)
211
+ tzinfo = getattr(self, "tzinfo", None)
212
+ if (
213
+ tzinfo is not None
214
+ and isinstance(response, pl.DataFrame)
215
+ and "datetime" in response.columns
216
+ ):
217
+ target_tz = getattr(tzinfo, "zone", None) or getattr(tzinfo, "key", None)
218
+ if target_tz:
219
+ current_dtype = response.schema.get("datetime")
220
+ if hasattr(current_dtype, "time_zone"):
221
+ current_tz = current_dtype.time_zone
222
+ else:
223
+ current_tz = None
224
+ if current_tz != target_tz:
225
+ datetime_col = pl.col("datetime")
226
+ if current_tz is None:
227
+ response = response.with_columns(
228
+ datetime_col.dt.replace_time_zone(target_tz)
229
+ )
230
+ else:
231
+ response = response.with_columns(
232
+ datetime_col.dt.convert_time_zone(target_tz)
233
+ )
234
+
235
+ # DEBUG-LOG: Creating Bars object
236
+ if _THETA_PARITY_DEBUG:
237
+ sample_data = {}
238
+ for col in ['open', 'high', 'low', 'close', 'volume', 'missing']:
239
+ if col in response.columns:
240
+ try:
241
+ sample_data[col] = response[col][:3].to_list()
242
+ except Exception:
243
+ sample_data[col] = 'ERROR'
244
+ logger.debug(
245
+ "[POLARS_MIXIN][PARSE][BEFORE_BARS] asset=%s source=%s response_type=%s response_shape=%s return_polars=%s sample_data=%s",
246
+ getattr(asset, 'symbol', asset),
247
+ source,
248
+ type(response).__name__,
249
+ (response.height, len(response.columns)),
250
+ return_polars,
251
+ sample_data
252
+ )
253
+
254
+ bars = Bars(
255
+ response,
256
+ source,
257
+ asset,
258
+ raw=response,
259
+ quote=quote,
260
+ return_polars=return_polars,
261
+ tzinfo=tzinfo,
262
+ )
263
+
264
+ # DEBUG-LOG: Bars object created
265
+ if _THETA_PARITY_DEBUG:
266
+ logger.debug(
267
+ "[POLARS_MIXIN][PARSE][AFTER_BARS] asset=%s source=%s bars_type=%s bars._df_type=%s bars._df_shape=%s bars._return_polars=%s",
268
+ getattr(asset, 'symbol', asset),
269
+ source,
270
+ type(bars).__name__,
271
+ type(bars._df).__name__ if hasattr(bars, '_df') else 'NO_DF',
272
+ (bars._df.height, len(bars._df.columns)) if hasattr(bars, '_df') and hasattr(bars._df, 'height') else bars._df.shape if hasattr(bars, '_df') and hasattr(bars._df, 'shape') else 'NO_SHAPE',
273
+ bars._return_polars if hasattr(bars, '_return_polars') else 'NO_ATTR'
274
+ )
275
+
143
276
  return bars
144
277
 
145
278
  def _clear_cache_polars(self, asset: Optional[Asset] = None):
@@ -348,130 +481,373 @@ class PolarsMixin:
348
481
  Optional[pl.DataFrame]
349
482
  Filtered dataframe or None
350
483
  """
484
+ # DEBUG-LOG: Filter entry with parameters
485
+ if _THETA_PARITY_DEBUG:
486
+ logger.debug(
487
+ "[POLARS_MIXIN][FILTER][ENTRY] asset=%s end_filter=%s end_filter_tz=%s length=%s timestep=%s use_strict_less_than=%s",
488
+ getattr(asset, 'symbol', asset),
489
+ end_filter,
490
+ end_filter.tzinfo if hasattr(end_filter, 'tzinfo') else 'N/A',
491
+ length,
492
+ timestep,
493
+ use_strict_less_than
494
+ )
495
+
351
496
  # DEBUG
352
497
  logger.debug(f"[POLARS FILTER] end_filter={end_filter}, tzinfo={end_filter.tzinfo if hasattr(end_filter, 'tzinfo') else 'N/A'}, length={length}")
353
498
 
354
499
  # Convert end_filter to naive
355
500
  end_filter_naive = self._convert_datetime_for_filtering(end_filter)
356
501
 
502
+ # DEBUG-LOG: Naive end filter calculation
503
+ if _THETA_PARITY_DEBUG:
504
+ logger.debug(
505
+ "[POLARS_MIXIN][FILTER][END_FILTER_NAIVE] asset=%s end_filter_naive=%s",
506
+ getattr(asset, 'symbol', asset),
507
+ end_filter_naive
508
+ )
509
+
357
510
  # DEBUG
358
511
  logger.debug(f"[POLARS FILTER] end_filter_naive={end_filter_naive}")
359
512
 
360
- # For daily timestep, use caching
513
+ # Derive naive UTC end filter and compute matching start filter
361
514
  if timestep == "day":
362
515
  current_date = end_filter.date() if hasattr(end_filter, 'date') else end_filter
363
516
  cache_key = (asset, current_date, timestep)
517
+ else:
518
+ current_date = None
519
+ cache_key = None
520
+
521
+ # Determine datetime column name
522
+ schema = lazy_data.collect_schema()
523
+ dt_col = None
524
+ for col_name in schema.names():
525
+ if col_name in ['datetime', 'date', 'timestamp']:
526
+ dt_col = col_name
527
+ break
528
+
529
+ if dt_col is None:
530
+ logger.error("No datetime column found")
531
+ return None
364
532
 
365
- # Check cache first
366
- if cache_key in self._filtered_data_cache:
367
- result = self._filtered_data_cache[cache_key]
368
- if len(result) >= length:
369
- return result.tail(length)
370
-
371
- # Fetch extra for caching
372
- fetch_length = max(length * 2, 100)
373
-
374
- # Find datetime column
375
- schema = lazy_data.collect_schema()
376
- dt_col = None
377
- for col_name in schema.names():
378
- if col_name in ['datetime', 'date', 'timestamp']:
379
- dt_col = col_name
380
- break
381
-
382
- if dt_col is None:
383
- logger.error("No datetime column found")
384
- return None
385
-
386
- # Filter and collect
387
- # CRITICAL FIX: Keep timezone info! Match the DataFrame's timezone
388
- # Get the DataFrame column's timezone from schema
389
- dt_dtype = schema[dt_col]
533
+ dt_dtype = schema[dt_col]
534
+ if hasattr(dt_dtype, 'time_zone') and dt_dtype.time_zone:
535
+ import pytz
536
+ df_tz = pytz.timezone(dt_dtype.time_zone)
537
+ end_filter_with_tz = pytz.utc.localize(end_filter_naive).astimezone(df_tz)
538
+ else:
539
+ end_filter_with_tz = end_filter_naive
390
540
 
391
- # Convert filter to match DataFrame's timezone
541
+ start_filter_with_tz = None
542
+ if length and length > 0:
543
+ try:
544
+ if hasattr(self, "get_start_datetime_and_ts_unit"):
545
+ start_candidate, _ = self.get_start_datetime_and_ts_unit(length, timestep, start_dt=end_filter)
546
+ else:
547
+ delta, unit = self.convert_timestep_str_to_timedelta(timestep)
548
+ if unit == "day":
549
+ delta = timedelta(days=length)
550
+ else:
551
+ delta *= length
552
+ start_candidate = end_filter - delta
553
+ except Exception:
554
+ delta, unit = self.convert_timestep_str_to_timedelta(timestep)
555
+ if unit == "day":
556
+ delta = timedelta(days=length)
557
+ else:
558
+ delta *= length
559
+ start_candidate = end_filter - delta
560
+
561
+ start_naive = self._convert_datetime_for_filtering(start_candidate)
392
562
  if hasattr(dt_dtype, 'time_zone') and dt_dtype.time_zone:
393
- # DataFrame has timezone, convert filter to match
394
563
  import pytz
395
- df_tz = pytz.timezone(dt_dtype.time_zone)
396
- end_filter_with_tz = pytz.utc.localize(end_filter_naive).astimezone(df_tz)
564
+ start_filter_with_tz = pytz.utc.localize(start_naive).astimezone(df_tz)
397
565
  else:
398
- # DataFrame is naive, use UTC
399
- from datetime import timezone as tz
400
- end_filter_with_tz = datetime.combine(
401
- end_filter_naive.date(),
402
- end_filter_naive.time(),
403
- tzinfo=tz.utc
404
- )
405
-
406
- # CRITICAL FIX: Deduplicate before caching
407
- # Use < or <= based on use_strict_less_than flag
566
+ start_filter_with_tz = start_naive
567
+
568
+ if timestep == "day" and cache_key in self._filtered_data_cache:
569
+ cached = self._filtered_data_cache[cache_key]
570
+ if len(cached) >= length:
571
+ return cached.tail(length)
572
+
573
+ dt_time_zone = getattr(dt_dtype, "time_zone", None)
574
+ target_dtype = pl.Datetime(time_unit="ns", time_zone=dt_time_zone)
575
+ end_literal = pl.lit(end_filter_with_tz).cast(target_dtype)
576
+ filter_expr = pl.col(dt_col) <= end_literal
577
+ if start_filter_with_tz is not None:
578
+ start_literal = pl.lit(start_filter_with_tz).cast(target_dtype)
408
579
  if use_strict_less_than:
409
- filter_expr = pl.col(dt_col) < end_filter_with_tz
580
+ filter_expr = (pl.col(dt_col) < end_literal) & (pl.col(dt_col) >= start_literal)
410
581
  else:
411
- filter_expr = pl.col(dt_col) <= end_filter_with_tz
582
+ filter_expr = (pl.col(dt_col) <= end_literal) & (pl.col(dt_col) >= start_literal)
583
+ elif use_strict_less_than:
584
+ filter_expr = pl.col(dt_col) < end_literal
585
+
586
+ # DEBUG-LOG: Before filtering with expression
587
+ if _THETA_PARITY_DEBUG:
588
+ logger.debug(
589
+ "[POLARS_MIXIN][FILTER][BEFORE_FILTER_EXPR] asset=%s start_filter_with_tz=%s end_filter_with_tz=%s use_strict_less_than=%s dt_col=%s",
590
+ getattr(asset, 'symbol', asset),
591
+ start_filter_with_tz,
592
+ end_filter_with_tz,
593
+ use_strict_less_than,
594
+ dt_col
595
+ )
596
+
597
+ result = (
598
+ lazy_data
599
+ .filter(filter_expr)
600
+ .sort(dt_col)
601
+ .unique(subset=[dt_col], keep='last', maintain_order=True)
602
+ .collect()
603
+ )
604
+
605
+ # DEBUG-LOG: After filtering
606
+ if _THETA_PARITY_DEBUG:
607
+ logger.debug(
608
+ "[POLARS_MIXIN][FILTER][AFTER_FILTER_EXPR] asset=%s result_shape=%s result_is_empty=%s",
609
+ getattr(asset, 'symbol', asset),
610
+ (result.height, len(result.columns)),
611
+ result.is_empty()
612
+ )
412
613
 
413
- result = (
614
+ if result.is_empty() and length and length > 0:
615
+ # DEBUG-LOG: Fallback triggered
616
+ if _THETA_PARITY_DEBUG:
617
+ logger.warning(
618
+ "[POLARS_MIXIN][FILTER][FALLBACK_TRIGGERED] asset=%s length=%s reason=empty_result_after_filter",
619
+ getattr(asset, 'symbol', asset),
620
+ length
621
+ )
622
+ fallback = (
414
623
  lazy_data
415
- .filter(filter_expr)
416
624
  .sort(dt_col)
417
625
  .unique(subset=[dt_col], keep='last', maintain_order=True)
418
- .tail(fetch_length)
626
+ .tail(length)
419
627
  .collect()
420
628
  )
629
+ if not fallback.is_empty():
630
+ logger.debug(
631
+ '[POLARS-FILTER][FALLBACK] asset=%s timestep=%s length=%s rows=%s',
632
+ getattr(asset, 'symbol', asset) if hasattr(asset, 'symbol') else asset,
633
+ timestep,
634
+ length,
635
+ fallback.height,
636
+ )
637
+ # DEBUG-LOG: Fallback succeeded
638
+ if _THETA_PARITY_DEBUG:
639
+ logger.debug(
640
+ "[POLARS_MIXIN][FILTER][FALLBACK_SUCCESS] asset=%s fallback_shape=%s",
641
+ getattr(asset, 'symbol', asset),
642
+ (fallback.height, len(fallback.columns))
643
+ )
644
+ result = fallback
645
+ else:
646
+ # DEBUG-LOG: Fallback also empty
647
+ if _THETA_PARITY_DEBUG:
648
+ logger.warning(
649
+ "[POLARS_MIXIN][FILTER][FALLBACK_EMPTY] asset=%s lazy_data_has_no_rows=True",
650
+ getattr(asset, 'symbol', asset)
651
+ )
652
+
653
+ has_price_columns = {"open", "high", "low", "close"} <= set(result.columns)
654
+
655
+ # DEBUG-LOG: Before missing flag computation
656
+ if _THETA_PARITY_DEBUG:
657
+ logger.debug(
658
+ "[POLARS_MIXIN][FILTER][BEFORE_MISSING_FLAG] asset=%s has_price_columns=%s result_columns=%s",
659
+ getattr(asset, 'symbol', asset),
660
+ has_price_columns,
661
+ result.columns
662
+ )
421
663
 
422
- # Cache the result
423
- self._filtered_data_cache[cache_key] = result
424
-
425
- # Return requested length
426
- return result.tail(length) if len(result) > length else result
664
+ if has_price_columns:
665
+ # CRITICAL FIX: Match pandas missing flag logic exactly
666
+ # Pandas uses .isna().all(axis=1) which means ALL OHLCV must be NaN for missing=True
667
+ # NOT any single column - this is a critical difference from previous implementation
668
+ missing_price_expr = (
669
+ (pl.col("open").is_null() | pl.col("open").is_nan()) &
670
+ (pl.col("high").is_null() | pl.col("high").is_nan()) &
671
+ (pl.col("low").is_null() | pl.col("low").is_nan()) &
672
+ (pl.col("close").is_null() | pl.col("close").is_nan())
673
+ )
674
+ # Add volume check if it exists (pandas does this too)
675
+ if "volume" in result.columns:
676
+ missing_price_expr = missing_price_expr & (
677
+ pl.col("volume").is_null() | pl.col("volume").is_nan()
678
+ )
427
679
  else:
428
- # For minute data, don't cache
429
- schema = lazy_data.collect_schema()
430
- dt_col = None
431
- for col_name in schema.names():
432
- if col_name in ['datetime', 'date', 'timestamp']:
433
- dt_col = col_name
434
- break
680
+ missing_price_expr = pl.lit(False)
435
681
 
436
- if dt_col is None:
437
- logger.error("No datetime column found")
438
- return None
682
+ result = result.with_columns(missing_price_expr.alias("_lumibot_missing_price"))
439
683
 
440
- # CRITICAL FIX: Keep timezone info during filtering!
441
- # Match the DataFrame's timezone to avoid comparison errors
442
- # Get the DataFrame column's timezone from schema
443
- dt_dtype = schema[dt_col]
684
+ # DEBUG-LOG: After missing flag computation
685
+ if _THETA_PARITY_DEBUG:
686
+ try:
687
+ missing_count = int(result.select(pl.col("_lumibot_missing_price").cast(pl.Int64).sum()).item())
688
+ logger.debug(
689
+ "[POLARS_MIXIN][FILTER][AFTER_MISSING_FLAG] asset=%s missing_count=%s total_rows=%s",
690
+ getattr(asset, 'symbol', asset),
691
+ missing_count,
692
+ result.height
693
+ )
694
+ except Exception as e:
695
+ logger.debug(
696
+ "[POLARS_MIXIN][FILTER][AFTER_MISSING_FLAG] asset=%s missing_count=ERROR error=%s",
697
+ getattr(asset, 'symbol', asset),
698
+ str(e)
699
+ )
444
700
 
445
- # Convert filter to match DataFrame's timezone
446
- if hasattr(dt_dtype, 'time_zone') and dt_dtype.time_zone:
447
- # DataFrame has timezone, convert filter to match
448
- import pytz
449
- df_tz = pytz.timezone(dt_dtype.time_zone)
450
- end_filter_with_tz = pytz.utc.localize(end_filter_naive).astimezone(df_tz)
451
- else:
452
- # DataFrame is naive, use UTC
453
- from datetime import timezone as tz
454
- end_filter_with_tz = datetime.combine(
455
- end_filter_naive.date(),
456
- end_filter_naive.time(),
457
- tzinfo=tz.utc
701
+ if timestep != "day":
702
+ if {"open", "high", "low", "close", "volume"} <= set(result.columns):
703
+ open_ffill = pl.col("open").fill_nan(None).fill_null(strategy="forward")
704
+ high_ffill = pl.col("high").fill_nan(None).fill_null(strategy="forward")
705
+ low_ffill = pl.col("low").fill_nan(None).fill_null(strategy="forward")
706
+ close_ffill = pl.col("close").fill_nan(None).fill_null(strategy="forward")
707
+ close_fallback = pl.coalesce(
708
+ [close_ffill, open_ffill, high_ffill, low_ffill]
709
+ )
710
+ missing_price_mask = pl.col("_lumibot_missing_price")
711
+ price_null_mask = (
712
+ pl.col("open").is_null()
713
+ | pl.col("open").is_nan()
714
+ | pl.col("high").is_null()
715
+ | pl.col("high").is_nan()
716
+ | pl.col("low").is_null()
717
+ | pl.col("low").is_nan()
718
+ | pl.col("close").is_null()
719
+ | pl.col("close").is_nan()
720
+ )
721
+ normalized_volume = pl.coalesce([pl.col("volume"), pl.lit(0.0)])
722
+ has_quote_cols = {"bid", "ask"} <= set(result.columns)
723
+ if has_quote_cols:
724
+ valid_mid_mask = (
725
+ pl.col("bid").is_not_null()
726
+ & ~pl.col("bid").is_nan()
727
+ & pl.col("ask").is_not_null()
728
+ & ~pl.col("ask").is_nan()
729
+ )
730
+ mid_price_expr = pl.when(valid_mid_mask).then((pl.col("bid") + pl.col("ask")) / 2.0).otherwise(close_fallback)
731
+ else:
732
+ valid_mid_mask = pl.lit(False)
733
+ mid_price_expr = close_fallback
734
+ adjust_condition = missing_price_mask | price_null_mask | ((normalized_volume <= 0) & valid_mid_mask)
735
+ result = result.with_columns(
736
+ [
737
+ pl.when(adjust_condition)
738
+ .then(mid_price_expr)
739
+ .otherwise(pl.col("open"))
740
+ .alias("open"),
741
+ pl.when(adjust_condition)
742
+ .then(mid_price_expr)
743
+ .otherwise(pl.col("high"))
744
+ .alias("high"),
745
+ pl.when(adjust_condition)
746
+ .then(mid_price_expr)
747
+ .otherwise(pl.col("low"))
748
+ .alias("low"),
749
+ pl.when(adjust_condition)
750
+ .then(mid_price_expr)
751
+ .otherwise(pl.col("close"))
752
+ .alias("close"),
753
+ pl.when(missing_price_mask | normalized_volume.is_null())
754
+ .then(pl.lit(0.0))
755
+ .otherwise(normalized_volume)
756
+ .alias("volume"),
757
+ ]
758
+ )
759
+ elif has_price_columns:
760
+ open_ffill = pl.col("open").fill_nan(None).fill_null(strategy="forward")
761
+ high_ffill = pl.col("high").fill_nan(None).fill_null(strategy="forward")
762
+ low_ffill = pl.col("low").fill_nan(None).fill_null(strategy="forward")
763
+ close_ffill = pl.col("close").fill_nan(None).fill_null(strategy="forward")
764
+ close_fallback = pl.coalesce(
765
+ [close_ffill, open_ffill, high_ffill, low_ffill]
766
+ )
767
+ missing_price_mask = pl.col("_lumibot_missing_price")
768
+ result = result.with_columns(
769
+ [
770
+ pl.when(missing_price_mask)
771
+ .then(close_fallback)
772
+ .otherwise(pl.col(col_name))
773
+ .alias(col_name)
774
+ for col_name in ["open", "high", "low", "close"]
775
+ if col_name in result.columns
776
+ ]
458
777
  )
459
778
 
460
- # CRITICAL FIX: Deduplicate before returning
461
- # Sometimes lazy operations can create duplicates
462
- # Use < or <= based on use_strict_less_than flag
463
- if use_strict_less_than:
464
- filter_expr = pl.col(dt_col) < end_filter_with_tz
465
- else:
466
- filter_expr = pl.col(dt_col) <= end_filter_with_tz
779
+ forward_fill_columns = [
780
+ col_name
781
+ for col_name in ("open", "high", "low", "close", "volume", "bid", "ask")
782
+ if col_name in result.columns
783
+ ]
784
+ if forward_fill_columns:
785
+ result = result.with_columns(
786
+ [
787
+ pl.col(col_name)
788
+ .fill_nan(None)
789
+ .fill_null(strategy="forward")
790
+ for col_name in forward_fill_columns
791
+ ]
792
+ )
467
793
 
468
- result = (
469
- lazy_data
470
- .filter(filter_expr)
471
- .sort(dt_col)
472
- .unique(subset=[dt_col], keep='last', maintain_order=True)
473
- .tail(length)
474
- .collect()
794
+ if "return" in result.columns:
795
+ result = result.with_columns(
796
+ pl.col("return").fill_null(0.0).fill_nan(0.0)
475
797
  )
798
+ if "price_change" in result.columns:
799
+ result = result.with_columns(
800
+ pl.col("price_change").fill_null(0.0).fill_nan(0.0)
801
+ )
802
+ if "dividend_yield" in result.columns:
803
+ result = result.with_columns(
804
+ pl.col("dividend_yield").fill_null(0.0).fill_nan(0.0)
805
+ )
806
+
807
+ if timestep == "day" and cache_key:
808
+ self._filtered_data_cache[cache_key] = result
476
809
 
477
- return result
810
+ if "_lumibot_missing_price" in result.columns:
811
+ missing_flag = pl.col("_lumibot_missing_price").cast(pl.Boolean)
812
+ if "missing" in result.columns:
813
+ result = result.with_columns(
814
+ pl.when(pl.col("missing").cast(pl.Boolean))
815
+ .then(True)
816
+ .otherwise(missing_flag)
817
+ .alias("missing")
818
+ )
819
+ else:
820
+ result = result.with_columns(missing_flag.alias("missing"))
821
+ result = result.drop("_lumibot_missing_price")
822
+
823
+ if length and len(result) > length:
824
+ result = result.tail(length)
825
+
826
+ try:
827
+ first_dt = result["datetime"][0] if "datetime" in result.columns and len(result) else None
828
+ except Exception:
829
+ first_dt = None
830
+ try:
831
+ last_dt = result["datetime"][-1] if "datetime" in result.columns and len(result) else None
832
+ except Exception:
833
+ last_dt = None
834
+ missing_true = None
835
+ if "missing" in result.columns and len(result):
836
+ try:
837
+ missing_true = int(
838
+ result.select(pl.col("missing").cast(pl.Int64).sum()).item()
839
+ )
840
+ except Exception:
841
+ missing_true = None
842
+ logger.debug(
843
+ "[POLARS-FILTER] asset=%s timestep=%s length=%s rows=%s first_dt=%s last_dt=%s missing_true=%s columns=%s",
844
+ getattr(asset, "symbol", asset) if hasattr(asset, "symbol") else asset,
845
+ timestep,
846
+ length,
847
+ len(result),
848
+ first_dt,
849
+ last_dt,
850
+ missing_true,
851
+ result.columns,
852
+ )
853
+ return result
@@ -7,6 +7,11 @@ This implementation:
7
7
  4. Efficient caching with parquet files
8
8
  5. Vectorized operations only
9
9
  """
10
+ # NOTE: This module is intentionally disabled. The DataBento Polars migration only
11
+ # supports Polars for DataBento; other data sources must use the pandas implementations.
12
+ raise RuntimeError('Yahoo/Polygon Polars backends are not production-ready; use the pandas data sources instead.')
13
+
14
+
10
15
 
11
16
  import traceback
12
17
  from datetime import timedelta
@@ -382,8 +382,15 @@ class YahooData(DataSourceBacktesting):
382
382
  if cache_key in self._last_price_cache:
383
383
  return self._last_price_cache[cache_key]
384
384
 
385
- # Use -1 timeshift to get the price for the current bar (otherwise gets yesterdays prices)
386
- bars = self.get_historical_prices(asset, 1, timestep=timestep, quote=quote, timeshift=timedelta(days=-1))
385
+ # Daily bars are stamped at the session close. Leaving the timeshift unset for daily
386
+ # requests ensures we only reference the most recent fully closed bar (no lookahead).
387
+ # Intraday paths still step back one interval to avoid peeking ahead.
388
+ if isinstance(timestep, str) and 'day' in timestep.lower():
389
+ timeshift_delta = None
390
+ else:
391
+ timeshift_delta = timedelta(days=-1)
392
+
393
+ bars = self.get_historical_prices(asset, 1, timestep=timestep, quote=quote, timeshift=timeshift_delta)
387
394
 
388
395
  if isinstance(bars, float):
389
396
  return bars