lumibot 4.1.3__py3-none-any.whl → 4.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lumibot might be problematic. Click here for more details.
- lumibot/backtesting/__init__.py +19 -5
- lumibot/backtesting/backtesting_broker.py +98 -18
- lumibot/backtesting/databento_backtesting.py +5 -686
- lumibot/backtesting/databento_backtesting_pandas.py +738 -0
- lumibot/backtesting/databento_backtesting_polars.py +860 -546
- lumibot/backtesting/fix_debug.py +37 -0
- lumibot/backtesting/thetadata_backtesting.py +9 -355
- lumibot/backtesting/thetadata_backtesting_pandas.py +1167 -0
- lumibot/brokers/alpaca.py +8 -1
- lumibot/brokers/schwab.py +12 -2
- lumibot/credentials.py +13 -0
- lumibot/data_sources/__init__.py +5 -8
- lumibot/data_sources/data_source.py +6 -2
- lumibot/data_sources/data_source_backtesting.py +30 -0
- lumibot/data_sources/databento_data.py +5 -390
- lumibot/data_sources/databento_data_pandas.py +440 -0
- lumibot/data_sources/databento_data_polars.py +15 -9
- lumibot/data_sources/pandas_data.py +30 -17
- lumibot/data_sources/polars_data.py +986 -0
- lumibot/data_sources/polars_mixin.py +472 -96
- lumibot/data_sources/polygon_data_polars.py +5 -0
- lumibot/data_sources/yahoo_data.py +9 -2
- lumibot/data_sources/yahoo_data_polars.py +5 -0
- lumibot/entities/__init__.py +15 -0
- lumibot/entities/asset.py +5 -28
- lumibot/entities/bars.py +89 -20
- lumibot/entities/data.py +29 -6
- lumibot/entities/data_polars.py +668 -0
- lumibot/entities/position.py +38 -4
- lumibot/strategies/_strategy.py +2 -1
- lumibot/strategies/strategy.py +61 -49
- lumibot/tools/backtest_cache.py +284 -0
- lumibot/tools/databento_helper.py +35 -35
- lumibot/tools/databento_helper_polars.py +738 -775
- lumibot/tools/futures_roll.py +251 -0
- lumibot/tools/indicators.py +135 -104
- lumibot/tools/polars_utils.py +142 -0
- lumibot/tools/thetadata_helper.py +1068 -134
- {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/METADATA +9 -1
- {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/RECORD +71 -147
- tests/backtest/test_databento.py +37 -6
- tests/backtest/test_databento_comprehensive_trading.py +8 -4
- tests/backtest/test_databento_parity.py +4 -2
- tests/backtest/test_debug_avg_fill_price.py +1 -1
- tests/backtest/test_example_strategies.py +11 -1
- tests/backtest/test_futures_edge_cases.py +3 -3
- tests/backtest/test_futures_single_trade.py +2 -2
- tests/backtest/test_futures_ultra_simple.py +2 -2
- tests/backtest/test_polars_lru_eviction.py +470 -0
- tests/backtest/test_yahoo.py +42 -0
- tests/test_asset.py +4 -4
- tests/test_backtest_cache_manager.py +149 -0
- tests/test_backtesting_data_source_env.py +6 -0
- tests/test_continuous_futures_resolution.py +60 -48
- tests/test_data_polars_parity.py +160 -0
- tests/test_databento_asset_validation.py +23 -5
- tests/test_databento_backtesting.py +1 -1
- tests/test_databento_backtesting_polars.py +312 -192
- tests/test_databento_data.py +220 -463
- tests/test_databento_live.py +10 -10
- tests/test_futures_roll.py +38 -0
- tests/test_indicator_subplots.py +101 -0
- tests/test_market_infinite_loop_bug.py +77 -3
- tests/test_polars_resample.py +67 -0
- tests/test_polygon_helper.py +46 -0
- tests/test_thetadata_backwards_compat.py +97 -0
- tests/test_thetadata_helper.py +222 -23
- tests/test_thetadata_pandas_verification.py +186 -0
- lumibot/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/__pycache__/constants.cpython-312.pyc +0 -0
- lumibot/__pycache__/credentials.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/alpaca_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/alpha_vantage_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/backtesting_broker.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/ccxt_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/databento_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/interactive_brokers_rest_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/pandas_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/polygon_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/thetadata_backtesting.cpython-312.pyc +0 -0
- lumibot/backtesting/__pycache__/yahoo_backtesting.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/alpaca.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/bitunix.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/broker.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/ccxt.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/example_broker.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/interactive_brokers.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/interactive_brokers_rest.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/projectx.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/schwab.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/tradier.cpython-312.pyc +0 -0
- lumibot/brokers/__pycache__/tradovate.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/alpaca_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/alpha_vantage_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/bitunix_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/ccxt_backtesting_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/ccxt_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/data_source.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/data_source_backtesting.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/databento_data_polars_backtesting.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/databento_data_polars_live.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/example_broker_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/exceptions.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/interactive_brokers_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/interactive_brokers_rest_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/pandas_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/polars_mixin.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/polygon_data_polars.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/projectx_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/schwab_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/tradier_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/tradovate_data.cpython-312.pyc +0 -0
- lumibot/data_sources/__pycache__/yahoo_data_polars.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/asset.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/bar.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/bars.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/chains.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/data.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/dataline.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/order.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/position.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/quote.cpython-312.pyc +0 -0
- lumibot/entities/__pycache__/trading_fee.cpython-312.pyc +0 -0
- lumibot/example_strategies/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/example_strategies/__pycache__/test_broker_functions.cpython-312-pytest-8.4.1.pyc +0 -0
- lumibot/strategies/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/strategies/__pycache__/_strategy.cpython-312.pyc +0 -0
- lumibot/strategies/__pycache__/strategy.cpython-312.pyc +0 -0
- lumibot/strategies/__pycache__/strategy_executor.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/alpaca_helpers.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/bitunix_helpers.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/black_scholes.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/ccxt_data_store.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/databento_helper.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/databento_helper_polars.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/debugers.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/decorators.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/helpers.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/indicators.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/lumibot_logger.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/pandas.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/polygon_helper.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/polygon_helper_async.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/polygon_helper_polars_optimized.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/projectx_helpers.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/schwab_helper.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/thetadata_helper.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/types.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/yahoo_helper.cpython-312.pyc +0 -0
- lumibot/tools/__pycache__/yahoo_helper_polars_optimized.cpython-312.pyc +0 -0
- lumibot/traders/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/traders/__pycache__/trader.cpython-312.pyc +0 -0
- lumibot/trading_builtins/__pycache__/__init__.cpython-312.pyc +0 -0
- lumibot/trading_builtins/__pycache__/custom_stream.cpython-312.pyc +0 -0
- lumibot/trading_builtins/__pycache__/safe_list.cpython-312.pyc +0 -0
- {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/WHEEL +0 -0
- {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/licenses/LICENSE +0 -0
- {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,986 @@
|
|
|
1
|
+
from collections import OrderedDict, defaultdict
|
|
2
|
+
from datetime import timedelta
|
|
3
|
+
from decimal import Decimal
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
|
|
8
|
+
from lumibot.data_sources import DataSourceBacktesting
|
|
9
|
+
from lumibot.entities import Asset, Bars, Quote
|
|
10
|
+
from lumibot.tools.lumibot_logger import get_logger
|
|
11
|
+
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PolarsData(DataSourceBacktesting):
|
|
16
|
+
"""
|
|
17
|
+
PolarsData is a Backtesting-only DataSource that will be optimized to use Polars DataFrames.
|
|
18
|
+
Currently identical to PandasData as a baseline. Will be incrementally converted to use Polars.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
SOURCE = "POLARS"
|
|
22
|
+
TIMESTEP_MAPPING = [
|
|
23
|
+
{"timestep": "day", "representations": ["1D", "day"]},
|
|
24
|
+
{"timestep": "minute", "representations": ["1M", "minute"]},
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
def __init__(self, *args, pandas_data=None, auto_adjust=True, allow_option_quote_fallback: bool = False, **kwargs):
|
|
28
|
+
super().__init__(*args, **kwargs)
|
|
29
|
+
self.option_quote_fallback_allowed = allow_option_quote_fallback
|
|
30
|
+
self.name = "polars"
|
|
31
|
+
self.pandas_data = self._set_pandas_data_keys(pandas_data)
|
|
32
|
+
self.auto_adjust = auto_adjust
|
|
33
|
+
self._data_store = self.pandas_data
|
|
34
|
+
self._date_index = None
|
|
35
|
+
self._date_supply = None
|
|
36
|
+
self._timestep = "minute"
|
|
37
|
+
|
|
38
|
+
# Sliding window configuration (always-on, optimized for speed)
|
|
39
|
+
self._HISTORY_WINDOW_BARS = 5000 # Fixed window size
|
|
40
|
+
self._FUTURE_WINDOW_BARS = 1000 # Look-ahead buffer for efficiency
|
|
41
|
+
self._TRIM_FREQUENCY_BARS = 1000 # Trim every 1000 iterations
|
|
42
|
+
self._trim_iteration_count = 0 # Counter for periodic trimming
|
|
43
|
+
|
|
44
|
+
# Aggregated bars cache (separate from pandas_data)
|
|
45
|
+
# Uses existing OrderedDict infrastructure for LRU tracking
|
|
46
|
+
self._aggregated_cache = OrderedDict()
|
|
47
|
+
|
|
48
|
+
# Memory limits (1 GB hard cap)
|
|
49
|
+
self.MAX_STORAGE_BYTES = 1_000_000_000
|
|
50
|
+
|
|
51
|
+
def _trim_cached_data(self):
|
|
52
|
+
"""Periodically trim cached data to maintain sliding window.
|
|
53
|
+
|
|
54
|
+
Called every _TRIM_FREQUENCY_BARS iterations to remove old bars
|
|
55
|
+
that are outside the sliding window. This keeps memory usage low
|
|
56
|
+
while maintaining enough history for lookback calculations.
|
|
57
|
+
|
|
58
|
+
This is always-on and requires no user configuration.
|
|
59
|
+
"""
|
|
60
|
+
# Increment iteration counter
|
|
61
|
+
self._trim_iteration_count += 1
|
|
62
|
+
|
|
63
|
+
# Only trim every TRIM_FREQUENCY_BARS iterations
|
|
64
|
+
if self._trim_iteration_count < self._TRIM_FREQUENCY_BARS:
|
|
65
|
+
return
|
|
66
|
+
|
|
67
|
+
# Reset counter
|
|
68
|
+
self._trim_iteration_count = 0
|
|
69
|
+
|
|
70
|
+
# Get current datetime for window calculation
|
|
71
|
+
current_dt = self.get_datetime()
|
|
72
|
+
|
|
73
|
+
# Trim each DataPolars object in the data store
|
|
74
|
+
# CRITICAL: Use each data object's own timestep, not global self._timestep
|
|
75
|
+
# A backtest can have mixed timeframes (1m, 5m, 1h, 1d for same asset)
|
|
76
|
+
trimmed_count = 0
|
|
77
|
+
for asset_key, data in self._data_store.items():
|
|
78
|
+
# Only trim if data is a DataPolars object (has trim_before method)
|
|
79
|
+
if not hasattr(data, 'trim_before'):
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
# Get this data object's timestep (not the global self._timestep!)
|
|
84
|
+
data_timestep = getattr(data, 'timestep', 'minute')
|
|
85
|
+
|
|
86
|
+
# Use convert_timestep_str_to_timedelta for robust conversion
|
|
87
|
+
base_delta, _ = self.convert_timestep_str_to_timedelta(data_timestep)
|
|
88
|
+
|
|
89
|
+
# Calculate cutoff for this specific data object
|
|
90
|
+
# Keep HISTORY_WINDOW_BARS bars of this timestep before current time
|
|
91
|
+
window_delta = base_delta * self._HISTORY_WINDOW_BARS
|
|
92
|
+
cutoff_dt = current_dt - window_delta
|
|
93
|
+
|
|
94
|
+
# Trim with the correct per-asset cutoff
|
|
95
|
+
data.trim_before(cutoff_dt)
|
|
96
|
+
|
|
97
|
+
trimmed_count += 1
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
logger.warning(f"Failed to trim data for {asset_key}: {e}")
|
|
101
|
+
|
|
102
|
+
if trimmed_count > 0:
|
|
103
|
+
logger.debug(f"[SLIDING WINDOW] Trimmed {trimmed_count} assets at iteration {self._TRIM_FREQUENCY_BARS}")
|
|
104
|
+
|
|
105
|
+
def _get_aggregation_cache_key(self, asset, quote, timestep):
|
|
106
|
+
"""Generate a unique cache key for aggregated bars.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
asset : Asset
|
|
111
|
+
The asset
|
|
112
|
+
quote : Asset
|
|
113
|
+
The quote asset
|
|
114
|
+
timestep : str
|
|
115
|
+
The timestep (e.g., "5 minutes", "15 minutes", "hour", "day")
|
|
116
|
+
|
|
117
|
+
Returns
|
|
118
|
+
-------
|
|
119
|
+
tuple
|
|
120
|
+
Cache key (asset, quote, timestep)
|
|
121
|
+
"""
|
|
122
|
+
if isinstance(asset, tuple):
|
|
123
|
+
asset, quote = asset
|
|
124
|
+
return (asset, quote, timestep)
|
|
125
|
+
|
|
126
|
+
def _aggregate_polars_bars(self, source_data, target_timestep):
|
|
127
|
+
"""Aggregate minute-level polars data to higher timeframes.
|
|
128
|
+
|
|
129
|
+
This is a critical performance optimization - aggregating once and caching
|
|
130
|
+
is much faster than re-aggregating every iteration.
|
|
131
|
+
|
|
132
|
+
Parameters
|
|
133
|
+
----------
|
|
134
|
+
source_data : DataPolars
|
|
135
|
+
Source data (typically 1-minute bars)
|
|
136
|
+
target_timestep : str
|
|
137
|
+
Target timestep ("5 minutes", "15 minutes", "hour", "day")
|
|
138
|
+
|
|
139
|
+
Returns
|
|
140
|
+
-------
|
|
141
|
+
polars.DataFrame or None
|
|
142
|
+
Aggregated data, or None if aggregation not possible
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
import polars as pl
|
|
146
|
+
|
|
147
|
+
# Get the polars DataFrame from DataPolars
|
|
148
|
+
if not hasattr(source_data, 'polars_df'):
|
|
149
|
+
return None
|
|
150
|
+
|
|
151
|
+
df = source_data.polars_df
|
|
152
|
+
if df.height == 0:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
# Map timestep to polars interval
|
|
156
|
+
interval_mapping = {
|
|
157
|
+
"5 minutes": "5m",
|
|
158
|
+
"15 minutes": "15m",
|
|
159
|
+
"30 minutes": "30m",
|
|
160
|
+
"hour": "1h",
|
|
161
|
+
"2 hours": "2h",
|
|
162
|
+
"4 hours": "4h",
|
|
163
|
+
"day": "1d",
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
interval = interval_mapping.get(target_timestep)
|
|
167
|
+
if not interval:
|
|
168
|
+
logger.warning(f"Unsupported aggregation timestep: {target_timestep}")
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Aggregate using polars group_by_dynamic (fast!)
|
|
172
|
+
# This is the core optimization - polars aggregation is 10-100x faster than pandas
|
|
173
|
+
aggregated = df.group_by_dynamic(
|
|
174
|
+
"datetime",
|
|
175
|
+
every=interval,
|
|
176
|
+
closed="left",
|
|
177
|
+
label="left"
|
|
178
|
+
).agg([
|
|
179
|
+
pl.col("open").first(),
|
|
180
|
+
pl.col("high").max(),
|
|
181
|
+
pl.col("low").min(),
|
|
182
|
+
pl.col("close").last(),
|
|
183
|
+
pl.col("volume").sum(),
|
|
184
|
+
])
|
|
185
|
+
|
|
186
|
+
logger.debug(f"[AGGREGATION] {source_data.asset.symbol}: {df.height} rows ({source_data.timestep}) → {aggregated.height} rows ({target_timestep})")
|
|
187
|
+
return aggregated
|
|
188
|
+
|
|
189
|
+
except Exception as e:
|
|
190
|
+
logger.error(f"Error aggregating data: {e}")
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
def _get_or_aggregate_bars(self, asset, quote, length, source_timestep, target_timestep):
|
|
194
|
+
"""Get aggregated bars from cache or create them.
|
|
195
|
+
|
|
196
|
+
This method implements the aggregated bars cache to avoid re-aggregating
|
|
197
|
+
5m/15m/1h bars from 1-minute data on every iteration.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
asset : Asset
|
|
202
|
+
The asset
|
|
203
|
+
quote : Asset
|
|
204
|
+
The quote asset
|
|
205
|
+
length : int
|
|
206
|
+
Number of bars requested
|
|
207
|
+
source_timestep : str
|
|
208
|
+
Source timestep (typically "minute")
|
|
209
|
+
target_timestep : str
|
|
210
|
+
Target timestep (e.g., "5 minutes", "15 minutes", "hour")
|
|
211
|
+
|
|
212
|
+
Returns
|
|
213
|
+
-------
|
|
214
|
+
polars.DataFrame or None
|
|
215
|
+
Aggregated bars, or None if not available
|
|
216
|
+
"""
|
|
217
|
+
# Generate cache key
|
|
218
|
+
cache_key = self._get_aggregation_cache_key(asset, quote, target_timestep)
|
|
219
|
+
|
|
220
|
+
# Check if we already have aggregated data cached
|
|
221
|
+
if cache_key in self._aggregated_cache:
|
|
222
|
+
# Move to end (LRU tracking)
|
|
223
|
+
self._aggregated_cache.move_to_end(cache_key)
|
|
224
|
+
logger.debug(f"[AGG CACHE HIT] {asset.symbol} {target_timestep}")
|
|
225
|
+
return self._aggregated_cache[cache_key]
|
|
226
|
+
|
|
227
|
+
# Need to aggregate from source data
|
|
228
|
+
asset_key = self.find_asset_in_data_store(asset, quote)
|
|
229
|
+
if not asset_key or asset_key not in self._data_store:
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
source_data = self._data_store[asset_key]
|
|
233
|
+
|
|
234
|
+
# Only aggregate from DataPolars objects (has polars_df)
|
|
235
|
+
if not hasattr(source_data, 'polars_df'):
|
|
236
|
+
logger.warning(f"Cannot aggregate - source data is not DataPolars: {type(source_data)}")
|
|
237
|
+
return None
|
|
238
|
+
|
|
239
|
+
# Perform aggregation
|
|
240
|
+
aggregated_df = self._aggregate_polars_bars(source_data, target_timestep)
|
|
241
|
+
if aggregated_df is None:
|
|
242
|
+
return None
|
|
243
|
+
|
|
244
|
+
# Cache the result (LRU cache)
|
|
245
|
+
self._aggregated_cache[cache_key] = aggregated_df
|
|
246
|
+
logger.debug(f"[AGG CACHE MISS] {asset.symbol} {target_timestep} - cached {aggregated_df.height} rows")
|
|
247
|
+
|
|
248
|
+
# Note: Memory limits are enforced periodically in get_historical_prices()
|
|
249
|
+
# Don't enforce here to avoid immediate eviction after caching
|
|
250
|
+
|
|
251
|
+
return aggregated_df
|
|
252
|
+
|
|
253
|
+
def _enforce_memory_limits(self):
|
|
254
|
+
"""Enforce memory limits using LRU eviction.
|
|
255
|
+
|
|
256
|
+
This method ensures total memory usage stays under MAX_STORAGE_BYTES (1GB)
|
|
257
|
+
by evicting least-recently-used items from both _data_store and _aggregated_cache.
|
|
258
|
+
|
|
259
|
+
Uses the proven LRU pattern from polygon_backtesting_pandas.py.
|
|
260
|
+
|
|
261
|
+
PERFORMANCE: Only checks every _TRIM_FREQUENCY_BARS iterations (same as trim).
|
|
262
|
+
Checking memory on every get_historical_prices() call is expensive!
|
|
263
|
+
"""
|
|
264
|
+
# Use the same periodic counter as _trim_cached_data
|
|
265
|
+
# Only check memory limits when we actually trim (every 1000 iterations)
|
|
266
|
+
# This avoids iterating all data on every get_historical_prices call
|
|
267
|
+
if self._trim_iteration_count != 0:
|
|
268
|
+
return # Not time to check yet
|
|
269
|
+
|
|
270
|
+
try:
|
|
271
|
+
# Calculate total memory usage
|
|
272
|
+
storage_used = 0
|
|
273
|
+
|
|
274
|
+
# Memory from _data_store (DataPolars objects)
|
|
275
|
+
for data in self._data_store.values():
|
|
276
|
+
if hasattr(data, 'polars_df'):
|
|
277
|
+
# Estimate polars DataFrame memory
|
|
278
|
+
df = data.polars_df
|
|
279
|
+
if df.height > 0:
|
|
280
|
+
# Polars estimated_size() returns bytes
|
|
281
|
+
storage_used += df.estimated_size()
|
|
282
|
+
|
|
283
|
+
# Memory from _aggregated_cache (polars DataFrames)
|
|
284
|
+
for agg_df in self._aggregated_cache.values():
|
|
285
|
+
if agg_df is not None and hasattr(agg_df, 'estimated_size'):
|
|
286
|
+
storage_used += agg_df.estimated_size()
|
|
287
|
+
|
|
288
|
+
if storage_used <= self.MAX_STORAGE_BYTES:
|
|
289
|
+
return # Under limit, nothing to do
|
|
290
|
+
|
|
291
|
+
logger.debug(f"[MEMORY] Storage used: {storage_used:,} bytes ({len(self._data_store)} data + {len(self._aggregated_cache)} aggregated)")
|
|
292
|
+
logger.warning(f"[MEMORY] Exceeds limit of {self.MAX_STORAGE_BYTES:,} bytes, evicting LRU items...")
|
|
293
|
+
|
|
294
|
+
# Evict from aggregated cache first (less critical than source data)
|
|
295
|
+
while storage_used > self.MAX_STORAGE_BYTES and len(self._aggregated_cache) > 0:
|
|
296
|
+
# popitem(last=False) removes oldest (LRU)
|
|
297
|
+
k, agg_df = self._aggregated_cache.popitem(last=False)
|
|
298
|
+
if agg_df is not None and hasattr(agg_df, 'estimated_size'):
|
|
299
|
+
freed = agg_df.estimated_size()
|
|
300
|
+
storage_used -= freed
|
|
301
|
+
logger.debug(f"[MEMORY] Evicted aggregated cache for {k}: freed {freed:,} bytes")
|
|
302
|
+
else:
|
|
303
|
+
# Item has no size - assume 0 bytes freed but continue evicting
|
|
304
|
+
logger.warning(f"[MEMORY] Evicted aggregated cache for {k}: no estimated_size(), assuming 0 bytes")
|
|
305
|
+
|
|
306
|
+
# If still over limit, evict from data_store (more aggressive)
|
|
307
|
+
evicted_data_items = 0
|
|
308
|
+
while storage_used > self.MAX_STORAGE_BYTES and len(self._data_store) > 0:
|
|
309
|
+
# popitem(last=False) removes oldest (LRU)
|
|
310
|
+
k, data = self._data_store.popitem(last=False)
|
|
311
|
+
if hasattr(data, 'polars_df'):
|
|
312
|
+
df = data.polars_df
|
|
313
|
+
if df.height > 0:
|
|
314
|
+
freed = df.estimated_size()
|
|
315
|
+
storage_used -= freed
|
|
316
|
+
evicted_data_items += 1
|
|
317
|
+
logger.warning(f"[MEMORY] Evicted data_store for {k}: freed {freed:,} bytes")
|
|
318
|
+
else:
|
|
319
|
+
# DataFrame is empty - assume 0 bytes
|
|
320
|
+
evicted_data_items += 1
|
|
321
|
+
logger.warning(f"[MEMORY] Evicted data_store for {k}: empty DataFrame, 0 bytes freed")
|
|
322
|
+
else:
|
|
323
|
+
# Not a DataPolars object - assume 0 bytes
|
|
324
|
+
logger.warning(f"[MEMORY] Evicted data_store for {k}: no polars_df, assuming 0 bytes")
|
|
325
|
+
|
|
326
|
+
if evicted_data_items > 0:
|
|
327
|
+
logger.warning(f"[MEMORY] Evicted {evicted_data_items} data items to stay under {self.MAX_STORAGE_BYTES:,} bytes")
|
|
328
|
+
|
|
329
|
+
logger.debug(f"[MEMORY] After eviction: {storage_used:,} bytes ({len(self._data_store)} data + {len(self._aggregated_cache)} aggregated)")
|
|
330
|
+
|
|
331
|
+
except Exception as e:
|
|
332
|
+
logger.error(f"Error enforcing memory limits: {e}")
|
|
333
|
+
|
|
334
|
+
@staticmethod
|
|
335
|
+
def _set_pandas_data_keys(pandas_data):
|
|
336
|
+
# OrderedDict tracks the LRU dataframes for when it comes time to do evictions.
|
|
337
|
+
new_pandas_data = OrderedDict()
|
|
338
|
+
|
|
339
|
+
def _get_new_pandas_data_key(data):
|
|
340
|
+
# Always save the asset as a tuple of Asset and quote
|
|
341
|
+
if isinstance(data.asset, tuple):
|
|
342
|
+
return data.asset
|
|
343
|
+
elif isinstance(data.asset, Asset):
|
|
344
|
+
# If quote is not specified, use USD as the quote
|
|
345
|
+
if data.quote is None:
|
|
346
|
+
# Warn that USD is being used as the quote
|
|
347
|
+
logger.warning(f"No quote specified for {data.asset}. Using USD as the quote.")
|
|
348
|
+
return data.asset, Asset(symbol="USD", asset_type="forex")
|
|
349
|
+
return data.asset, data.quote
|
|
350
|
+
else:
|
|
351
|
+
raise ValueError("Asset must be an Asset or a tuple of Asset and quote")
|
|
352
|
+
|
|
353
|
+
# Check if pandas_data is a dictionary
|
|
354
|
+
if isinstance(pandas_data, dict):
|
|
355
|
+
for k, data in pandas_data.items():
|
|
356
|
+
key = _get_new_pandas_data_key(data)
|
|
357
|
+
new_pandas_data[key] = data
|
|
358
|
+
|
|
359
|
+
# Check if pandas_data is a list
|
|
360
|
+
elif isinstance(pandas_data, list):
|
|
361
|
+
for data in pandas_data:
|
|
362
|
+
key = _get_new_pandas_data_key(data)
|
|
363
|
+
new_pandas_data[key] = data
|
|
364
|
+
|
|
365
|
+
return new_pandas_data
|
|
366
|
+
|
|
367
|
+
def load_data(self):
|
|
368
|
+
self._data_store = self.pandas_data
|
|
369
|
+
self._date_index = self.update_date_index()
|
|
370
|
+
|
|
371
|
+
if len(self._data_store.values()) > 0:
|
|
372
|
+
self._timestep = list(self._data_store.values())[0].timestep
|
|
373
|
+
|
|
374
|
+
pcal = self.get_trading_days_pandas()
|
|
375
|
+
self._date_index = self.clean_trading_times(self._date_index, pcal)
|
|
376
|
+
for _, data in self._data_store.items():
|
|
377
|
+
data.repair_times_and_fill(self._date_index)
|
|
378
|
+
return pcal
|
|
379
|
+
|
|
380
|
+
def clean_trading_times(self, dt_index, pcal):
|
|
381
|
+
"""Fill gaps within trading days using the supplied market calendar.
|
|
382
|
+
|
|
383
|
+
Parameters
|
|
384
|
+
----------
|
|
385
|
+
dt_index : pandas.DatetimeIndex
|
|
386
|
+
Original datetime index.
|
|
387
|
+
pcal : pandas.DataFrame
|
|
388
|
+
Calendar with ``market_open`` and ``market_close`` columns indexed by date.
|
|
389
|
+
|
|
390
|
+
Returns
|
|
391
|
+
-------
|
|
392
|
+
pandas.DatetimeIndex
|
|
393
|
+
Cleaned index with one-minute frequency during market hours.
|
|
394
|
+
"""
|
|
395
|
+
# Ensure the datetime index is in datetime format and drop duplicate timestamps
|
|
396
|
+
dt_index = pd.to_datetime(dt_index).drop_duplicates()
|
|
397
|
+
|
|
398
|
+
# Create a DataFrame with dt_index as the index and sort it
|
|
399
|
+
df = pd.DataFrame(range(len(dt_index)), index=dt_index)
|
|
400
|
+
df = df.sort_index()
|
|
401
|
+
|
|
402
|
+
# Create a column for the date portion only (normalize to date, keeping as datetime64 type)
|
|
403
|
+
df["dates"] = df.index.normalize()
|
|
404
|
+
|
|
405
|
+
# Merge with the trading calendar on the 'dates' column to get market open/close times.
|
|
406
|
+
# Use a left join to keep all rows from the original index.
|
|
407
|
+
df = df.merge(
|
|
408
|
+
pcal[["market_open", "market_close"]],
|
|
409
|
+
left_on="dates",
|
|
410
|
+
right_index=True,
|
|
411
|
+
how="left"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
if self._timestep == "minute":
|
|
415
|
+
# Resample to a 1-minute frequency, using pad to fill missing times.
|
|
416
|
+
# At this point, the index is unique so asfreq will work correctly.
|
|
417
|
+
df = df.asfreq("1min", method="pad")
|
|
418
|
+
|
|
419
|
+
# Filter to include only the rows that fall within market open and close times.
|
|
420
|
+
result_index = df.loc[
|
|
421
|
+
(df.index >= df["market_open"]) & (df.index <= df["market_close"])
|
|
422
|
+
].index
|
|
423
|
+
else:
|
|
424
|
+
result_index = df.index
|
|
425
|
+
|
|
426
|
+
return result_index
|
|
427
|
+
|
|
428
|
+
def get_trading_days_pandas(self):
|
|
429
|
+
pcal = pd.DataFrame(self._date_index)
|
|
430
|
+
|
|
431
|
+
if pcal.empty:
|
|
432
|
+
# Create a dummy dataframe that spans the entire date range with market_open and market_close
|
|
433
|
+
# set to 00:00:00 and 23:59:59 respectively.
|
|
434
|
+
result = pd.DataFrame(
|
|
435
|
+
index=pd.date_range(start=self.datetime_start, end=self.datetime_end, freq="D"),
|
|
436
|
+
columns=["market_open", "market_close"],
|
|
437
|
+
)
|
|
438
|
+
result["market_open"] = result.index.floor("D")
|
|
439
|
+
result["market_close"] = result.index.ceil("D") - pd.Timedelta("1s")
|
|
440
|
+
return result
|
|
441
|
+
|
|
442
|
+
else:
|
|
443
|
+
pcal.columns = ["datetime"]
|
|
444
|
+
# Normalize to date but keep as datetime64 type (not date objects)
|
|
445
|
+
pcal["date"] = pcal["datetime"].dt.normalize()
|
|
446
|
+
result = pcal.groupby("date").agg(
|
|
447
|
+
market_open=(
|
|
448
|
+
"datetime",
|
|
449
|
+
"first",
|
|
450
|
+
),
|
|
451
|
+
market_close=(
|
|
452
|
+
"datetime",
|
|
453
|
+
"last",
|
|
454
|
+
),
|
|
455
|
+
)
|
|
456
|
+
return result
|
|
457
|
+
|
|
458
|
+
def get_assets(self):
|
|
459
|
+
return list(self._data_store.keys())
|
|
460
|
+
|
|
461
|
+
def get_asset_by_name(self, name):
|
|
462
|
+
return [asset for asset in self.get_assets() if asset.name == name]
|
|
463
|
+
|
|
464
|
+
def get_asset_by_symbol(self, symbol, asset_type=None):
|
|
465
|
+
"""Finds the assets that match the symbol. If type is specified
|
|
466
|
+
finds the assets matching symbol and type.
|
|
467
|
+
|
|
468
|
+
Parameters
|
|
469
|
+
----------
|
|
470
|
+
symbol : str
|
|
471
|
+
The symbol of the asset.
|
|
472
|
+
asset_type : str
|
|
473
|
+
Asset type. One of:
|
|
474
|
+
- stock
|
|
475
|
+
- future
|
|
476
|
+
- option
|
|
477
|
+
- forex
|
|
478
|
+
|
|
479
|
+
Returns
|
|
480
|
+
-------
|
|
481
|
+
list of Asset
|
|
482
|
+
"""
|
|
483
|
+
store_assets = self.get_assets()
|
|
484
|
+
if asset_type is None:
|
|
485
|
+
return [asset for asset in store_assets if asset.symbol == symbol]
|
|
486
|
+
else:
|
|
487
|
+
return [asset for asset in store_assets if (asset.symbol == symbol and asset.asset_type == asset_type)]
|
|
488
|
+
|
|
489
|
+
def update_date_index(self):
|
|
490
|
+
dt_index = None
|
|
491
|
+
for asset, data in self._data_store.items():
|
|
492
|
+
if dt_index is None:
|
|
493
|
+
df = data.df
|
|
494
|
+
dt_index = df.index
|
|
495
|
+
else:
|
|
496
|
+
dt_index = dt_index.join(data.df.index, how="outer")
|
|
497
|
+
|
|
498
|
+
if dt_index is None:
|
|
499
|
+
# Build a dummy index
|
|
500
|
+
freq = "1min" if self._timestep == "minute" else "1D"
|
|
501
|
+
dt_index = pd.date_range(start=self.datetime_start, end=self.datetime_end, freq=freq)
|
|
502
|
+
|
|
503
|
+
else:
|
|
504
|
+
if self.datetime_end < dt_index[0]:
|
|
505
|
+
raise ValueError(
|
|
506
|
+
f"The ending date for the backtest was set for {self.datetime_end}. "
|
|
507
|
+
f"The earliest data entered is {dt_index[0]}. \nNo backtest can "
|
|
508
|
+
f"be run since there is no data before the backtest end date."
|
|
509
|
+
)
|
|
510
|
+
elif self.datetime_start > dt_index[-1]:
|
|
511
|
+
raise ValueError(
|
|
512
|
+
f"The starting date for the backtest was set for {self.datetime_start}. "
|
|
513
|
+
f"The latest data entered is {dt_index[-1]}. \nNo backtest can "
|
|
514
|
+
f"be run since there is no data after the backtest start date."
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
return dt_index
|
|
518
|
+
|
|
519
|
+
def get_last_price(self, asset, quote=None, exchange=None) -> Union[float, Decimal, None]:
|
|
520
|
+
# Takes an asset and returns the last known price
|
|
521
|
+
tuple_to_find = self.find_asset_in_data_store(asset, quote)
|
|
522
|
+
|
|
523
|
+
if tuple_to_find in self._data_store:
|
|
524
|
+
# LRU tracking - mark this data as recently used
|
|
525
|
+
self._data_store.move_to_end(tuple_to_find)
|
|
526
|
+
data = self._data_store[tuple_to_find]
|
|
527
|
+
try:
|
|
528
|
+
dt = self.get_datetime()
|
|
529
|
+
price = data.get_last_price(dt)
|
|
530
|
+
|
|
531
|
+
# Check if price is NaN
|
|
532
|
+
if pd.isna(price):
|
|
533
|
+
# Provide more specific error message for index assets
|
|
534
|
+
if hasattr(asset, 'asset_type') and asset.asset_type == Asset.AssetType.INDEX:
|
|
535
|
+
logger.warning(f"Index asset `{asset.symbol}` returned NaN price. This could be due to missing data for the index or a subscription issue if using Polygon.io. Note that some index data (like SPX) requires a paid subscription. Consider using Yahoo Finance for broader index data coverage.")
|
|
536
|
+
else:
|
|
537
|
+
logger.debug(f"Error getting last price for {tuple_to_find}: price is NaN")
|
|
538
|
+
return None
|
|
539
|
+
|
|
540
|
+
return price
|
|
541
|
+
except Exception as e:
|
|
542
|
+
logger.debug(f"Error getting last price for {tuple_to_find}: {e}")
|
|
543
|
+
return None
|
|
544
|
+
else:
|
|
545
|
+
# Provide more specific error message when asset not found in data store
|
|
546
|
+
if hasattr(asset, 'asset_type') and asset.asset_type == Asset.AssetType.INDEX:
|
|
547
|
+
logger.warning(f"The index asset `{asset.symbol}` does not exist or does not have data. Index data may not be available from this data source. If using Polygon, note that some index data (like SPX) requires a paid subscription. Consider using Yahoo Finance for broader index data coverage.")
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
def get_quote(self, asset, quote=None, exchange=None) -> Quote:
|
|
551
|
+
"""
|
|
552
|
+
Get the latest quote for an asset.
|
|
553
|
+
Returns a Quote object with bid, ask, last, and other fields if available.
|
|
554
|
+
|
|
555
|
+
Parameters
|
|
556
|
+
----------
|
|
557
|
+
asset : Asset object
|
|
558
|
+
The asset for which the quote is needed.
|
|
559
|
+
quote : Asset object, optional
|
|
560
|
+
The quote asset for cryptocurrency pairs.
|
|
561
|
+
exchange : str, optional
|
|
562
|
+
The exchange to get the quote from.
|
|
563
|
+
|
|
564
|
+
Returns
|
|
565
|
+
-------
|
|
566
|
+
Quote
|
|
567
|
+
A Quote object with the quote information.
|
|
568
|
+
"""
|
|
569
|
+
from lumibot.entities import Quote
|
|
570
|
+
|
|
571
|
+
# Takes an asset and returns the last known price
|
|
572
|
+
tuple_to_find = self.find_asset_in_data_store(asset, quote)
|
|
573
|
+
|
|
574
|
+
if tuple_to_find in self._data_store:
|
|
575
|
+
# LRU tracking - mark this data as recently used
|
|
576
|
+
self._data_store.move_to_end(tuple_to_find)
|
|
577
|
+
data = self._data_store[tuple_to_find]
|
|
578
|
+
dt = self.get_datetime()
|
|
579
|
+
ohlcv_bid_ask_dict = data.get_quote(dt)
|
|
580
|
+
|
|
581
|
+
# Check if ohlcv_bid_ask_dict is NaN
|
|
582
|
+
if pd.isna(ohlcv_bid_ask_dict):
|
|
583
|
+
logger.debug(f"Error getting ohlcv_bid_ask for {tuple_to_find}: ohlcv_bid_ask_dict is NaN")
|
|
584
|
+
return Quote(asset=asset)
|
|
585
|
+
|
|
586
|
+
# Convert dictionary to Quote object
|
|
587
|
+
return Quote(
|
|
588
|
+
asset=asset,
|
|
589
|
+
price=ohlcv_bid_ask_dict.get('close'),
|
|
590
|
+
bid=ohlcv_bid_ask_dict.get('bid'),
|
|
591
|
+
ask=ohlcv_bid_ask_dict.get('ask'),
|
|
592
|
+
volume=ohlcv_bid_ask_dict.get('volume'),
|
|
593
|
+
timestamp=dt,
|
|
594
|
+
bid_size=ohlcv_bid_ask_dict.get('bid_size'),
|
|
595
|
+
ask_size=ohlcv_bid_ask_dict.get('ask_size'),
|
|
596
|
+
raw_data=ohlcv_bid_ask_dict
|
|
597
|
+
)
|
|
598
|
+
else:
|
|
599
|
+
return Quote(asset=asset)
|
|
600
|
+
|
|
601
|
+
def get_last_prices(self, assets, quote=None, exchange=None, **kwargs):
|
|
602
|
+
result = {}
|
|
603
|
+
for asset in assets:
|
|
604
|
+
result[asset] = self.get_last_price(asset, quote=quote, exchange=exchange)
|
|
605
|
+
return result
|
|
606
|
+
|
|
607
|
+
def _get_polars_data_entry(self, asset, quote, timestep):
|
|
608
|
+
"""Retrieve a cached DataPolars entry for a specific timestep if available."""
|
|
609
|
+
polars_cache = getattr(self, "_polars_data", {})
|
|
610
|
+
|
|
611
|
+
# Build candidate quotes: exact match first, then USD fallback (default storage)
|
|
612
|
+
quote_candidates = []
|
|
613
|
+
if quote is not None:
|
|
614
|
+
quote_candidates.append(quote)
|
|
615
|
+
quote_candidates.append(Asset(symbol="USD", asset_type="forex"))
|
|
616
|
+
|
|
617
|
+
for candidate_quote in quote_candidates:
|
|
618
|
+
key = (asset, candidate_quote, timestep)
|
|
619
|
+
entry = polars_cache.get(key)
|
|
620
|
+
if entry is not None:
|
|
621
|
+
return entry
|
|
622
|
+
|
|
623
|
+
# Final attempt: linear scan to cope with differing Asset instances
|
|
624
|
+
for (cached_asset, cached_quote, cached_timestep), entry in polars_cache.items():
|
|
625
|
+
if cached_asset == asset and cached_timestep == timestep:
|
|
626
|
+
if quote is None or cached_quote == quote:
|
|
627
|
+
return entry
|
|
628
|
+
return None
|
|
629
|
+
|
|
630
|
+
def find_asset_in_data_store(self, asset, quote=None, timestep=None):
|
|
631
|
+
"""
|
|
632
|
+
Locate the cache key for an asset, preferring timestep-aware keys but
|
|
633
|
+
gracefully falling back to legacy (asset, quote) entries for backward
|
|
634
|
+
compatibility.
|
|
635
|
+
"""
|
|
636
|
+
candidates = []
|
|
637
|
+
|
|
638
|
+
if timestep is not None:
|
|
639
|
+
base_quote = quote if quote is not None else Asset("USD", "forex")
|
|
640
|
+
candidates.append((asset, base_quote, timestep))
|
|
641
|
+
# If a quote was explicitly supplied, also consider the USD fallback to
|
|
642
|
+
# match historical cache entries that were stored with USD.
|
|
643
|
+
if quote is not None:
|
|
644
|
+
candidates.append((asset, Asset("USD", "forex"), timestep))
|
|
645
|
+
|
|
646
|
+
if quote is not None:
|
|
647
|
+
candidates.append((asset, quote))
|
|
648
|
+
|
|
649
|
+
if isinstance(asset, Asset):
|
|
650
|
+
candidates.append((asset, Asset("USD", "forex")))
|
|
651
|
+
|
|
652
|
+
candidates.append(asset)
|
|
653
|
+
|
|
654
|
+
for key in candidates:
|
|
655
|
+
if key in self._data_store:
|
|
656
|
+
return key
|
|
657
|
+
return None
|
|
658
|
+
|
|
659
|
+
def _pull_source_symbol_bars(
|
|
660
|
+
self,
|
|
661
|
+
asset,
|
|
662
|
+
length,
|
|
663
|
+
timestep="",
|
|
664
|
+
timeshift=0,
|
|
665
|
+
quote=None,
|
|
666
|
+
exchange=None,
|
|
667
|
+
include_after_hours=True,
|
|
668
|
+
):
|
|
669
|
+
timestep = timestep if timestep else self.MIN_TIMESTEP
|
|
670
|
+
if exchange is not None:
|
|
671
|
+
logger.warning(
|
|
672
|
+
f"the exchange parameter is not implemented for PandasData, but {exchange} was passed as the exchange"
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
if not timeshift:
|
|
676
|
+
timeshift = 0
|
|
677
|
+
|
|
678
|
+
asset_to_find = self.find_asset_in_data_store(asset, quote, timestep)
|
|
679
|
+
|
|
680
|
+
if asset_to_find in self._data_store:
|
|
681
|
+
# LRU tracking - mark this data as recently used
|
|
682
|
+
self._data_store.move_to_end(asset_to_find)
|
|
683
|
+
data = self._data_store[asset_to_find]
|
|
684
|
+
else:
|
|
685
|
+
if hasattr(asset, 'asset_type') and asset.asset_type == Asset.AssetType.INDEX:
|
|
686
|
+
logger.warning(f"The index asset `{asset.symbol}` does not exist or does not have data. Index data may not be available from this data source. If using Polygon, note that some index data (like SPX) requires a paid subscription. Consider using Yahoo Finance for broader index data coverage.")
|
|
687
|
+
else:
|
|
688
|
+
logger.warning(f"The asset: `{asset}` does not exist or does not have data.")
|
|
689
|
+
return
|
|
690
|
+
|
|
691
|
+
desired_timestep = timestep
|
|
692
|
+
|
|
693
|
+
# Prefer a direct DataPolars match for the requested timestep (if available) to
|
|
694
|
+
# avoid aggregating from trimmed minute windows.
|
|
695
|
+
current_timestep = getattr(data, "timestep", None)
|
|
696
|
+
if desired_timestep and current_timestep != desired_timestep:
|
|
697
|
+
direct_match = self._get_polars_data_entry(asset, quote, desired_timestep)
|
|
698
|
+
if direct_match is not None:
|
|
699
|
+
data = direct_match
|
|
700
|
+
current_timestep = data.timestep
|
|
701
|
+
|
|
702
|
+
# OPTIMIZATION: Use aggregated bars cache for different timesteps
|
|
703
|
+
# This avoids re-aggregating 5m/15m/1h bars from minute data every iteration
|
|
704
|
+
source_timestep = current_timestep
|
|
705
|
+
can_aggregate = (
|
|
706
|
+
source_timestep == "minute"
|
|
707
|
+
and timestep != source_timestep
|
|
708
|
+
and hasattr(data, 'polars_df') # Only for DataPolars objects
|
|
709
|
+
and timestep in ["5 minutes", "15 minutes", "30 minutes", "hour", "2 hours", "4 hours", "day"]
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
if can_aggregate:
|
|
713
|
+
# Try to get aggregated bars from cache
|
|
714
|
+
aggregated_df = self._get_or_aggregate_bars(asset, quote, length, source_timestep, timestep)
|
|
715
|
+
if aggregated_df is not None:
|
|
716
|
+
# We have aggregated data - now filter and tail it like get_bars would
|
|
717
|
+
import polars as pl
|
|
718
|
+
|
|
719
|
+
now = self.get_datetime()
|
|
720
|
+
# Apply timeshift if specified
|
|
721
|
+
# CRITICAL: Integer timeshift represents BAR offsets, not minute deltas!
|
|
722
|
+
# Must calculate adjustment based on the actual timestep being requested.
|
|
723
|
+
if timeshift:
|
|
724
|
+
from datetime import timedelta
|
|
725
|
+
if isinstance(timeshift, int):
|
|
726
|
+
# Calculate timedelta for one bar of this timestep
|
|
727
|
+
timestep_delta, _ = self.convert_timestep_str_to_timedelta(timestep)
|
|
728
|
+
# Multiply by timeshift to get total adjustment
|
|
729
|
+
# Example: timestep="5 minutes", timeshift=-2 → adjustment = -10 minutes
|
|
730
|
+
now = now + (timestep_delta * timeshift)
|
|
731
|
+
else:
|
|
732
|
+
# Timeshift is already a timedelta - use it directly
|
|
733
|
+
now = now + timeshift
|
|
734
|
+
|
|
735
|
+
# Filter to current time and take last 'length' bars
|
|
736
|
+
# Convert now to match polars DataFrame timezone
|
|
737
|
+
import pytz
|
|
738
|
+
if now.tzinfo is None:
|
|
739
|
+
now_aware = pytz.utc.localize(now)
|
|
740
|
+
else:
|
|
741
|
+
now_aware = now
|
|
742
|
+
|
|
743
|
+
polars_tz = aggregated_df["datetime"].dtype.time_zone
|
|
744
|
+
if polars_tz:
|
|
745
|
+
import pandas as pd
|
|
746
|
+
now_compat = pd.Timestamp(now_aware).tz_convert(polars_tz)
|
|
747
|
+
else:
|
|
748
|
+
now_compat = now_aware
|
|
749
|
+
|
|
750
|
+
filtered = aggregated_df.filter(pl.col("datetime") <= now_compat)
|
|
751
|
+
result = filtered.tail(length)
|
|
752
|
+
|
|
753
|
+
if result.height >= length:
|
|
754
|
+
logger.debug(f"[AGG CACHE] {asset.symbol} {timestep}: returning {result.height} bars from cache")
|
|
755
|
+
return result
|
|
756
|
+
|
|
757
|
+
# Aggregated slice is insufficient—evict this cache entry and try to fall back
|
|
758
|
+
logger.warning(
|
|
759
|
+
"[AGG CACHE] %s %s: insufficient rows (requested=%s, filtered=%s, returning=%s); falling back",
|
|
760
|
+
asset.symbol,
|
|
761
|
+
timestep,
|
|
762
|
+
length,
|
|
763
|
+
filtered.height,
|
|
764
|
+
result.height,
|
|
765
|
+
)
|
|
766
|
+
cache_key = self._get_aggregation_cache_key(asset, quote, timestep)
|
|
767
|
+
self._aggregated_cache.pop(cache_key, None)
|
|
768
|
+
|
|
769
|
+
direct_match = self._get_polars_data_entry(asset, quote, timestep)
|
|
770
|
+
if direct_match is not None:
|
|
771
|
+
data = direct_match
|
|
772
|
+
source_timestep = data.timestep
|
|
773
|
+
# Fall through to regular get_bars
|
|
774
|
+
|
|
775
|
+
# Regular path - use data.get_bars() which handles timestep conversion internally
|
|
776
|
+
now = self.get_datetime()
|
|
777
|
+
|
|
778
|
+
try:
|
|
779
|
+
res = data.get_bars(now, length=length, timestep=timestep, timeshift=timeshift)
|
|
780
|
+
# Return None if data.get_bars returns a ValueError
|
|
781
|
+
except ValueError as e:
|
|
782
|
+
logger.debug(f"Error getting bars for {asset}: {e}")
|
|
783
|
+
return None
|
|
784
|
+
|
|
785
|
+
return res
|
|
786
|
+
|
|
787
|
+
def _pull_source_symbol_bars_between_dates(
|
|
788
|
+
self,
|
|
789
|
+
asset,
|
|
790
|
+
timestep="",
|
|
791
|
+
quote=None,
|
|
792
|
+
exchange=None,
|
|
793
|
+
include_after_hours=True,
|
|
794
|
+
start_date=None,
|
|
795
|
+
end_date=None,
|
|
796
|
+
):
|
|
797
|
+
"""Pull all bars for an asset"""
|
|
798
|
+
timestep = timestep if timestep else self.MIN_TIMESTEP
|
|
799
|
+
asset_to_find = self.find_asset_in_data_store(asset, quote)
|
|
800
|
+
|
|
801
|
+
if asset_to_find in self._data_store:
|
|
802
|
+
# LRU tracking - mark this data as recently used
|
|
803
|
+
self._data_store.move_to_end(asset_to_find)
|
|
804
|
+
data = self._data_store[asset_to_find]
|
|
805
|
+
else:
|
|
806
|
+
if hasattr(asset, 'asset_type') and asset.asset_type == Asset.AssetType.INDEX:
|
|
807
|
+
logger.warning(f"The index asset `{asset.symbol}` does not exist or does not have data. Index data may not be available from this data source. If using Polygon, note that some index data (like SPX) requires a paid subscription. Consider using Yahoo Finance for broader index data coverage.")
|
|
808
|
+
else:
|
|
809
|
+
logger.warning(f"The asset: `{asset}` does not exist or does not have data.")
|
|
810
|
+
return
|
|
811
|
+
|
|
812
|
+
try:
|
|
813
|
+
res = data.get_bars_between_dates(start_date=start_date, end_date=end_date, timestep=timestep)
|
|
814
|
+
# Return None if data.get_bars returns a ValueError
|
|
815
|
+
except ValueError as e:
|
|
816
|
+
logger.debug(f"Error getting bars for {asset}: {e}")
|
|
817
|
+
res = None
|
|
818
|
+
return res
|
|
819
|
+
|
|
820
|
+
def _pull_source_bars(
|
|
821
|
+
self,
|
|
822
|
+
assets,
|
|
823
|
+
length,
|
|
824
|
+
timestep="",
|
|
825
|
+
timeshift=None,
|
|
826
|
+
quote=None,
|
|
827
|
+
include_after_hours=True,
|
|
828
|
+
):
|
|
829
|
+
"""pull broker bars for a list assets"""
|
|
830
|
+
timestep = timestep if timestep else self.MIN_TIMESTEP
|
|
831
|
+
self._parse_source_timestep(timestep, reverse=True)
|
|
832
|
+
|
|
833
|
+
result = {}
|
|
834
|
+
for asset in assets:
|
|
835
|
+
result[asset] = self._pull_source_symbol_bars(
|
|
836
|
+
asset, length, timestep=timestep, timeshift=timeshift, quote=quote
|
|
837
|
+
)
|
|
838
|
+
# remove assets that have no data from the result
|
|
839
|
+
if result[asset] is None:
|
|
840
|
+
result.pop(asset)
|
|
841
|
+
|
|
842
|
+
return result
|
|
843
|
+
|
|
844
|
+
def _parse_source_symbol_bars(self, response, asset, quote=None, length=None, return_polars=False):
|
|
845
|
+
"""parse broker response for a single asset
|
|
846
|
+
|
|
847
|
+
CRITICAL: return_polars defaults to False for backwards compatibility.
|
|
848
|
+
Existing strategies expect pandas DataFrames!
|
|
849
|
+
"""
|
|
850
|
+
asset1 = asset
|
|
851
|
+
asset2 = quote
|
|
852
|
+
if isinstance(asset, tuple):
|
|
853
|
+
asset1, asset2 = asset
|
|
854
|
+
bars = Bars(response, self.SOURCE, asset1, quote=asset2, raw=response, return_polars=return_polars)
|
|
855
|
+
return bars
|
|
856
|
+
|
|
857
|
+
def get_yesterday_dividend(self, asset, quote=None):
|
|
858
|
+
pass
|
|
859
|
+
|
|
860
|
+
def get_yesterday_dividends(self, assets, quote=None):
|
|
861
|
+
pass
|
|
862
|
+
|
|
863
|
+
# =======Options methods.=================
|
|
864
|
+
def get_chains(self, asset: Asset, quote: Asset = None, exchange: str = None):
|
|
865
|
+
"""Returns option chains.
|
|
866
|
+
|
|
867
|
+
Obtains option chain information for the asset (stock) from each
|
|
868
|
+
of the exchanges the options trade on and returns a dictionary
|
|
869
|
+
for each exchange.
|
|
870
|
+
|
|
871
|
+
Parameters
|
|
872
|
+
----------
|
|
873
|
+
asset : Asset object
|
|
874
|
+
The stock whose option chain is being fetched. Represented
|
|
875
|
+
as an asset object.
|
|
876
|
+
quote : Asset object, optional
|
|
877
|
+
The quote asset. Default is None.
|
|
878
|
+
exchange : str, optional
|
|
879
|
+
The exchange to fetch the option chains from. For PandasData, will only use "SMART".
|
|
880
|
+
|
|
881
|
+
Returns
|
|
882
|
+
-------
|
|
883
|
+
dict
|
|
884
|
+
Mapping with keys such as ``Multiplier`` (e.g. ``"100"``) and ``Chains``.
|
|
885
|
+
``Chains`` is a nested dictionary where expiration dates map to strike lists,
|
|
886
|
+
e.g. ``chains['Chains']['CALL']['2023-07-31'] = [strike1, strike2, ...]``.
|
|
887
|
+
"""
|
|
888
|
+
chains = dict(
|
|
889
|
+
Multiplier=100,
|
|
890
|
+
Exchange="SMART",
|
|
891
|
+
Chains={"CALL": defaultdict(list), "PUT": defaultdict(list)},
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
for store_item, data in self._data_store.items():
|
|
895
|
+
store_asset = store_item[0]
|
|
896
|
+
if store_asset.asset_type != "option":
|
|
897
|
+
continue
|
|
898
|
+
if store_asset.symbol != asset.symbol:
|
|
899
|
+
continue
|
|
900
|
+
chains["Chains"][store_asset.right][store_asset.expiration].append(store_asset.strike)
|
|
901
|
+
|
|
902
|
+
return chains
|
|
903
|
+
|
|
904
|
+
def get_start_datetime_and_ts_unit(self, length, timestep, start_dt=None, start_buffer=timedelta(days=5)):
|
|
905
|
+
"""
|
|
906
|
+
Get the start datetime for the data.
|
|
907
|
+
|
|
908
|
+
Parameters
|
|
909
|
+
----------
|
|
910
|
+
length : int
|
|
911
|
+
The number of data points to get.
|
|
912
|
+
timestep : str
|
|
913
|
+
The timestep to use. For example, "1minute" or "1hour" or "1day".
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
Returns
|
|
917
|
+
-------
|
|
918
|
+
datetime
|
|
919
|
+
The start datetime.
|
|
920
|
+
str
|
|
921
|
+
The timestep unit.
|
|
922
|
+
"""
|
|
923
|
+
# Convert timestep string to timedelta and get start datetime
|
|
924
|
+
td, ts_unit = self.convert_timestep_str_to_timedelta(timestep)
|
|
925
|
+
|
|
926
|
+
if ts_unit == "day":
|
|
927
|
+
weeks_requested = length // 5 # Full trading week is 5 days
|
|
928
|
+
extra_padding_days = weeks_requested * 3 # to account for 3day weekends
|
|
929
|
+
td = timedelta(days=length + extra_padding_days)
|
|
930
|
+
else:
|
|
931
|
+
td *= length
|
|
932
|
+
|
|
933
|
+
if start_dt is not None:
|
|
934
|
+
start_datetime = start_dt - td
|
|
935
|
+
else:
|
|
936
|
+
start_datetime = self.datetime_start - td
|
|
937
|
+
|
|
938
|
+
# Subtract an extra 5 days to the start datetime to make sure we have enough
|
|
939
|
+
# data when it's a sparsely traded asset, especially over weekends
|
|
940
|
+
start_datetime = start_datetime - start_buffer
|
|
941
|
+
|
|
942
|
+
return start_datetime, ts_unit
|
|
943
|
+
|
|
944
|
+
def get_historical_prices(
|
|
945
|
+
self,
|
|
946
|
+
asset: Asset,
|
|
947
|
+
length: int,
|
|
948
|
+
timestep: str = None,
|
|
949
|
+
timeshift: int = None,
|
|
950
|
+
quote: Asset = None,
|
|
951
|
+
exchange: str = None,
|
|
952
|
+
include_after_hours: bool = True,
|
|
953
|
+
# PolarsData supports return_polars to enable polars-backed Bars for performance.
|
|
954
|
+
# When True, returns Bars with polars DataFrame internally (lazy conversion to pandas).
|
|
955
|
+
# CRITICAL: Default MUST be False for backwards compatibility with existing strategies!
|
|
956
|
+
return_polars: bool = False,
|
|
957
|
+
):
|
|
958
|
+
"""Get bars for a given asset"""
|
|
959
|
+
# Periodically trim cached data to maintain sliding window
|
|
960
|
+
self._trim_cached_data()
|
|
961
|
+
|
|
962
|
+
# Enforce memory limits after trimming (same periodic frequency)
|
|
963
|
+
# This ensures total memory usage stays under 1GB cap
|
|
964
|
+
self._enforce_memory_limits()
|
|
965
|
+
|
|
966
|
+
if isinstance(asset, str):
|
|
967
|
+
asset = Asset(symbol=asset)
|
|
968
|
+
|
|
969
|
+
if not timestep:
|
|
970
|
+
timestep = self.get_timestep()
|
|
971
|
+
response = self._pull_source_symbol_bars(
|
|
972
|
+
asset,
|
|
973
|
+
length,
|
|
974
|
+
timestep=timestep,
|
|
975
|
+
timeshift=timeshift,
|
|
976
|
+
quote=quote,
|
|
977
|
+
exchange=exchange,
|
|
978
|
+
include_after_hours=include_after_hours,
|
|
979
|
+
)
|
|
980
|
+
if isinstance(response, float):
|
|
981
|
+
return response
|
|
982
|
+
elif response is None:
|
|
983
|
+
return None
|
|
984
|
+
|
|
985
|
+
bars = self._parse_source_symbol_bars(response, asset, quote=quote, length=length, return_polars=return_polars)
|
|
986
|
+
return bars
|