lumibot 4.0.21__py3-none-any.whl → 4.0.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lumibot might be problematic. Click here for more details.

@@ -0,0 +1,793 @@
1
+ """DataBento data source using Polars with proper Live API integration - FIXED VERSION.
2
+
3
+ This implementation uses:
4
+ - Live API for real-time data streaming
5
+ - Historical API for data >24 hours old
6
+ - Proper handling of DataBento message types
7
+ - Correct price conversion from fixed-point format
8
+ """
9
+
10
+ from datetime import datetime, timedelta, timezone
11
+ from decimal import Decimal
12
+ from typing import Dict, Optional, Union
13
+ import time
14
+ import threading
15
+ import queue
16
+ from collections import defaultdict
17
+
18
+ import polars as pl
19
+ import databento as db
20
+
21
+ from lumibot.data_sources import DataSource
22
+ from lumibot.data_sources.polars_mixin import PolarsMixin
23
+ from lumibot.entities import Asset, Bars, Quote
24
+ from lumibot.tools import databento_helper_polars
25
+ from lumibot.tools.databento_helper_polars import _ensure_polars_datetime_timezone as _ensure_polars_tz
26
+ from lumibot.tools.lumibot_logger import get_logger
27
+
28
+ logger = get_logger(__name__)
29
+
30
+
31
+ class DataBentoDataPolarsLive(PolarsMixin, DataSource):
32
+ """
33
+ DataBento data source optimized with Polars and proper Live API usage.
34
+
35
+ Uses Live API for real-time trade streaming to achieve <1 minute lag.
36
+ Falls back to Historical API for older data.
37
+ """
38
+
39
+ SOURCE = "DATABENTO"
40
+ MIN_TIMESTEP = "minute"
41
+ TIMESTEP_MAPPING = {
42
+ "minute": "1m",
43
+ "day": "1d",
44
+ "hour": "1h"
45
+ }
46
+
47
+ def __init__(
48
+ self,
49
+ api_key: str,
50
+ has_paid_subscription: bool = False,
51
+ enable_cache: bool = True,
52
+ cache_duration_minutes: int = 60,
53
+ enable_live_stream: bool = True,
54
+ timeout: int = None, # For backwards compatibility
55
+ max_retries: int = None # For backwards compatibility
56
+ ):
57
+ """Initialize DataBento data source with Live API support"""
58
+ super().__init__(api_key=api_key, has_paid_subscription=has_paid_subscription)
59
+
60
+ # Core configuration
61
+ self._api_key = api_key
62
+ self.has_paid_subscription = has_paid_subscription
63
+ self.enable_cache = enable_cache
64
+ self.cache_duration = timedelta(minutes=cache_duration_minutes)
65
+ self.enable_live_stream = enable_live_stream
66
+
67
+ # Caches
68
+ self._last_price_cache = {}
69
+ self._eager_cache = {}
70
+ self._filtered_data_cache = {}
71
+ self._cache_metadata = {}
72
+ self._cache_timestamps = {}
73
+
74
+ # Live streaming state
75
+ self._live_client = None
76
+ self._producer_threads = {} # Map symbol to producer thread
77
+ self._consumer_thread = None
78
+ self._finalizer_thread = None
79
+ self._stop_streaming = False
80
+ self._minute_bars = defaultdict(dict)
81
+ self._bars_lock = threading.Lock()
82
+ self._finalized_minutes = defaultdict(set)
83
+ self._subscribed_symbols = set()
84
+ self._last_trade_time = {}
85
+ self._last_ts_event = {} # Track last timestamp per symbol for reconnection
86
+ self._symbol_mapping = {} # Maps instrument_id to symbol
87
+ self._record_queue = queue.Queue(maxsize=10000)
88
+ self._reconnect_backoff = 1.0
89
+
90
+ # Live tick cache
91
+ self._live_cache_lock = threading.RLock()
92
+ self._latest_trades: Dict[str, dict] = {}
93
+ self._latest_quotes: Dict[str, dict] = {}
94
+ self._max_live_age = timedelta(seconds=2)
95
+ self._stale_warning_issued: Dict[str, bool] = {}
96
+
97
+ # Configuration
98
+ self._finalize_grace_seconds = 3 # Wait 3 seconds after minute ends to finalize
99
+ self._prune_older_minutes = 720 # Remove bars older than 12 hours
100
+ self._resub_overlap_seconds = 5 # Overlap on reconnection
101
+
102
+ if self.enable_live_stream:
103
+ self._init_live_streaming()
104
+
105
+ def _should_use_live_api(self, start_dt: datetime, end_dt: datetime) -> bool:
106
+ """Return True when the requested window should use the live API."""
107
+ if not self.enable_live_stream:
108
+ return False
109
+ if start_dt is None or end_dt is None:
110
+ return False
111
+ if start_dt.tzinfo is None:
112
+ start_dt = start_dt.replace(tzinfo=timezone.utc)
113
+ if end_dt.tzinfo is None:
114
+ end_dt = end_dt.replace(tzinfo=timezone.utc)
115
+ if end_dt < start_dt:
116
+ start_dt, end_dt = end_dt, start_dt
117
+ now = datetime.now(timezone.utc)
118
+ live_window = timedelta(hours=24)
119
+ return end_dt >= now - live_window
120
+
121
+
122
+ def _init_live_streaming(self):
123
+ """Initialize DataBento Live API client for real-time data"""
124
+ try:
125
+ self._stop_streaming = False
126
+
127
+ # Start consumer thread to process records from queue
128
+ self._consumer_thread = threading.Thread(target=self._consumer_loop, daemon=True)
129
+ self._consumer_thread.start()
130
+
131
+ # Start finalizer thread to mark old bars as complete
132
+ self._finalizer_thread = threading.Thread(target=self._finalizer_loop, daemon=True)
133
+ self._finalizer_thread.start()
134
+
135
+ logger.debug("[DATABENTO][LIVE] Live streaming threads initialized")
136
+
137
+ except Exception as e:
138
+ logger.error(f"[DATABENTO][LIVE] Failed to initialize Live streaming: {e}", exc_info=True)
139
+ self.enable_live_stream = False
140
+
141
+ def _live_stream_worker(self, symbol: str, start_time: datetime):
142
+ """Producer thread that subscribes and iterates in the same context"""
143
+ logger.debug(f"[DATABENTO][PRODUCER] Starting for {symbol}")
144
+ reconnect_attempts = 0
145
+ max_reconnect_attempts = 5
146
+ backoff_seconds = 1
147
+
148
+ while not self._stop_streaming and reconnect_attempts < max_reconnect_attempts:
149
+ try:
150
+ # Create a new client for this producer
151
+ client = db.Live(key=self._api_key)
152
+
153
+ logger.debug(f"[DATABENTO][PRODUCER] Subscribing to {symbol} from {start_time.isoformat()}")
154
+
155
+ # Subscribe - must happen in same context as iteration
156
+ client.subscribe(
157
+ dataset="GLBX.MDP3",
158
+ schema="trades",
159
+ stype_in="raw_symbol",
160
+ symbols=[symbol],
161
+ start=start_time.isoformat()
162
+ )
163
+
164
+ # Attempt to subscribe to top-of-book quotes for richer data
165
+ try:
166
+ client.subscribe(
167
+ dataset="GLBX.MDP3",
168
+ schema="quotes",
169
+ stype_in="raw_symbol",
170
+ symbols=[symbol],
171
+ start=start_time.isoformat()
172
+ )
173
+ except Exception as quote_sub_err:
174
+ logger.debug(f"[DATABENTO][PRODUCER] Quote subscription not available for {symbol}: {quote_sub_err}")
175
+
176
+ # Immediately iterate in the SAME context
177
+ record_count = 0
178
+ error_count = 0
179
+
180
+ for record in client:
181
+ if self._stop_streaming:
182
+ break
183
+
184
+ record_count += 1
185
+
186
+ # Handle ErrorMsg records
187
+ if hasattr(record, '__class__') and record.__class__.__name__ == 'ErrorMsg':
188
+ error_count += 1
189
+ err_msg = getattr(record, 'err', 'Unknown error')
190
+ logger.error(f"[DATABENTO][PRODUCER] Error from server: {err_msg}")
191
+ if error_count > 3:
192
+ logger.error(f"[DATABENTO][PRODUCER] Too many errors, will reconnect")
193
+ break
194
+ continue
195
+
196
+ # Reset error count on successful records
197
+ error_count = 0
198
+
199
+ # Put record in queue for consumer
200
+ try:
201
+ self._record_queue.put((symbol, record), timeout=0.1)
202
+
203
+ # Track last event timestamp for reconnection
204
+ if hasattr(record, 'ts_event'):
205
+ self._last_ts_event[symbol] = getattr(record, 'ts_event')
206
+
207
+ # Log progress (only first few)
208
+ if record_count <= 3:
209
+ logger.debug(f"[DATABENTO][PRODUCER] {symbol} record #{record_count}: {record.__class__.__name__}")
210
+
211
+ except queue.Full:
212
+ logger.warning(f"[DATABENTO][PRODUCER] Queue full, dropping record")
213
+
214
+ # Clean exit
215
+ logger.info(f"[DATABENTO][PRODUCER] {symbol} stopped after {record_count} records")
216
+ break # Successful completion
217
+
218
+ except Exception as e:
219
+ logger.error(f"[DATABENTO][PRODUCER] {symbol} error: {e}")
220
+ reconnect_attempts += 1
221
+
222
+ if reconnect_attempts < max_reconnect_attempts:
223
+ sleep_time = backoff_seconds * (2 ** reconnect_attempts)
224
+ logger.info(f"[DATABENTO][PRODUCER] Reconnecting {symbol} in {sleep_time}s (attempt {reconnect_attempts})")
225
+ time.sleep(sleep_time)
226
+
227
+ # Update start time for reconnection to avoid duplicate data
228
+ if symbol in self._last_ts_event:
229
+ # Start from last received timestamp (databento timestamps are in nanoseconds)
230
+ ts_ns = self._last_ts_event[symbol]
231
+ if ts_ns > 0:
232
+ start_time = datetime.fromtimestamp(ts_ns / 1e9, tz=timezone.utc)
233
+ logger.info(f"[DATABENTO][PRODUCER] Resuming from last event: {start_time.isoformat()}")
234
+ else:
235
+ logger.error(f"[DATABENTO][PRODUCER] {symbol} max reconnection attempts reached")
236
+
237
+ def _subscribe_to_symbol(self, symbol: str, start_time: datetime = None, min_bars: int = 10):
238
+ """Start a producer thread for a symbol"""
239
+ if symbol in self._subscribed_symbols:
240
+ logger.debug(f"[DATABENTO][LIVE] {symbol} already subscribed")
241
+ return
242
+
243
+ try:
244
+ # Calculate start time for replay
245
+ if start_time is None:
246
+ # Request enough history to build minute bars
247
+ start_time = datetime.now(timezone.utc) - timedelta(minutes=max(30, min_bars * 2))
248
+
249
+ logger.debug(f"[DATABENTO][LIVE] Starting producer for {symbol}")
250
+ logger.debug(f"[DATABENTO][LIVE] Replay from: {start_time.isoformat()}")
251
+
252
+ # Start producer thread for this symbol
253
+ producer_thread = threading.Thread(
254
+ target=self._live_stream_worker,
255
+ args=(symbol, start_time),
256
+ daemon=True,
257
+ name=f"databento-producer-{symbol}"
258
+ )
259
+ producer_thread.start()
260
+
261
+ self._subscribed_symbols.add(symbol)
262
+ self._producer_threads[symbol] = producer_thread
263
+ logger.debug(f"[DATABENTO][LIVE] Producer started for {symbol}")
264
+
265
+ except Exception as e:
266
+ logger.error(f"[DATABENTO][LIVE] Failed to start producer for {symbol}: {e}", exc_info=True)
267
+
268
+ def _consumer_loop(self):
269
+ """Consumer thread that processes records from the queue"""
270
+ logger.debug("[DATABENTO][CONSUMER] Started")
271
+ trade_count = 0
272
+
273
+ while not self._stop_streaming:
274
+ try:
275
+ # Get record from queue with timeout
276
+ symbol, record = self._record_queue.get(timeout=1.0)
277
+
278
+ # Handle symbol mappings
279
+ if hasattr(record, '__class__') and record.__class__.__name__ == 'SymbolMappingMsg':
280
+ instrument_id = getattr(record, 'instrument_id', None)
281
+ if instrument_id:
282
+ for attr in ['raw_symbol', 'stype_out_symbol', 'symbol']:
283
+ mapped_symbol = getattr(record, attr, None)
284
+ if mapped_symbol:
285
+ self._symbol_mapping[instrument_id] = mapped_symbol
286
+ logger.debug(f"[DATABENTO][CONSUMER] Symbol mapping: {instrument_id} -> {mapped_symbol}")
287
+ break
288
+
289
+ # Process trade messages
290
+ elif hasattr(record, '__class__') and record.__class__.__name__ == 'TradeMsg':
291
+ instrument_id = getattr(record, 'instrument_id', None)
292
+
293
+ # Try to get symbol from mapping or use provided symbol
294
+ actual_symbol = self._symbol_mapping.get(instrument_id, symbol)
295
+
296
+ # Process the trade
297
+ if actual_symbol:
298
+ self._last_trade_time[actual_symbol] = datetime.now(timezone.utc)
299
+ trade_count += 1
300
+
301
+ # Log only first few trades for verification
302
+ raw_price = getattr(record, 'price', 0)
303
+ price = raw_price / 1e9 if raw_price > 1e10 else raw_price
304
+ size = getattr(record, 'size', 0)
305
+ ts_event = getattr(record, 'ts_event', 0)
306
+ trade_dt = datetime.fromtimestamp(ts_event / 1e9, tz=timezone.utc)
307
+
308
+ if trade_count <= 3:
309
+ logger.debug(f"[DATABENTO][CONSUMER] Trade #{trade_count} {actual_symbol} @ {price:.2f} size={size}")
310
+
311
+ # Update live trade cache
312
+ self._record_live_trade(actual_symbol, price, size, trade_dt)
313
+
314
+ # Aggregate the trade into minute bars
315
+ self._aggregate_trade(actual_symbol, price, size, trade_dt)
316
+
317
+ elif hasattr(record, '__class__') and record.__class__.__name__ in {"Mbp1Msg", "BboMsg", "QuoteMsg"}:
318
+ actual_symbol = getattr(record, 'symbol', symbol)
319
+ bid_px = getattr(record, 'bid_px', None)
320
+ ask_px = getattr(record, 'ask_px', None)
321
+ bid_sz = getattr(record, 'bid_sz', None)
322
+ ask_sz = getattr(record, 'ask_sz', None)
323
+ ts_event = getattr(record, 'ts_event', None)
324
+
325
+ if bid_px is not None or ask_px is not None:
326
+ # Normalize units (DataBento quotes may be scaled by 1e9)
327
+ def _normalize(val):
328
+ if val is None:
329
+ return None
330
+ return float(val) / 1e9 if val > 1e10 else float(val)
331
+
332
+ bid_price = _normalize(bid_px)
333
+ ask_price = _normalize(ask_px)
334
+ bid_size = float(bid_sz) if bid_sz is not None else None
335
+ ask_size = float(ask_sz) if ask_sz is not None else None
336
+ ts_dt = datetime.fromtimestamp(ts_event / 1e9, tz=timezone.utc) if ts_event else datetime.now(timezone.utc)
337
+
338
+ self._record_live_quote(actual_symbol, bid_price, ask_price, bid_size, ask_size, ts_dt)
339
+
340
+ except queue.Empty:
341
+ continue
342
+ except Exception as e:
343
+ logger.error(f"[DATABENTO][CONSUMER] Error processing record: {e}")
344
+
345
+ logger.info(f"[DATABENTO][CONSUMER] Stopped after {trade_count} trades")
346
+
347
+ def _finalizer_loop(self):
348
+ """Finalizer thread that marks old bars as complete"""
349
+ logger.debug("[DATABENTO][FINALIZER] Started")
350
+
351
+ while not self._stop_streaming:
352
+ try:
353
+ time.sleep(5) # Check every 5 seconds
354
+
355
+ current_time = datetime.now(timezone.utc)
356
+ cutoff_time = current_time - timedelta(seconds=self._finalize_grace_seconds)
357
+ cutoff_minute = cutoff_time.replace(second=0, microsecond=0)
358
+
359
+ with self._bars_lock:
360
+ for symbol in list(self._minute_bars.keys()):
361
+ # Finalize minutes that are complete
362
+ for minute_dt in list(self._minute_bars[symbol].keys()):
363
+ if minute_dt < cutoff_minute and minute_dt not in self._finalized_minutes[symbol]:
364
+ self._finalized_minutes[symbol].add(minute_dt)
365
+ bar = self._minute_bars[symbol][minute_dt]
366
+ logger.debug(f"[DATABENTO][FINALIZER] Finalized {symbol} bar at {minute_dt}: OHLC={bar['open']:.2f}/{bar['high']:.2f}/{bar['low']:.2f}/{bar['close']:.2f} vol={bar['volume']}")
367
+
368
+ # Prune old bars to prevent unlimited memory growth
369
+ prune_before = current_time - timedelta(minutes=self._prune_older_minutes)
370
+ old_minutes = [dt for dt in self._minute_bars[symbol].keys() if dt < prune_before]
371
+ for old_dt in old_minutes:
372
+ del self._minute_bars[symbol][old_dt]
373
+ self._finalized_minutes[symbol].discard(old_dt)
374
+
375
+ if old_minutes:
376
+ logger.debug(f"[DATABENTO][FINALIZER] Pruned {len(old_minutes)} old bars for {symbol}")
377
+
378
+ except Exception as e:
379
+ logger.error(f"[DATABENTO][FINALIZER] Error: {e}")
380
+
381
+ logger.info("[DATABENTO][FINALIZER] Stopped")
382
+
383
+ def _aggregate_trade(self, symbol: str, price: float, size: float, trade_time: datetime):
384
+ """Aggregate a trade into minute bars"""
385
+ minute = trade_time.replace(second=0, microsecond=0)
386
+
387
+ # Skip if already finalized
388
+ if minute in self._finalized_minutes[symbol]:
389
+ return
390
+
391
+ # Get current time to check if bar should be finalized
392
+ current_time = datetime.now(timezone.utc)
393
+ current_minute = current_time.replace(second=0, microsecond=0)
394
+
395
+ # Initialize symbol's bar dict if needed
396
+ if symbol not in self._minute_bars:
397
+ self._minute_bars[symbol] = {}
398
+
399
+ # Create or update the minute bar
400
+ if minute not in self._minute_bars[symbol]:
401
+ # New minute bar
402
+ self._minute_bars[symbol][minute] = {
403
+ 'datetime': minute,
404
+ 'open': price,
405
+ 'high': price,
406
+ 'low': price,
407
+ 'close': price,
408
+ 'volume': size
409
+ }
410
+ logger.debug(f"[DATABENTO][LIVE] New minute bar: {symbol} {minute} @ {price:.2f}")
411
+ else:
412
+ # Update existing bar
413
+ bar = self._minute_bars[symbol][minute]
414
+ bar['high'] = max(bar['high'], price)
415
+ bar['low'] = min(bar['low'], price)
416
+ bar['close'] = price
417
+ bar['volume'] += size
418
+
419
+ # Finalize old bars (anything older than current minute)
420
+ for bar_minute in list(self._minute_bars[symbol].keys()):
421
+ if bar_minute < current_minute and bar_minute not in self._finalized_minutes[symbol]:
422
+ self._finalized_minutes[symbol].add(bar_minute)
423
+ logger.debug(f"[DATABENTO][LIVE] Finalized bar: {symbol} {bar_minute}")
424
+
425
+ def _get_live_tail(self, symbol: str, after_dt: datetime) -> Optional[pl.DataFrame]:
426
+ """Get finalized live bars newer than after_dt"""
427
+ if symbol not in self._minute_bars or not self._minute_bars[symbol]:
428
+ return None
429
+
430
+ current_minute = datetime.now(timezone.utc).replace(second=0, microsecond=0)
431
+
432
+ # Get finalized bars newer than after_dt
433
+ tail_bars = []
434
+ for minute, bar_data in sorted(self._minute_bars[symbol].items()):
435
+ if minute > after_dt and minute < current_minute:
436
+ # Only include core OHLCV data to match historical schema
437
+ simple_bar = {
438
+ 'datetime': bar_data['datetime'],
439
+ 'open': bar_data['open'],
440
+ 'high': bar_data['high'],
441
+ 'low': bar_data['low'],
442
+ 'close': bar_data['close'],
443
+ 'volume': bar_data['volume']
444
+ }
445
+ tail_bars.append(simple_bar)
446
+
447
+ if not tail_bars:
448
+ return None
449
+
450
+ df = pl.DataFrame(tail_bars).sort('datetime')
451
+ df = _ensure_polars_tz(df)
452
+ logger.debug(f"[DATABENTO][LIVE] Collected {len(df)} tail bars after {after_dt}")
453
+ return df
454
+
455
+ def _record_live_trade(self, symbol: str, price: float, size: float, trade_time: datetime):
456
+ """Cache the latest trade for fast quote/price lookups."""
457
+ with self._live_cache_lock:
458
+ self._latest_trades[symbol] = {
459
+ "price": price,
460
+ "size": size,
461
+ "event_time": trade_time,
462
+ "received_at": datetime.now(timezone.utc)
463
+ }
464
+ self._stale_warning_issued.pop(symbol, None)
465
+
466
+ def _record_live_quote(
467
+ self,
468
+ symbol: str,
469
+ bid: Optional[float],
470
+ ask: Optional[float],
471
+ bid_size: Optional[float],
472
+ ask_size: Optional[float],
473
+ quote_time: datetime,
474
+ ):
475
+ with self._live_cache_lock:
476
+ self._latest_quotes[symbol] = {
477
+ "bid": bid,
478
+ "ask": ask,
479
+ "bid_size": bid_size,
480
+ "ask_size": ask_size,
481
+ "event_time": quote_time,
482
+ "received_at": datetime.now(timezone.utc)
483
+ }
484
+ self._stale_warning_issued.pop(symbol, None)
485
+
486
+ def _get_live_trade(self, symbol: str) -> Optional[dict]:
487
+ with self._live_cache_lock:
488
+ return self._latest_trades.get(symbol)
489
+
490
+ def _get_live_quote(self, symbol: str) -> Optional[dict]:
491
+ with self._live_cache_lock:
492
+ return self._latest_quotes.get(symbol)
493
+
494
+ def _is_live_entry_fresh(self, entry: Optional[dict]) -> bool:
495
+ if not entry:
496
+ return False
497
+ received_at = entry.get("received_at")
498
+ if not received_at:
499
+ return False
500
+ return datetime.now(timezone.utc) - received_at <= self._max_live_age
501
+
502
+ def _warn_stale(self, symbol: str, context: str):
503
+ if not self._stale_warning_issued.get(symbol):
504
+ logger.warning(f"[DATABENTO][LIVE] Falling back to historical data for {symbol} ({context})")
505
+ self._stale_warning_issued[symbol] = True
506
+
507
+ def _resolve_futures_symbol(self, asset: Asset, reference_date: datetime = None) -> str:
508
+ """Resolve asset to specific futures contract symbol"""
509
+ if asset.asset_type in [Asset.AssetType.FUTURE, Asset.AssetType.CONT_FUTURE]:
510
+ # For continuous futures, resolve to specific contract
511
+ if asset.asset_type == Asset.AssetType.CONT_FUTURE:
512
+ if hasattr(asset, 'resolve_continuous_futures_contract'):
513
+ return asset.resolve_continuous_futures_contract(
514
+ reference_date=reference_date,
515
+ year_digits=1,
516
+ )
517
+
518
+ # Manual resolution for common futures
519
+ symbol = asset.symbol.upper()
520
+ month = reference_date.month if reference_date else datetime.now().month
521
+ year = reference_date.year if reference_date else datetime.now().year
522
+
523
+ # Quarterly contracts
524
+ if month <= 3:
525
+ month_code = 'H'
526
+ elif month <= 6:
527
+ month_code = 'M'
528
+ elif month <= 9:
529
+ month_code = 'U'
530
+ else:
531
+ month_code = 'Z'
532
+
533
+ year_digit = year % 10
534
+
535
+ if symbol in ["ES", "NQ", "RTY", "YM", "MES", "MNQ", "MYM", "M2K", "CL", "GC", "SI"]:
536
+ return f"{symbol}{month_code}{year_digit}"
537
+
538
+ return asset.symbol
539
+
540
+ return asset.symbol
541
+
542
+ def get_historical_prices(
543
+ self,
544
+ asset: Asset,
545
+ length: int,
546
+ timestep: str = "minute",
547
+ timeshift: Optional[timedelta] = None,
548
+ quote: Optional[Asset] = None,
549
+ exchange: Optional[str] = None,
550
+ include_after_hours: bool = True,
551
+ return_polars: bool = False
552
+ ) -> Optional[Bars]:
553
+ """Get historical prices with live tail merge"""
554
+
555
+ # Validate asset type
556
+ if asset.asset_type not in [Asset.AssetType.FUTURE, Asset.AssetType.CONT_FUTURE]:
557
+ logger.error(f"DataBento only supports futures. Got: {asset.asset_type}")
558
+ return None
559
+
560
+ # Calculate time range
561
+ current_time = datetime.now(timezone.utc)
562
+ if timeshift:
563
+ current_time = current_time - timeshift
564
+
565
+ # Determine time range
566
+ if timestep == "minute":
567
+ time_needed = timedelta(minutes=max(length * 3, 30))
568
+ elif timestep == "hour":
569
+ time_needed = timedelta(hours=max(length * 2, 12))
570
+ else:
571
+ time_needed = timedelta(days=max(length * 2, 10))
572
+
573
+ start_time = current_time - time_needed
574
+
575
+ # Resolve to specific contract
576
+ symbol = self._resolve_futures_symbol(asset, current_time)
577
+ logger.debug(f"Resolved {asset.symbol} to {symbol}")
578
+
579
+ # Subscribe to live stream if enabled (only for recent data gap)
580
+ if self.enable_live_stream and symbol not in self._subscribed_symbols:
581
+ # Live API can only replay recent data (last ~30 minutes)
582
+ live_start_time = current_time - timedelta(minutes=30)
583
+ self._subscribe_to_symbol(symbol, live_start_time, min_bars=30)
584
+ # Give it a moment to start receiving data
585
+ time.sleep(0.5)
586
+
587
+ # Get historical data
588
+ logger.debug(f"[DATABENTO][HIST] Fetching {symbol} from {start_time} to {current_time}")
589
+
590
+ df = databento_helper_polars.get_price_data_from_databento_polars(
591
+ api_key=self._api_key,
592
+ asset=asset,
593
+ start=start_time,
594
+ end=current_time,
595
+ timestep=timestep,
596
+ venue=exchange,
597
+ force_cache_update=False
598
+ )
599
+
600
+ if df is not None and not df.is_empty():
601
+ # Try to append live tail if available
602
+ if self.enable_live_stream and 'datetime' in df.columns:
603
+ try:
604
+ hist_last = df['datetime'].max()
605
+ # Ensure hist_last is timezone-aware
606
+ if not hasattr(hist_last, 'tzinfo') or hist_last.tzinfo is None:
607
+ from datetime import timezone as tz
608
+ hist_last = hist_last.replace(tzinfo=tz.utc)
609
+ tail_df = self._get_live_tail(symbol, hist_last)
610
+
611
+ # Debug: check live bar status
612
+ if symbol in self._minute_bars:
613
+ live_bar_count = len(self._minute_bars[symbol])
614
+ finalized_count = len(self._finalized_minutes.get(symbol, []))
615
+ logger.debug(f"[DATABENTO][DEBUG] {symbol} has {live_bar_count} total bars, {finalized_count} finalized")
616
+ else:
617
+ logger.debug(f"[DATABENTO][DEBUG] No live bars for {symbol}")
618
+
619
+ if tail_df is not None and not tail_df.is_empty():
620
+ # Make sure both dataframes have the same columns and types
621
+ try:
622
+ # Ensure timezone compatibility
623
+ hist_tz_info = df['datetime'].dtype
624
+ tail_tz_info = tail_df['datetime'].dtype
625
+
626
+ logger.debug(f"[DATABENTO][MERGE] Historical datetime: {hist_tz_info}, Live datetime: {tail_tz_info}")
627
+
628
+ df = _ensure_polars_tz(df)
629
+ tail_df = _ensure_polars_tz(tail_df)
630
+
631
+ # Only keep columns that exist in both dataframes
632
+ common_columns = [col for col in df.columns if col in tail_df.columns]
633
+ df_subset = df.select(common_columns)
634
+ tail_subset = tail_df.select(common_columns)
635
+
636
+ # Ensure numeric columns have compatible types
637
+ for col in common_columns:
638
+ if col != 'datetime': # Don't modify datetime
639
+ df_dtype = df_subset[col].dtype
640
+ tail_dtype = tail_subset[col].dtype
641
+
642
+ # Convert both to Float64 for compatibility
643
+ if df_dtype != tail_dtype:
644
+ logger.debug(f"[DATABENTO][MERGE] Converting {col}: {df_dtype} vs {tail_dtype} -> Float64")
645
+ df_subset = df_subset.with_columns(pl.col(col).cast(pl.Float64))
646
+ tail_subset = tail_subset.with_columns(pl.col(col).cast(pl.Float64))
647
+
648
+ # Merge the data and drop duplicate minutes (keep latest)
649
+ merged_df = pl.concat([df_subset, tail_subset]).sort('datetime')
650
+ merged_df = merged_df.unique(subset=['datetime'], keep='last').sort('datetime')
651
+
652
+ # If original df had more columns, merge them back
653
+ if len(df.columns) > len(common_columns):
654
+ extra_cols = [col for col in df.columns if col not in common_columns]
655
+ df_extra = df.select(['datetime'] + extra_cols)
656
+ merged_df = merged_df.join(df_extra, on='datetime', how='left')
657
+
658
+ df = merged_df
659
+ logger.debug(f"[DATABENTO][MERGE] Successfully appended {len(tail_df)} live bars")
660
+
661
+ except Exception as merge_e:
662
+ logger.error(f"[DATABENTO][MERGE] All merge attempts failed: {merge_e}")
663
+ # Last resort - just log what we have
664
+ hist_latest = df['datetime'].max() if 'datetime' in df.columns else None
665
+ tail_latest = tail_df['datetime'].max() if 'datetime' in tail_df.columns else None
666
+ logger.error(f"[DATABENTO][MERGE] Historical latest: {hist_latest}, Live latest: {tail_latest}")
667
+ # Continue with historical data only
668
+ else:
669
+ lag = (current_time - hist_last).total_seconds()
670
+ logger.debug(f"[DATABENTO][MERGE] No live tail bars (lag={lag:.0f}s)")
671
+
672
+ except Exception as e:
673
+ logger.warning(f"[DATABENTO][MERGE] Failed to merge live tail: {e}")
674
+
675
+ # Trim to requested length
676
+ df = df.tail(length)
677
+ df = _ensure_polars_tz(df)
678
+ return Bars(
679
+ df=df,
680
+ source=self.SOURCE,
681
+ asset=asset,
682
+ quote=quote,
683
+ return_polars=return_polars,
684
+ tzinfo=self.tzinfo,
685
+ )
686
+
687
+ return None
688
+
689
+ def get_last_price(self, asset: Asset, quote: Optional[Asset] = None, exchange: Optional[str] = None) -> Optional[float]:
690
+ """Get the last price for an asset"""
691
+ symbol = self._resolve_futures_symbol(asset)
692
+
693
+ # Try live tick cache
694
+ if self.enable_live_stream:
695
+ if symbol not in self._subscribed_symbols:
696
+ self._subscribe_to_symbol(symbol)
697
+
698
+ trade_entry = self._get_live_trade(symbol)
699
+ if self._is_live_entry_fresh(trade_entry):
700
+ return float(trade_entry["price"])
701
+ else:
702
+ self._warn_stale(symbol, "stale trade cache")
703
+
704
+ # Fallback to historical
705
+ bars = self.get_historical_prices(asset, 1, "minute", exchange=exchange)
706
+ if bars and len(bars) > 0:
707
+ return float(bars.df['close'].tail(1).item())
708
+
709
+ return None
710
+
711
+ def get_quote(self, asset: Asset, quote: Optional[Asset] = None, exchange: Optional[str] = None) -> Quote:
712
+ symbol = self._resolve_futures_symbol(asset)
713
+ bid = ask = price = bid_size = ask_size = None
714
+ event_time = datetime.now(timezone.utc)
715
+ age_ms = None
716
+
717
+ if self.enable_live_stream:
718
+ if symbol not in self._subscribed_symbols:
719
+ self._subscribe_to_symbol(symbol)
720
+
721
+ quote_entry = self._get_live_quote(symbol)
722
+ trade_entry = self._get_live_trade(symbol)
723
+
724
+ if self._is_live_entry_fresh(quote_entry):
725
+ bid = quote_entry.get("bid")
726
+ ask = quote_entry.get("ask")
727
+ bid_size = quote_entry.get("bid_size")
728
+ ask_size = quote_entry.get("ask_size")
729
+ event_time = quote_entry.get("event_time", event_time)
730
+ age_ms = int((datetime.now(timezone.utc) - quote_entry["received_at"]).total_seconds() * 1000)
731
+
732
+ if trade_entry and self._is_live_entry_fresh(trade_entry):
733
+ price = trade_entry.get("price")
734
+ elif bid is not None and ask is not None:
735
+ price = (bid + ask) / 2
736
+ elif self._is_live_entry_fresh(trade_entry):
737
+ price = trade_entry.get("price")
738
+ event_time = trade_entry.get("event_time", event_time)
739
+ age_ms = int((datetime.now(timezone.utc) - trade_entry["received_at"]).total_seconds() * 1000)
740
+
741
+ tick = 0.25 if price is not None else 0.25
742
+ if price is not None:
743
+ bid = price - tick / 2
744
+ ask = price + tick / 2
745
+ else:
746
+ self._warn_stale(symbol, "stale quote cache")
747
+
748
+ if price is None:
749
+ last_price = self.get_last_price(asset, quote=quote, exchange=exchange)
750
+ price = last_price
751
+ if last_price is not None and bid is None and ask is None:
752
+ tick = 0.25
753
+ bid = last_price - tick / 2
754
+ ask = last_price + tick / 2
755
+
756
+ return Quote(
757
+ asset=asset,
758
+ price=price,
759
+ bid=bid,
760
+ ask=ask,
761
+ bid_size=bid_size,
762
+ ask_size=ask_size,
763
+ timestamp=event_time,
764
+ quote_time=event_time,
765
+ raw_data={
766
+ "source": "databento_live" if self.enable_live_stream else "databento_rest",
767
+ "age_ms": age_ms,
768
+ }
769
+ )
770
+
771
+ def get_chains(self, asset: Asset, quote: Asset = None, exchange: str = None) -> dict:
772
+ """Get option chains - not supported for futures"""
773
+ logger.warning("DataBento does not support option chains")
774
+ return {"Chains": {}, "Multiplier": 1, "Exchange": exchange or ""}
775
+
776
+ def __del__(self):
777
+ """Cleanup on deletion"""
778
+ if hasattr(self, '_stop_streaming'):
779
+ self._stop_streaming = True
780
+
781
+ # Stop all producer threads
782
+ if hasattr(self, '_producer_threads'):
783
+ for symbol, thread in self._producer_threads.items():
784
+ if thread and thread.is_alive():
785
+ thread.join(timeout=1)
786
+
787
+ # Stop consumer thread
788
+ if hasattr(self, '_consumer_thread') and self._consumer_thread:
789
+ self._consumer_thread.join(timeout=1)
790
+
791
+ # Stop finalizer thread
792
+ if hasattr(self, '_finalizer_thread') and self._finalizer_thread:
793
+ self._finalizer_thread.join(timeout=1)