lumibot 4.1.3__py3-none-any.whl → 4.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lumibot might be problematic. Click here for more details.

Files changed (163) hide show
  1. lumibot/backtesting/__init__.py +19 -5
  2. lumibot/backtesting/backtesting_broker.py +98 -18
  3. lumibot/backtesting/databento_backtesting.py +5 -686
  4. lumibot/backtesting/databento_backtesting_pandas.py +738 -0
  5. lumibot/backtesting/databento_backtesting_polars.py +860 -546
  6. lumibot/backtesting/fix_debug.py +37 -0
  7. lumibot/backtesting/thetadata_backtesting.py +9 -355
  8. lumibot/backtesting/thetadata_backtesting_pandas.py +1167 -0
  9. lumibot/brokers/alpaca.py +8 -1
  10. lumibot/brokers/schwab.py +12 -2
  11. lumibot/credentials.py +13 -0
  12. lumibot/data_sources/__init__.py +5 -8
  13. lumibot/data_sources/data_source.py +6 -2
  14. lumibot/data_sources/data_source_backtesting.py +30 -0
  15. lumibot/data_sources/databento_data.py +5 -390
  16. lumibot/data_sources/databento_data_pandas.py +440 -0
  17. lumibot/data_sources/databento_data_polars.py +15 -9
  18. lumibot/data_sources/pandas_data.py +30 -17
  19. lumibot/data_sources/polars_data.py +986 -0
  20. lumibot/data_sources/polars_mixin.py +472 -96
  21. lumibot/data_sources/polygon_data_polars.py +5 -0
  22. lumibot/data_sources/yahoo_data.py +9 -2
  23. lumibot/data_sources/yahoo_data_polars.py +5 -0
  24. lumibot/entities/__init__.py +15 -0
  25. lumibot/entities/asset.py +5 -28
  26. lumibot/entities/bars.py +89 -20
  27. lumibot/entities/data.py +29 -6
  28. lumibot/entities/data_polars.py +668 -0
  29. lumibot/entities/position.py +38 -4
  30. lumibot/strategies/_strategy.py +2 -1
  31. lumibot/strategies/strategy.py +61 -49
  32. lumibot/tools/backtest_cache.py +284 -0
  33. lumibot/tools/databento_helper.py +35 -35
  34. lumibot/tools/databento_helper_polars.py +738 -775
  35. lumibot/tools/futures_roll.py +251 -0
  36. lumibot/tools/indicators.py +135 -104
  37. lumibot/tools/polars_utils.py +142 -0
  38. lumibot/tools/thetadata_helper.py +1068 -134
  39. {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/METADATA +9 -1
  40. {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/RECORD +71 -147
  41. tests/backtest/test_databento.py +37 -6
  42. tests/backtest/test_databento_comprehensive_trading.py +8 -4
  43. tests/backtest/test_databento_parity.py +4 -2
  44. tests/backtest/test_debug_avg_fill_price.py +1 -1
  45. tests/backtest/test_example_strategies.py +11 -1
  46. tests/backtest/test_futures_edge_cases.py +3 -3
  47. tests/backtest/test_futures_single_trade.py +2 -2
  48. tests/backtest/test_futures_ultra_simple.py +2 -2
  49. tests/backtest/test_polars_lru_eviction.py +470 -0
  50. tests/backtest/test_yahoo.py +42 -0
  51. tests/test_asset.py +4 -4
  52. tests/test_backtest_cache_manager.py +149 -0
  53. tests/test_backtesting_data_source_env.py +6 -0
  54. tests/test_continuous_futures_resolution.py +60 -48
  55. tests/test_data_polars_parity.py +160 -0
  56. tests/test_databento_asset_validation.py +23 -5
  57. tests/test_databento_backtesting.py +1 -1
  58. tests/test_databento_backtesting_polars.py +312 -192
  59. tests/test_databento_data.py +220 -463
  60. tests/test_databento_live.py +10 -10
  61. tests/test_futures_roll.py +38 -0
  62. tests/test_indicator_subplots.py +101 -0
  63. tests/test_market_infinite_loop_bug.py +77 -3
  64. tests/test_polars_resample.py +67 -0
  65. tests/test_polygon_helper.py +46 -0
  66. tests/test_thetadata_backwards_compat.py +97 -0
  67. tests/test_thetadata_helper.py +222 -23
  68. tests/test_thetadata_pandas_verification.py +186 -0
  69. lumibot/__pycache__/__init__.cpython-312.pyc +0 -0
  70. lumibot/__pycache__/constants.cpython-312.pyc +0 -0
  71. lumibot/__pycache__/credentials.cpython-312.pyc +0 -0
  72. lumibot/backtesting/__pycache__/__init__.cpython-312.pyc +0 -0
  73. lumibot/backtesting/__pycache__/alpaca_backtesting.cpython-312.pyc +0 -0
  74. lumibot/backtesting/__pycache__/alpha_vantage_backtesting.cpython-312.pyc +0 -0
  75. lumibot/backtesting/__pycache__/backtesting_broker.cpython-312.pyc +0 -0
  76. lumibot/backtesting/__pycache__/ccxt_backtesting.cpython-312.pyc +0 -0
  77. lumibot/backtesting/__pycache__/databento_backtesting.cpython-312.pyc +0 -0
  78. lumibot/backtesting/__pycache__/interactive_brokers_rest_backtesting.cpython-312.pyc +0 -0
  79. lumibot/backtesting/__pycache__/pandas_backtesting.cpython-312.pyc +0 -0
  80. lumibot/backtesting/__pycache__/polygon_backtesting.cpython-312.pyc +0 -0
  81. lumibot/backtesting/__pycache__/thetadata_backtesting.cpython-312.pyc +0 -0
  82. lumibot/backtesting/__pycache__/yahoo_backtesting.cpython-312.pyc +0 -0
  83. lumibot/brokers/__pycache__/__init__.cpython-312.pyc +0 -0
  84. lumibot/brokers/__pycache__/alpaca.cpython-312.pyc +0 -0
  85. lumibot/brokers/__pycache__/bitunix.cpython-312.pyc +0 -0
  86. lumibot/brokers/__pycache__/broker.cpython-312.pyc +0 -0
  87. lumibot/brokers/__pycache__/ccxt.cpython-312.pyc +0 -0
  88. lumibot/brokers/__pycache__/example_broker.cpython-312.pyc +0 -0
  89. lumibot/brokers/__pycache__/interactive_brokers.cpython-312.pyc +0 -0
  90. lumibot/brokers/__pycache__/interactive_brokers_rest.cpython-312.pyc +0 -0
  91. lumibot/brokers/__pycache__/projectx.cpython-312.pyc +0 -0
  92. lumibot/brokers/__pycache__/schwab.cpython-312.pyc +0 -0
  93. lumibot/brokers/__pycache__/tradier.cpython-312.pyc +0 -0
  94. lumibot/brokers/__pycache__/tradovate.cpython-312.pyc +0 -0
  95. lumibot/data_sources/__pycache__/__init__.cpython-312.pyc +0 -0
  96. lumibot/data_sources/__pycache__/alpaca_data.cpython-312.pyc +0 -0
  97. lumibot/data_sources/__pycache__/alpha_vantage_data.cpython-312.pyc +0 -0
  98. lumibot/data_sources/__pycache__/bitunix_data.cpython-312.pyc +0 -0
  99. lumibot/data_sources/__pycache__/ccxt_backtesting_data.cpython-312.pyc +0 -0
  100. lumibot/data_sources/__pycache__/ccxt_data.cpython-312.pyc +0 -0
  101. lumibot/data_sources/__pycache__/data_source.cpython-312.pyc +0 -0
  102. lumibot/data_sources/__pycache__/data_source_backtesting.cpython-312.pyc +0 -0
  103. lumibot/data_sources/__pycache__/databento_data_polars_backtesting.cpython-312.pyc +0 -0
  104. lumibot/data_sources/__pycache__/databento_data_polars_live.cpython-312.pyc +0 -0
  105. lumibot/data_sources/__pycache__/example_broker_data.cpython-312.pyc +0 -0
  106. lumibot/data_sources/__pycache__/exceptions.cpython-312.pyc +0 -0
  107. lumibot/data_sources/__pycache__/interactive_brokers_data.cpython-312.pyc +0 -0
  108. lumibot/data_sources/__pycache__/interactive_brokers_rest_data.cpython-312.pyc +0 -0
  109. lumibot/data_sources/__pycache__/pandas_data.cpython-312.pyc +0 -0
  110. lumibot/data_sources/__pycache__/polars_mixin.cpython-312.pyc +0 -0
  111. lumibot/data_sources/__pycache__/polygon_data_polars.cpython-312.pyc +0 -0
  112. lumibot/data_sources/__pycache__/projectx_data.cpython-312.pyc +0 -0
  113. lumibot/data_sources/__pycache__/schwab_data.cpython-312.pyc +0 -0
  114. lumibot/data_sources/__pycache__/tradier_data.cpython-312.pyc +0 -0
  115. lumibot/data_sources/__pycache__/tradovate_data.cpython-312.pyc +0 -0
  116. lumibot/data_sources/__pycache__/yahoo_data_polars.cpython-312.pyc +0 -0
  117. lumibot/entities/__pycache__/__init__.cpython-312.pyc +0 -0
  118. lumibot/entities/__pycache__/asset.cpython-312.pyc +0 -0
  119. lumibot/entities/__pycache__/bar.cpython-312.pyc +0 -0
  120. lumibot/entities/__pycache__/bars.cpython-312.pyc +0 -0
  121. lumibot/entities/__pycache__/chains.cpython-312.pyc +0 -0
  122. lumibot/entities/__pycache__/data.cpython-312.pyc +0 -0
  123. lumibot/entities/__pycache__/dataline.cpython-312.pyc +0 -0
  124. lumibot/entities/__pycache__/order.cpython-312.pyc +0 -0
  125. lumibot/entities/__pycache__/position.cpython-312.pyc +0 -0
  126. lumibot/entities/__pycache__/quote.cpython-312.pyc +0 -0
  127. lumibot/entities/__pycache__/trading_fee.cpython-312.pyc +0 -0
  128. lumibot/example_strategies/__pycache__/__init__.cpython-312.pyc +0 -0
  129. lumibot/example_strategies/__pycache__/test_broker_functions.cpython-312-pytest-8.4.1.pyc +0 -0
  130. lumibot/strategies/__pycache__/__init__.cpython-312.pyc +0 -0
  131. lumibot/strategies/__pycache__/_strategy.cpython-312.pyc +0 -0
  132. lumibot/strategies/__pycache__/strategy.cpython-312.pyc +0 -0
  133. lumibot/strategies/__pycache__/strategy_executor.cpython-312.pyc +0 -0
  134. lumibot/tools/__pycache__/__init__.cpython-312.pyc +0 -0
  135. lumibot/tools/__pycache__/alpaca_helpers.cpython-312.pyc +0 -0
  136. lumibot/tools/__pycache__/bitunix_helpers.cpython-312.pyc +0 -0
  137. lumibot/tools/__pycache__/black_scholes.cpython-312.pyc +0 -0
  138. lumibot/tools/__pycache__/ccxt_data_store.cpython-312.pyc +0 -0
  139. lumibot/tools/__pycache__/databento_helper.cpython-312.pyc +0 -0
  140. lumibot/tools/__pycache__/databento_helper_polars.cpython-312.pyc +0 -0
  141. lumibot/tools/__pycache__/debugers.cpython-312.pyc +0 -0
  142. lumibot/tools/__pycache__/decorators.cpython-312.pyc +0 -0
  143. lumibot/tools/__pycache__/helpers.cpython-312.pyc +0 -0
  144. lumibot/tools/__pycache__/indicators.cpython-312.pyc +0 -0
  145. lumibot/tools/__pycache__/lumibot_logger.cpython-312.pyc +0 -0
  146. lumibot/tools/__pycache__/pandas.cpython-312.pyc +0 -0
  147. lumibot/tools/__pycache__/polygon_helper.cpython-312.pyc +0 -0
  148. lumibot/tools/__pycache__/polygon_helper_async.cpython-312.pyc +0 -0
  149. lumibot/tools/__pycache__/polygon_helper_polars_optimized.cpython-312.pyc +0 -0
  150. lumibot/tools/__pycache__/projectx_helpers.cpython-312.pyc +0 -0
  151. lumibot/tools/__pycache__/schwab_helper.cpython-312.pyc +0 -0
  152. lumibot/tools/__pycache__/thetadata_helper.cpython-312.pyc +0 -0
  153. lumibot/tools/__pycache__/types.cpython-312.pyc +0 -0
  154. lumibot/tools/__pycache__/yahoo_helper.cpython-312.pyc +0 -0
  155. lumibot/tools/__pycache__/yahoo_helper_polars_optimized.cpython-312.pyc +0 -0
  156. lumibot/traders/__pycache__/__init__.cpython-312.pyc +0 -0
  157. lumibot/traders/__pycache__/trader.cpython-312.pyc +0 -0
  158. lumibot/trading_builtins/__pycache__/__init__.cpython-312.pyc +0 -0
  159. lumibot/trading_builtins/__pycache__/custom_stream.cpython-312.pyc +0 -0
  160. lumibot/trading_builtins/__pycache__/safe_list.cpython-312.pyc +0 -0
  161. {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/WHEEL +0 -0
  162. {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/licenses/LICENSE +0 -0
  163. {lumibot-4.1.3.dist-info → lumibot-4.2.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,9 @@
1
1
  # This file contains helper functions for getting data from Polygon.io
2
2
  import time
3
3
  import os
4
- from datetime import date, datetime, timedelta
4
+ import signal
5
+ from typing import Dict, List, Optional
6
+ from datetime import date, datetime, timedelta, timezone
5
7
  from pathlib import Path
6
8
  import pytz
7
9
  import pandas as pd
@@ -11,6 +13,7 @@ from lumibot import LUMIBOT_CACHE_FOLDER, LUMIBOT_DEFAULT_PYTZ
11
13
  from lumibot.tools.lumibot_logger import get_logger
12
14
  from lumibot.entities import Asset
13
15
  from tqdm import tqdm
16
+ from lumibot.tools.backtest_cache import CacheMode, get_backtest_cache
14
17
 
15
18
  logger = get_logger(__name__)
16
19
 
@@ -18,10 +21,172 @@ WAIT_TIME = 60
18
21
  MAX_DAYS = 30
19
22
  CACHE_SUBFOLDER = "thetadata"
20
23
  BASE_URL = "http://127.0.0.1:25510"
24
+ CONNECTION_RETRY_SLEEP = 1.0
25
+ CONNECTION_MAX_RETRIES = 60
26
+ BOOT_GRACE_PERIOD = 5.0
27
+ MAX_RESTART_ATTEMPTS = 3
21
28
 
22
29
  # Global process tracking for ThetaTerminal
23
30
  THETA_DATA_PROCESS = None
24
31
  THETA_DATA_PID = None
32
+ THETA_DATA_LOG_HANDLE = None
33
+
34
+ def reset_connection_diagnostics():
35
+ """Reset ThetaData connection counters (useful for tests)."""
36
+ CONNECTION_DIAGNOSTICS.update({
37
+ "check_connection_calls": 0,
38
+ "start_terminal_calls": 0,
39
+ "network_requests": 0,
40
+ "placeholder_writes": 0,
41
+ })
42
+
43
+
44
+ def ensure_missing_column(df: Optional[pd.DataFrame]) -> Optional[pd.DataFrame]:
45
+ """Ensure the dataframe includes a `missing` flag column (True for placeholders)."""
46
+ if df is None or len(df) == 0:
47
+ return df
48
+ if "missing" not in df.columns:
49
+ df["missing"] = False
50
+ logger.debug(
51
+ "[THETA][DEBUG][THETADATA-CACHE] added 'missing' column to frame (rows=%d)",
52
+ len(df),
53
+ )
54
+ return df
55
+
56
+
57
+ def restore_numeric_dtypes(df: Optional[pd.DataFrame]) -> Optional[pd.DataFrame]:
58
+ """Try to convert object columns back to numeric types after placeholder removal."""
59
+ if df is None or len(df) == 0:
60
+ return df
61
+ for column in df.columns:
62
+ if df[column].dtype == object:
63
+ try:
64
+ df[column] = pd.to_numeric(df[column])
65
+ except (ValueError, TypeError):
66
+ continue
67
+ return df
68
+
69
+
70
+ def append_missing_markers(
71
+ df_all: Optional[pd.DataFrame],
72
+ missing_dates: List[datetime.date],
73
+ ) -> Optional[pd.DataFrame]:
74
+ """Append placeholder rows for dates that returned no data."""
75
+ if not missing_dates:
76
+ if df_all is not None and not df_all.empty and "missing" in df_all.columns:
77
+ df_all = df_all[~df_all["missing"].astype(bool)].drop(columns=["missing"])
78
+ df_all = restore_numeric_dtypes(df_all)
79
+ return df_all
80
+
81
+ base_columns = ["open", "high", "low", "close", "volume"]
82
+
83
+ if df_all is None or len(df_all) == 0:
84
+ df_all = pd.DataFrame(columns=base_columns + ["missing"])
85
+ df_all.index = pd.DatetimeIndex([], name="datetime")
86
+
87
+ df_all = ensure_missing_column(df_all)
88
+
89
+ rows = []
90
+ for d in missing_dates:
91
+ dt = datetime(d.year, d.month, d.day, tzinfo=pytz.UTC)
92
+ row = {col: pd.NA for col in df_all.columns if col != "missing"}
93
+ row["datetime"] = dt
94
+ row["missing"] = True
95
+ rows.append(row)
96
+
97
+ if rows:
98
+ CONNECTION_DIAGNOSTICS["placeholder_writes"] = CONNECTION_DIAGNOSTICS.get("placeholder_writes", 0) + len(rows)
99
+
100
+ # DEBUG-LOG: Placeholder injection
101
+ logger.debug(
102
+ "[THETA][DEBUG][PLACEHOLDER][INJECT] count=%d dates=%s",
103
+ len(rows),
104
+ ", ".join(sorted({d.isoformat() for d in missing_dates}))
105
+ )
106
+
107
+ placeholder_df = pd.DataFrame(rows).set_index("datetime")
108
+ for col in df_all.columns:
109
+ if col not in placeholder_df.columns:
110
+ placeholder_df[col] = pd.NA if col != "missing" else True
111
+ placeholder_df = placeholder_df[df_all.columns]
112
+ if len(df_all) == 0:
113
+ df_all = placeholder_df
114
+ else:
115
+ df_all = pd.concat([df_all, placeholder_df]).sort_index()
116
+ df_all = df_all[~df_all.index.duplicated(keep="last")]
117
+ logger.debug(
118
+ "[THETA][DEBUG][THETADATA-CACHE] recorded %d placeholder day(s): %s",
119
+ len(rows),
120
+ ", ".join(sorted({d.isoformat() for d in missing_dates})),
121
+ )
122
+
123
+ return df_all
124
+
125
+
126
+ def remove_missing_markers(
127
+ df_all: Optional[pd.DataFrame],
128
+ available_dates: List[datetime.date],
129
+ ) -> Optional[pd.DataFrame]:
130
+ """Drop placeholder rows when real data becomes available."""
131
+ if df_all is None or len(df_all) == 0 or not available_dates:
132
+ return df_all
133
+
134
+ df_all = ensure_missing_column(df_all)
135
+ available_set = set(available_dates)
136
+
137
+ mask = df_all["missing"].eq(True) & df_all.index.map(
138
+ lambda ts: ts.date() in available_set
139
+ )
140
+ if mask.any():
141
+ removed_dates = sorted({ts.date().isoformat() for ts in df_all.index[mask]})
142
+ df_all = df_all.loc[~mask]
143
+ logger.debug(
144
+ "[THETA][DEBUG][THETADATA-CACHE] cleared %d placeholder row(s) for dates: %s",
145
+ mask.sum(),
146
+ ", ".join(removed_dates),
147
+ )
148
+
149
+ return df_all
150
+
151
+
152
+ def _clamp_option_end(asset: Asset, dt: datetime) -> datetime:
153
+ """Ensure intraday pulls for options never extend beyond expiration."""
154
+ if isinstance(dt, datetime):
155
+ end_dt = dt
156
+ else:
157
+ end_dt = datetime.combine(dt, datetime.max.time())
158
+
159
+ if end_dt.tzinfo is None:
160
+ end_dt = end_dt.replace(tzinfo=pytz.UTC)
161
+
162
+ if asset.asset_type == "option" and asset.expiration:
163
+ expiration_dt = datetime.combine(asset.expiration, datetime.max.time())
164
+ expiration_dt = expiration_dt.replace(tzinfo=end_dt.tzinfo)
165
+ if end_dt > expiration_dt:
166
+ return expiration_dt
167
+
168
+ return end_dt
169
+
170
+
171
+ def reset_theta_terminal_tracking():
172
+ """Clear cached ThetaTerminal process references."""
173
+ global THETA_DATA_PROCESS, THETA_DATA_PID, THETA_DATA_LOG_HANDLE
174
+ THETA_DATA_PROCESS = None
175
+ THETA_DATA_PID = None
176
+ if THETA_DATA_LOG_HANDLE is not None:
177
+ try:
178
+ THETA_DATA_LOG_HANDLE.close()
179
+ except Exception:
180
+ pass
181
+ THETA_DATA_LOG_HANDLE = None
182
+
183
+
184
+ CONNECTION_DIAGNOSTICS = {
185
+ "check_connection_calls": 0,
186
+ "start_terminal_calls": 0,
187
+ "network_requests": 0,
188
+ "placeholder_writes": 0,
189
+ }
25
190
 
26
191
 
27
192
  def get_price_data(
@@ -34,13 +199,17 @@ def get_price_data(
34
199
  quote_asset: Asset = None,
35
200
  dt=None,
36
201
  datastyle: str = "ohlc",
37
- include_after_hours: bool = True
38
- ):
202
+ include_after_hours: bool = True,
203
+ return_polars: bool = False
204
+ ) -> Optional[pd.DataFrame]:
39
205
  """
40
206
  Queries ThetaData for pricing data for the given asset and returns a DataFrame with the data. Data will be
41
207
  cached in the LUMIBOT_CACHE_FOLDER/{CACHE_SUBFOLDER} folder so that it can be reused later and we don't have to query
42
208
  ThetaData every time we run a backtest.
43
209
 
210
+ Returns pandas DataFrames for backwards compatibility. Polars output is not
211
+ currently supported; callers requesting polars will receive a ValueError.
212
+
44
213
  Parameters
45
214
  ----------
46
215
  username : str
@@ -62,35 +231,153 @@ def get_price_data(
62
231
  The style of data to retrieve ("ohlc" or "quote")
63
232
  include_after_hours : bool
64
233
  Whether to include after-hours trading data (default True)
234
+ return_polars : bool
235
+ ThetaData currently supports pandas output only. Passing True raises a ValueError.
65
236
 
66
237
  Returns
67
238
  -------
68
- pd.DataFrame
69
- A DataFrame with the pricing data for the asset
239
+ Optional[pd.DataFrame]
240
+ A pandas DataFrame with the pricing data for the asset
70
241
 
71
242
  """
72
243
  import pytz # Import at function level to avoid scope issues in nested calls
73
244
 
245
+ # DEBUG-LOG: Entry point for ThetaData request
246
+ logger.debug(
247
+ "[THETA][DEBUG][REQUEST][ENTRY] asset=%s quote=%s start=%s end=%s dt=%s timespan=%s datastyle=%s include_after_hours=%s return_polars=%s",
248
+ asset,
249
+ quote_asset,
250
+ start.isoformat() if hasattr(start, 'isoformat') else start,
251
+ end.isoformat() if hasattr(end, 'isoformat') else end,
252
+ dt.isoformat() if dt and hasattr(dt, 'isoformat') else dt,
253
+ timespan,
254
+ datastyle,
255
+ include_after_hours,
256
+ return_polars
257
+ )
258
+
259
+ if return_polars:
260
+ raise ValueError("ThetaData polars output is not available; pass return_polars=False.")
261
+
262
+ # Preserve original bounds for final filtering
263
+ requested_start = start
264
+ requested_end = end
265
+
74
266
  # Check if we already have data for this asset in the cache file
75
267
  df_all = None
76
268
  df_cached = None
77
269
  cache_file = build_cache_filename(asset, timespan, datastyle)
270
+ remote_payload = build_remote_cache_payload(asset, timespan, datastyle)
271
+ cache_manager = get_backtest_cache()
272
+
273
+ if cache_manager.enabled:
274
+ try:
275
+ fetched_remote = cache_manager.ensure_local_file(cache_file, payload=remote_payload)
276
+ if fetched_remote:
277
+ logger.debug(
278
+ "[THETA][DEBUG][CACHE][REMOTE_DOWNLOAD] asset=%s timespan=%s datastyle=%s cache_file=%s",
279
+ asset,
280
+ timespan,
281
+ datastyle,
282
+ cache_file,
283
+ )
284
+ except Exception as exc:
285
+ logger.debug(
286
+ "[THETA][DEBUG][CACHE][REMOTE_DOWNLOAD_ERROR] asset=%s cache_file=%s error=%s",
287
+ asset,
288
+ cache_file,
289
+ exc,
290
+ )
291
+
292
+ # DEBUG-LOG: Cache file check
293
+ logger.debug(
294
+ "[THETA][DEBUG][CACHE][CHECK] asset=%s timespan=%s datastyle=%s cache_file=%s exists=%s",
295
+ asset,
296
+ timespan,
297
+ datastyle,
298
+ cache_file,
299
+ cache_file.exists()
300
+ )
301
+
78
302
  if cache_file.exists():
79
303
  logger.info(f"\nLoading '{datastyle}' pricing data for {asset} / {quote_asset} with '{timespan}' timespan from cache file...")
80
304
  df_cached = load_cache(cache_file)
81
305
  if df_cached is not None and not df_cached.empty:
82
306
  df_all = df_cached.copy() # Make a copy so we can check the original later for differences
83
307
 
308
+ cached_rows = 0 if df_all is None else len(df_all)
309
+ placeholder_rows = 0
310
+ if df_all is not None and not df_all.empty and "missing" in df_all.columns:
311
+ placeholder_rows = int(df_all["missing"].sum())
312
+
313
+ # DEBUG-LOG: Cache load result
314
+ logger.debug(
315
+ "[THETA][DEBUG][CACHE][LOADED] asset=%s cached_rows=%d placeholder_rows=%d real_rows=%d",
316
+ asset,
317
+ cached_rows,
318
+ placeholder_rows,
319
+ cached_rows - placeholder_rows
320
+ )
321
+
322
+ logger.debug(
323
+ "[THETA][DEBUG][THETADATA-CACHE] pre-fetch rows=%d placeholders=%d for %s %s %s",
324
+ cached_rows,
325
+ placeholder_rows,
326
+ asset,
327
+ timespan,
328
+ datastyle,
329
+ )
330
+
84
331
  # Check if we need to get more data
332
+ logger.debug(
333
+ "[THETA][DEBUG][CACHE][DECISION_START] asset=%s | "
334
+ "calling get_missing_dates(start=%s, end=%s)",
335
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
336
+ start.isoformat() if hasattr(start, 'isoformat') else start,
337
+ end.isoformat() if hasattr(end, 'isoformat') else end
338
+ )
339
+
85
340
  missing_dates = get_missing_dates(df_all, asset, start, end)
341
+
342
+ logger.debug(
343
+ "[THETA][DEBUG][CACHE][DECISION_RESULT] asset=%s | "
344
+ "missing_dates=%d | "
345
+ "decision=%s",
346
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
347
+ len(missing_dates),
348
+ "CACHE_HIT" if not missing_dates else "CACHE_MISS"
349
+ )
350
+
351
+ cache_file = build_cache_filename(asset, timespan, datastyle)
352
+ logger.debug(
353
+ "[THETA][DEBUG][THETADATA-CACHE] asset=%s/%s timespan=%s datastyle=%s cache_file=%s exists=%s missing=%d",
354
+ asset,
355
+ quote_asset.symbol if quote_asset else None,
356
+ timespan,
357
+ datastyle,
358
+ cache_file,
359
+ cache_file.exists(),
360
+ len(missing_dates),
361
+ )
86
362
  if not missing_dates:
363
+ if df_all is not None and not df_all.empty:
364
+ logger.info("ThetaData cache HIT for %s %s %s (%d rows).", asset, timespan, datastyle, len(df_all))
365
+ # DEBUG-LOG: Cache hit
366
+ logger.debug(
367
+ "[THETA][DEBUG][CACHE][HIT] asset=%s timespan=%s datastyle=%s rows=%d start=%s end=%s",
368
+ asset,
369
+ timespan,
370
+ datastyle,
371
+ len(df_all),
372
+ start.isoformat() if hasattr(start, 'isoformat') else start,
373
+ end.isoformat() if hasattr(end, 'isoformat') else end
374
+ )
87
375
  # Filter cached data to requested date range before returning
88
376
  if df_all is not None and not df_all.empty:
89
377
  # For daily data, use date-based filtering (timestamps vary by provider)
90
378
  # For intraday data, use precise datetime filtering
91
379
  if timespan == "day":
92
380
  # Convert index to dates for comparison
93
- import pandas as pd
94
381
  df_dates = pd.to_datetime(df_all.index).date
95
382
  start_date = start.date() if hasattr(start, 'date') else start
96
383
  end_date = end.date() if hasattr(end, 'date') else end
@@ -98,30 +385,119 @@ def get_price_data(
98
385
  df_all = df_all[mask]
99
386
  else:
100
387
  # Intraday: use precise datetime filtering
101
- import datetime as dt
388
+ import datetime as datetime_module # RENAMED to avoid shadowing dt parameter!
389
+
390
+ # DEBUG-LOG: Entry to intraday filter
391
+ rows_before_any_filter = len(df_all)
392
+ max_ts_before_any_filter = df_all.index.max() if len(df_all) > 0 else None
393
+ logger.debug(
394
+ "[THETA][DEBUG][FILTER][INTRADAY_ENTRY] asset=%s | "
395
+ "rows_before=%d max_ts_before=%s | "
396
+ "start_param=%s end_param=%s dt_param=%s dt_type=%s",
397
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
398
+ rows_before_any_filter,
399
+ max_ts_before_any_filter.isoformat() if max_ts_before_any_filter else None,
400
+ start.isoformat() if hasattr(start, 'isoformat') else start,
401
+ end.isoformat() if hasattr(end, 'isoformat') else end,
402
+ dt.isoformat() if dt and hasattr(dt, 'isoformat') else dt,
403
+ type(dt).__name__ if dt else None
404
+ )
405
+
102
406
  # Convert date to datetime if needed
103
- if isinstance(start, dt.date) and not isinstance(start, dt.datetime):
104
- start = dt.datetime.combine(start, dt.time.min)
105
- if isinstance(end, dt.date) and not isinstance(end, dt.datetime):
106
- end = dt.datetime.combine(end, dt.time.max)
407
+ if isinstance(start, datetime_module.date) and not isinstance(start, datetime_module.datetime):
408
+ start = datetime_module.datetime.combine(start, datetime_module.time.min)
409
+ logger.debug(
410
+ "[THETA][DEBUG][FILTER][DATE_CONVERSION] converted start from date to datetime: %s",
411
+ start.isoformat()
412
+ )
413
+ if isinstance(end, datetime_module.date) and not isinstance(end, datetime_module.datetime):
414
+ end = datetime_module.datetime.combine(end, datetime_module.time.max)
415
+ logger.debug(
416
+ "[THETA][DEBUG][FILTER][DATE_CONVERSION] converted end from date to datetime: %s",
417
+ end.isoformat()
418
+ )
107
419
 
108
420
  # Handle datetime objects with midnight time (users often pass datetime(YYYY, MM, DD))
109
- if isinstance(end, dt.datetime) and end.time() == dt.time.min:
421
+ if isinstance(end, datetime_module.datetime) and end.time() == datetime_module.time.min:
110
422
  # Convert end-of-period midnight to end-of-day
111
- end = dt.datetime.combine(end.date(), dt.time.max)
423
+ end = datetime_module.datetime.combine(end.date(), datetime_module.time.max)
424
+ logger.debug(
425
+ "[THETA][DEBUG][FILTER][MIDNIGHT_FIX] converted end from midnight to end-of-day: %s",
426
+ end.isoformat()
427
+ )
112
428
 
113
429
  if start.tzinfo is None:
114
430
  start = LUMIBOT_DEFAULT_PYTZ.localize(start).astimezone(pytz.UTC)
431
+ logger.debug(
432
+ "[THETA][DEBUG][FILTER][TZ_LOCALIZE] localized start to UTC: %s",
433
+ start.isoformat()
434
+ )
115
435
  if end.tzinfo is None:
116
436
  end = LUMIBOT_DEFAULT_PYTZ.localize(end).astimezone(pytz.UTC)
437
+ logger.debug(
438
+ "[THETA][DEBUG][FILTER][TZ_LOCALIZE] localized end to UTC: %s",
439
+ end.isoformat()
440
+ )
441
+
442
+ # REMOVED: Look-ahead bias protection was too aggressive
443
+ # The dt filtering was breaking negative timeshift (intentional look-ahead for fills)
444
+ # Look-ahead bias protection should happen at get_bars() level, not cache retrieval
445
+ #
446
+ # NEW APPROACH: Always return full [start, end] range from cache
447
+ # Let Data/DataPolars.get_bars() handle look-ahead bias protection
448
+ logger.debug(
449
+ "[THETA][DEBUG][FILTER][NO_DT_FILTER] asset=%s | "
450
+ "using end=%s for upper bound (dt parameter ignored for cache retrieval)",
451
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
452
+ end.isoformat()
453
+ )
117
454
  df_all = df_all[(df_all.index >= start) & (df_all.index <= end)]
455
+
456
+ # DEBUG-LOG: After date range filtering, before missing removal
457
+ if df_all is not None and not df_all.empty:
458
+ logger.debug(
459
+ "[THETA][DEBUG][FILTER][AFTER] asset=%s rows=%d first_ts=%s last_ts=%s dt_filter=%s",
460
+ asset,
461
+ len(df_all),
462
+ df_all.index.min().isoformat() if len(df_all) > 0 else None,
463
+ df_all.index.max().isoformat() if len(df_all) > 0 else None,
464
+ dt.isoformat() if dt and hasattr(dt, 'isoformat') else dt
465
+ )
466
+
467
+ if df_all is not None and not df_all.empty and "missing" in df_all.columns:
468
+ df_all = df_all[~df_all["missing"].astype(bool)].drop(columns=["missing"])
469
+
470
+
471
+ # DEBUG-LOG: Before pandas return
472
+ if df_all is not None and not df_all.empty:
473
+ logger.debug(
474
+ "[THETA][DEBUG][RETURN][PANDAS] asset=%s rows=%d first_ts=%s last_ts=%s",
475
+ asset,
476
+ len(df_all),
477
+ df_all.index.min().isoformat() if len(df_all) > 0 else None,
478
+ df_all.index.max().isoformat() if len(df_all) > 0 else None
479
+ )
118
480
  return df_all
119
481
 
120
- start = missing_dates[0] # Data will start at 8am UTC (4am EST)
121
- end = missing_dates[-1] # Data will end at 23:59 UTC (7:59pm EST)
482
+ logger.info("ThetaData cache MISS for %s %s %s; fetching %d interval(s) from ThetaTerminal.", asset, timespan, datastyle, len(missing_dates))
483
+
484
+ # DEBUG-LOG: Cache miss
485
+ logger.debug(
486
+ "[THETA][DEBUG][CACHE][MISS] asset=%s timespan=%s datastyle=%s missing_intervals=%d first=%s last=%s",
487
+ asset,
488
+ timespan,
489
+ datastyle,
490
+ len(missing_dates),
491
+ missing_dates[0] if missing_dates else None,
492
+ missing_dates[-1] if missing_dates else None
493
+ )
494
+
495
+
496
+ fetch_start = missing_dates[0] # Data will start at 8am UTC (4am EST)
497
+ fetch_end = missing_dates[-1] # Data will end at 23:59 UTC (7:59pm EST)
122
498
 
123
499
  # Initialize tqdm progress bar
124
- total_days = (end - start).days + 1
500
+ total_days = (fetch_end - fetch_start).days + 1
125
501
  total_queries = (total_days // MAX_DAYS) + 1
126
502
  description = f"\nDownloading '{datastyle}' data for {asset} / {quote_asset} with '{timespan}' from ThetaData..."
127
503
  logger.info(description)
@@ -133,19 +509,134 @@ def get_price_data(
133
509
  # The EOD endpoint includes the 16:00 closing auction and follows SIP sale-condition rules
134
510
  # This matches Polygon and Yahoo Finance EXACTLY (zero tolerance)
135
511
  if timespan == "day":
136
- logger.info(f"Daily bars: using EOD endpoint for official close prices")
512
+ requested_dates = list(missing_dates)
513
+ logger.info("Daily bars: using EOD endpoint for official close prices")
514
+ logger.debug(
515
+ "[THETA][DEBUG][THETADATA-EOD] requesting %d trading day(s) for %s from %s to %s",
516
+ len(requested_dates),
517
+ asset,
518
+ fetch_start,
519
+ fetch_end,
520
+ )
137
521
 
138
522
  # Use EOD endpoint for official daily OHLC
139
523
  result_df = get_historical_eod_data(
140
524
  asset=asset,
141
- start_dt=start,
142
- end_dt=end,
525
+ start_dt=fetch_start,
526
+ end_dt=fetch_end,
143
527
  username=username,
144
528
  password=password,
145
529
  datastyle=datastyle
146
530
  )
531
+ logger.debug(
532
+ "[THETA][DEBUG][THETADATA-EOD] fetched rows=%s for %s",
533
+ 0 if result_df is None else len(result_df),
534
+ asset,
535
+ )
536
+
537
+ if result_df is None or result_df.empty:
538
+ expired_range = (
539
+ asset.asset_type == "option"
540
+ and asset.expiration is not None
541
+ and requested_dates
542
+ and all(day > asset.expiration for day in requested_dates)
543
+ )
544
+ if expired_range:
545
+ logger.debug(
546
+ "[THETA][DEBUG][THETADATA-EOD] Option %s expired on %s; cache reuse for range %s -> %s.",
547
+ asset,
548
+ asset.expiration,
549
+ fetch_start,
550
+ fetch_end,
551
+ )
552
+ else:
553
+ logger.debug(
554
+ "[THETA][DEBUG][THETADATA-EOD] No rows returned for %s between %s and %s; recording placeholders.",
555
+ asset,
556
+ fetch_start,
557
+ fetch_end,
558
+ )
559
+ df_all = append_missing_markers(df_all, requested_dates)
560
+ update_cache(
561
+ cache_file,
562
+ df_all,
563
+ df_cached,
564
+ missing_dates=requested_dates,
565
+ remote_payload=remote_payload,
566
+ )
567
+ df_clean = df_all.copy() if df_all is not None else None
568
+ if df_clean is not None and not df_clean.empty and "missing" in df_clean.columns:
569
+ df_clean = df_clean[~df_clean["missing"].astype(bool)].drop(columns=["missing"])
570
+ df_clean = restore_numeric_dtypes(df_clean)
571
+ logger.info(
572
+ "ThetaData cache updated for %s %s %s with placeholders only (missing=%d).",
573
+ asset,
574
+ timespan,
575
+ datastyle,
576
+ len(requested_dates),
577
+ )
578
+
579
+ if df_clean is not None and not df_clean.empty and timespan == "day":
580
+ start_date = requested_start.date() if hasattr(requested_start, "date") else requested_start
581
+ end_date = requested_end.date() if hasattr(requested_end, "date") else requested_end
582
+ dates = pd.to_datetime(df_clean.index).date
583
+ df_clean = df_clean[(dates >= start_date) & (dates <= end_date)]
584
+
585
+ return df_clean if df_clean is not None else pd.DataFrame()
586
+
587
+ df_all = update_df(df_all, result_df)
588
+ logger.debug(
589
+ "[THETA][DEBUG][THETADATA-EOD] merged cache rows=%d (cached=%d new=%d)",
590
+ 0 if df_all is None else len(df_all),
591
+ 0 if df_cached is None else len(df_cached),
592
+ len(result_df),
593
+ )
594
+
595
+ trading_days = get_trading_dates(asset, fetch_start, fetch_end)
596
+ if "datetime" in result_df.columns:
597
+ covered_index = pd.DatetimeIndex(pd.to_datetime(result_df["datetime"], utc=True))
598
+ else:
599
+ covered_index = pd.DatetimeIndex(result_df.index)
600
+ if covered_index.tz is None:
601
+ covered_index = covered_index.tz_localize(pytz.UTC)
602
+ else:
603
+ covered_index = covered_index.tz_convert(pytz.UTC)
604
+ covered_days = set(covered_index.date)
605
+
606
+ df_all = remove_missing_markers(df_all, list(covered_days))
607
+ missing_within_range = [day for day in trading_days if day not in covered_days]
608
+ placeholder_count = len(missing_within_range)
609
+ df_all = append_missing_markers(df_all, missing_within_range)
610
+
611
+ update_cache(
612
+ cache_file,
613
+ df_all,
614
+ df_cached,
615
+ missing_dates=missing_within_range,
616
+ remote_payload=remote_payload,
617
+ )
147
618
 
148
- return result_df
619
+ df_clean = df_all.copy() if df_all is not None else None
620
+ if df_clean is not None and not df_clean.empty and "missing" in df_clean.columns:
621
+ df_clean = df_clean[~df_clean["missing"].astype(bool)].drop(columns=["missing"])
622
+ df_clean = restore_numeric_dtypes(df_clean)
623
+
624
+ logger.info(
625
+ "ThetaData cache updated for %s %s %s (rows=%d placeholders=%d).",
626
+ asset,
627
+ timespan,
628
+ datastyle,
629
+ 0 if df_all is None else len(df_all),
630
+ placeholder_count,
631
+ )
632
+
633
+ if df_clean is not None and not df_clean.empty and timespan == "day":
634
+ start_date = requested_start.date() if hasattr(requested_start, "date") else requested_start
635
+ end_date = requested_end.date() if hasattr(requested_end, "date") else requested_end
636
+ dates = pd.to_datetime(df_clean.index).date
637
+ df_clean = df_clean[(dates >= start_date) & (dates <= end_date)]
638
+
639
+ return df_clean if df_clean is not None else pd.DataFrame()
149
640
 
150
641
  # Map timespan to milliseconds for intraday intervals
151
642
  TIMESPAN_TO_MS = {
@@ -167,33 +658,84 @@ def get_price_data(
167
658
  f"Supported values: {list(TIMESPAN_TO_MS.keys())} or 'day'"
168
659
  )
169
660
 
170
- while start <= missing_dates[-1]:
661
+ current_start = fetch_start
662
+ current_end = fetch_start + delta
663
+
664
+ while current_start <= fetch_end:
171
665
  # If we don't have a paid subscription, we need to wait 1 minute between requests because of
172
666
  # the rate limit. Wait every other query so that we don't spend too much time waiting.
173
667
 
174
- if end > start + delta:
175
- end = start + delta
668
+ if current_end > fetch_end:
669
+ current_end = fetch_end
670
+ if current_end > current_start + delta:
671
+ current_end = current_start + delta
176
672
 
177
- result_df = get_historical_data(asset, start, end, interval_ms, username, password, datastyle=datastyle, include_after_hours=include_after_hours)
673
+ result_df = get_historical_data(asset, current_start, current_end, interval_ms, username, password, datastyle=datastyle, include_after_hours=include_after_hours)
674
+ chunk_end = _clamp_option_end(asset, current_end)
178
675
 
179
676
  if result_df is None or len(result_df) == 0:
180
- logger.warning(
181
- f"No data returned for {asset} / {quote_asset} with '{timespan}' timespan between {start} and {end}"
677
+ expired_chunk = (
678
+ asset.asset_type == "option"
679
+ and asset.expiration is not None
680
+ and chunk_end.date() >= asset.expiration
182
681
  )
682
+ if expired_chunk:
683
+ logger.debug(
684
+ "[THETA][DEBUG][THETADATA] Option %s considered expired on %s; reusing cached data between %s and %s.",
685
+ asset,
686
+ asset.expiration,
687
+ current_start,
688
+ chunk_end,
689
+ )
690
+ else:
691
+ logger.warning(
692
+ f"No data returned for {asset} / {quote_asset} with '{timespan}' timespan between {current_start} and {current_end}"
693
+ )
694
+ missing_chunk = get_trading_dates(asset, current_start, chunk_end)
695
+ df_all = append_missing_markers(df_all, missing_chunk)
696
+ pbar.update(1)
183
697
 
184
698
  else:
185
699
  df_all = update_df(df_all, result_df)
700
+ available_chunk = get_trading_dates(asset, current_start, chunk_end)
701
+ df_all = remove_missing_markers(df_all, available_chunk)
702
+ if "datetime" in result_df.columns:
703
+ chunk_index = pd.DatetimeIndex(pd.to_datetime(result_df["datetime"], utc=True))
704
+ else:
705
+ chunk_index = pd.DatetimeIndex(result_df.index)
706
+ if chunk_index.tz is None:
707
+ chunk_index = chunk_index.tz_localize(pytz.UTC)
708
+ else:
709
+ chunk_index = chunk_index.tz_convert(pytz.UTC)
710
+ covered_days = {ts.date() for ts in chunk_index}
711
+ missing_within_chunk = [day for day in available_chunk if day not in covered_days]
712
+ if missing_within_chunk:
713
+ df_all = append_missing_markers(df_all, missing_within_chunk)
186
714
  pbar.update(1)
187
715
 
188
- start = end + timedelta(days=1)
189
- end = start + delta
716
+ current_start = current_end + timedelta(days=1)
717
+ current_end = current_start + delta
190
718
 
191
- if asset.expiration and start > asset.expiration:
719
+ if asset.expiration and current_start > asset.expiration:
192
720
  break
193
721
 
194
- update_cache(cache_file, df_all, df_cached)
722
+ update_cache(cache_file, df_all, df_cached, remote_payload=remote_payload)
723
+ if df_all is not None:
724
+ logger.debug("[THETA][DEBUG][THETADATA-CACHE-WRITE] wrote %s rows=%d", cache_file, len(df_all))
725
+ if df_all is not None:
726
+ logger.info("ThetaData cache updated for %s %s %s (%d rows).", asset, timespan, datastyle, len(df_all))
195
727
  # Close the progress bar when done
196
728
  pbar.close()
729
+ if df_all is not None and not df_all.empty and "missing" in df_all.columns:
730
+ df_all = df_all[~df_all["missing"].astype(bool)].drop(columns=["missing"])
731
+ df_all = restore_numeric_dtypes(df_all)
732
+
733
+ if df_all is not None and not df_all.empty and timespan == "day":
734
+ start_date = requested_start.date() if hasattr(requested_start, "date") else requested_start
735
+ end_date = requested_end.date() if hasattr(requested_end, "date") else requested_end
736
+ dates = pd.to_datetime(df_all.index).date
737
+ df_all = df_all[(dates >= start_date) & (dates <= end_date)]
738
+
197
739
  return df_all
198
740
 
199
741
 
@@ -261,6 +803,28 @@ def build_cache_filename(asset: Asset, timespan: str, datastyle: str = "ohlc"):
261
803
  return cache_file
262
804
 
263
805
 
806
+ def build_remote_cache_payload(asset: Asset, timespan: str, datastyle: str = "ohlc") -> Dict[str, object]:
807
+ """Generate metadata describing the cache entry for remote storage."""
808
+ payload: Dict[str, object] = {
809
+ "provider": "thetadata",
810
+ "timespan": timespan,
811
+ "datastyle": datastyle,
812
+ "asset_type": getattr(asset, "asset_type", None),
813
+ "symbol": getattr(asset, "symbol", str(asset)),
814
+ }
815
+
816
+ if getattr(asset, "asset_type", None) == "option":
817
+ payload.update(
818
+ {
819
+ "expiration": getattr(asset, "expiration", None),
820
+ "strike": getattr(asset, "strike", None),
821
+ "right": getattr(asset, "right", None),
822
+ }
823
+ )
824
+
825
+ return payload
826
+
827
+
264
828
  def get_missing_dates(df_all, asset, start, end):
265
829
  """
266
830
  Check if we have data for the full range
@@ -283,27 +847,116 @@ def get_missing_dates(df_all, asset, start, end):
283
847
  list[datetime.date]
284
848
  A list of dates that we need to get data for
285
849
  """
850
+ # DEBUG-LOG: Entry to get_missing_dates
851
+ logger.debug(
852
+ "[THETA][DEBUG][CACHE][MISSING_DATES_CHECK] asset=%s | "
853
+ "start=%s end=%s | "
854
+ "cache_rows=%d",
855
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
856
+ start.isoformat() if hasattr(start, 'isoformat') else start,
857
+ end.isoformat() if hasattr(end, 'isoformat') else end,
858
+ 0 if df_all is None else len(df_all)
859
+ )
860
+
286
861
  trading_dates = get_trading_dates(asset, start, end)
862
+
863
+ logger.debug(
864
+ "[THETA][DEBUG][CACHE][TRADING_DATES] asset=%s | "
865
+ "trading_dates_count=%d first=%s last=%s",
866
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
867
+ len(trading_dates),
868
+ trading_dates[0] if trading_dates else None,
869
+ trading_dates[-1] if trading_dates else None
870
+ )
871
+
287
872
  if df_all is None or not len(df_all):
873
+ logger.debug(
874
+ "[THETA][DEBUG][CACHE][EMPTY] asset=%s | "
875
+ "cache is EMPTY -> all %d trading days are missing",
876
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
877
+ len(trading_dates)
878
+ )
288
879
  return trading_dates
289
880
 
290
881
  # It is possible to have full day gap in the data if previous queries were far apart
291
882
  # Example: Query for 8/1/2023, then 8/31/2023, then 8/7/2023
292
883
  # Whole days are easy to check for because we can just check the dates in the index
293
884
  dates = pd.Series(df_all.index.date).unique()
885
+ cached_dates_count = len(dates)
886
+ cached_first = min(dates) if len(dates) > 0 else None
887
+ cached_last = max(dates) if len(dates) > 0 else None
888
+
889
+ logger.debug(
890
+ "[THETA][DEBUG][CACHE][CACHED_DATES] asset=%s | "
891
+ "cached_dates_count=%d first=%s last=%s",
892
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
893
+ cached_dates_count,
894
+ cached_first,
895
+ cached_last
896
+ )
897
+
294
898
  missing_dates = sorted(set(trading_dates) - set(dates))
295
899
 
296
900
  # For Options, don't need any dates passed the expiration date
297
901
  if asset.asset_type == "option":
902
+ before_expiry_filter = len(missing_dates)
298
903
  missing_dates = [x for x in missing_dates if x <= asset.expiration]
904
+ after_expiry_filter = len(missing_dates)
905
+
906
+ if before_expiry_filter != after_expiry_filter:
907
+ logger.debug(
908
+ "[THETA][DEBUG][CACHE][OPTION_EXPIRY_FILTER] asset=%s | "
909
+ "filtered %d dates after expiration=%s | "
910
+ "missing_dates: %d -> %d",
911
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
912
+ before_expiry_filter - after_expiry_filter,
913
+ asset.expiration,
914
+ before_expiry_filter,
915
+ after_expiry_filter
916
+ )
917
+
918
+ logger.debug(
919
+ "[THETA][DEBUG][CACHE][MISSING_RESULT] asset=%s | "
920
+ "missing_dates_count=%d | "
921
+ "first_missing=%s last_missing=%s",
922
+ asset.symbol if hasattr(asset, 'symbol') else str(asset),
923
+ len(missing_dates),
924
+ missing_dates[0] if missing_dates else None,
925
+ missing_dates[-1] if missing_dates else None
926
+ )
299
927
 
300
928
  return missing_dates
301
929
 
302
930
 
303
931
  def load_cache(cache_file):
304
932
  """Load the data from the cache file and return a DataFrame with a DateTimeIndex"""
933
+ # DEBUG-LOG: Start loading cache
934
+ logger.debug(
935
+ "[THETA][DEBUG][CACHE][LOAD_START] cache_file=%s | "
936
+ "exists=%s size_bytes=%d",
937
+ cache_file.name,
938
+ cache_file.exists(),
939
+ cache_file.stat().st_size if cache_file.exists() else 0
940
+ )
941
+
942
+ if not cache_file.exists():
943
+ logger.debug(
944
+ "[THETA][DEBUG][CACHE][LOAD_MISSING] cache_file=%s | returning=None",
945
+ cache_file.name,
946
+ )
947
+ return None
948
+
305
949
  df = pd.read_parquet(cache_file, engine='pyarrow')
306
950
 
951
+ rows_after_read = len(df)
952
+ logger.debug(
953
+ "[THETA][DEBUG][CACHE][LOAD_READ] cache_file=%s | "
954
+ "rows_read=%d columns=%s",
955
+ cache_file.name,
956
+ rows_after_read,
957
+ list(df.columns)
958
+ )
959
+
307
960
  # Set the 'datetime' column as the index of the DataFrame
308
961
  df.set_index("datetime", inplace=True)
309
962
 
@@ -316,26 +969,135 @@ def load_cache(cache_file):
316
969
  if df.index.tzinfo is None:
317
970
  # Set the timezone to UTC
318
971
  df.index = df.index.tz_localize("UTC")
972
+ logger.debug(
973
+ "[THETA][DEBUG][CACHE][LOAD_TZ] cache_file=%s | "
974
+ "localized index to UTC",
975
+ cache_file.name
976
+ )
977
+
978
+ df = ensure_missing_column(df)
979
+
980
+ min_ts = df.index.min() if len(df) > 0 else None
981
+ max_ts = df.index.max() if len(df) > 0 else None
982
+ placeholder_count = int(df["missing"].sum()) if "missing" in df.columns else 0
983
+
984
+ logger.debug(
985
+ "[THETA][DEBUG][CACHE][LOAD_SUCCESS] cache_file=%s | "
986
+ "total_rows=%d real_rows=%d placeholders=%d | "
987
+ "min_ts=%s max_ts=%s",
988
+ cache_file.name,
989
+ len(df),
990
+ len(df) - placeholder_count,
991
+ placeholder_count,
992
+ min_ts.isoformat() if min_ts else None,
993
+ max_ts.isoformat() if max_ts else None
994
+ )
319
995
 
320
996
  return df
321
997
 
322
998
 
323
- def update_cache(cache_file, df_all, df_cached):
324
- """Update the cache file with the new data"""
325
- # Check if df_all is different from df_cached (if df_cached exists)
326
- if df_all is not None and len(df_all) > 0:
327
- # Check if the dataframes are the same
328
- if df_all.equals(df_cached):
999
+ def update_cache(cache_file, df_all, df_cached, missing_dates=None, remote_payload=None):
1000
+ """Update the cache file with the new data and optional placeholder markers."""
1001
+ # DEBUG-LOG: Entry to update_cache
1002
+ logger.debug(
1003
+ "[THETA][DEBUG][CACHE][UPDATE_ENTRY] cache_file=%s | "
1004
+ "df_all_rows=%d df_cached_rows=%d missing_dates=%d",
1005
+ cache_file.name,
1006
+ 0 if df_all is None else len(df_all),
1007
+ 0 if df_cached is None else len(df_cached),
1008
+ 0 if not missing_dates else len(missing_dates)
1009
+ )
1010
+
1011
+ if df_all is None or len(df_all) == 0:
1012
+ if not missing_dates:
1013
+ logger.debug(
1014
+ "[THETA][DEBUG][CACHE][UPDATE_SKIP] cache_file=%s | "
1015
+ "df_all is empty and no missing_dates, skipping cache update",
1016
+ cache_file.name
1017
+ )
329
1018
  return
1019
+ logger.debug(
1020
+ "[THETA][DEBUG][CACHE][UPDATE_PLACEHOLDERS_ONLY] cache_file=%s | "
1021
+ "df_all is empty, writing %d placeholders",
1022
+ cache_file.name,
1023
+ len(missing_dates)
1024
+ )
1025
+ df_working = append_missing_markers(None, missing_dates)
1026
+ else:
1027
+ df_working = ensure_missing_column(df_all.copy())
1028
+ if missing_dates:
1029
+ logger.debug(
1030
+ "[THETA][DEBUG][CACHE][UPDATE_APPEND_PLACEHOLDERS] cache_file=%s | "
1031
+ "appending %d placeholders to %d existing rows",
1032
+ cache_file.name,
1033
+ len(missing_dates),
1034
+ len(df_working)
1035
+ )
1036
+ df_working = append_missing_markers(df_working, missing_dates)
330
1037
 
331
- # Create the directory if it doesn't exist
332
- cache_file.parent.mkdir(parents=True, exist_ok=True)
1038
+ if df_working is None or len(df_working) == 0:
1039
+ logger.debug(
1040
+ "[THETA][DEBUG][CACHE][UPDATE_SKIP_EMPTY] cache_file=%s | "
1041
+ "df_working is empty after processing, skipping write",
1042
+ cache_file.name
1043
+ )
1044
+ return
1045
+
1046
+ df_cached_cmp = None
1047
+ if df_cached is not None and len(df_cached) > 0:
1048
+ df_cached_cmp = ensure_missing_column(df_cached.copy())
1049
+
1050
+ if df_cached_cmp is not None and df_working.equals(df_cached_cmp):
1051
+ logger.debug(
1052
+ "[THETA][DEBUG][CACHE][UPDATE_NO_CHANGES] cache_file=%s | "
1053
+ "df_working equals df_cached (rows=%d), skipping write",
1054
+ cache_file.name,
1055
+ len(df_working)
1056
+ )
1057
+ return
1058
+
1059
+ cache_file.parent.mkdir(parents=True, exist_ok=True)
1060
+ df_to_save = df_working.reset_index()
1061
+
1062
+ placeholder_count = int(df_working["missing"].sum()) if "missing" in df_working.columns else 0
1063
+ real_rows = len(df_working) - placeholder_count
1064
+ min_ts = df_working.index.min() if len(df_working) > 0 else None
1065
+ max_ts = df_working.index.max() if len(df_working) > 0 else None
1066
+
1067
+ def _format_ts(value):
1068
+ if value is None:
1069
+ return None
1070
+ return value.isoformat() if hasattr(value, "isoformat") else value
1071
+
1072
+ logger.debug(
1073
+ "[THETA][DEBUG][CACHE][UPDATE_WRITE] cache_file=%s | "
1074
+ "total_rows=%d real_rows=%d placeholders=%d | "
1075
+ "min_ts=%s max_ts=%s",
1076
+ cache_file.name,
1077
+ len(df_working),
1078
+ real_rows,
1079
+ placeholder_count,
1080
+ _format_ts(min_ts),
1081
+ _format_ts(max_ts)
1082
+ )
1083
+
1084
+ df_to_save.to_parquet(cache_file, engine="pyarrow", compression="snappy")
333
1085
 
334
- # Reset the index to convert DatetimeIndex to a regular column
335
- df_all_reset = df_all.reset_index()
1086
+ logger.debug(
1087
+ "[THETA][DEBUG][CACHE][UPDATE_SUCCESS] cache_file=%s written successfully",
1088
+ cache_file.name
1089
+ )
336
1090
 
337
- # Save the data to a parquet file
338
- df_all_reset.to_parquet(cache_file, engine='pyarrow', compression='snappy')
1091
+ cache_manager = get_backtest_cache()
1092
+ if cache_manager.mode == CacheMode.S3_READWRITE:
1093
+ try:
1094
+ cache_manager.on_local_update(cache_file, payload=remote_payload)
1095
+ except Exception as exc:
1096
+ logger.debug(
1097
+ "[THETA][DEBUG][CACHE][REMOTE_UPLOAD_ERROR] cache_file=%s error=%s",
1098
+ cache_file,
1099
+ exc,
1100
+ )
339
1101
 
340
1102
 
341
1103
  def update_df(df_all, result):
@@ -366,6 +1128,7 @@ def update_df(df_all, result):
366
1128
  ny_tz = LUMIBOT_DEFAULT_PYTZ
367
1129
  df = pd.DataFrame(result)
368
1130
  if not df.empty:
1131
+ df["missing"] = False
369
1132
  if "datetime" not in df.index.names:
370
1133
  # check if df has a column named "datetime", if not raise key error
371
1134
  if "datetime" not in df.columns:
@@ -398,51 +1161,79 @@ def update_df(df_all, result):
398
1161
  df_all = df
399
1162
  else:
400
1163
  df_all = pd.concat([df_all, df]).sort_index()
401
- df_all = df_all[~df_all.index.duplicated(keep="first")] # Remove any duplicate rows
1164
+ df_all = df_all[~df_all.index.duplicated(keep="last")] # Keep newest data over placeholders
402
1165
 
403
1166
  # NOTE: Timestamp correction is now done in get_historical_data() at line 569
404
1167
  # Do NOT subtract 1 minute here as it would double-correct
405
1168
  # df_all.index = df_all.index - pd.Timedelta(minutes=1)
1169
+ df_all = ensure_missing_column(df_all)
406
1170
  return df_all
407
1171
 
408
1172
 
409
1173
  def is_process_alive():
410
1174
  """Check if ThetaTerminal Java process is still running"""
1175
+ import os
411
1176
  import subprocess
412
- global THETA_DATA_PROCESS
413
1177
 
414
- # First check if we have a process handle and it's still alive
1178
+ global THETA_DATA_PROCESS, THETA_DATA_PID, THETA_DATA_LOG_HANDLE
1179
+
1180
+ # If we have a subprocess handle, trust it first
415
1181
  if THETA_DATA_PROCESS is not None:
416
- # poll() returns None if process is still running, otherwise returns exit code
417
1182
  if THETA_DATA_PROCESS.poll() is None:
418
1183
  return True
1184
+ # Process exited—clear cached handle and PID
1185
+ reset_theta_terminal_tracking()
419
1186
 
420
- # If we don't have a process handle or it died, check if any ThetaTerminal process is running
421
- # This handles cases where the process was started by a previous Python session
422
- try:
423
- result = subprocess.run(
424
- ["pgrep", "-f", "ThetaTerminal.jar"],
425
- capture_output=True,
426
- text=True,
427
- timeout=2
428
- )
429
- # pgrep returns 0 if processes found, 1 if none found
430
- return result.returncode == 0
431
- except Exception:
432
- return False
1187
+ # If we know the PID, probe it directly
1188
+ if THETA_DATA_PID:
1189
+ try:
1190
+ # Sending signal 0 simply tests liveness
1191
+ os.kill(THETA_DATA_PID, 0)
1192
+ return True
1193
+ except OSError:
1194
+ reset_theta_terminal_tracking()
1195
+
1196
+ return False
433
1197
 
434
1198
 
435
1199
  def start_theta_data_client(username: str, password: str):
436
1200
  import subprocess
437
1201
  import shutil
438
1202
  global THETA_DATA_PROCESS, THETA_DATA_PID
1203
+ CONNECTION_DIAGNOSTICS["start_terminal_calls"] += 1
439
1204
 
440
1205
  # First try shutting down any existing connection
1206
+ graceful_shutdown_requested = False
441
1207
  try:
442
- requests.get(f"{BASE_URL}/v2/system/terminal/shutdown")
1208
+ requests.get(f"{BASE_URL}/v2/system/terminal/shutdown", timeout=1)
1209
+ graceful_shutdown_requested = True
443
1210
  except Exception:
444
1211
  pass
445
1212
 
1213
+ shutdown_deadline = time.time() + 15
1214
+ while True:
1215
+ process_alive = is_process_alive()
1216
+ status_alive = False
1217
+ try:
1218
+ status_text = requests.get(f"{BASE_URL}/v2/system/mdds/status", timeout=0.5).text
1219
+ status_alive = status_text in ("CONNECTED", "DISCONNECTED")
1220
+ except Exception:
1221
+ status_alive = False
1222
+
1223
+ if not process_alive and not status_alive:
1224
+ break
1225
+
1226
+ if time.time() >= shutdown_deadline:
1227
+ if process_alive and THETA_DATA_PID:
1228
+ kill_signal = getattr(signal, "SIGKILL", signal.SIGTERM)
1229
+ try:
1230
+ os.kill(THETA_DATA_PID, kill_signal)
1231
+ except Exception as kill_exc:
1232
+ logger.warning("Failed to force kill ThetaTerminal PID %s: %s", THETA_DATA_PID, kill_exc)
1233
+ break
1234
+
1235
+ time.sleep(0.5)
1236
+
446
1237
  # Create creds.txt file to avoid passing password with special characters on command line
447
1238
  # This is the official ThetaData method and avoids shell escaping issues
448
1239
  # Security note: creds.txt with 0o600 permissions is MORE secure than command-line args
@@ -451,37 +1242,43 @@ def start_theta_data_client(username: str, password: str):
451
1242
  theta_dir.mkdir(parents=True, exist_ok=True)
452
1243
  creds_file = theta_dir / "creds.txt"
453
1244
 
454
- # IDEMPOTENT WRITE: Only write credentials if file doesn't exist or username changed
455
- # This prevents overwriting production credentials with test credentials
456
- should_write = False
457
- if not creds_file.exists():
458
- logger.info(f"Creating new creds.txt file at {creds_file}")
459
- should_write = True
460
- else:
461
- # Check if username changed
1245
+ # Read previous credentials if they exist so we can decide whether to overwrite
1246
+ existing_username = None
1247
+ existing_password = None
1248
+ if creds_file.exists():
462
1249
  try:
463
1250
  with open(creds_file, 'r') as f:
464
- existing_username = f.readline().strip()
465
- if existing_username != username:
466
- logger.info(f"Username changed from {existing_username} to {username}, updating creds.txt")
467
- should_write = True
468
- else:
469
- logger.debug(f"Using existing creds.txt for {username}")
470
- except Exception as e:
471
- logger.warning(f"Could not read existing creds.txt: {e}, will recreate")
472
- should_write = True
1251
+ existing_username = (f.readline().strip() or None)
1252
+ existing_password = (f.readline().strip() or None)
1253
+ except Exception as exc:
1254
+ logger.warning(f"Could not read existing creds.txt: {exc}; will recreate the file.")
1255
+ existing_username = None
1256
+ existing_password = None
1257
+
1258
+ if username is None:
1259
+ username = existing_username
1260
+ if password is None:
1261
+ password = existing_password
1262
+
1263
+ if username is None or password is None:
1264
+ raise ValueError(
1265
+ "ThetaData credentials are required to start ThetaTerminal. Provide them via backtest() or configure THETADATA_USERNAME/THETADATA_PASSWORD."
1266
+ )
1267
+
1268
+ should_write = (
1269
+ not creds_file.exists()
1270
+ or existing_username != username
1271
+ or existing_password != password
1272
+ )
473
1273
 
474
1274
  if should_write:
475
- # Write credentials to creds.txt (format: email on first line, password on second line)
1275
+ logger.info(f"Writing creds.txt file for user: {username}")
476
1276
  with open(creds_file, 'w') as f:
477
1277
  f.write(f"{username}\n")
478
1278
  f.write(f"{password}\n")
479
-
480
- # Set restrictive permissions on creds file (owner read/write only)
481
- # This prevents other users on the system from reading the credentials
482
1279
  os.chmod(creds_file, 0o600)
483
-
484
- logger.info(f"Updated creds.txt file for user: {username}")
1280
+ else:
1281
+ logger.debug(f"Reusing existing creds.txt for {username}")
485
1282
 
486
1283
  # Launch ThetaTerminal directly with --creds-file to avoid shell escaping issues
487
1284
  # We bypass the thetadata library's launcher which doesn't support this option
@@ -526,68 +1323,129 @@ def start_theta_data_client(username: str, password: str):
526
1323
 
527
1324
  logger.info(f"Launching ThetaTerminal with creds file: {cmd}")
528
1325
 
529
- # Launch in background and store process handle
530
- THETA_DATA_PROCESS = subprocess.Popen(
531
- cmd,
532
- stdout=subprocess.PIPE,
533
- stderr=subprocess.PIPE,
534
- cwd=str(theta_dir)
535
- )
1326
+ reset_theta_terminal_tracking()
1327
+
1328
+ log_path = theta_dir / "lumibot_launch.log"
1329
+ log_path.parent.mkdir(parents=True, exist_ok=True)
1330
+ log_handle = open(log_path, "ab")
1331
+ launch_ts = datetime.now(timezone.utc)
1332
+ log_handle.write(f"\n---- Launch {launch_ts.isoformat()} ----\n".encode())
1333
+ log_handle.flush()
1334
+
1335
+ global THETA_DATA_LOG_HANDLE
1336
+ THETA_DATA_LOG_HANDLE = log_handle
1337
+
1338
+ try:
1339
+ THETA_DATA_PROCESS = subprocess.Popen(
1340
+ cmd,
1341
+ stdout=log_handle,
1342
+ stderr=subprocess.STDOUT,
1343
+ cwd=str(theta_dir)
1344
+ )
1345
+ except Exception:
1346
+ THETA_DATA_LOG_HANDLE = None
1347
+ log_handle.close()
1348
+ raise
1349
+
536
1350
  THETA_DATA_PID = THETA_DATA_PROCESS.pid
537
1351
  logger.info(f"ThetaTerminal started with PID: {THETA_DATA_PID}")
538
1352
 
539
- # Give it a moment to start
540
- time.sleep(2)
541
-
542
1353
  # We don't return a ThetaClient object since we're launching manually
543
1354
  # The connection will be established via HTTP/WebSocket to localhost:25510
544
1355
  return THETA_DATA_PROCESS
545
1356
 
546
1357
 
547
- def check_connection(username: str, password: str):
548
- # Do endless while loop and check if connected every 100 milliseconds
549
- MAX_RETRIES = 15
550
- counter = 0
1358
+ def check_connection(username: str, password: str, wait_for_connection: bool = False):
1359
+ """Ensure the local ThetaTerminal is running. Optionally block until it is connected.
1360
+
1361
+ Parameters
1362
+ ----------
1363
+ username : str
1364
+ ThetaData username.
1365
+ password : str
1366
+ ThetaData password.
1367
+ wait_for_connection : bool, optional
1368
+ If True, block and retry until the terminal reports CONNECTED (or retries are exhausted).
1369
+ If False, perform a lightweight liveness check and return immediately.
1370
+ """
1371
+
1372
+ CONNECTION_DIAGNOSTICS["check_connection_calls"] += 1
1373
+
1374
+ max_retries = CONNECTION_MAX_RETRIES
1375
+ sleep_interval = CONNECTION_RETRY_SLEEP
1376
+ restart_attempts = 0
551
1377
  client = None
552
- connected = False
553
1378
 
554
- while True:
555
- # FIRST: Check if already connected (most important check!)
556
- # This prevents unnecessary restarts that would overwrite creds.txt
1379
+ def probe_status() -> Optional[str]:
557
1380
  try:
558
1381
  res = requests.get(f"{BASE_URL}/v2/system/mdds/status", timeout=1)
559
- con_text = res.text
1382
+ return res.text
1383
+ except Exception as exc:
1384
+ logger.debug(f"Cannot reach ThetaTerminal status endpoint: {exc}")
1385
+ return None
1386
+
1387
+ if not wait_for_connection:
1388
+ status_text = probe_status()
1389
+ if status_text == "CONNECTED":
1390
+ if THETA_DATA_PROCESS is None and THETA_DATA_PID is None:
1391
+ logger.debug("ThetaTerminal reports CONNECTED but no process is tracked; restarting to capture handle.")
1392
+ client = start_theta_data_client(username=username, password=password)
1393
+ new_client, connected = check_connection(
1394
+ username=username,
1395
+ password=password,
1396
+ wait_for_connection=True,
1397
+ )
1398
+ return client or new_client, connected
1399
+
1400
+ logger.debug("ThetaTerminal already connected.")
1401
+ return None, True
560
1402
 
561
- if con_text == "CONNECTED":
562
- logger.debug("Already connected to Theta Data!")
563
- connected = True
564
- break
565
- elif con_text == "DISCONNECTED":
566
- logger.debug("Disconnected from Theta Data, will attempt to start...")
567
- # Fall through to process check and restart logic
568
- else:
569
- logger.debug(f"Unknown connection status: {con_text}")
570
- # Fall through to process check and restart logic
571
- except Exception as e:
572
- # Connection endpoint not responding - process might be dead
573
- logger.debug(f"Cannot reach ThetaData status endpoint: {e}")
574
- # Fall through to process check and restart logic
575
-
576
- # SECOND: Check if the Java process is still alive
577
1403
  if not is_process_alive():
578
- logger.warning("ThetaTerminal process is not running, starting...")
1404
+ logger.debug("ThetaTerminal process not running; launching background restart.")
579
1405
  client = start_theta_data_client(username=username, password=password)
580
- counter += 1
581
- time.sleep(0.5)
1406
+ new_client, connected = check_connection(
1407
+ username=username,
1408
+ password=password,
1409
+ wait_for_connection=True,
1410
+ )
1411
+ return client or new_client, connected
1412
+
1413
+ logger.debug("ThetaTerminal running but not yet CONNECTED; waiting for status.")
1414
+ return check_connection(username=username, password=password, wait_for_connection=True)
1415
+
1416
+ counter = 0
1417
+ connected = False
1418
+
1419
+ while counter < max_retries:
1420
+ status_text = probe_status()
1421
+ if status_text == "CONNECTED":
1422
+ if counter:
1423
+ logger.info("ThetaTerminal connected after %s attempt(s).", counter + 1)
1424
+ connected = True
1425
+ break
1426
+ elif status_text == "DISCONNECTED":
1427
+ logger.debug("ThetaTerminal reports DISCONNECTED; will retry.")
1428
+ elif status_text is not None:
1429
+ logger.debug(f"ThetaTerminal returned unexpected status: {status_text}")
1430
+
1431
+ if not is_process_alive():
1432
+ if restart_attempts >= MAX_RESTART_ATTEMPTS:
1433
+ logger.error("ThetaTerminal not running after %s restart attempts.", restart_attempts)
1434
+ break
1435
+ restart_attempts += 1
1436
+ logger.warning("ThetaTerminal process is not running (restart #%s).", restart_attempts)
1437
+ client = start_theta_data_client(username=username, password=password)
1438
+ time.sleep(max(BOOT_GRACE_PERIOD, sleep_interval))
1439
+ counter = 0
582
1440
  continue
583
1441
 
584
- # THIRD: Process is alive but not connected - wait and retry
585
- time.sleep(0.5)
586
1442
  counter += 1
1443
+ if counter % 10 == 0:
1444
+ logger.info("Waiting for ThetaTerminal connection (attempt %s/%s).", counter, max_retries)
1445
+ time.sleep(sleep_interval)
587
1446
 
588
- if counter > MAX_RETRIES:
589
- logger.error("Cannot connect to Theta Data!")
590
- break
1447
+ if not connected and counter >= max_retries:
1448
+ logger.error("Cannot connect to Theta Data after %s attempts.", counter)
591
1449
 
592
1450
  return client, connected
593
1451
 
@@ -597,6 +1455,9 @@ def get_request(url: str, headers: dict, querystring: dict, username: str, passw
597
1455
  next_page_url = None
598
1456
  page_count = 0
599
1457
 
1458
+ # Lightweight liveness probe before issuing the request
1459
+ check_connection(username=username, password=password, wait_for_connection=False)
1460
+
600
1461
  while True:
601
1462
  counter = 0
602
1463
  # Use next_page URL if available, otherwise use original URL with querystring
@@ -605,18 +1466,44 @@ def get_request(url: str, headers: dict, querystring: dict, username: str, passw
605
1466
 
606
1467
  while True:
607
1468
  try:
1469
+ CONNECTION_DIAGNOSTICS["network_requests"] += 1
1470
+
1471
+ # DEBUG-LOG: API request
1472
+ logger.debug(
1473
+ "[THETA][DEBUG][API][REQUEST] url=%s params=%s",
1474
+ request_url if next_page_url else url,
1475
+ request_params if request_params else querystring
1476
+ )
1477
+
608
1478
  response = requests.get(request_url, headers=headers, params=request_params)
609
1479
  # Status code 472 means "No data" - this is valid, return None
610
1480
  if response.status_code == 472:
611
1481
  logger.warning(f"No data available for request: {response.text[:200]}")
1482
+ # DEBUG-LOG: API response - no data
1483
+ logger.debug(
1484
+ "[THETA][DEBUG][API][RESPONSE] status=472 result=NO_DATA"
1485
+ )
612
1486
  return None
613
1487
  # If status code is not 200, then we are not connected
614
1488
  elif response.status_code != 200:
615
1489
  logger.warning(f"Non-200 status code {response.status_code}: {response.text[:200]}")
616
- check_connection(username=username, password=password)
1490
+ # DEBUG-LOG: API response - error
1491
+ logger.debug(
1492
+ "[THETA][DEBUG][API][RESPONSE] status=%d result=ERROR",
1493
+ response.status_code
1494
+ )
1495
+ check_connection(username=username, password=password, wait_for_connection=True)
617
1496
  else:
618
1497
  json_resp = response.json()
619
1498
 
1499
+ # DEBUG-LOG: API response - success
1500
+ response_rows = len(json_resp.get("response", [])) if isinstance(json_resp.get("response"), list) else 0
1501
+ logger.debug(
1502
+ "[THETA][DEBUG][API][RESPONSE] status=200 rows=%d has_next_page=%s",
1503
+ response_rows,
1504
+ bool(json_resp.get("header", {}).get("next_page"))
1505
+ )
1506
+
620
1507
  # Check if json_resp has error_type inside of header
621
1508
  if "error_type" in json_resp["header"] and json_resp["header"]["error_type"] != "null":
622
1509
  # Handle "NO_DATA" error
@@ -625,18 +1512,19 @@ def get_request(url: str, headers: dict, querystring: dict, username: str, passw
625
1512
  f"No data returned for querystring: {querystring}")
626
1513
  return None
627
1514
  else:
1515
+ error_label = json_resp["header"].get("error_type")
628
1516
  logger.error(
629
- f"Error getting data from Theta Data: {json_resp['header']['error_type']},\nquerystring: {querystring}")
630
- check_connection(username=username, password=password)
1517
+ f"Error getting data from Theta Data: {error_label},\nquerystring: {querystring}")
1518
+ check_connection(username=username, password=password, wait_for_connection=True)
1519
+ raise ValueError(f"ThetaData returned error_type={error_label}")
631
1520
  else:
632
1521
  break
633
1522
 
634
1523
  except Exception as e:
635
1524
  logger.warning(f"Exception during request (attempt {counter + 1}): {e}")
636
- check_connection(username=username, password=password)
637
- # Give the process time to start after restart
1525
+ check_connection(username=username, password=password, wait_for_connection=True)
638
1526
  if counter == 0:
639
- logger.info("Waiting 5 seconds for ThetaTerminal to initialize...")
1527
+ logger.debug("[THETA][DEBUG][API][WAIT] Allowing ThetaTerminal to initialize for 5s before retry.")
640
1528
  time.sleep(5)
641
1529
 
642
1530
  counter += 1
@@ -720,12 +1608,34 @@ def get_historical_eod_data(asset: Asset, start_dt: datetime, end_dt: datetime,
720
1608
 
721
1609
  headers = {"Accept": "application/json"}
722
1610
 
1611
+ # DEBUG-LOG: EOD data request
1612
+ logger.debug(
1613
+ "[THETA][DEBUG][EOD][REQUEST] asset=%s start=%s end=%s datastyle=%s",
1614
+ asset,
1615
+ start_date,
1616
+ end_date,
1617
+ datastyle
1618
+ )
1619
+
723
1620
  # Send the request
724
1621
  json_resp = get_request(url=url, headers=headers, querystring=querystring,
725
1622
  username=username, password=password)
726
1623
  if json_resp is None:
1624
+ # DEBUG-LOG: EOD data response - no data
1625
+ logger.debug(
1626
+ "[THETA][DEBUG][EOD][RESPONSE] asset=%s result=NO_DATA",
1627
+ asset
1628
+ )
727
1629
  return None
728
1630
 
1631
+ # DEBUG-LOG: EOD data response - success
1632
+ response_rows = len(json_resp.get("response", [])) if isinstance(json_resp.get("response"), list) else 0
1633
+ logger.debug(
1634
+ "[THETA][DEBUG][EOD][RESPONSE] asset=%s rows=%d",
1635
+ asset,
1636
+ response_rows
1637
+ )
1638
+
729
1639
  # Convert to pandas dataframe
730
1640
  df = pd.DataFrame(json_resp["response"], columns=json_resp["header"]["format"])
731
1641
 
@@ -875,13 +1785,37 @@ def get_historical_data(asset: Asset, start_dt: datetime, end_dt: datetime, ivl:
875
1785
 
876
1786
  headers = {"Accept": "application/json"}
877
1787
 
1788
+ # DEBUG-LOG: Intraday data request
1789
+ logger.debug(
1790
+ "[THETA][DEBUG][INTRADAY][REQUEST] asset=%s start=%s end=%s ivl=%d datastyle=%s include_after_hours=%s",
1791
+ asset,
1792
+ start_date,
1793
+ end_date,
1794
+ ivl,
1795
+ datastyle,
1796
+ include_after_hours
1797
+ )
1798
+
878
1799
  # Send the request
879
1800
 
880
1801
  json_resp = get_request(url=url, headers=headers, querystring=querystring,
881
1802
  username=username, password=password)
882
1803
  if json_resp is None:
1804
+ # DEBUG-LOG: Intraday data response - no data
1805
+ logger.debug(
1806
+ "[THETA][DEBUG][INTRADAY][RESPONSE] asset=%s result=NO_DATA",
1807
+ asset
1808
+ )
883
1809
  return None
884
1810
 
1811
+ # DEBUG-LOG: Intraday data response - success
1812
+ response_rows = len(json_resp.get("response", [])) if isinstance(json_resp.get("response"), list) else 0
1813
+ logger.debug(
1814
+ "[THETA][DEBUG][INTRADAY][RESPONSE] asset=%s rows=%d",
1815
+ asset,
1816
+ response_rows
1817
+ )
1818
+
885
1819
  # Convert to pandas dataframe
886
1820
  df = pd.DataFrame(json_resp["response"], columns=json_resp["header"]["format"])
887
1821
 
@@ -916,8 +1850,8 @@ def get_historical_data(asset: Asset, start_dt: datetime, end_dt: datetime, ivl:
916
1850
  # Convert the datetime column to a datetime and localize to Eastern Time
917
1851
  df["datetime"] = pd.to_datetime(df["datetime"])
918
1852
 
919
- # Localize to Eastern Time (ThetaData returns times in ET)
920
- df["datetime"] = df["datetime"].dt.tz_localize("America/New_York")
1853
+ # Localize to LUMIBOT_DEFAULT_PYTZ (ThetaData returns times in ET)
1854
+ df["datetime"] = df["datetime"].dt.tz_localize(LUMIBOT_DEFAULT_PYTZ)
921
1855
 
922
1856
  # Set datetime as the index
923
1857
  df = df.set_index("datetime")