PyPI - lumibot - Versions diffs - 4.0.23__py3-none-any.whl → 4.1.1__py3-none-any.whl - Mend

lumibot 4.0.23py3-none-any.whl → 4.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of lumibot might be problematic. Click here for more details.

Files changed (161) hide show

lumibot/data_sources/databento_data_polars_backtesting.py CHANGED Viewed

@@ -20,7 +20,7 @@ import polars as pl
 from lumibot.data_sources import DataSourceBacktesting
 from lumibot.data_sources.polars_mixin import PolarsMixin
 from lumibot.entities import Asset, Bars
-from lumibot.tools import databento_helper_polars
+from lumibot.tools import databento_helper_polars, databento_helper
 from lumibot.tools.lumibot_logger import get_logger
 logger = get_logger(__name__)
@@ -71,8 +71,88 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
         self._prefetch_cache: Dict[tuple, bool] = {}
         self._prefetched_assets = set()  # Track which assets have been fully loaded
+        # OPTIMIZATION: Iteration-level filtered bars cache (same as Pandas)
+        self._filtered_bars_cache = {}  # {(asset_key, length, timestep, timeshift, dt): DataFrame}
+        self._bars_cache_datetime = None  # Track when to invalidate bars cache
+        # Futures multiplier cache - track which assets have had multipliers fetched
+        self._multiplier_fetched_assets = set()
         logger.info(f"DataBento backtesting initialized for period: {datetime_start} to {datetime_end}")
+    def _ensure_futures_multiplier(self, asset):
+        """
+        Ensure futures asset has correct multiplier set.
+        This method is idempotent and cached - safe to call multiple times.
+        Only fetches multiplier once per unique asset.
+        Design rationale:
+        - Futures multipliers must be fetched from data provider (e.g., DataBento)
+        - Asset class defaults to multiplier=1
+        - Data source is responsible for updating multiplier on first use
+        - Lazy fetching is more efficient than prefetching all possible assets
+        Parameters
+        ----------
+        asset : Asset
+            The asset to ensure has correct multiplier
+        """
+        # Skip if not a futures asset
+        if asset.asset_type not in (Asset.AssetType.FUTURE, Asset.AssetType.CONT_FUTURE):
+            return
+        # Skip if multiplier already set to non-default value
+        if asset.multiplier != 1:
+            return
+        # Create cache key to track which assets we've already processed
+        # Use symbol + asset_type + expiration to handle different contracts
+        cache_key = (asset.symbol, asset.asset_type, getattr(asset, 'expiration', None))
+        # Check if we already tried to fetch for this asset
+        if cache_key in self._multiplier_fetched_assets:
+            return  # Already attempted (even if failed, don't retry every time)
+        # Mark as attempted to avoid redundant API calls
+        self._multiplier_fetched_assets.add(cache_key)
+        # Fetch and set multiplier from DataBento
+        try:
+            client = databento_helper.DataBentoClient(self._api_key)
+            # Resolve symbol based on asset type
+            if asset.asset_type == Asset.AssetType.CONT_FUTURE:
+                resolved_symbol = databento_helper._format_futures_symbol_for_databento(
+                    asset, reference_date=self.datetime_start
+                )
+            else:
+                resolved_symbol = databento_helper._format_futures_symbol_for_databento(asset)
+            # Fetch multiplier from DataBento instrument definition
+            databento_helper._fetch_and_update_futures_multiplier(
+                client=client,
+                asset=asset,
+                resolved_symbol=resolved_symbol,
+                dataset="GLBX.MDP3",
+                reference_date=self.datetime_start
+            )
+            logger.info(f"Successfully set multiplier for {asset.symbol}: {asset.multiplier}")
+        except Exception as e:
+            logger.warning(f"Could not fetch multiplier for {asset.symbol}: {e}")
+    def _check_and_clear_bars_cache(self):
+        """
+        OPTIMIZATION: Clear iteration caches when datetime changes.
+        This prevents stale data from being returned across different backtest iterations.
+        """
+        current_dt = self.get_datetime()
+        if self._bars_cache_datetime != current_dt:
+            self._filtered_bars_cache.clear()
+            self._bars_cache_datetime = current_dt
     def _enforce_storage_limit(self, data_store: Dict[Asset, pl.LazyFrame]):
         """Enforce storage limit by removing least recently used data."""
         # Use mixin's enforce method
@@ -216,13 +296,20 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
             self._prefetched_assets.add(search_asset)
             return
-        # Get the start datetime and timestep unit
+        # Get the start datetime and timestep unit (includes length*timestep + buffer)
+        # This matches Pandas logic: start_datetime = (start_dt - length*timestep) - START_BUFFER
         start_datetime, ts_unit = self.get_start_datetime_and_ts_unit(
             length, timestep, start_dt, start_buffer=START_BUFFER
         )
-        # Fetch data for ENTIRE backtest period (like pandas does)
-        start_datetime = self.datetime_start - START_BUFFER
+        # FIX: Ensure timezone-aware datetime for API call (matches Pandas behavior)
+        # Polars was passing naive datetime, causing DataBento to treat it as UTC instead of ET
+        # This caused fetching wrong data (18 hours off!)
+        start_datetime = self.to_default_timezone(start_datetime)
+        # FIX: Don't override start_datetime! Use the calculated value that includes bars + buffer
+        # The old code set start_datetime = self.datetime_start - START_BUFFER which was wrong
+        # It didn't account for the requested bar length, causing missing data
         end_datetime = self.datetime_end + timedelta(days=1)
         logger.info(f"Prefetching {asset_separated.symbol} data from {start_datetime.date()} to {end_datetime.date()}")
@@ -244,6 +331,9 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
         # Download data from DataBento using polars helper
         try:
+            # CRITICAL FIX: Use start_datetime as reference_date to match Pandas behavior!
+            # Pandas passes reference_date=start (WITH buffer included) - see databento_helper.py line 797
+            # This determines which futures contract is active at that time
             df = databento_helper_polars.get_price_data_from_databento_polars(
                 api_key=self._api_key,
                 asset=asset_separated,
@@ -251,7 +341,8 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
                 end=end_datetime,
                 timestep=timestep,
                 venue=None,
-                force_cache_update=False
+                force_cache_update=False,
+                reference_date=start_datetime  # MUST match Pandas: reference_date=start (WITH buffer)
             )
         except Exception as e:
             # Handle all exceptions
@@ -291,10 +382,31 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
     ) -> Optional[pl.DataFrame]:
         """Pull bars with maximum efficiency using pre-filtered cache."""
-        # Build search key
-        search_asset = asset if not isinstance(asset, tuple) else asset
-        if quote:
-            search_asset = (asset, quote)
+        # OPTIMIZATION: Check iteration cache first
+        self._check_and_clear_bars_cache()
+        current_dt = self.get_datetime()
+        # Build search key - MUST match _update_data logic!
+        # Default quote to USD forex if not provided (matches _update_data)
+        search_asset = asset
+        quote_asset = quote if quote is not None else Asset("USD", "forex")
+        if isinstance(asset, tuple):
+            search_asset, quote_asset = asset
+        else:
+            search_asset = (asset, quote_asset)
+        # OPTIMIZATION: Build cache key and check filtered bars cache (same as Pandas)
+        timeshift_key = 0
+        if timeshift:
+            if isinstance(timeshift, int):
+                timeshift_key = timeshift
+            elif hasattr(timeshift, 'total_seconds'):
+                timeshift_key = int(timeshift.total_seconds() / 60)
+        bars_cache_key = (search_asset, length, timestep, timeshift_key, current_dt)
+        if bars_cache_key in self._filtered_bars_cache:
+            return self._filtered_bars_cache[bars_cache_key]
         # For daily timestep, use optimized caching strategy
         if timestep == "day":
@@ -307,19 +419,18 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
                 if len(result) >= length:
                     return result.tail(length)
-        # Get the current datetime and calculate the start datetime
-        current_dt = self.get_datetime()
-        # Get data from DataBento
-        self._update_data(asset, quote, length, timestep, current_dt)
-        # Get lazy data
-        search_asset = asset if not isinstance(asset, tuple) else asset
-        if quote:
-            search_asset = (asset, quote)
+        # FIX: Pass None as start_dt to match Pandas behavior
+        # Pandas uses self.datetime_start as reference, not current iteration time
+        # This ensures we fetch enough historical data for all iterations
+        self._update_data(asset, quote, length, timestep, start_dt=None)
+        # Get lazy data - use the same search_asset key we already built
         lazy_data = self._get_data_lazy(search_asset)
+        logger.info(f"[POLARS-DEBUG] _get_data_lazy returned: {lazy_data is not None}, search_asset={search_asset}")
+        logger.info(f"[POLARS-DEBUG] Data store keys: {list(self._data_store.keys())}")
         if lazy_data is None:
+            logger.warning(f"[POLARS-DEBUG] lazy_data is None for search_asset={search_asset}")
             return None
         # Use lazy evaluation and collect only when needed
@@ -336,28 +447,43 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
             # For minute data, collect on demand
             data = lazy_data.collect()
+        logger.info(f"[POLARS-DEBUG] After collect: data shape={data.shape if data is not None else 'None'}")
         # OPTIMIZATION: Direct filtering on eager DataFrame
         current_dt = self.to_default_timezone(self._datetime)
-        # Determine end filter
-        if timestep == "day":
-            dt = self._datetime.replace(hour=23, minute=59, second=59, microsecond=999999)
-            end_filter = dt - timedelta(days=1)
-        else:
-            end_filter = current_dt
+        # Determine end filter - CRITICAL: Must match pandas logic!
+        # For backtesting, we need to exclude the in-progress bar
+        # IMPORTANT: Use the current datetime directly, not minus 1 bar
+        # The filter uses < (not <=) to exclude the current bar
+        use_strict_less_than = False  # Use < instead of <=
         if timeshift:
+            # When timeshift is present, use <= with adjusted end_filter
             if isinstance(timeshift, int):
-                timeshift = timedelta(days=timeshift)
-            end_filter = end_filter - timeshift
+                # Match pandas implementation: interpret integer timeshift as minutes
+                timeshift = timedelta(minutes=timeshift)
+            if timestep == "day":
+                dt = self._datetime.replace(hour=23, minute=59, second=59, microsecond=999999)
+                end_filter = dt - timedelta(days=1) - timeshift
+            elif timestep == "hour":
+                end_filter = current_dt - timedelta(hours=1) - timeshift
+            else:
+                end_filter = current_dt - timedelta(minutes=1) - timeshift
+        else:
+            # No timeshift: use current_dt with < operator (matches Pandas behavior)
+            end_filter = current_dt
+            use_strict_less_than = True
         logger.debug(f"Filtering {asset.symbol} data: current_dt={current_dt}, end_filter={end_filter}, timestep={timestep}, timeshift={timeshift}")
         # Convert to lazy frame for filtering
         lazy_data = data.lazy() if not hasattr(data, 'collect') else data
+        logger.info(f"[POLARS-DEBUG] Before filter: lazy_data type={type(lazy_data)}, end_filter={end_filter}, length={length}, use_strict_less_than={use_strict_less_than}")
         # Use mixin's filter method
-        result = self._filter_data_polars(search_asset, lazy_data, end_filter, length, timestep)
+        result = self._filter_data_polars(search_asset, lazy_data, end_filter, length, timestep, use_strict_less_than=use_strict_less_than)
+        logger.info(f"[POLARS-DEBUG] After filter: result shape={result.shape if result is not None else 'None'}")
         if result is None:
             return None
@@ -370,6 +496,12 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
         logger.debug(f"Returning {len(result)} bars for {asset.symbol}")
+        # OPTIMIZATION: Cache the result before returning (same as Pandas)
+        if result is not None and not result.is_empty():
+            self._filtered_bars_cache[bars_cache_key] = result
+        else:
+            self._filtered_bars_cache[bars_cache_key] = None
         return result
     def _parse_source_symbol_bars(
@@ -408,6 +540,9 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
         if cached_price is not None:
             return cached_price
+        # Ensure futures have correct multiplier set
+        self._ensure_futures_multiplier(asset)
         try:
             dt = self.get_datetime()
             self._update_data(asset, quote, 1, timestep, dt)
@@ -417,34 +552,40 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
             self._cache_last_price_polars(asset, None, current_datetime, timestep)
             return None
-        # Get price efficiently
-        # For daily data, don't apply additional timeshift since _pull_source_symbol_bars
-        # already handles getting the previous day's data
-        # Only request 1 bar for efficiency (matching pandas implementation)
-        timeshift = None if timestep == "day" else timedelta(days=-1)
-        length = 1
+        # Request a single completed bar (aligns with pandas implementation)
         bars_data = self._pull_source_symbol_bars(
-            asset, length, timestep=timestep, timeshift=timeshift, quote=quote
+            asset, 1, timestep=timestep, timeshift=None, quote=quote
         )
         if bars_data is None or len(bars_data) == 0:
-            logger.warning(f"No bars data for {asset.symbol} at {current_datetime}")
+            logger.warning(f"[POLARS-DEBUG] ✗✗✗ NO BARS DATA for {asset.symbol} at {current_datetime}, timestep={timestep}")
+            logger.warning(f"[POLARS-DEBUG] Data store keys: {list(self._data_store.keys())}")
             self._cache_last_price_polars(asset, None, current_datetime, timestep)
             return None
-        # Direct column access - since we only request 1 bar, take the first (and only) element
-        open_price = bars_data["open"][0]
+        # Use the close of the most recent completed bar (pandas parity)
+        if "close" not in bars_data.columns:
+            logger.warning(f"[POLARS-DEBUG] ✗✗✗ Close column missing for {asset.symbol}")
+            self._cache_last_price_polars(asset, None, current_datetime, timestep)
+            return None
-        # Convert if needed
-        if isinstance(open_price, (np.int64, np.integer)):
-            open_price = Decimal(int(open_price))
-        elif isinstance(open_price, (np.float64, np.floating)):
-            open_price = float(open_price)
+        last_close = bars_data.select(pl.col("close").tail(1)).item()
-        # Use mixin's cache method
-        self._cache_last_price_polars(asset, open_price, current_datetime, timestep)
-        return open_price
+        if last_close is None:
+            logger.warning(f"[POLARS-DEBUG] ✗✗✗ Unable to extract close price for {asset.symbol}")
+            self._cache_last_price_polars(asset, None, current_datetime, timestep)
+            return None
+        if isinstance(last_close, (np.int64, np.integer)):
+            price_value = Decimal(int(last_close))
+        elif isinstance(last_close, (np.float64, np.floating)):
+            price_value = float(last_close)
+        else:
+            price_value = float(last_close)
+        self._cache_last_price_polars(asset, price_value, current_datetime, timestep)
+        logger.info(f"[POLARS-DEBUG] Returning price from bars (close): {price_value}")
+        return price_value
     def get_historical_prices(
         self,
@@ -458,7 +599,7 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
         return_polars: bool = False,
     ) -> Optional[Bars]:
         """Get historical prices using polars."""
-        logger.debug(f"get_historical_prices called for {asset.symbol}")
+        logger.info(f"[POLARS-DEBUG] get_historical_prices called: asset={asset.symbol}, length={length}, timestep={timestep}, datetime={self._datetime}")
         if timestep is None:
             timestep = self.get_timestep()
@@ -473,12 +614,17 @@ class DataBentoDataPolarsBacktesting(PolarsMixin, DataSourceBacktesting):
         )
         if bars_data is None:
+            logger.warning(f"[POLARS-DEBUG] ✗✗✗ _pull_source_symbol_bars returned None for {asset.symbol}")
             return None
+        logger.info(f"[POLARS-DEBUG] _pull_source_symbol_bars returned {len(bars_data)} bars")
         # Create and return Bars object
-        return self._parse_source_symbol_bars(
+        result = self._parse_source_symbol_bars(
             bars_data, asset, quote=quote, length=length, return_polars=return_polars
         )
+        logger.info(f"[POLARS-DEBUG] Returning Bars object: {result is not None}")
+        return result
     def get_chains(self, asset: Asset, quote: Asset = None, exchange: str = None):
         """Get option chains - not implemented for DataBento."""

lumibot/data_sources/pandas_data.py CHANGED Viewed

@@ -103,8 +103,8 @@ class PandasData(DataSourceBacktesting):
         df = pd.DataFrame(range(len(dt_index)), index=dt_index)
         df = df.sort_index()
-        # Create a column for the date portion only
-        df["dates"] = df.index.date
+        # Create a column for the date portion only (normalize to date, keeping as datetime64 type)
+        df["dates"] = df.index.normalize()
         # Merge with the trading calendar on the 'dates' column to get market open/close times.
         # Use a left join to keep all rows from the original index.
@@ -145,7 +145,8 @@ class PandasData(DataSourceBacktesting):
         else:
             pcal.columns = ["datetime"]
-            pcal["date"] = pcal["datetime"].dt.date
+            # Normalize to date but keep as datetime64 type (not date objects)
+            pcal["date"] = pcal["datetime"].dt.normalize()
             result = pcal.groupby("date").agg(
                 market_open=(
                     "datetime",
@@ -290,6 +291,8 @@ class PandasData(DataSourceBacktesting):
                 ask=ohlcv_bid_ask_dict.get('ask'),
                 volume=ohlcv_bid_ask_dict.get('volume'),
                 timestamp=dt,
+                bid_size=ohlcv_bid_ask_dict.get('bid_size'),
+                ask_size=ohlcv_bid_ask_dict.get('ask_size'),
                 raw_data=ohlcv_bid_ask_dict
             )
         else:

lumibot/data_sources/polars_mixin.py CHANGED Viewed

@@ -72,17 +72,19 @@ class PolarsMixin:
     def _get_data_lazy(self, asset: Asset) -> Optional[pl.LazyFrame]:
         """Get lazy frame for asset.
         Parameters
         ----------
-        asset : Asset
-            The asset to get data for
+        asset : Asset or tuple
+            The asset to get data for (can be a tuple of (asset, quote))
         Returns
         -------
         Optional[pl.LazyFrame]
             The lazy frame or None if not found
         """
+        # CRITICAL FIX: Handle both Asset and (Asset, quote) tuple keys
+        # The data store uses tuple keys (asset, quote), so we need to look up by that key
         return self._data_store.get(asset)
     def _parse_source_symbol_bars_polars(
@@ -95,7 +97,7 @@ class PolarsMixin:
         return_polars: bool = False
     ) -> Bars:
         """Parse bars from polars DataFrame.
         Parameters
         ----------
         response : pl.DataFrame
@@ -108,7 +110,7 @@ class PolarsMixin:
             The quote asset for forex/crypto
         length : Optional[int]
             Limit the number of bars
         Returns
         -------
         Bars
@@ -121,6 +123,21 @@ class PolarsMixin:
         if length and len(response) > length:
             response = response.tail(length)
+        # Filter to only keep OHLCV + datetime columns (remove DataBento metadata like rtype, publisher_id, etc.)
+        # Required columns for strategies
+        required_cols = ['open', 'high', 'low', 'close', 'volume']
+        optional_cols = ['datetime', 'timestamp', 'date', 'time', 'dividend', 'stock_splits', 'symbol']
+        # Determine which columns to keep
+        keep_cols = []
+        for col in response.columns:
+            if col in required_cols or col in optional_cols:
+                keep_cols.append(col)
+        # Select only the relevant columns
+        if keep_cols:
+            response = response.select(keep_cols)
         # Create bars object
         bars = Bars(response, source, asset, raw=response, quote=quote, return_polars=return_polars)
         return bars
@@ -209,22 +226,45 @@ class PolarsMixin:
         self._last_price_cache[cache_key] = price
     def _convert_datetime_for_filtering(self, dt: Any) -> datetime:
-        """Convert datetime to naive datetime for filtering.
+        """Convert datetime to naive UTC datetime for filtering.
+        CRITICAL FIX: Must convert to UTC BEFORE stripping timezone!
+        If we strip timezone from ET datetime, we lose 5 hours of data.
+        Example:
+        - Input: 2024-01-02 18:00:00-05:00 (ET)
+        - Convert to UTC: 2024-01-02 23:00:00+00:00
+        - Strip timezone: 2024-01-02 23:00:00 (naive UTC)
+        OLD BUGGY CODE:
+        - Input: 2024-01-02 18:00:00-05:00 (ET)
+        - Strip timezone: 2024-01-02 18:00:00 (naive, loses timezone!)
+        - Compare to cached data in naive UTC: WRONG by 5 hours!
         Parameters
         ----------
         dt : Any
             Datetime-like object
         Returns
         -------
         datetime
-            Naive datetime object
+            Naive UTC datetime object
         """
-        if hasattr(dt, 'tz_localize'):
-            return dt.tz_localize(None)
+        from datetime import timezone
+        # First convert to UTC if timezone-aware
+        if hasattr(dt, 'tzinfo') and dt.tzinfo is not None:
+            # Convert to UTC
+            dt_utc = dt.astimezone(timezone.utc)
+            # Then strip timezone
+            return dt_utc.replace(tzinfo=None)
+        elif hasattr(dt, 'tz_localize'):
+            # Pandas Timestamp
+            return dt.tz_convert('UTC').tz_localize(None)
         elif hasattr(dt, 'replace'):
-            return dt.replace(tzinfo=None)
+            # Already naive
+            return dt
         else:
             return dt
@@ -283,10 +323,11 @@ class PolarsMixin:
         lazy_data: pl.LazyFrame,
         end_filter: datetime,
         length: int,
-        timestep: str = "minute"
+        timestep: str = "minute",
+        use_strict_less_than: bool = False
     ) -> Optional[pl.DataFrame]:
         """Filter data up to end_filter and return last length rows.
         Parameters
         ----------
         asset : Asset
@@ -299,15 +340,23 @@ class PolarsMixin:
             Number of rows to return
         timestep : str
             Timestep for caching strategy
+        use_strict_less_than : bool
+            If True, use < instead of <= for filtering (matches Pandas behavior without timeshift)
         Returns
         -------
         Optional[pl.DataFrame]
             Filtered dataframe or None
         """
+        # DEBUG
+        logger.debug(f"[POLARS FILTER] end_filter={end_filter}, tzinfo={end_filter.tzinfo if hasattr(end_filter, 'tzinfo') else 'N/A'}, length={length}")
         # Convert end_filter to naive
         end_filter_naive = self._convert_datetime_for_filtering(end_filter)
+        # DEBUG
+        logger.debug(f"[POLARS FILTER] end_filter_naive={end_filter_naive}")
         # For daily timestep, use caching
         if timestep == "day":
             current_date = end_filter.date() if hasattr(end_filter, 'date') else end_filter
@@ -335,11 +384,37 @@ class PolarsMixin:
                 return None
             # Filter and collect
+            # CRITICAL FIX: Keep timezone info! Match the DataFrame's timezone
+            # Get the DataFrame column's timezone from schema
+            dt_dtype = schema[dt_col]
+            # Convert filter to match DataFrame's timezone
+            if hasattr(dt_dtype, 'time_zone') and dt_dtype.time_zone:
+                # DataFrame has timezone, convert filter to match
+                import pytz
+                df_tz = pytz.timezone(dt_dtype.time_zone)
+                end_filter_with_tz = pytz.utc.localize(end_filter_naive).astimezone(df_tz)
+            else:
+                # DataFrame is naive, use UTC
+                from datetime import timezone as tz
+                end_filter_with_tz = datetime.combine(
+                    end_filter_naive.date(),
+                    end_filter_naive.time(),
+                    tzinfo=tz.utc
+                )
+            # CRITICAL FIX: Deduplicate before caching
+            # Use < or <= based on use_strict_less_than flag
+            if use_strict_less_than:
+                filter_expr = pl.col(dt_col) < end_filter_with_tz
+            else:
+                filter_expr = pl.col(dt_col) <= end_filter_with_tz
             result = (
                 lazy_data
-                .with_columns(pl.col(dt_col).cast(pl.Datetime("us")))
-                .filter(pl.col(dt_col) <= end_filter_naive)
+                .filter(filter_expr)
                 .sort(dt_col)
+                .unique(subset=[dt_col], keep='last', maintain_order=True)
                 .tail(fetch_length)
                 .collect()
             )
@@ -362,11 +437,41 @@ class PolarsMixin:
                 logger.error("No datetime column found")
                 return None
-            return (
+            # CRITICAL FIX: Keep timezone info during filtering!
+            # Match the DataFrame's timezone to avoid comparison errors
+            # Get the DataFrame column's timezone from schema
+            dt_dtype = schema[dt_col]
+            # Convert filter to match DataFrame's timezone
+            if hasattr(dt_dtype, 'time_zone') and dt_dtype.time_zone:
+                # DataFrame has timezone, convert filter to match
+                import pytz
+                df_tz = pytz.timezone(dt_dtype.time_zone)
+                end_filter_with_tz = pytz.utc.localize(end_filter_naive).astimezone(df_tz)
+            else:
+                # DataFrame is naive, use UTC
+                from datetime import timezone as tz
+                end_filter_with_tz = datetime.combine(
+                    end_filter_naive.date(),
+                    end_filter_naive.time(),
+                    tzinfo=tz.utc
+                )
+            # CRITICAL FIX: Deduplicate before returning
+            # Sometimes lazy operations can create duplicates
+            # Use < or <= based on use_strict_less_than flag
+            if use_strict_less_than:
+                filter_expr = pl.col(dt_col) < end_filter_with_tz
+            else:
+                filter_expr = pl.col(dt_col) <= end_filter_with_tz
+            result = (
                 lazy_data
-                .with_columns(pl.col(dt_col).cast(pl.Datetime("us")))
-                .filter(pl.col(dt_col) <= end_filter_naive)
+                .filter(filter_expr)
                 .sort(dt_col)
+                .unique(subset=[dt_col], keep='last', maintain_order=True)
                 .tail(length)
                 .collect()
             )
+            return result

lumibot 4.0.23__py3-none-any.whl → 4.1.1__py3-none-any.whl

Potentially problematic release.

lumibot 4.0.23py3-none-any.whl → 4.1.1py3-none-any.whl