PyPI - zipline_polygon_bundle - Versions diffs - 0.2.0.dev1__py3-none-any.whl → 0.2.3__py3-none-any.whl - Mend

zipline_polygon_bundle 0.2.0.dev1py3-none-any.whl → 0.2.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

zipline_polygon_bundle/__init__.py +7 -9
zipline_polygon_bundle/adjustments.py +27 -32
zipline_polygon_bundle/bundle.py +157 -312
zipline_polygon_bundle/compute_signals.py +261 -0
zipline_polygon_bundle/concat_all_aggs.py +130 -25
zipline_polygon_bundle/config.py +70 -45
zipline_polygon_bundle/trades.py +197 -606
{zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/METADATA +90 -8
zipline_polygon_bundle-0.2.3.dist-info/RECORD +18 -0
{zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/WHEEL +1 -1
zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +0 -17
{zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/LICENSE +0 -0

zipline_polygon_bundle/config.py CHANGED Viewed

@@ -1,16 +1,20 @@
 from exchange_calendars.calendar_helpers import Date, parse_date
-from exchange_calendars.calendar_utils import get_calendar
+from zipline.utils.calendar_utils import get_calendar
 from .nyse_all_hours_calendar import NYSE_ALL_HOURS
-from typing import Iterator, Tuple
+from typing import Iterator, Mapping, Tuple
 import pandas as pd
 from pyarrow.fs import LocalFileSystem
 import os
+import datetime
 import re
 import fnmatch
+AGG_TIME_DAY = "day"
+AGG_TIME_MINUTE = "minute"
+AGG_TIME_TRADES = "1minute"
 PARTITION_COLUMN_NAME = "part"
 PARTITION_KEY_LENGTH = 2
@@ -34,16 +38,16 @@ def to_partition_key(s: str) -> str:
 class PolygonConfig:
     def __init__(
         self,
-        environ: dict,
+        environ: Mapping[str, str],
         calendar_name: str,
         start_date: Date,
         end_date: Date,
-        agg_time: str = "day",
-        custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
+        agg_time: str = AGG_TIME_DAY,
     ):
         self.calendar_name = calendar_name
         self.start_date = start_date
         self.end_date = end_date
+        # TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
         self.start_timestamp = (
             parse_date(start_date, calendar=self.calendar)
             if start_date
@@ -54,6 +58,8 @@ class PolygonConfig:
             if end_date
             else self.calendar.last_session
         )
+        self.agg_time = agg_time
         self.max_workers = None
         if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
             self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
@@ -65,17 +71,6 @@ class PolygonConfig:
         )
         self.market = environ.get("POLYGON_MARKET", "stocks")
         self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
-        self.tickers_dir = environ.get(
-            "POLYGON_TICKERS_DIR",
-            os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
-        )
-        self.tickers_csv_path = environ.get(
-            "POLYGON_TICKERS_CSV",
-            os.path.join(
-                self.tickers_dir,
-                f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
-            ),
-        )
         self.flat_files_dir = environ.get(
             "POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
         )
@@ -92,54 +87,78 @@ class PolygonConfig:
         self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
         # TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
+        self.custom_asset_files_dir = environ.get(
+            "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
+        )
+        self.tickers_dir = environ.get(
+            "POLYGON_TICKERS_DIR",
+            os.path.join(self.custom_asset_files_dir, "tickers"),
+        )
+        self.tickers_csv_path = environ.get(
+            "POLYGON_TICKERS_CSV",
+            os.path.join(
+                self.tickers_dir,
+                f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
+            ),
+        )
+        self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
+        self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
+        self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
+        self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
         self.minute_by_ticker_dir = os.path.join(
-            self.asset_files_dir, "minute_by_ticker_v1"
+            self.custom_asset_files_dir, "minute_by_ticker_v1"
+        )
+        self.day_by_ticker_dir = os.path.join(
+            self.custom_asset_files_dir, "day_by_ticker_v1"
         )
-        self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
+        # If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
         if bool(re.match(r"^\d", agg_time)):
             self.agg_timedelta = pd.to_timedelta(agg_time)
-            self.custom_asset_files_dir = environ.get(
-                "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
+            self.csv_files_dir = self.trades_dir
+            self.custom_aggs_name_format = environ.get(
+                "CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
             )
-            self.custom_aggs_dir = os.path.join(
-                self.custom_asset_files_dir, custom_aggs_format.format(config=self)
+            self.aggs_dir = os.path.join(
+                self.custom_asset_files_dir,
+                self.custom_aggs_name_format.format(config=self),
             )
-            self.custom_aggs_by_ticker_dir = os.path.join(
+            self.by_ticker_dir = os.path.join(
                 self.custom_asset_files_dir,
-                (custom_aggs_format + "_by_ticker").format(config=self),
+                (self.custom_aggs_name_format + "_by_ticker").format(config=self),
             )
-            self.aggs_dir = self.custom_aggs_dir
-            self.by_ticker_dir = self.custom_aggs_by_ticker_dir
-        elif agg_time == "minute":
+        elif agg_time == AGG_TIME_MINUTE:
             self.agg_timedelta = pd.to_timedelta("1minute")
             self.aggs_dir = self.minute_aggs_dir
+            self.csv_files_dir = self.aggs_dir
             self.by_ticker_dir = self.minute_by_ticker_dir
-        elif agg_time == "day":
+        elif agg_time == AGG_TIME_DAY:
             self.agg_timedelta = pd.to_timedelta("1day")
             self.aggs_dir = self.day_aggs_dir
+            self.csv_files_dir = self.aggs_dir
             self.by_ticker_dir = self.day_by_ticker_dir
         else:
             raise ValueError(
                 f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
             )
-        self.agg_time = agg_time
         self.arrow_format = environ.get(
-            "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive"
+            "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
         )
         # self.by_ticker_hive_dir = os.path.join(
         #     self.by_ticker_dir,
         #     f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
         # )
-        self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
     @property
     def calendar(self):
-        # If you don't give a start date you'll only get 20 years from today.
-        if self.calendar_name in [NYSE_ALL_HOURS, "us_futures", "CMES", "XNYS", "NYSE"]:
-            return get_calendar(self.calendar_name, side="right", start=pd.Timestamp("1990-01-01"))
-        return get_calendar(self.calendar_name, side="right")
+        # print call stack
+        # import traceback
+        # traceback.print_stack()
+        return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
     def ticker_file_path(self, date: pd.Timestamp):
         ticker_year_dir = os.path.join(
@@ -154,6 +173,9 @@ class PolygonConfig:
         # TODO: Use csv_paths_pattern to remove the suffixes
         return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
+    def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
+        return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
     @property
     def by_ticker_aggs_arrow_dir(self):
         # TODO: Don't split these up by ingestion range.  They're already time indexed.
@@ -161,19 +183,20 @@ class PolygonConfig:
         # This scattering is really slow and is usually gonna be redundant.
         # This wasn't a problem when start/end dates were the calendar bounds when omitted.
         # Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
-        return os.path.join(
-            self.by_ticker_dir,
-            f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
-            # "aggs.arrow",
-        )
+        # return os.path.join(
+        #     self.by_ticker_dir,
+        #     f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
+        #     # "aggs.arrow",
+        # )
+        return self.by_ticker_dir
     def api_cache_path(
-        self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
+        self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
     ):
-        start_str = parse_date(start_date, calendar=self.calendar).date().isoformat()
-        end_str = parse_date(end_date, calendar=self.calendar).date().isoformat()
+        first_day_str = first_day.date().isoformat()
+        last_day_str = last_day.date().isoformat()
         return os.path.join(
-            self.cache_dir, f"{start_str}_{end_str}/{filename}{extension}"
+            self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
         )
     def csv_paths(self) -> Iterator[str]:
@@ -186,7 +209,9 @@ class PolygonConfig:
                 for filename in sorted(filenames):
                     yield os.path.join(root, filename)
-    def find_first_and_last_aggs(self, aggs_dir, file_pattern) -> Tuple[str | None, str | None]:
+    def find_first_and_last_aggs(
+        self, aggs_dir, file_pattern
+    ) -> Tuple[str | None, str | None]:
         # Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
         # Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
         # This needs to be efficient because it is called on every init, even though we only need it for ingest.

zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.3__py3-none-any.whl

zipline_polygon_bundle 0.2.0.dev1py3-none-any.whl → 0.2.3py3-none-any.whl