zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +7 -9
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +157 -312
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +130 -25
- zipline_polygon_bundle/config.py +70 -45
- zipline_polygon_bundle/trades.py +197 -606
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/METADATA +90 -8
- zipline_polygon_bundle-0.2.3.dist-info/RECORD +18 -0
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/WHEEL +1 -1
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +0 -17
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/LICENSE +0 -0
    
        zipline_polygon_bundle/config.py
    CHANGED
    
    | @@ -1,16 +1,20 @@ | |
| 1 1 | 
             
            from exchange_calendars.calendar_helpers import Date, parse_date
         | 
| 2 | 
            -
            from  | 
| 2 | 
            +
            from zipline.utils.calendar_utils import get_calendar
         | 
| 3 3 |  | 
| 4 4 | 
             
            from .nyse_all_hours_calendar import NYSE_ALL_HOURS
         | 
| 5 5 |  | 
| 6 | 
            -
            from typing import Iterator, Tuple
         | 
| 6 | 
            +
            from typing import Iterator, Mapping, Tuple
         | 
| 7 7 |  | 
| 8 8 | 
             
            import pandas as pd
         | 
| 9 9 | 
             
            from pyarrow.fs import LocalFileSystem
         | 
| 10 10 | 
             
            import os
         | 
| 11 | 
            +
            import datetime
         | 
| 11 12 | 
             
            import re
         | 
| 12 13 | 
             
            import fnmatch
         | 
| 13 14 |  | 
| 15 | 
            +
            AGG_TIME_DAY = "day"
         | 
| 16 | 
            +
            AGG_TIME_MINUTE = "minute"
         | 
| 17 | 
            +
            AGG_TIME_TRADES = "1minute"
         | 
| 14 18 |  | 
| 15 19 | 
             
            PARTITION_COLUMN_NAME = "part"
         | 
| 16 20 | 
             
            PARTITION_KEY_LENGTH = 2
         | 
| @@ -34,16 +38,16 @@ def to_partition_key(s: str) -> str: | |
| 34 38 | 
             
            class PolygonConfig:
         | 
| 35 39 | 
             
                def __init__(
         | 
| 36 40 | 
             
                    self,
         | 
| 37 | 
            -
                    environ:  | 
| 41 | 
            +
                    environ: Mapping[str, str],
         | 
| 38 42 | 
             
                    calendar_name: str,
         | 
| 39 43 | 
             
                    start_date: Date,
         | 
| 40 44 | 
             
                    end_date: Date,
         | 
| 41 | 
            -
                    agg_time: str =  | 
| 42 | 
            -
                    custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
         | 
| 45 | 
            +
                    agg_time: str = AGG_TIME_DAY,
         | 
| 43 46 | 
             
                ):
         | 
| 44 47 | 
             
                    self.calendar_name = calendar_name
         | 
| 45 48 | 
             
                    self.start_date = start_date
         | 
| 46 49 | 
             
                    self.end_date = end_date
         | 
| 50 | 
            +
                    # TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
         | 
| 47 51 | 
             
                    self.start_timestamp = (
         | 
| 48 52 | 
             
                        parse_date(start_date, calendar=self.calendar)
         | 
| 49 53 | 
             
                        if start_date
         | 
| @@ -54,6 +58,8 @@ class PolygonConfig: | |
| 54 58 | 
             
                        if end_date
         | 
| 55 59 | 
             
                        else self.calendar.last_session
         | 
| 56 60 | 
             
                    )
         | 
| 61 | 
            +
                    self.agg_time = agg_time
         | 
| 62 | 
            +
             | 
| 57 63 | 
             
                    self.max_workers = None
         | 
| 58 64 | 
             
                    if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
         | 
| 59 65 | 
             
                        self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
         | 
| @@ -65,17 +71,6 @@ class PolygonConfig: | |
| 65 71 | 
             
                    )
         | 
| 66 72 | 
             
                    self.market = environ.get("POLYGON_MARKET", "stocks")
         | 
| 67 73 | 
             
                    self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
         | 
| 68 | 
            -
                    self.tickers_dir = environ.get(
         | 
| 69 | 
            -
                        "POLYGON_TICKERS_DIR",
         | 
| 70 | 
            -
                        os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
         | 
| 71 | 
            -
                    )
         | 
| 72 | 
            -
                    self.tickers_csv_path = environ.get(
         | 
| 73 | 
            -
                        "POLYGON_TICKERS_CSV",
         | 
| 74 | 
            -
                        os.path.join(
         | 
| 75 | 
            -
                            self.tickers_dir,
         | 
| 76 | 
            -
                            f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
         | 
| 77 | 
            -
                        ),
         | 
| 78 | 
            -
                    )
         | 
| 79 74 | 
             
                    self.flat_files_dir = environ.get(
         | 
| 80 75 | 
             
                        "POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
         | 
| 81 76 | 
             
                    )
         | 
| @@ -92,54 +87,78 @@ class PolygonConfig: | |
| 92 87 | 
             
                    self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
         | 
| 93 88 |  | 
| 94 89 | 
             
                    # TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
         | 
| 90 | 
            +
                    self.custom_asset_files_dir = environ.get(
         | 
| 91 | 
            +
                        "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
         | 
| 92 | 
            +
                    )
         | 
| 93 | 
            +
                    self.tickers_dir = environ.get(
         | 
| 94 | 
            +
                        "POLYGON_TICKERS_DIR",
         | 
| 95 | 
            +
                        os.path.join(self.custom_asset_files_dir, "tickers"),
         | 
| 96 | 
            +
                    )
         | 
| 97 | 
            +
                    self.tickers_csv_path = environ.get(
         | 
| 98 | 
            +
                        "POLYGON_TICKERS_CSV",
         | 
| 99 | 
            +
                        os.path.join(
         | 
| 100 | 
            +
                            self.tickers_dir,
         | 
| 101 | 
            +
                            f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
         | 
| 102 | 
            +
                        ),
         | 
| 103 | 
            +
                    )
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                    self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                    self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
         | 
| 108 | 
            +
                    self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
         | 
| 109 | 
            +
                    self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
         | 
| 110 | 
            +
             | 
| 95 111 | 
             
                    self.minute_by_ticker_dir = os.path.join(
         | 
| 96 | 
            -
                        self. | 
| 112 | 
            +
                        self.custom_asset_files_dir, "minute_by_ticker_v1"
         | 
| 113 | 
            +
                    )
         | 
| 114 | 
            +
                    self.day_by_ticker_dir = os.path.join(
         | 
| 115 | 
            +
                        self.custom_asset_files_dir, "day_by_ticker_v1"
         | 
| 97 116 | 
             
                    )
         | 
| 98 | 
            -
                    self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
         | 
| 99 117 |  | 
| 118 | 
            +
                    # If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
         | 
| 100 119 | 
             
                    if bool(re.match(r"^\d", agg_time)):
         | 
| 101 120 | 
             
                        self.agg_timedelta = pd.to_timedelta(agg_time)
         | 
| 102 | 
            -
                        self. | 
| 103 | 
            -
             | 
| 121 | 
            +
                        self.csv_files_dir = self.trades_dir
         | 
| 122 | 
            +
                        self.custom_aggs_name_format = environ.get(
         | 
| 123 | 
            +
                            "CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
         | 
| 104 124 | 
             
                        )
         | 
| 105 | 
            -
                        self. | 
| 106 | 
            -
                            self.custom_asset_files_dir, | 
| 125 | 
            +
                        self.aggs_dir = os.path.join(
         | 
| 126 | 
            +
                            self.custom_asset_files_dir,
         | 
| 127 | 
            +
                            self.custom_aggs_name_format.format(config=self),
         | 
| 107 128 | 
             
                        )
         | 
| 108 | 
            -
                        self. | 
| 129 | 
            +
                        self.by_ticker_dir = os.path.join(
         | 
| 109 130 | 
             
                            self.custom_asset_files_dir,
         | 
| 110 | 
            -
                            ( | 
| 131 | 
            +
                            (self.custom_aggs_name_format + "_by_ticker").format(config=self),
         | 
| 111 132 | 
             
                        )
         | 
| 112 | 
            -
             | 
| 113 | 
            -
                        self.by_ticker_dir = self.custom_aggs_by_ticker_dir
         | 
| 114 | 
            -
                    elif agg_time == "minute":
         | 
| 133 | 
            +
                    elif agg_time == AGG_TIME_MINUTE:
         | 
| 115 134 | 
             
                        self.agg_timedelta = pd.to_timedelta("1minute")
         | 
| 116 135 | 
             
                        self.aggs_dir = self.minute_aggs_dir
         | 
| 136 | 
            +
                        self.csv_files_dir = self.aggs_dir
         | 
| 117 137 | 
             
                        self.by_ticker_dir = self.minute_by_ticker_dir
         | 
| 118 | 
            -
                    elif agg_time ==  | 
| 138 | 
            +
                    elif agg_time == AGG_TIME_DAY:
         | 
| 119 139 | 
             
                        self.agg_timedelta = pd.to_timedelta("1day")
         | 
| 120 140 | 
             
                        self.aggs_dir = self.day_aggs_dir
         | 
| 141 | 
            +
                        self.csv_files_dir = self.aggs_dir
         | 
| 121 142 | 
             
                        self.by_ticker_dir = self.day_by_ticker_dir
         | 
| 122 143 | 
             
                    else:
         | 
| 123 144 | 
             
                        raise ValueError(
         | 
| 124 145 | 
             
                            f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
         | 
| 125 146 | 
             
                        )
         | 
| 126 | 
            -
                    self.agg_time = agg_time
         | 
| 127 147 |  | 
| 128 148 | 
             
                    self.arrow_format = environ.get(
         | 
| 129 | 
            -
                        "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time ==  | 
| 149 | 
            +
                        "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
         | 
| 130 150 | 
             
                    )
         | 
| 131 151 | 
             
                    # self.by_ticker_hive_dir = os.path.join(
         | 
| 132 152 | 
             
                    #     self.by_ticker_dir,
         | 
| 133 153 | 
             
                    #     f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
         | 
| 134 154 | 
             
                    # )
         | 
| 135 | 
            -
                    self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
         | 
| 136 155 |  | 
| 137 156 | 
             
                @property
         | 
| 138 157 | 
             
                def calendar(self):
         | 
| 139 | 
            -
                    #  | 
| 140 | 
            -
                     | 
| 141 | 
            -
             | 
| 142 | 
            -
                    return get_calendar(self.calendar_name,  | 
| 158 | 
            +
                    # print call stack
         | 
| 159 | 
            +
                    # import traceback
         | 
| 160 | 
            +
                    # traceback.print_stack()
         | 
| 161 | 
            +
                    return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
         | 
| 143 162 |  | 
| 144 163 | 
             
                def ticker_file_path(self, date: pd.Timestamp):
         | 
| 145 164 | 
             
                    ticker_year_dir = os.path.join(
         | 
| @@ -154,6 +173,9 @@ class PolygonConfig: | |
| 154 173 | 
             
                    # TODO: Use csv_paths_pattern to remove the suffixes
         | 
| 155 174 | 
             
                    return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
         | 
| 156 175 |  | 
| 176 | 
            +
                def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
         | 
| 177 | 
            +
                    return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
         | 
| 178 | 
            +
             | 
| 157 179 | 
             
                @property
         | 
| 158 180 | 
             
                def by_ticker_aggs_arrow_dir(self):
         | 
| 159 181 | 
             
                    # TODO: Don't split these up by ingestion range.  They're already time indexed.
         | 
| @@ -161,19 +183,20 @@ class PolygonConfig: | |
| 161 183 | 
             
                    # This scattering is really slow and is usually gonna be redundant.
         | 
| 162 184 | 
             
                    # This wasn't a problem when start/end dates were the calendar bounds when omitted.
         | 
| 163 185 | 
             
                    # Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
         | 
| 164 | 
            -
                    return os.path.join(
         | 
| 165 | 
            -
             | 
| 166 | 
            -
             | 
| 167 | 
            -
             | 
| 168 | 
            -
                    )
         | 
| 186 | 
            +
                    # return os.path.join(
         | 
| 187 | 
            +
                    #     self.by_ticker_dir,
         | 
| 188 | 
            +
                    #     f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
         | 
| 189 | 
            +
                    #     # "aggs.arrow",
         | 
| 190 | 
            +
                    # )
         | 
| 191 | 
            +
                    return self.by_ticker_dir
         | 
| 169 192 |  | 
| 170 193 | 
             
                def api_cache_path(
         | 
| 171 | 
            -
                    self,  | 
| 194 | 
            +
                    self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
         | 
| 172 195 | 
             
                ):
         | 
| 173 | 
            -
                     | 
| 174 | 
            -
                     | 
| 196 | 
            +
                    first_day_str = first_day.date().isoformat()
         | 
| 197 | 
            +
                    last_day_str = last_day.date().isoformat()
         | 
| 175 198 | 
             
                    return os.path.join(
         | 
| 176 | 
            -
                        self.cache_dir, f"{ | 
| 199 | 
            +
                        self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
         | 
| 177 200 | 
             
                    )
         | 
| 178 201 |  | 
| 179 202 | 
             
                def csv_paths(self) -> Iterator[str]:
         | 
| @@ -186,7 +209,9 @@ class PolygonConfig: | |
| 186 209 | 
             
                            for filename in sorted(filenames):
         | 
| 187 210 | 
             
                                yield os.path.join(root, filename)
         | 
| 188 211 |  | 
| 189 | 
            -
                def find_first_and_last_aggs( | 
| 212 | 
            +
                def find_first_and_last_aggs(
         | 
| 213 | 
            +
                    self, aggs_dir, file_pattern
         | 
| 214 | 
            +
                ) -> Tuple[str | None, str | None]:
         | 
| 190 215 | 
             
                    # Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
         | 
| 191 216 | 
             
                    # Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
         | 
| 192 217 | 
             
                    # This needs to be efficient because it is called on every init, even though we only need it for ingest.
         |