zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +7 -9
 - zipline_polygon_bundle/adjustments.py +27 -32
 - zipline_polygon_bundle/bundle.py +157 -312
 - zipline_polygon_bundle/compute_signals.py +261 -0
 - zipline_polygon_bundle/concat_all_aggs.py +130 -25
 - zipline_polygon_bundle/config.py +57 -32
 - zipline_polygon_bundle/trades.py +196 -607
 - {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/METADATA +8 -6
 - zipline_polygon_bundle-0.2.1.dist-info/RECORD +18 -0
 - {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/WHEEL +1 -1
 - zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +0 -17
 - {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/LICENSE +0 -0
 
    
        zipline_polygon_bundle/trades.py
    CHANGED
    
    | 
         @@ -1,29 +1,29 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
         
     | 
| 
       2 
2 
     | 
    
         | 
| 
       3 
     | 
    
         
            -
            from typing import Iterator, Tuple 
     | 
| 
      
 3 
     | 
    
         
            +
            from typing import Iterator, Tuple
         
     | 
| 
       4 
4 
     | 
    
         | 
| 
       5 
5 
     | 
    
         
             
            import pyarrow as pa
         
     | 
| 
       6 
     | 
    
         
            -
            import pyarrow.dataset as pa_ds
         
     | 
| 
       7 
6 
     | 
    
         
             
            import pyarrow.compute as pa_compute
         
     | 
| 
       8 
7 
     | 
    
         
             
            import pyarrow.csv as pa_csv
         
     | 
| 
      
 8 
     | 
    
         
            +
            import pyarrow.dataset as pa_ds
         
     | 
| 
       9 
9 
     | 
    
         
             
            import pyarrow.fs as pa_fs
         
     | 
| 
       10 
10 
     | 
    
         | 
| 
       11 
11 
     | 
    
         
             
            from fsspec.implementations.arrow import ArrowFSWrapper
         
     | 
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            import os
         
     | 
| 
       14 
14 
     | 
    
         
             
            import datetime
         
     | 
| 
       15 
     | 
    
         
            -
            import shutil
         
     | 
| 
       16 
15 
     | 
    
         | 
| 
       17 
16 
     | 
    
         
             
            import numpy as np
         
     | 
| 
       18 
17 
     | 
    
         
             
            import pandas as pd
         
     | 
| 
       19 
     | 
    
         
            -
            import pandas_ta as ta
         
     | 
| 
       20 
18 
     | 
    
         | 
| 
       21 
19 
     | 
    
         | 
| 
       22 
20 
     | 
    
         
             
            def trades_schema(raw: bool = False) -> pa.Schema:
         
     | 
| 
       23 
21 
     | 
    
         
             
                # There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
         
     | 
| 
       24 
22 
     | 
    
         
             
                # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
         
     | 
| 
       25 
23 
     | 
    
         
             
                # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
         
     | 
| 
       26 
     | 
    
         
            -
                #  
     | 
| 
      
 24 
     | 
    
         
            +
                # The timezone is America/New_York because that's the US exchanges timezone and the date is a trading day.
         
     | 
| 
      
 25 
     | 
    
         
            +
                # timestamp_type = pa.timestamp("ns", tz="America/New_York")
         
     | 
| 
      
 26 
     | 
    
         
            +
                # timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
         
     | 
| 
       27 
27 
     | 
    
         
             
                timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
         
     | 
| 
       28 
28 
     | 
    
         | 
| 
       29 
29 
     | 
    
         
             
                # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
         
     | 
| 
         @@ -94,7 +94,7 @@ def cast_strings_to_list( 
     | 
|
| 
       94 
94 
     | 
    
         
             
                return int_list_array
         
     | 
| 
       95 
95 
     | 
    
         | 
| 
       96 
96 
     | 
    
         | 
| 
       97 
     | 
    
         
            -
            def cast_trades(trades):
         
     | 
| 
      
 97 
     | 
    
         
            +
            def cast_trades(trades) -> pa.Table:
         
     | 
| 
       98 
98 
     | 
    
         
             
                trades = trades.cast(trades_schema())
         
     | 
| 
       99 
99 
     | 
    
         
             
                condition_values = cast_strings_to_list(
         
     | 
| 
       100 
100 
     | 
    
         
             
                    trades.column("conditions").combine_chunks()
         
     | 
| 
         @@ -102,220 +102,8 @@ def cast_trades(trades): 
     | 
|
| 
       102 
102 
     | 
    
         
             
                return trades.append_column("condition_values", condition_values)
         
     | 
| 
       103 
103 
     | 
    
         | 
| 
       104 
104 
     | 
    
         | 
| 
       105 
     | 
    
         
            -
            def date_to_path(date, ext=".csv.gz"):
         
     | 
| 
       106 
     | 
    
         
            -
                # return f"{date.year}/{date.month:02}/{date.isoformat()}{ext}"
         
     | 
| 
       107 
     | 
    
         
            -
                return date.strftime("%Y/%m/%Y-%m-%d") + ext
         
     | 
| 
       108 
     | 
    
         
            -
             
     | 
| 
       109 
     | 
    
         
            -
             
     | 
| 
       110 
     | 
    
         
            -
            # def convert_to_custom_aggs_file(
         
     | 
| 
       111 
     | 
    
         
            -
            #     config: PolygonConfig,
         
     | 
| 
       112 
     | 
    
         
            -
            #     overwrite: bool,
         
     | 
| 
       113 
     | 
    
         
            -
            #     timestamp: pd.Timestamp,
         
     | 
| 
       114 
     | 
    
         
            -
            #     start_session: pd.Timestamp,
         
     | 
| 
       115 
     | 
    
         
            -
            #     end_session: pd.Timestamp,
         
     | 
| 
       116 
     | 
    
         
            -
            # ):
         
     | 
| 
       117 
     | 
    
         
            -
            #     date = timestamp.to_pydatetime().date()
         
     | 
| 
       118 
     | 
    
         
            -
            #     aggs_date_path = date_to_path(date, ext=".parquet")
         
     | 
| 
       119 
     | 
    
         
            -
            #     aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
         
     | 
| 
       120 
     | 
    
         
            -
            #     # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
         
     | 
| 
       121 
     | 
    
         
            -
            #     fsspec = ArrowFSWrapper(config.filesystem)
         
     | 
| 
       122 
     | 
    
         
            -
            #     if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
         
     | 
| 
       123 
     | 
    
         
            -
            #         if overwrite:
         
     | 
| 
       124 
     | 
    
         
            -
            #             if fsspec.exists(aggs_path):
         
     | 
| 
       125 
     | 
    
         
            -
            #                 config.filesystem.delete_file(aggs_path)
         
     | 
| 
       126 
     | 
    
         
            -
            #             if fsspec.exists(aggs_by_ticker_path):
         
     | 
| 
       127 
     | 
    
         
            -
            #                 config.filesystem.delete_file(aggs_by_ticker_path)
         
     | 
| 
       128 
     | 
    
         
            -
            #         else:
         
     | 
| 
       129 
     | 
    
         
            -
            #             if fsspec.exists(aggs_path):
         
     | 
| 
       130 
     | 
    
         
            -
            #                 print(f"SKIPPING: {date=} File exists {aggs_path=}")
         
     | 
| 
       131 
     | 
    
         
            -
            #             if fsspec.exists(aggs_by_ticker_path):
         
     | 
| 
       132 
     | 
    
         
            -
            #                 print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
         
     | 
| 
       133 
     | 
    
         
            -
            #             return
         
     | 
| 
       134 
     | 
    
         
            -
            #     fsspec.mkdir(fsspec._parent(aggs_path))
         
     | 
| 
       135 
     | 
    
         
            -
            #     fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
         
     | 
| 
       136 
     | 
    
         
            -
            #     trades_path = f"{config.trades_dir}/{date_to_path(date)}"
         
     | 
| 
       137 
     | 
    
         
            -
            #     if not fsspec.exists(trades_path):
         
     | 
| 
       138 
     | 
    
         
            -
            #         print(f"ERROR: Trades file missing.  Skipping {date=}.  {trades_path=}")
         
     | 
| 
       139 
     | 
    
         
            -
            #         return
         
     | 
| 
       140 
     | 
    
         
            -
            #     print(f"{trades_path=}")
         
     | 
| 
       141 
     | 
    
         
            -
            #     format = pa_ds.CsvFileFormat()
         
     | 
| 
       142 
     | 
    
         
            -
            #     trades_ds = pa_ds.FileSystemDataset.from_paths(
         
     | 
| 
       143 
     | 
    
         
            -
            #         [trades_path],
         
     | 
| 
       144 
     | 
    
         
            -
            #         format=format,
         
     | 
| 
       145 
     | 
    
         
            -
            #         schema=trades_schema(raw=True),
         
     | 
| 
       146 
     | 
    
         
            -
            #         filesystem=config.filesystem,
         
     | 
| 
       147 
     | 
    
         
            -
            #     )
         
     | 
| 
       148 
     | 
    
         
            -
            #     fragments = trades_ds.get_fragments()
         
     | 
| 
       149 
     | 
    
         
            -
            #     fragment = next(fragments)
         
     | 
| 
       150 
     | 
    
         
            -
            #     try:
         
     | 
| 
       151 
     | 
    
         
            -
            #         next(fragments)
         
     | 
| 
       152 
     | 
    
         
            -
            #         print("ERROR: More than one fragment for {path=}")
         
     | 
| 
       153 
     | 
    
         
            -
            #     except StopIteration:
         
     | 
| 
       154 
     | 
    
         
            -
            #         pass
         
     | 
| 
       155 
     | 
    
         
            -
            #     trades = fragment.to_table(schema=trades_ds.schema)
         
     | 
| 
       156 
     | 
    
         
            -
            #     trades = trades.cast(trades_schema())
         
     | 
| 
       157 
     | 
    
         
            -
            #     min_timestamp = pa.compute.min(trades.column("sip_timestamp")).as_py()
         
     | 
| 
       158 
     | 
    
         
            -
            #     max_timestamp = pa.compute.max(trades.column("sip_timestamp")).as_py()
         
     | 
| 
       159 
     | 
    
         
            -
            #     if min_timestamp < start_session:
         
     | 
| 
       160 
     | 
    
         
            -
            #         print(f"ERROR: {min_timestamp=} < {start_session=}")
         
     | 
| 
       161 
     | 
    
         
            -
            #     if max_timestamp >= end_session:
         
     | 
| 
       162 
     | 
    
         
            -
            #         print(f"ERROR: {max_timestamp=} >= {end_session=}")
         
     | 
| 
       163 
     | 
    
         
            -
            #     trades_df = trades.to_pandas()
         
     | 
| 
       164 
     | 
    
         
            -
            #     trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
         
     | 
| 
       165 
     | 
    
         
            -
            #     aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
         
     | 
| 
       166 
     | 
    
         
            -
            #         open=("price", "first"),
         
     | 
| 
       167 
     | 
    
         
            -
            #         high=("price", "max"),
         
     | 
| 
       168 
     | 
    
         
            -
            #         low=("price", "min"),
         
     | 
| 
       169 
     | 
    
         
            -
            #         close=("price", "last"),
         
     | 
| 
       170 
     | 
    
         
            -
            #         volume=("size", "sum"),
         
     | 
| 
       171 
     | 
    
         
            -
            #     )
         
     | 
| 
       172 
     | 
    
         
            -
            #     aggs_df["transactions"] = trades_df.groupby(["ticker", "window_start"]).size()
         
     | 
| 
       173 
     | 
    
         
            -
            #     aggs_df.reset_index(inplace=True)
         
     | 
| 
       174 
     | 
    
         
            -
            #     aggs_table = pa.Table.from_pandas(aggs_df).select(
         
     | 
| 
       175 
     | 
    
         
            -
            #         [
         
     | 
| 
       176 
     | 
    
         
            -
            #             "ticker",
         
     | 
| 
       177 
     | 
    
         
            -
            #             "volume",
         
     | 
| 
       178 
     | 
    
         
            -
            #             "open",
         
     | 
| 
       179 
     | 
    
         
            -
            #             "close",
         
     | 
| 
       180 
     | 
    
         
            -
            #             "high",
         
     | 
| 
       181 
     | 
    
         
            -
            #             "low",
         
     | 
| 
       182 
     | 
    
         
            -
            #             "window_start",
         
     | 
| 
       183 
     | 
    
         
            -
            #             "transactions",
         
     | 
| 
       184 
     | 
    
         
            -
            #         ]
         
     | 
| 
       185 
     | 
    
         
            -
            #     )
         
     | 
| 
       186 
     | 
    
         
            -
            #     aggs_table = aggs_table.sort_by(
         
     | 
| 
       187 
     | 
    
         
            -
            #         [("ticker", "ascending"), ("window_start", "ascending")]
         
     | 
| 
       188 
     | 
    
         
            -
            #     )
         
     | 
| 
       189 
     | 
    
         
            -
            #     print(f"{aggs_by_ticker_path=}")
         
     | 
| 
       190 
     | 
    
         
            -
            #     pa_parquet.write_table(
         
     | 
| 
       191 
     | 
    
         
            -
            #         table=aggs_table, where=aggs_by_ticker_path, filesystem=to_config.filesystem
         
     | 
| 
       192 
     | 
    
         
            -
            #     )
         
     | 
| 
       193 
     | 
    
         
            -
            #     aggs_table = aggs_table.sort_by(
         
     | 
| 
       194 
     | 
    
         
            -
            #         [("window_start", "ascending"), ("ticker", "ascending")]
         
     | 
| 
       195 
     | 
    
         
            -
            #     )
         
     | 
| 
       196 
     | 
    
         
            -
            #     print(f"{aggs_path=}")
         
     | 
| 
       197 
     | 
    
         
            -
            #     pa_parquet.write_table(
         
     | 
| 
       198 
     | 
    
         
            -
            #         table=aggs_table, where=aggs_path, filesystem=to_config.filesystem
         
     | 
| 
       199 
     | 
    
         
            -
            #     )
         
     | 
| 
       200 
     | 
    
         
            -
             
     | 
| 
       201 
     | 
    
         
            -
             
     | 
| 
       202 
     | 
    
         
            -
            # def convert_to_custom_aggs(config: PolygonConfig,
         
     | 
| 
       203 
     | 
    
         
            -
            #                             overwrite: bool,
         
     | 
| 
       204 
     | 
    
         
            -
            #                             timestamp: pd.Timestamp,
         
     | 
| 
       205 
     | 
    
         
            -
            #                             start_session: pd.Timestamp,
         
     | 
| 
       206 
     | 
    
         
            -
            #                             end_session: pd.Timestamp):
         
     | 
| 
       207 
     | 
    
         
            -
            #     date = timestamp.to_pydatetime().date()
         
     | 
| 
       208 
     | 
    
         
            -
            #     aggs_date_path = date_to_path(date, ext=".parquet")
         
     | 
| 
       209 
     | 
    
         
            -
            #     aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
         
     | 
| 
       210 
     | 
    
         
            -
            #     # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
         
     | 
| 
       211 
     | 
    
         
            -
            #     fsspec = ArrowFSWrapper(config.filesystem)
         
     | 
| 
       212 
     | 
    
         
            -
            #     if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
         
     | 
| 
       213 
     | 
    
         
            -
            #         if overwrite:
         
     | 
| 
       214 
     | 
    
         
            -
            #             if fsspec.exists(aggs_path):
         
     | 
| 
       215 
     | 
    
         
            -
            #                 config.filesystem.delete_file(aggs_path)
         
     | 
| 
       216 
     | 
    
         
            -
            #             if fsspec.exists(aggs_by_ticker_path):
         
     | 
| 
       217 
     | 
    
         
            -
            #                 config.filesystem.delete_file(aggs_by_ticker_path)
         
     | 
| 
       218 
     | 
    
         
            -
            #         else:
         
     | 
| 
       219 
     | 
    
         
            -
            #             if fsspec.exists(aggs_path):
         
     | 
| 
       220 
     | 
    
         
            -
            #                 print(f"SKIPPING: {date=} File exists {aggs_path=}")
         
     | 
| 
       221 
     | 
    
         
            -
            #             if fsspec.exists(aggs_by_ticker_path):
         
     | 
| 
       222 
     | 
    
         
            -
            #                 print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
         
     | 
| 
       223 
     | 
    
         
            -
            #             return
         
     | 
| 
       224 
     | 
    
         
            -
            #     fsspec.mkdir(fsspec._parent(aggs_path))
         
     | 
| 
       225 
     | 
    
         
            -
            #     fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
         
     | 
| 
       226 
     | 
    
         
            -
            #     trades_path = f"{config.trades_dir}/{date_to_path(date)}"
         
     | 
| 
       227 
     | 
    
         
            -
            #     if not fsspec.exists(trades_path):
         
     | 
| 
       228 
     | 
    
         
            -
            #         print(f"ERROR: Trades file missing.  Skipping {date=}.  {trades_path=}")
         
     | 
| 
       229 
     | 
    
         
            -
            #         return
         
     | 
| 
       230 
     | 
    
         
            -
            #     print(f"{trades_path=}")
         
     | 
| 
       231 
     | 
    
         
            -
            #     format = pa_ds.CsvFileFormat()
         
     | 
| 
       232 
     | 
    
         
            -
            #     trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
         
     | 
| 
       233 
     | 
    
         
            -
            #     fragments = trades_ds.get_fragments()
         
     | 
| 
       234 
     | 
    
         
            -
            #     fragment = next(fragments)
         
     | 
| 
       235 
     | 
    
         
            -
            #     try:
         
     | 
| 
       236 
     | 
    
         
            -
            #         next(fragments)
         
     | 
| 
       237 
     | 
    
         
            -
            #         print("ERROR: More than one fragment for {path=}")
         
     | 
| 
       238 
     | 
    
         
            -
            #     except StopIteration:
         
     | 
| 
       239 
     | 
    
         
            -
            #         pass
         
     | 
| 
       240 
     | 
    
         
            -
            #     trades = fragment.to_table(schema=trades_ds.schema)
         
     | 
| 
       241 
     | 
    
         
            -
            #     trades = trades.cast(trades_schema())
         
     | 
| 
       242 
     | 
    
         
            -
            #     min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
         
     | 
| 
       243 
     | 
    
         
            -
            #     max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
         
     | 
| 
       244 
     | 
    
         
            -
            #     if min_timestamp < start_session:
         
     | 
| 
       245 
     | 
    
         
            -
            #         print(f"ERROR: {min_timestamp=} < {start_session=}")
         
     | 
| 
       246 
     | 
    
         
            -
            #     if max_timestamp >= end_session:
         
     | 
| 
       247 
     | 
    
         
            -
            #         print(f"ERROR: {max_timestamp=} >= {end_session=}")
         
     | 
| 
       248 
     | 
    
         
            -
            #     trades_df = trades.to_pandas()
         
     | 
| 
       249 
     | 
    
         
            -
            #     trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
         
     | 
| 
       250 
     | 
    
         
            -
            #     aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
         
     | 
| 
       251 
     | 
    
         
            -
            #         open=('price', 'first'),
         
     | 
| 
       252 
     | 
    
         
            -
            #         high=('price', 'max'),
         
     | 
| 
       253 
     | 
    
         
            -
            #         low=('price', 'min'),
         
     | 
| 
       254 
     | 
    
         
            -
            #         close=('price', 'last'),
         
     | 
| 
       255 
     | 
    
         
            -
            #         volume=('size', 'sum'),
         
     | 
| 
       256 
     | 
    
         
            -
            #     )
         
     | 
| 
       257 
     | 
    
         
            -
            #     aggs_df['transactions'] = trades_df.groupby(["ticker", "window_start"]).size()
         
     | 
| 
       258 
     | 
    
         
            -
            #     aggs_df.reset_index(inplace=True)
         
     | 
| 
       259 
     | 
    
         
            -
            #     aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
         
     | 
| 
       260 
     | 
    
         
            -
            #     aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
         
     | 
| 
       261 
     | 
    
         
            -
            #     print(f"{aggs_by_ticker_path=}")
         
     | 
| 
       262 
     | 
    
         
            -
            #     pa_parquet.write_table(table=aggs_table,
         
     | 
| 
       263 
     | 
    
         
            -
            #                            where=aggs_by_ticker_path, filesystem=to_config.filesystem)
         
     | 
| 
       264 
     | 
    
         
            -
            #     aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
         
     | 
| 
       265 
     | 
    
         
            -
            #     print(f"{aggs_path=}")
         
     | 
| 
       266 
     | 
    
         
            -
            #     pa_parquet.write_table(table=aggs_table,
         
     | 
| 
       267 
     | 
    
         
            -
            #                            where=aggs_path, filesystem=to_config.filesystem)
         
     | 
| 
       268 
     | 
    
         
            -
            #     pa_ds.write_dataset(
         
     | 
| 
       269 
     | 
    
         
            -
            #         generate_batches_from_tables(tables),
         
     | 
| 
       270 
     | 
    
         
            -
            #         schema=schema,
         
     | 
| 
       271 
     | 
    
         
            -
            #         base_dir=by_ticker_aggs_arrow_dir,
         
     | 
| 
       272 
     | 
    
         
            -
            #         partitioning=partitioning,
         
     | 
| 
       273 
     | 
    
         
            -
            #         format="parquet",
         
     | 
| 
       274 
     | 
    
         
            -
            #         existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
       275 
     | 
    
         
            -
            #     )
         
     | 
| 
       276 
     | 
    
         
            -
             
     | 
| 
       277 
     | 
    
         
            -
             
     | 
| 
       278 
     | 
    
         
            -
            # def generate_csv_trades_tables(
         
     | 
| 
       279 
     | 
    
         
            -
            #     config: PolygonConfig,
         
     | 
| 
       280 
     | 
    
         
            -
            # ) -> Tuple[datetime.date, Iterator[pa.Table]]:
         
     | 
| 
       281 
     | 
    
         
            -
            #     """Generator for trades tables from flatfile CSVs."""
         
     | 
| 
       282 
     | 
    
         
            -
            #     # Use pandas_market_calendars so we can get extended hours.
         
     | 
| 
       283 
     | 
    
         
            -
            #     # NYSE and NASDAQ have extended hours but XNYS does not.
         
     | 
| 
       284 
     | 
    
         
            -
            #     calendar = pandas_market_calendars.get_calendar(config.calendar_name)
         
     | 
| 
       285 
     | 
    
         
            -
            #     schedule = calendar.schedule(start_date=config.start_timestamp, end_date=config.end_timestamp, start="pre", end="post")
         
     | 
| 
       286 
     | 
    
         
            -
            #     for timestamp, session in schedule.iterrows():
         
     | 
| 
       287 
     | 
    
         
            -
            #         date = timestamp.to_pydatetime().date()
         
     | 
| 
       288 
     | 
    
         
            -
            #         trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
         
     | 
| 
       289 
     | 
    
         
            -
            #         format = pa_ds.CsvFileFormat()
         
     | 
| 
       290 
     | 
    
         
            -
            #         trades_ds = pa_ds.FileSystemDataset.from_paths([trades_csv_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
         
     | 
| 
       291 
     | 
    
         
            -
            #         fragments = trades_ds.get_fragments()
         
     | 
| 
       292 
     | 
    
         
            -
            #         fragment = next(fragments)
         
     | 
| 
       293 
     | 
    
         
            -
            #         try:
         
     | 
| 
       294 
     | 
    
         
            -
            #             next(fragments)
         
     | 
| 
       295 
     | 
    
         
            -
            #             print("ERROR: More than one fragment for {path=}")
         
     | 
| 
       296 
     | 
    
         
            -
            #         except StopIteration:
         
     | 
| 
       297 
     | 
    
         
            -
            #             pass
         
     | 
| 
       298 
     | 
    
         
            -
            #         trades = fragment.to_table(schema=trades_ds.schema)
         
     | 
| 
       299 
     | 
    
         
            -
            #         trades = trades.cast(trades_schema())
         
     | 
| 
       300 
     | 
    
         
            -
            #         min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
         
     | 
| 
       301 
     | 
    
         
            -
            #         max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
         
     | 
| 
       302 
     | 
    
         
            -
            #         start_session = session['pre']
         
     | 
| 
       303 
     | 
    
         
            -
            #         end_session = session['post']
         
     | 
| 
       304 
     | 
    
         
            -
            #         # print(f"{start_session=} {end_session=}")
         
     | 
| 
       305 
     | 
    
         
            -
            #         # print(f"{min_timestamp=} {max_timestamp=}")
         
     | 
| 
       306 
     | 
    
         
            -
            #         if min_timestamp < start_session:
         
     | 
| 
       307 
     | 
    
         
            -
            #             print(f"ERROR: {min_timestamp=} < {start_session=}")
         
     | 
| 
       308 
     | 
    
         
            -
            #         # The end_session is supposed to be a limit but there are many with trades at that second.
         
     | 
| 
       309 
     | 
    
         
            -
            #         if max_timestamp >= (end_session + pd.Timedelta(seconds=1)):
         
     | 
| 
       310 
     | 
    
         
            -
            #             # print(f"ERROR: {max_timestamp=} >= {end_session=}")
         
     | 
| 
       311 
     | 
    
         
            -
            #             print(f"ERROR: {max_timestamp=} > {end_session+pd.Timedelta(seconds=1)=}")
         
     | 
| 
       312 
     | 
    
         
            -
            #         yield date, trades
         
     | 
| 
       313 
     | 
    
         
            -
            #         del fragment
         
     | 
| 
       314 
     | 
    
         
            -
            #         del fragments
         
     | 
| 
       315 
     | 
    
         
            -
            #         del trades_ds
         
     | 
| 
       316 
     | 
    
         
            -
             
     | 
| 
       317 
     | 
    
         
            -
             
     | 
| 
       318 
105 
     | 
    
         
             
            def custom_aggs_schema(raw: bool = False) -> pa.Schema:
         
     | 
| 
      
 106 
     | 
    
         
            +
                # timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
         
     | 
| 
       319 
107 
     | 
    
         
             
                timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
         
     | 
| 
       320 
108 
     | 
    
         
             
                price_type = pa.float64()
         
     | 
| 
       321 
109 
     | 
    
         
             
                return pa.schema(
         
     | 
| 
         @@ -331,6 +119,7 @@ def custom_aggs_schema(raw: bool = False) -> pa.Schema: 
     | 
|
| 
       331 
119 
     | 
    
         
             
                        pa.field("date", pa.date32(), nullable=False),
         
     | 
| 
       332 
120 
     | 
    
         
             
                        pa.field("year", pa.uint16(), nullable=False),
         
     | 
| 
       333 
121 
     | 
    
         
             
                        pa.field("month", pa.uint8(), nullable=False),
         
     | 
| 
      
 122 
     | 
    
         
            +
                        pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
         
     | 
| 
       334 
123 
     | 
    
         
             
                    ]
         
     | 
| 
       335 
124 
     | 
    
         
             
                )
         
     | 
| 
       336 
125 
     | 
    
         | 
| 
         @@ -344,12 +133,12 @@ def custom_aggs_partitioning() -> pa.Schema: 
     | 
|
| 
       344 
133 
     | 
    
         
             
                )
         
     | 
| 
       345 
134 
     | 
    
         | 
| 
       346 
135 
     | 
    
         | 
| 
       347 
     | 
    
         
            -
            def  
     | 
| 
       348 
     | 
    
         
            -
                file_info = config.filesystem.get_file_info(config. 
     | 
| 
      
 136 
     | 
    
         
            +
            def get_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
         
     | 
| 
      
 137 
     | 
    
         
            +
                file_info = config.filesystem.get_file_info(config.aggs_dir)
         
     | 
| 
       349 
138 
     | 
    
         
             
                if file_info.type == pa_fs.FileType.NotFound:
         
     | 
| 
       350 
139 
     | 
    
         
             
                    return set()
         
     | 
| 
       351 
140 
     | 
    
         
             
                aggs_ds = pa_ds.dataset(
         
     | 
| 
       352 
     | 
    
         
            -
                    config. 
     | 
| 
      
 141 
     | 
    
         
            +
                    config.aggs_dir,
         
     | 
| 
       353 
142 
     | 
    
         
             
                    format="parquet",
         
     | 
| 
       354 
143 
     | 
    
         
             
                    schema=custom_aggs_schema(),
         
     | 
| 
       355 
144 
     | 
    
         
             
                    partitioning=custom_aggs_partitioning(),
         
     | 
| 
         @@ -366,17 +155,17 @@ def generate_csv_trades_tables( 
     | 
|
| 
       366 
155 
     | 
    
         
             
                config: PolygonConfig, overwrite: bool = False
         
     | 
| 
       367 
156 
     | 
    
         
             
            ) -> Iterator[Tuple[datetime.date, pa.Table]]:
         
     | 
| 
       368 
157 
     | 
    
         
             
                """Generator for trades tables from flatfile CSVs."""
         
     | 
| 
       369 
     | 
    
         
            -
                 
     | 
| 
      
 158 
     | 
    
         
            +
                existing_aggs_dates = set()
         
     | 
| 
       370 
159 
     | 
    
         
             
                if not overwrite:
         
     | 
| 
       371 
     | 
    
         
            -
                     
     | 
| 
      
 160 
     | 
    
         
            +
                    existing_aggs_dates = get_aggs_dates(config)
         
     | 
| 
       372 
161 
     | 
    
         
             
                schedule = config.calendar.trading_index(
         
     | 
| 
       373 
162 
     | 
    
         
             
                    start=config.start_timestamp, end=config.end_timestamp, period="1D"
         
     | 
| 
       374 
163 
     | 
    
         
             
                )
         
     | 
| 
       375 
164 
     | 
    
         
             
                for timestamp in schedule:
         
     | 
| 
       376 
     | 
    
         
            -
                    date = timestamp.to_pydatetime().date()
         
     | 
| 
       377 
     | 
    
         
            -
                    if date in  
     | 
| 
      
 165 
     | 
    
         
            +
                    date: datetime.date = timestamp.to_pydatetime().date()
         
     | 
| 
      
 166 
     | 
    
         
            +
                    if date in existing_aggs_dates:
         
     | 
| 
       378 
167 
     | 
    
         
             
                        continue
         
     | 
| 
       379 
     | 
    
         
            -
                    trades_csv_path =  
     | 
| 
      
 168 
     | 
    
         
            +
                    trades_csv_path = config.date_to_csv_file_path(date)
         
     | 
| 
       380 
169 
     | 
    
         
             
                    convert_options = pa_csv.ConvertOptions(column_types=trades_schema(raw=True))
         
     | 
| 
       381 
170 
     | 
    
         
             
                    trades = pa_csv.read_csv(trades_csv_path, convert_options=convert_options)
         
     | 
| 
       382 
171 
     | 
    
         
             
                    trades = trades.cast(trades_schema())
         
     | 
| 
         @@ -402,7 +191,8 @@ def trades_to_custom_aggs( 
     | 
|
| 
       402 
191 
     | 
    
         
             
                table: pa.Table,
         
     | 
| 
       403 
192 
     | 
    
         
             
                include_trf: bool = False,
         
     | 
| 
       404 
193 
     | 
    
         
             
            ) -> pa.Table:
         
     | 
| 
       405 
     | 
    
         
            -
                print(f"{ 
     | 
| 
      
 194 
     | 
    
         
            +
                print(f"{date=} {pa.default_memory_pool()=}")
         
     | 
| 
      
 195 
     | 
    
         
            +
                # print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
         
     | 
| 
       406 
196 
     | 
    
         
             
                # print(f"{resource.getrusage(resource.RUSAGE_SELF).ru_maxrss=}")
         
     | 
| 
       407 
197 
     | 
    
         
             
                table = table.filter(pa_compute.greater(table["size"], 0))
         
     | 
| 
       408 
198 
     | 
    
         
             
                table = table.filter(pa_compute.equal(table["correction"], "0"))
         
     | 
| 
         @@ -452,37 +242,25 @@ def trades_to_custom_aggs( 
     | 
|
| 
       452 
242 
     | 
    
         
             
                table = table.append_column(
         
     | 
| 
       453 
243 
     | 
    
         
             
                    "month", pa.array(np.full(len(table), date.month), type=pa.uint8())
         
     | 
| 
       454 
244 
     | 
    
         
             
                )
         
     | 
| 
      
 245 
     | 
    
         
            +
                table = table.append_column(
         
     | 
| 
      
 246 
     | 
    
         
            +
                    PARTITION_COLUMN_NAME,
         
     | 
| 
      
 247 
     | 
    
         
            +
                    pa.array(
         
     | 
| 
      
 248 
     | 
    
         
            +
                        [to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
         
     | 
| 
      
 249 
     | 
    
         
            +
                    ),
         
     | 
| 
      
 250 
     | 
    
         
            +
                )
         
     | 
| 
       455 
251 
     | 
    
         
             
                table = table.sort_by([("window_start", "ascending"), ("ticker", "ascending")])
         
     | 
| 
      
 252 
     | 
    
         
            +
                # print(f"aggs {date=} {table.to_pandas().head()=}")
         
     | 
| 
       456 
253 
     | 
    
         
             
                return table
         
     | 
| 
       457 
254 
     | 
    
         | 
| 
       458 
255 
     | 
    
         | 
| 
       459 
     | 
    
         
            -
            # def generate_custom_agg_batches_from_tables(config: PolygonConfig) 
     | 
| 
      
 256 
     | 
    
         
            +
            # def generate_custom_agg_batches_from_tables(config: PolygonConfig):
         
     | 
| 
       460 
257 
     | 
    
         
             
            #     for date, trades_table in generate_csv_trades_tables(config):
         
     | 
| 
       461 
     | 
    
         
            -
            #          
     | 
| 
       462 
     | 
    
         
            -
            # 
     | 
| 
      
 258 
     | 
    
         
            +
            #         aggs_table = trades_to_custom_aggs(config, date, trades_table)
         
     | 
| 
      
 259 
     | 
    
         
            +
            #         yield aggs_table
         
     | 
| 
      
 260 
     | 
    
         
            +
            #         del aggs_table
         
     | 
| 
       463 
261 
     | 
    
         
             
            #         del trades_table
         
     | 
| 
       464 
262 
     | 
    
         | 
| 
       465 
263 
     | 
    
         | 
| 
       466 
     | 
    
         
            -
            # def generate_custom_agg_tables(config: PolygonConfig) -> pa.Table:
         
     | 
| 
       467 
     | 
    
         
            -
            #     for date, trades_table in generate_csv_trades_tables(config):
         
     | 
| 
       468 
     | 
    
         
            -
            #         yield trades_to_custom_aggs(config, date, trades_table)
         
     | 
| 
       469 
     | 
    
         
            -
             
     | 
| 
       470 
     | 
    
         
            -
             
     | 
| 
       471 
     | 
    
         
            -
            # def configure_write_custom_aggs_to_dataset(config: PolygonConfig):
         
     | 
| 
       472 
     | 
    
         
            -
            #     def write_custom_aggs_to_dataset(args: Tuple[datetime.date, pa.Table]):
         
     | 
| 
       473 
     | 
    
         
            -
            #         date, table = args
         
     | 
| 
       474 
     | 
    
         
            -
            #         pa_ds.write_dataset(
         
     | 
| 
       475 
     | 
    
         
            -
            #             trades_to_custom_aggs(config, date, table),
         
     | 
| 
       476 
     | 
    
         
            -
            #             filesystem=config.filesystem,
         
     | 
| 
       477 
     | 
    
         
            -
            #             base_dir=config.custom_aggs_dir,
         
     | 
| 
       478 
     | 
    
         
            -
            #             partitioning=custom_aggs_partitioning(),
         
     | 
| 
       479 
     | 
    
         
            -
            #             format="parquet",
         
     | 
| 
       480 
     | 
    
         
            -
            #             existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
       481 
     | 
    
         
            -
            #         )
         
     | 
| 
       482 
     | 
    
         
            -
             
     | 
| 
       483 
     | 
    
         
            -
            #     return write_custom_aggs_to_dataset
         
     | 
| 
       484 
     | 
    
         
            -
             
     | 
| 
       485 
     | 
    
         
            -
             
     | 
| 
       486 
264 
     | 
    
         
             
            def file_visitor(written_file):
         
     | 
| 
       487 
265 
     | 
    
         
             
                print(f"{written_file.path=}")
         
     | 
| 
       488 
266 
     | 
    
         | 
| 
         @@ -504,26 +282,25 @@ def convert_trades_to_custom_aggs( 
     | 
|
| 
       504 
282 
     | 
    
         
             
                #     generate_custom_agg_batches_from_tables(config),
         
     | 
| 
       505 
283 
     | 
    
         
             
                #     schema=custom_aggs_schema(),
         
     | 
| 
       506 
284 
     | 
    
         
             
                #     filesystem=config.filesystem,
         
     | 
| 
       507 
     | 
    
         
            -
                #     base_dir=config. 
     | 
| 
      
 285 
     | 
    
         
            +
                #     base_dir=config.aggs_dir,
         
     | 
| 
       508 
286 
     | 
    
         
             
                #     partitioning=custom_aggs_partitioning(),
         
     | 
| 
       509 
287 
     | 
    
         
             
                #     format="parquet",
         
     | 
| 
       510 
288 
     | 
    
         
             
                #     existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
       511 
     | 
    
         
            -
                #     max_open_files = MAX_FILES_OPEN,
         
     | 
| 
       512 
     | 
    
         
            -
                #     min_rows_per_group = MIN_ROWS_PER_GROUP,
         
     | 
| 
      
 289 
     | 
    
         
            +
                #     # max_open_files = MAX_FILES_OPEN,
         
     | 
| 
      
 290 
     | 
    
         
            +
                #     # min_rows_per_group = MIN_ROWS_PER_GROUP,
         
     | 
| 
       513 
291 
     | 
    
         
             
                # )
         
     | 
| 
       514 
292 
     | 
    
         | 
| 
       515 
293 
     | 
    
         
             
                for date, trades_table in generate_csv_trades_tables(config):
         
     | 
| 
       516 
294 
     | 
    
         
             
                    aggs_table = trades_to_custom_aggs(config, date, trades_table)
         
     | 
| 
       517 
295 
     | 
    
         
             
                    pa_ds.write_dataset(
         
     | 
| 
       518 
296 
     | 
    
         
             
                        aggs_table,
         
     | 
| 
       519 
     | 
    
         
            -
                        # schema=custom_aggs_schema(),
         
     | 
| 
       520 
297 
     | 
    
         
             
                        filesystem=config.filesystem,
         
     | 
| 
       521 
298 
     | 
    
         
             
                        base_dir=config.aggs_dir,
         
     | 
| 
       522 
299 
     | 
    
         
             
                        partitioning=custom_aggs_partitioning(),
         
     | 
| 
       523 
300 
     | 
    
         
             
                        format="parquet",
         
     | 
| 
       524 
301 
     | 
    
         
             
                        existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
       525 
302 
     | 
    
         
             
                        file_visitor=file_visitor,
         
     | 
| 
       526 
     | 
    
         
            -
                        # max_open_files= 
     | 
| 
      
 303 
     | 
    
         
            +
                        # max_open_files=10,
         
     | 
| 
       527 
304 
     | 
    
         
             
                        # min_rows_per_group=MIN_ROWS_PER_GROUP,
         
     | 
| 
       528 
305 
     | 
    
         
             
                    )
         
     | 
| 
       529 
306 
     | 
    
         
             
                    del aggs_table
         
     | 
| 
         @@ -559,386 +336,198 @@ def convert_trades_to_custom_aggs( 
     | 
|
| 
       559 
336 
     | 
    
         
             
            #     return mfi
         
     | 
| 
       560 
337 
     | 
    
         | 
| 
       561 
338 
     | 
    
         | 
| 
       562 
     | 
    
         
            -
             
     | 
| 
       563 
     | 
    
         
            -
             
     | 
| 
       564 
     | 
    
         
            -
             
     | 
| 
       565 
     | 
    
         
            -
             
     | 
| 
       566 
     | 
    
         
            -
             
     | 
| 
       567 
     | 
    
         
            -
             
     | 
| 
       568 
     | 
    
         
            -
             
     | 
| 
       569 
     | 
    
         
            -
             
     | 
| 
       570 
     | 
    
         
            -
             
     | 
| 
       571 
     | 
    
         
            -
             
     | 
| 
       572 
     | 
    
         
            -
             
     | 
| 
       573 
     | 
    
         
            -
             
     | 
| 
       574 
     | 
    
         
            -
             
     | 
| 
       575 
     | 
    
         
            -
             
     | 
| 
       576 
     | 
    
         
            -
             
     | 
| 
       577 
     | 
    
         
            -
             
     | 
| 
       578 
     | 
    
         
            -
            #     # https://github.com/apache/arrow/issues/39839#issuecomment-1915981816
         
     | 
| 
       579 
     | 
    
         
            -
            #     # Also I don't think you can use those in a format string without a separator.
         
     | 
| 
       580 
     | 
    
         
            -
             
     | 
| 
       581 
     | 
    
         
            -
            #     # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
         
     | 
| 
       582 
     | 
    
         
            -
            #     # price_type = pa.decimal128(precision=38, scale=10)
         
     | 
| 
       583 
     | 
    
         
            -
            #     # 64bit float a little overkill but avoids any plausible truncation error.
         
     | 
| 
       584 
     | 
    
         
            -
            #     price_type = pa.float64()
         
     | 
| 
       585 
     | 
    
         
            -
             
     | 
| 
       586 
     | 
    
         
            -
            #     custom_aggs_schema = pa.schema(
         
     | 
| 
       587 
     | 
    
         
            -
            #         [
         
     | 
| 
       588 
     | 
    
         
            -
            #             pa.field("ticker", pa.string(), nullable=False),
         
     | 
| 
       589 
     | 
    
         
            -
            #             pa.field("volume", pa.int64(), nullable=False),
         
     | 
| 
       590 
     | 
    
         
            -
            #             pa.field("open", price_type, nullable=False),
         
     | 
| 
       591 
     | 
    
         
            -
            #             pa.field("close", price_type, nullable=False),
         
     | 
| 
       592 
     | 
    
         
            -
            #             pa.field("high", price_type, nullable=False),
         
     | 
| 
       593 
     | 
    
         
            -
            #             pa.field("low", price_type, nullable=False),
         
     | 
| 
       594 
     | 
    
         
            -
            #             pa.field("window_start", timestamp_type, nullable=False),
         
     | 
| 
       595 
     | 
    
         
            -
            #             pa.field("transactions", pa.int64(), nullable=False),
         
     | 
| 
       596 
     | 
    
         
            -
            #             pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
         
     | 
| 
       597 
     | 
    
         
            -
            #         ]
         
     | 
| 
       598 
     | 
    
         
            -
            #     )
         
     | 
| 
       599 
     | 
    
         
            -
             
     | 
| 
       600 
     | 
    
         
            -
            #     # TODO: Use generator like os.walk for paths.
         
     | 
| 
       601 
     | 
    
         
            -
            #     return (
         
     | 
| 
       602 
     | 
    
         
            -
            #         custom_aggs_schema,
         
     | 
| 
       603 
     | 
    
         
            -
            #         generate_tables_from_custom_aggs(
         
     | 
| 
       604 
     | 
    
         
            -
            #             paths=config.csv_paths(),
         
     | 
| 
       605 
     | 
    
         
            -
            #             schema=custom_aggs_schema,
         
     | 
| 
       606 
     | 
    
         
            -
            #             start_timestamp=config.start_timestamp,
         
     | 
| 
       607 
     | 
    
         
            -
            #             limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
         
     | 
| 
       608 
     | 
    
         
            -
            #         ),
         
     | 
| 
       609 
     | 
    
         
            -
            #     )
         
     | 
| 
      
 339 
     | 
    
         
            +
            def get_by_ticker_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
         
     | 
| 
      
 340 
     | 
    
         
            +
                file_info = config.filesystem.get_file_info(config.by_ticker_aggs_arrow_dir)
         
     | 
| 
      
 341 
     | 
    
         
            +
                if file_info.type == pa_fs.FileType.NotFound:
         
     | 
| 
      
 342 
     | 
    
         
            +
                    return set()
         
     | 
| 
      
 343 
     | 
    
         
            +
                by_ticker_aggs_ds = pa_ds.dataset(
         
     | 
| 
      
 344 
     | 
    
         
            +
                    config.by_ticker_aggs_arrow_dir,
         
     | 
| 
      
 345 
     | 
    
         
            +
                    format="parquet",
         
     | 
| 
      
 346 
     | 
    
         
            +
                    schema=custom_aggs_schema(),
         
     | 
| 
      
 347 
     | 
    
         
            +
                    partitioning=custom_aggs_partitioning(),
         
     | 
| 
      
 348 
     | 
    
         
            +
                )
         
     | 
| 
      
 349 
     | 
    
         
            +
                return set(
         
     | 
| 
      
 350 
     | 
    
         
            +
                    [
         
     | 
| 
      
 351 
     | 
    
         
            +
                        pa_ds.get_partition_keys(fragment.partition_expression).get("date")
         
     | 
| 
      
 352 
     | 
    
         
            +
                        for fragment in by_ticker_aggs_ds.get_fragments()
         
     | 
| 
      
 353 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 354 
     | 
    
         
            +
                )
         
     | 
| 
       610 
355 
     | 
    
         | 
| 
       611 
     | 
    
         
            -
            # def get_custom_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
         
     | 
| 
       612 
     | 
    
         
            -
            #     file_info = config.filesystem.get_file_info(config.custom_aggs_dir)
         
     | 
| 
       613 
     | 
    
         
            -
            #     if file_info.type == pa_fs.FileType.NotFound:
         
     | 
| 
       614 
     | 
    
         
            -
            #         return set()
         
     | 
| 
       615 
     | 
    
         
            -
            #     aggs_ds = pa_ds.dataset(
         
     | 
| 
       616 
     | 
    
         
            -
            #         config.custom_aggs_dir,
         
     | 
| 
       617 
     | 
    
         
            -
            #         format="parquet",
         
     | 
| 
       618 
     | 
    
         
            -
            #         schema=custom_aggs_schema(),
         
     | 
| 
       619 
     | 
    
         
            -
            #         partitioning=custom_aggs_partitioning(),
         
     | 
| 
       620 
     | 
    
         
            -
            #     )
         
     | 
| 
       621 
     | 
    
         
            -
            #     return set(
         
     | 
| 
       622 
     | 
    
         
            -
            #         [
         
     | 
| 
       623 
     | 
    
         
            -
            #             pa_ds.get_partition_keys(fragment.partition_expression).get("date")
         
     | 
| 
       624 
     | 
    
         
            -
            #             for fragment in aggs_ds.get_fragments()
         
     | 
| 
       625 
     | 
    
         
            -
            #         ]
         
     | 
| 
       626 
     | 
    
         
            -
            #     )
         
     | 
| 
       627 
356 
     | 
    
         | 
| 
      
 357 
     | 
    
         
            +
            def batches_for_date(aggs_ds: pa_ds.Dataset, date: pd.Timestamp):
         
     | 
| 
      
 358 
     | 
    
         
            +
                date_filter_expr = (
         
     | 
| 
      
 359 
     | 
    
         
            +
                    (pa_compute.field("year") == date.year)
         
     | 
| 
      
 360 
     | 
    
         
            +
                    & (pa_compute.field("month") == date.month)
         
     | 
| 
      
 361 
     | 
    
         
            +
                    & (pa_compute.field("date") == date.date())
         
     | 
| 
      
 362 
     | 
    
         
            +
                )
         
     | 
| 
      
 363 
     | 
    
         
            +
                print(f"table for {date=}")
         
     | 
| 
      
 364 
     | 
    
         
            +
                # return aggs_ds.scanner(filter=date_filter_expr).to_batches()
         
     | 
| 
      
 365 
     | 
    
         
            +
                table = aggs_ds.scanner(filter=date_filter_expr).to_table()
         
     | 
| 
      
 366 
     | 
    
         
            +
                table = table.sort_by([("part", "ascending"), ("ticker", "ascending"), ("window_start", "ascending"), ])
         
     | 
| 
      
 367 
     | 
    
         
            +
                return table.to_batches()
         
     | 
| 
       628 
368 
     | 
    
         | 
| 
       629 
     | 
    
         
            -
            def  
     | 
| 
       630 
     | 
    
         
            -
                 
     | 
| 
       631 
     | 
    
         
            -
             
     | 
| 
      
 369 
     | 
    
         
            +
            def generate_batches_for_schedule(config, aggs_ds):
         
     | 
| 
      
 370 
     | 
    
         
            +
                schedule = config.calendar.trading_index(
         
     | 
| 
      
 371 
     | 
    
         
            +
                    start=config.start_timestamp, end=config.end_timestamp, period="1D"
         
     | 
| 
      
 372 
     | 
    
         
            +
                )
         
     | 
| 
       632 
373 
     | 
    
         
             
                for timestamp in schedule:
         
     | 
| 
       633 
     | 
    
         
            -
                     
     | 
| 
       634 
     | 
    
         
            -
                     
     | 
| 
       635 
     | 
    
         
            -
                        (pa_compute.field("year") == date.year)
         
     | 
| 
       636 
     | 
    
         
            -
                        & (pa_compute.field("month") == date.month)
         
     | 
| 
       637 
     | 
    
         
            -
                        & (pa_compute.field("date") == date)
         
     | 
| 
       638 
     | 
    
         
            -
                    )
         
     | 
| 
       639 
     | 
    
         
            -
                    for batch in aggs_ds.to_batches(filter=date_filter_expr):
         
     | 
| 
       640 
     | 
    
         
            -
                        # TODO: Check that these rows are within range for this file's date (not just the whole session).
         
     | 
| 
       641 
     | 
    
         
            -
                        # And if we're doing that (figuring date for each file), we can just skip reading the file.
         
     | 
| 
       642 
     | 
    
         
            -
                        # Might able to do a single comparison using compute.days_between.
         
     | 
| 
       643 
     | 
    
         
            -
                        # https://arrow.apache.org/docs/python/generated/pyarrow.compute.days_between.html
         
     | 
| 
       644 
     | 
    
         
            -
                        batch = batch.append_column(
         
     | 
| 
       645 
     | 
    
         
            -
                            PARTITION_COLUMN_NAME,
         
     | 
| 
       646 
     | 
    
         
            -
                            pa.array(
         
     | 
| 
       647 
     | 
    
         
            -
                                [
         
     | 
| 
       648 
     | 
    
         
            -
                                    to_partition_key(ticker)
         
     | 
| 
       649 
     | 
    
         
            -
                                    for ticker in batch.column("ticker").to_pylist()
         
     | 
| 
       650 
     | 
    
         
            -
                                ]
         
     | 
| 
       651 
     | 
    
         
            -
                            ),
         
     | 
| 
       652 
     | 
    
         
            -
                        )
         
     | 
| 
       653 
     | 
    
         
            -
                        yield batch
         
     | 
| 
      
 374 
     | 
    
         
            +
                    # print(f"{timestamp=}")
         
     | 
| 
      
 375 
     | 
    
         
            +
                    yield from batches_for_date(aggs_ds=aggs_ds, date=timestamp)
         
     | 
| 
       654 
376 
     | 
    
         | 
| 
       655 
377 
     | 
    
         | 
| 
       656 
     | 
    
         
            -
            def scatter_custom_aggs_to_by_ticker(
         
     | 
| 
      
 378 
     | 
    
         
            +
            # def scatter_custom_aggs_to_by_ticker(
         
     | 
| 
      
 379 
     | 
    
         
            +
            #     config: PolygonConfig,
         
     | 
| 
      
 380 
     | 
    
         
            +
            #     overwrite: bool = False,
         
     | 
| 
      
 381 
     | 
    
         
            +
            # ) -> str:
         
     | 
| 
      
 382 
     | 
    
         
            +
            #     lock = FileLock(config.lock_file_path, blocking=False)
         
     | 
| 
      
 383 
     | 
    
         
            +
            #     with lock:
         
     | 
| 
      
 384 
     | 
    
         
            +
            #         if not lock.is_locked:
         
     | 
| 
      
 385 
     | 
    
         
            +
            #             raise IOError("Failed to acquire lock for updating custom assets.")
         
     | 
| 
      
 386 
     | 
    
         
            +
            #         with open(config.by_ticker_dates_path, "a") as f:
         
     | 
| 
      
 387 
     | 
    
         
            +
            #             f.write("I have a bad feeling about this.")
         
     | 
| 
      
 388 
     | 
    
         
            +
            #             by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker_(config, overwrite)
         
     | 
| 
      
 389 
     | 
    
         
            +
             
     | 
| 
      
 390 
     | 
    
         
            +
            #             print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
      
 391 
     | 
    
         
            +
            #             return by_ticker_aggs_arrow_dir
         
     | 
| 
      
 392 
     | 
    
         
            +
             
     | 
| 
      
 393 
     | 
    
         
            +
             
     | 
| 
      
 394 
     | 
    
         
            +
            def filter_by_date(config: PolygonConfig) -> pa_compute.Expression:
         
     | 
| 
      
 395 
     | 
    
         
            +
                start_date = config.start_timestamp.tz_localize(config.calendar.tz.key).date()
         
     | 
| 
      
 396 
     | 
    
         
            +
                limit_date = (
         
     | 
| 
      
 397 
     | 
    
         
            +
                    (config.end_timestamp + pd.Timedelta(days=1))
         
     | 
| 
      
 398 
     | 
    
         
            +
                    .tz_localize(config.calendar.tz.key)
         
     | 
| 
      
 399 
     | 
    
         
            +
                    .date()
         
     | 
| 
      
 400 
     | 
    
         
            +
                )
         
     | 
| 
      
 401 
     | 
    
         
            +
                return (pa_compute.field("date") >= start_date) & (
         
     | 
| 
      
 402 
     | 
    
         
            +
                    pa_compute.field("date") <= limit_date
         
     | 
| 
      
 403 
     | 
    
         
            +
                )
         
     | 
| 
      
 404 
     | 
    
         
            +
             
     | 
| 
      
 405 
     | 
    
         
            +
             
     | 
| 
      
 406 
     | 
    
         
            +
            # def generate_batches_with_partition(
         
     | 
| 
      
 407 
     | 
    
         
            +
            #     config: PolygonConfig,
         
     | 
| 
      
 408 
     | 
    
         
            +
            #     aggs_ds: pa_ds.Dataset,
         
     | 
| 
      
 409 
     | 
    
         
            +
            # ) -> Iterator[pa.Table]:
         
     | 
| 
      
 410 
     | 
    
         
            +
            #     for fragment in aggs_ds.sort_by("date").get_fragments(
         
     | 
| 
      
 411 
     | 
    
         
            +
            #         filter=filter_by_date(config),
         
     | 
| 
      
 412 
     | 
    
         
            +
            #     ):
         
     | 
| 
      
 413 
     | 
    
         
            +
            #         for batch in fragment.to_batches():
         
     | 
| 
      
 414 
     | 
    
         
            +
            #             # batch = batch.append_column(
         
     | 
| 
      
 415 
     | 
    
         
            +
            #             #     PARTITION_COLUMN_NAME,
         
     | 
| 
      
 416 
     | 
    
         
            +
            #             #     pa.array(
         
     | 
| 
      
 417 
     | 
    
         
            +
            #             #         [
         
     | 
| 
      
 418 
     | 
    
         
            +
            #             #             to_partition_key(ticker)
         
     | 
| 
      
 419 
     | 
    
         
            +
            #             #             for ticker in batch.column("ticker").to_pylist()
         
     | 
| 
      
 420 
     | 
    
         
            +
            #             #         ]
         
     | 
| 
      
 421 
     | 
    
         
            +
            #             #     ),
         
     | 
| 
      
 422 
     | 
    
         
            +
            #             # )
         
     | 
| 
      
 423 
     | 
    
         
            +
            #             yield batch.sort_by(
         
     | 
| 
      
 424 
     | 
    
         
            +
            #                 [("ticker", "ascending"), ("window_start", "ascending")]
         
     | 
| 
      
 425 
     | 
    
         
            +
            #             )
         
     | 
| 
      
 426 
     | 
    
         
            +
            #             del batch
         
     | 
| 
      
 427 
     | 
    
         
            +
            #         del fragment
         
     | 
| 
      
 428 
     | 
    
         
            +
             
     | 
| 
      
 429 
     | 
    
         
            +
             
     | 
| 
      
 430 
     | 
    
         
            +
            def generate_batches_with_partition(
         
     | 
| 
       657 
431 
     | 
    
         
             
                config: PolygonConfig,
         
     | 
| 
       658 
     | 
    
         
            -
                 
     | 
| 
       659 
     | 
    
         
            -
            ) ->  
     | 
| 
       660 
     | 
    
         
            -
                 
     | 
| 
       661 
     | 
    
         
            -
             
     | 
| 
       662 
     | 
    
         
            -
                     
     | 
| 
      
 432 
     | 
    
         
            +
                aggs_ds: pa_ds.Dataset,
         
     | 
| 
      
 433 
     | 
    
         
            +
            ) -> Iterator[pa.Table]:
         
     | 
| 
      
 434 
     | 
    
         
            +
                for fragment in (
         
     | 
| 
      
 435 
     | 
    
         
            +
                    aggs_ds.filter(filter_by_date(config))
         
     | 
| 
      
 436 
     | 
    
         
            +
                    .sort_by([(PARTITION_COLUMN_NAME, "ascending"), ("date", "ascending")])
         
     | 
| 
      
 437 
     | 
    
         
            +
                    .get_fragments()
         
     | 
| 
      
 438 
     | 
    
         
            +
                ):
         
     | 
| 
      
 439 
     | 
    
         
            +
                    for batch in fragment.to_batches():
         
     | 
| 
      
 440 
     | 
    
         
            +
                        yield batch.sort_by(
         
     | 
| 
      
 441 
     | 
    
         
            +
                            [("ticker", "ascending"), ("window_start", "ascending")]
         
     | 
| 
      
 442 
     | 
    
         
            +
                        )
         
     | 
| 
      
 443 
     | 
    
         
            +
                        del batch
         
     | 
| 
      
 444 
     | 
    
         
            +
                    del fragment
         
     | 
| 
       663 
445 
     | 
    
         | 
| 
       664 
     | 
    
         
            -
                by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
         
     | 
| 
       665 
     | 
    
         
            -
                if os.path.exists(by_ticker_aggs_arrow_dir):
         
     | 
| 
       666 
     | 
    
         
            -
                    if overwrite:
         
     | 
| 
       667 
     | 
    
         
            -
                        print(f"Removing {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
       668 
     | 
    
         
            -
                        shutil.rmtree(by_ticker_aggs_arrow_dir)
         
     | 
| 
       669 
     | 
    
         
            -
                    else:
         
     | 
| 
       670 
     | 
    
         
            -
                        print(f"Found existing {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
       671 
     | 
    
         
            -
                        return by_ticker_aggs_arrow_dir
         
     | 
| 
       672 
446 
     | 
    
         | 
| 
      
 447 
     | 
    
         
            +
            def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
         
     | 
| 
       673 
448 
     | 
    
         
             
                aggs_ds = pa_ds.dataset(
         
     | 
| 
       674 
     | 
    
         
            -
                    config. 
     | 
| 
      
 449 
     | 
    
         
            +
                    config.aggs_dir,
         
     | 
| 
       675 
450 
     | 
    
         
             
                    format="parquet",
         
     | 
| 
       676 
451 
     | 
    
         
             
                    schema=custom_aggs_schema(),
         
     | 
| 
       677 
452 
     | 
    
         
             
                    partitioning=custom_aggs_partitioning(),
         
     | 
| 
       678 
453 
     | 
    
         
             
                )
         
     | 
| 
       679 
     | 
    
         
            -
                 
     | 
| 
       680 
     | 
    
         
            -
                    start=config.start_timestamp, end=config.end_timestamp, period="1D"
         
     | 
| 
       681 
     | 
    
         
            -
                )
         
     | 
| 
       682 
     | 
    
         
            -
                assert type(schedule) is pd.DatetimeIndex
         
     | 
| 
      
 454 
     | 
    
         
            +
                by_ticker_schema = aggs_ds.schema
         
     | 
| 
       683 
455 
     | 
    
         
             
                partitioning = pa_ds.partitioning(
         
     | 
| 
       684 
     | 
    
         
            -
                    pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), 
     | 
| 
      
 456 
     | 
    
         
            +
                    pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
         
     | 
| 
      
 457 
     | 
    
         
            +
                    flavor="hive",
         
     | 
| 
       685 
458 
     | 
    
         
             
                )
         
     | 
| 
       686 
     | 
    
         
            -
                 
     | 
| 
       687 
     | 
    
         
            -
                 
     | 
| 
       688 
     | 
    
         
            -
             
     | 
| 
      
 459 
     | 
    
         
            +
                by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
         
     | 
| 
      
 460 
     | 
    
         
            +
                print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
       689 
461 
     | 
    
         
             
                pa_ds.write_dataset(
         
     | 
| 
       690 
     | 
    
         
            -
                     
     | 
| 
       691 
     | 
    
         
            -
                     
     | 
| 
      
 462 
     | 
    
         
            +
                    # generate_batches_with_partition(config=config, aggs_ds=aggs_ds),
         
     | 
| 
      
 463 
     | 
    
         
            +
                    generate_batches_for_schedule(config=config, aggs_ds=aggs_ds),
         
     | 
| 
      
 464 
     | 
    
         
            +
                    schema=by_ticker_schema,
         
     | 
| 
       692 
465 
     | 
    
         
             
                    base_dir=by_ticker_aggs_arrow_dir,
         
     | 
| 
       693 
466 
     | 
    
         
             
                    partitioning=partitioning,
         
     | 
| 
       694 
467 
     | 
    
         
             
                    format="parquet",
         
     | 
| 
       695 
468 
     | 
    
         
             
                    existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
       696 
469 
     | 
    
         
             
                )
         
     | 
| 
       697 
     | 
    
         
            -
                print(f"Scattered  
     | 
| 
      
 470 
     | 
    
         
            +
                print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
       698 
471 
     | 
    
         
             
                return by_ticker_aggs_arrow_dir
         
     | 
| 
       699 
472 
     | 
    
         | 
| 
       700 
473 
     | 
    
         | 
| 
       701 
     | 
    
         
            -
            def  
     | 
| 
       702 
     | 
    
         
            -
             
     | 
| 
       703 
     | 
    
         
            -
             
     | 
| 
       704 
     | 
    
         
            -
             
     | 
| 
       705 
     | 
    
         
            -
                # Calculate gain and loss using vectorized operations
         
     | 
| 
       706 
     | 
    
         
            -
                positive_mf = np.maximum(signed_mf, 0)
         
     | 
| 
       707 
     | 
    
         
            -
                negative_mf = np.maximum(-signed_mf, 0)
         
     | 
| 
       708 
     | 
    
         
            -
             
     | 
| 
       709 
     | 
    
         
            -
                mf_avg_gain = (
         
     | 
| 
       710 
     | 
    
         
            -
                    np.convolve(positive_mf, np.ones(period), mode="full")[: len(positive_mf)]
         
     | 
| 
       711 
     | 
    
         
            -
                    / period
         
     | 
| 
       712 
     | 
    
         
            -
                )
         
     | 
| 
       713 
     | 
    
         
            -
                mf_avg_loss = (
         
     | 
| 
       714 
     | 
    
         
            -
                    np.convolve(negative_mf, np.ones(period), mode="full")[: len(negative_mf)]
         
     | 
| 
       715 
     | 
    
         
            -
                    / period
         
     | 
| 
       716 
     | 
    
         
            -
                )
         
     | 
| 
       717 
     | 
    
         
            -
             
     | 
| 
       718 
     | 
    
         
            -
                epsilon = 1e-10  # Small epsilon value to avoid division by zero
         
     | 
| 
       719 
     | 
    
         
            -
                mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
         
     | 
| 
       720 
     | 
    
         
            -
                return mfi
         
     | 
| 
       721 
     | 
    
         
            -
             
     | 
| 
       722 
     | 
    
         
            -
             
     | 
| 
       723 
     | 
    
         
            -
            # https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
         
     | 
| 
       724 
     | 
    
         
            -
            # https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
         
     | 
| 
       725 
     | 
    
         
            -
            # `k` vs `fast_k` arg names.
         
     | 
| 
       726 
     | 
    
         
            -
            # https://github.com/twopirllc/pandas-ta/issues/726
         
     | 
| 
       727 
     | 
    
         
            -
            # Results affected by values outside range
         
     | 
| 
       728 
     | 
    
         
            -
            # https://github.com/twopirllc/pandas-ta/issues/535
         
     | 
| 
       729 
     | 
    
         
            -
             
     | 
| 
       730 
     | 
    
         
            -
             
     | 
| 
       731 
     | 
    
         
            -
            def calculate_stoch(
         
     | 
| 
       732 
     | 
    
         
            -
                high: pd.Series,
         
     | 
| 
       733 
     | 
    
         
            -
                low: pd.Series,
         
     | 
| 
       734 
     | 
    
         
            -
                close: pd.Series,
         
     | 
| 
       735 
     | 
    
         
            -
                k: int = 14,
         
     | 
| 
       736 
     | 
    
         
            -
                d: int = 3,
         
     | 
| 
       737 
     | 
    
         
            -
                smooth_k: int = 3,
         
     | 
| 
       738 
     | 
    
         
            -
                mamode: str = "sma",
         
     | 
| 
       739 
     | 
    
         
            -
            ):
         
     | 
| 
       740 
     | 
    
         
            -
                """Indicator: Stochastic Oscillator (STOCH)"""
         
     | 
| 
       741 
     | 
    
         
            -
                lowest_low = low.rolling(k).min()
         
     | 
| 
       742 
     | 
    
         
            -
                highest_high = high.rolling(k).max()
         
     | 
| 
      
 474 
     | 
    
         
            +
            # def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
         
     | 
| 
      
 475 
     | 
    
         
            +
            #     file_info = config.filesystem.get_file_info(config.aggs_dir)
         
     | 
| 
      
 476 
     | 
    
         
            +
            #     if file_info.type == pa_fs.FileType.NotFound:
         
     | 
| 
      
 477 
     | 
    
         
            +
            #         raise FileNotFoundError(f"{config.aggs_dir=} not found.")
         
     | 
| 
       743 
478 
     | 
    
         | 
| 
       744 
     | 
    
         
            -
             
     | 
| 
       745 
     | 
    
         
            -
             
     | 
| 
      
 479 
     | 
    
         
            +
            #     by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
         
     | 
| 
      
 480 
     | 
    
         
            +
            #     if os.path.exists(by_ticker_aggs_arrow_dir):
         
     | 
| 
      
 481 
     | 
    
         
            +
            #         if overwrite:
         
     | 
| 
      
 482 
     | 
    
         
            +
            #             print(f"Removing {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
      
 483 
     | 
    
         
            +
            #             shutil.rmtree(by_ticker_aggs_arrow_dir)
         
     | 
| 
       746 
484 
     | 
    
         | 
| 
       747 
     | 
    
         
            -
             
     | 
| 
       748 
     | 
    
         
            -
             
     | 
| 
       749 
     | 
    
         
            -
             
     | 
| 
       750 
     | 
    
         
            -
             
     | 
| 
       751 
     | 
    
         
            -
                    ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index() :,], length=d)
         
     | 
| 
       752 
     | 
    
         
            -
                    if stoch_k is not None
         
     | 
| 
       753 
     | 
    
         
            -
                    else None
         
     | 
| 
       754 
     | 
    
         
            -
                )
         
     | 
| 
       755 
     | 
    
         
            -
                # Histogram
         
     | 
| 
       756 
     | 
    
         
            -
                stoch_h = stoch_k - stoch_d if stoch_d is not None else None
         
     | 
| 
      
 485 
     | 
    
         
            +
            #     schedule = config.calendar.trading_index(
         
     | 
| 
      
 486 
     | 
    
         
            +
            #         start=config.start_timestamp, end=config.end_timestamp, period="1D"
         
     | 
| 
      
 487 
     | 
    
         
            +
            #     )
         
     | 
| 
      
 488 
     | 
    
         
            +
            #     assert type(schedule) is pd.DatetimeIndex
         
     | 
| 
       757 
489 
     | 
    
         | 
| 
       758 
     | 
    
         
            -
             
     | 
| 
      
 490 
     | 
    
         
            +
            #     print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
         
     | 
| 
      
 491 
     | 
    
         
            +
            #     aggs_ds = pa_ds.dataset(
         
     | 
| 
      
 492 
     | 
    
         
            +
            #         config.aggs_dir,
         
     | 
| 
      
 493 
     | 
    
         
            +
            #         format="parquet",
         
     | 
| 
      
 494 
     | 
    
         
            +
            #         schema=custom_aggs_schema(),
         
     | 
| 
      
 495 
     | 
    
         
            +
            #         partitioning=custom_aggs_partitioning(),
         
     | 
| 
      
 496 
     | 
    
         
            +
            #     )
         
     | 
| 
      
 497 
     | 
    
         
            +
            #     by_ticker_partitioning = pa_ds.partitioning(
         
     | 
| 
      
 498 
     | 
    
         
            +
            #         pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
         
     | 
| 
      
 499 
     | 
    
         
            +
            #         # pa.schema(
         
     | 
| 
      
 500 
     | 
    
         
            +
            #         #     [
         
     | 
| 
      
 501 
     | 
    
         
            +
            #         #         (PARTITION_COLUMN_NAME, pa.string()),
         
     | 
| 
      
 502 
     | 
    
         
            +
            #         #         ("year", pa.uint16()),
         
     | 
| 
      
 503 
     | 
    
         
            +
            #         #         ("month", pa.uint8()),
         
     | 
| 
      
 504 
     | 
    
         
            +
            #         #         ("date", pa.date32()),
         
     | 
| 
      
 505 
     | 
    
         
            +
            #         #     ]
         
     | 
| 
      
 506 
     | 
    
         
            +
            #         # ),
         
     | 
| 
      
 507 
     | 
    
         
            +
            #         flavor="hive",
         
     | 
| 
      
 508 
     | 
    
         
            +
            #     )
         
     | 
| 
      
 509 
     | 
    
         
            +
            #     by_ticker_schema = custom_aggs_schema()
         
     | 
| 
      
 510 
     | 
    
         
            +
            #     by_ticker_schema = by_ticker_schema.append(
         
     | 
| 
      
 511 
     | 
    
         
            +
            #         pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
         
     | 
| 
      
 512 
     | 
    
         
            +
            #     )
         
     | 
| 
       759 
513 
     | 
    
         | 
| 
      
 514 
     | 
    
         
            +
            #     # TODO: Collect the dates we've scattered and write a special partition key with them.
         
     | 
| 
      
 515 
     | 
    
         
            +
            #     pa_ds.write_dataset(
         
     | 
| 
      
 516 
     | 
    
         
            +
            #         generate_batches_for_schedule(schedule, aggs_ds),
         
     | 
| 
      
 517 
     | 
    
         
            +
            #         schema=by_ticker_schema,
         
     | 
| 
      
 518 
     | 
    
         
            +
            #         base_dir=by_ticker_aggs_arrow_dir,
         
     | 
| 
      
 519 
     | 
    
         
            +
            #         partitioning=by_ticker_partitioning,
         
     | 
| 
      
 520 
     | 
    
         
            +
            #         format="parquet",
         
     | 
| 
      
 521 
     | 
    
         
            +
            #         existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
      
 522 
     | 
    
         
            +
            #         # max_open_files=250,
         
     | 
| 
      
 523 
     | 
    
         
            +
            #         # file_visitor=file_visitor,
         
     | 
| 
      
 524 
     | 
    
         
            +
            #     )
         
     | 
| 
       760 
525 
     | 
    
         | 
| 
       761 
     | 
    
         
            -
             
     | 
| 
       762 
     | 
    
         
            -
                df = df.set_index("window_start").sort_index()
         
     | 
| 
       763 
     | 
    
         
            -
                session_index = pd.date_range(
         
     | 
| 
       764 
     | 
    
         
            -
                    start=df.index[0], end=df.index[-1], freq=pd.Timedelta(seconds=60)
         
     | 
| 
       765 
     | 
    
         
            -
                )
         
     | 
| 
       766 
     | 
    
         
            -
                df = df.reindex(session_index)
         
     | 
| 
       767 
     | 
    
         
            -
                df.index.rename("window_start", inplace=True)
         
     | 
| 
       768 
     | 
    
         
            -
             
     | 
| 
       769 
     | 
    
         
            -
                # df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
         
     | 
| 
       770 
     | 
    
         
            -
                # df["day_of_week"] = df.index.day_of_week
         
     | 
| 
       771 
     | 
    
         
            -
             
     | 
| 
       772 
     | 
    
         
            -
                df.transactions = df.transactions.fillna(0)
         
     | 
| 
       773 
     | 
    
         
            -
                df.volume = df.volume.fillna(0)
         
     | 
| 
       774 
     | 
    
         
            -
                df.total = df.total.fillna(0)
         
     | 
| 
       775 
     | 
    
         
            -
                df.close = df.close.ffill()
         
     | 
| 
       776 
     | 
    
         
            -
                close = df.close
         
     | 
| 
       777 
     | 
    
         
            -
                df.vwap = df.vwap.fillna(close)
         
     | 
| 
       778 
     | 
    
         
            -
                df.high = df.high.fillna(close)
         
     | 
| 
       779 
     | 
    
         
            -
                df.low = df.low.fillna(close)
         
     | 
| 
       780 
     | 
    
         
            -
                df.open = df.open.fillna(close)
         
     | 
| 
       781 
     | 
    
         
            -
                price_open = df.open
         
     | 
| 
       782 
     | 
    
         
            -
                high = df.high
         
     | 
| 
       783 
     | 
    
         
            -
                low = df.low
         
     | 
| 
       784 
     | 
    
         
            -
                vwap = df.vwap
         
     | 
| 
       785 
     | 
    
         
            -
                # volume = df.volume
         
     | 
| 
       786 
     | 
    
         
            -
                total = df.total
         
     | 
| 
       787 
     | 
    
         
            -
                next_close = close.shift()
         
     | 
| 
       788 
     | 
    
         
            -
             
     | 
| 
       789 
     | 
    
         
            -
                # TODO: Odometer rollover signal.  Relative difference to nearest power of 10.
         
     | 
| 
       790 
     | 
    
         
            -
                # Something about log10 being a whole number?  When is $50 the rollover vs $100 or $10?
         
     | 
| 
       791 
     | 
    
         
            -
             
     | 
| 
       792 
     | 
    
         
            -
                # "True (Typical?) Price" which I think is an approximation of VWAP.
         
     | 
| 
       793 
     | 
    
         
            -
                # Trouble with both is that if there are no trades in a bar we get NaN.
         
     | 
| 
       794 
     | 
    
         
            -
                # That then means we get NaN for averages for the next period-1 bars too.
         
     | 
| 
       795 
     | 
    
         
            -
                # Question is whether to ffill the price for these calculations.
         
     | 
| 
       796 
     | 
    
         
            -
                df["TP"] = (high + low + close) / 3
         
     | 
| 
       797 
     | 
    
         
            -
             
     | 
| 
       798 
     | 
    
         
            -
                # Gain/loss in this bar.
         
     | 
| 
       799 
     | 
    
         
            -
                df["ret1bar"] = close.div(price_open).sub(1)
         
     | 
| 
       800 
     | 
    
         
            -
             
     | 
| 
       801 
     | 
    
         
            -
                for t in range(2, period):
         
     | 
| 
       802 
     | 
    
         
            -
                    df[f"ret{t}bar"] = close.div(price_open.shift(t - 1)).sub(1)
         
     | 
| 
       803 
     | 
    
         
            -
             
     | 
| 
       804 
     | 
    
         
            -
                # Average True Range (ATR)
         
     | 
| 
       805 
     | 
    
         
            -
                true_range = pd.concat(
         
     | 
| 
       806 
     | 
    
         
            -
                    [high.sub(low), high.sub(next_close).abs(), low.sub(next_close).abs()], axis=1
         
     | 
| 
       807 
     | 
    
         
            -
                ).max(1)
         
     | 
| 
       808 
     | 
    
         
            -
                # Normalized ATR (NATR) or Average of Normalized TR.
         
     | 
| 
       809 
     | 
    
         
            -
                # Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
         
     | 
| 
       810 
     | 
    
         
            -
                # He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
         
     | 
| 
       811 
     | 
    
         
            -
                # atr = true_range.ewm(span=period).mean()
         
     | 
| 
       812 
     | 
    
         
            -
                # df["natr_c"] = atr / close
         
     | 
| 
       813 
     | 
    
         
            -
                # df["antr_c"] = (true_range / close).ewm(span=period).mean()
         
     | 
| 
       814 
     | 
    
         
            -
                # df["natr_v"] = atr / vwap
         
     | 
| 
       815 
     | 
    
         
            -
                # df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
         
     | 
| 
       816 
     | 
    
         
            -
                df["NATR"] = (true_range / vwap).ewm(span=period).mean()
         
     | 
| 
       817 
     | 
    
         
            -
             
     | 
| 
       818 
     | 
    
         
            -
                # True Price as HLC average VS VWAP.
         
     | 
| 
       819 
     | 
    
         
            -
                # VWAP is better I think but is quite different than standard CCI.
         
     | 
| 
       820 
     | 
    
         
            -
                # Three ways to compute CCI, all give the same value using TP.
         
     | 
| 
       821 
     | 
    
         
            -
                # tp = (high + low + close) / 3
         
     | 
| 
       822 
     | 
    
         
            -
                # df['SMA'] = ta.sma(tp, length=period)
         
     | 
| 
       823 
     | 
    
         
            -
                # df['sma_r'] = tp.rolling(period).mean()
         
     | 
| 
       824 
     | 
    
         
            -
                # df['MAD'] = ta.mad(tp, length=period)
         
     | 
| 
       825 
     | 
    
         
            -
                # # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
         
     | 
| 
       826 
     | 
    
         
            -
                # df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
         
     | 
| 
       827 
     | 
    
         
            -
             
     | 
| 
       828 
     | 
    
         
            -
                # df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
         
     | 
| 
       829 
     | 
    
         
            -
                # df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
         
     | 
| 
       830 
     | 
    
         
            -
                # df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
         
     | 
| 
       831 
     | 
    
         
            -
             
     | 
| 
       832 
     | 
    
         
            -
                df["taCCI"] = ta.cci(high=high, low=low, close=close, length=period)
         
     | 
| 
       833 
     | 
    
         
            -
             
     | 
| 
       834 
     | 
    
         
            -
                # https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
         
     | 
| 
       835 
     | 
    
         
            -
                # https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
         
     | 
| 
       836 
     | 
    
         
            -
                sma = pd.Series(ta.sma(vwap, length=period))
         
     | 
| 
       837 
     | 
    
         
            -
                mad = pd.Series(ta.mad(vwap, length=period))
         
     | 
| 
       838 
     | 
    
         
            -
                df["CCI"] = (vwap - sma) / (0.015 * mad)
         
     | 
| 
       839 
     | 
    
         
            -
             
     | 
| 
       840 
     | 
    
         
            -
                # df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
         
     | 
| 
       841 
     | 
    
         
            -
                df["MFI"] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
         
     | 
| 
       842 
     | 
    
         
            -
             
     | 
| 
       843 
     | 
    
         
            -
                # We use Stochastic (rather than MACD because we need a ticker independent indicator.
         
     | 
| 
       844 
     | 
    
         
            -
                # IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
         
     | 
| 
       845 
     | 
    
         
            -
                # https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
         
     | 
| 
       846 
     | 
    
         
            -
                # We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
         
     | 
| 
       847 
     | 
    
         
            -
                stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
         
     | 
| 
       848 
     | 
    
         
            -
                df["STOCHk"] = stoch_k
         
     | 
| 
       849 
     | 
    
         
            -
                df["STOCHd"] = stoch_d
         
     | 
| 
       850 
     | 
    
         
            -
                df["STOCHh"] = stoch_h
         
     | 
| 
       851 
     | 
    
         
            -
             
     | 
| 
       852 
     | 
    
         
            -
                return df
         
     | 
| 
       853 
     | 
    
         
            -
             
     | 
| 
       854 
     | 
    
         
            -
             
     | 
| 
       855 
     | 
    
         
            -
            def iterate_all_aggs_tables(
         
     | 
| 
       856 
     | 
    
         
            -
                config: PolygonConfig,
         
     | 
| 
       857 
     | 
    
         
            -
                valid_tickers: pa.Array,
         
     | 
| 
       858 
     | 
    
         
            -
            ):
         
     | 
| 
       859 
     | 
    
         
            -
                schedule = config.calendar.trading_index(
         
     | 
| 
       860 
     | 
    
         
            -
                    start=config.start_timestamp, end=config.end_timestamp, period="1D"
         
     | 
| 
       861 
     | 
    
         
            -
                )
         
     | 
| 
       862 
     | 
    
         
            -
                for timestamp in schedule:
         
     | 
| 
       863 
     | 
    
         
            -
                    date = timestamp.to_pydatetime().date()
         
     | 
| 
       864 
     | 
    
         
            -
                    aggs_ds = pa_ds.dataset(
         
     | 
| 
       865 
     | 
    
         
            -
                        config.custom_aggs_dir,
         
     | 
| 
       866 
     | 
    
         
            -
                        format="parquet",
         
     | 
| 
       867 
     | 
    
         
            -
                        schema=custom_aggs_schema(),
         
     | 
| 
       868 
     | 
    
         
            -
                        partitioning=custom_aggs_partitioning(),
         
     | 
| 
       869 
     | 
    
         
            -
                    )
         
     | 
| 
       870 
     | 
    
         
            -
                    date_filter_expr = (
         
     | 
| 
       871 
     | 
    
         
            -
                        (pa_compute.field("year") == date.year)
         
     | 
| 
       872 
     | 
    
         
            -
                        & (pa_compute.field("month") == date.month)
         
     | 
| 
       873 
     | 
    
         
            -
                        & (pa_compute.field("date") == date)
         
     | 
| 
       874 
     | 
    
         
            -
                    )
         
     | 
| 
       875 
     | 
    
         
            -
                    # print(f"{date_filter_expr=}")
         
     | 
| 
       876 
     | 
    
         
            -
                    for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
         
     | 
| 
       877 
     | 
    
         
            -
                        session_filter = (
         
     | 
| 
       878 
     | 
    
         
            -
                            (pa_compute.field("window_start") >= start_dt)
         
     | 
| 
       879 
     | 
    
         
            -
                            & (pa_compute.field("window_start") < end_dt)
         
     | 
| 
       880 
     | 
    
         
            -
                            & pa_compute.is_in(pa_compute.field("ticker"), valid_tickers)
         
     | 
| 
       881 
     | 
    
         
            -
                        )
         
     | 
| 
       882 
     | 
    
         
            -
                        # Sorting table doesn't seem to avoid needing to sort the df.  Maybe use_threads=False on to_pandas would help?
         
     | 
| 
       883 
     | 
    
         
            -
                        # table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
         
     | 
| 
       884 
     | 
    
         
            -
                        table = fragment.to_table(filter=session_filter)
         
     | 
| 
       885 
     | 
    
         
            -
                        if table.num_rows > 0:
         
     | 
| 
       886 
     | 
    
         
            -
                            metadata = (
         
     | 
| 
       887 
     | 
    
         
            -
                                dict(table.schema.metadata) if table.schema.metadata else dict()
         
     | 
| 
       888 
     | 
    
         
            -
                            )
         
     | 
| 
       889 
     | 
    
         
            -
                            metadata["date"] = date.isoformat()
         
     | 
| 
       890 
     | 
    
         
            -
                            table = table.replace_schema_metadata(metadata)
         
     | 
| 
       891 
     | 
    
         
            -
                            yield table
         
     | 
| 
       892 
     | 
    
         
            -
             
     | 
| 
       893 
     | 
    
         
            -
             
     | 
| 
       894 
     | 
    
         
            -
            # def iterate_all_aggs_with_signals(config: PolygonConfig):
         
     | 
| 
       895 
     | 
    
         
            -
            #     for table in iterate_all_aggs_tables(config):
         
     | 
| 
       896 
     | 
    
         
            -
            #         df = table.to_pandas()
         
     | 
| 
       897 
     | 
    
         
            -
            #         df = df.groupby("ticker").apply(
         
     | 
| 
       898 
     | 
    
         
            -
            #             compute_per_ticker_signals, include_groups=False
         
     | 
| 
       899 
     | 
    
         
            -
            #         )
         
     | 
| 
       900 
     | 
    
         
            -
            #         yield pa.Table.from_pandas(df)
         
     | 
| 
       901 
     | 
    
         
            -
             
     | 
| 
       902 
     | 
    
         
            -
             
     | 
| 
       903 
     | 
    
         
            -
            def compute_signals_for_all_custom_aggs(
         
     | 
| 
       904 
     | 
    
         
            -
                from_config: PolygonConfig,
         
     | 
| 
       905 
     | 
    
         
            -
                to_config: PolygonConfig,
         
     | 
| 
       906 
     | 
    
         
            -
                valid_tickers: pa.Array,
         
     | 
| 
       907 
     | 
    
         
            -
                overwrite: bool = False,
         
     | 
| 
       908 
     | 
    
         
            -
            ) -> str:
         
     | 
| 
       909 
     | 
    
         
            -
                if overwrite:
         
     | 
| 
       910 
     | 
    
         
            -
                    print("WARNING: overwrite not implemented/ignored.")
         
     | 
| 
      
 526 
     | 
    
         
            +
            #     return by_ticker_aggs_arrow_dir
         
     | 
| 
       911 
527 
     | 
    
         | 
| 
       912 
     | 
    
         
            -
                print(f"{to_config.custom_aggs_dir=}")
         
     | 
| 
       913 
528 
     | 
    
         | 
| 
       914 
     | 
    
         
            -
             
     | 
| 
       915 
     | 
    
         
            -
             
     | 
| 
       916 
     | 
    
         
            -
             
     | 
| 
       917 
     | 
    
         
            -
             
     | 
| 
       918 
     | 
    
         
            -
             
     | 
| 
       919 
     | 
    
         
            -
                    df = df.groupby("ticker").apply(
         
     | 
| 
       920 
     | 
    
         
            -
                        compute_per_ticker_signals, include_groups=False
         
     | 
| 
       921 
     | 
    
         
            -
                    )
         
     | 
| 
       922 
     | 
    
         
            -
                    table = pa.Table.from_pandas(df)
         
     | 
| 
       923 
     | 
    
         
            -
                    if table.num_rows > 0:
         
     | 
| 
       924 
     | 
    
         
            -
                        table = table.replace_schema_metadata(metadata)
         
     | 
| 
       925 
     | 
    
         
            -
                        table = table.append_column("date", pa.array(np.full(len(table), date)))
         
     | 
| 
       926 
     | 
    
         
            -
                        table = table.append_column(
         
     | 
| 
       927 
     | 
    
         
            -
                            "year", pa.array(np.full(len(table), date.year), type=pa.uint16())
         
     | 
| 
       928 
     | 
    
         
            -
                        )
         
     | 
| 
       929 
     | 
    
         
            -
                        table = table.append_column(
         
     | 
| 
       930 
     | 
    
         
            -
                            "month", pa.array(np.full(len(table), date.month), type=pa.uint8())
         
     | 
| 
       931 
     | 
    
         
            -
                        )
         
     | 
| 
       932 
     | 
    
         
            -
                        table = table.sort_by(
         
     | 
| 
       933 
     | 
    
         
            -
                            [("ticker", "ascending"), ("window_start", "ascending")]
         
     | 
| 
       934 
     | 
    
         
            -
                        )
         
     | 
| 
       935 
     | 
    
         
            -
                        pa_ds.write_dataset(
         
     | 
| 
       936 
     | 
    
         
            -
                            table,
         
     | 
| 
       937 
     | 
    
         
            -
                            filesystem=to_config.filesystem,
         
     | 
| 
       938 
     | 
    
         
            -
                            base_dir=to_config.custom_aggs_dir,
         
     | 
| 
       939 
     | 
    
         
            -
                            partitioning=custom_aggs_partitioning(),
         
     | 
| 
       940 
     | 
    
         
            -
                            format="parquet",
         
     | 
| 
       941 
     | 
    
         
            -
                            existing_data_behavior="overwrite_or_ignore",
         
     | 
| 
       942 
     | 
    
         
            -
                            file_visitor=file_visitor,
         
     | 
| 
       943 
     | 
    
         
            -
                        )
         
     | 
| 
       944 
     | 
    
         
            -
                return to_config.custom_aggs_dir
         
     | 
| 
      
 529 
     | 
    
         
            +
            # def generate_tables_from_custom_aggs_ds(
         
     | 
| 
      
 530 
     | 
    
         
            +
            #     aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
         
     | 
| 
      
 531 
     | 
    
         
            +
            # ):
         
     | 
| 
      
 532 
     | 
    
         
            +
            #     for timestamp in schedule:
         
     | 
| 
      
 533 
     | 
    
         
            +
            #         yield table_for_date(aggs_ds=aggs_ds, date=timestamp.to_pydatetime().date())
         
     |