zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.2.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +31 -1
- zipline_polygon_bundle/adjustments.py +34 -0
- zipline_polygon_bundle/bundle.py +183 -34
- zipline_polygon_bundle/concat_all_aggs.py +18 -53
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +132 -26
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +5 -38
- zipline_polygon_bundle/trades.py +944 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/METADATA +6 -3
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +17 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/WHEEL +0 -0
| @@ -6,11 +6,21 @@ from .bundle import ( | |
| 6 6 | 
             
            )
         | 
| 7 7 |  | 
| 8 8 | 
             
            from .config import PolygonConfig
         | 
| 9 | 
            +
            from .nyse_all_hours_calendar import NYSE_ALL_HOURS, register_nyse_all_hours_calendar
         | 
| 9 10 | 
             
            from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
         | 
| 10 | 
            -
            from .adjustments import load_splits, load_dividends
         | 
| 11 | 
            +
            from .adjustments import load_splits, load_dividends, load_conditions
         | 
| 12 | 
            +
            from .trades import trades_schema, trades_dataset, cast_trades, date_to_path
         | 
| 13 | 
            +
            from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_trades_to_custom_aggs
         | 
| 14 | 
            +
            from .trades import get_custom_aggs_dates, generate_csv_trades_tables, compute_signals_for_all_custom_aggs
         | 
| 15 | 
            +
            from .quotes import quotes_schema, quotes_dataset, cast_quotes
         | 
| 16 | 
            +
            # from .tickers_and_names import load_all_tickers, merge_tickers, ticker_names_from_merged_tickers, get_ticker_universe
         | 
| 17 | 
            +
            from .tickers_and_names import PolygonAssets, get_ticker_universe
         | 
| 18 | 
            +
             | 
| 11 19 |  | 
| 12 20 | 
             
            __all__ = [
         | 
| 13 21 | 
             
                "register_polygon_equities_bundle",
         | 
| 22 | 
            +
                "register_nyse_all_hours_calendar",
         | 
| 23 | 
            +
                "NYSE_ALL_HOURS",
         | 
| 14 24 | 
             
                "symbol_to_upper",
         | 
| 15 25 | 
             
                "polygon_equities_bundle_day",
         | 
| 16 26 | 
             
                "polygon_equities_bundle_minute",
         | 
| @@ -19,4 +29,24 @@ __all__ = [ | |
| 19 29 | 
             
                "generate_csv_agg_tables",
         | 
| 20 30 | 
             
                "load_splits",
         | 
| 21 31 | 
             
                "load_dividends",
         | 
| 32 | 
            +
                "load_conditions",
         | 
| 33 | 
            +
                "trades_schema",
         | 
| 34 | 
            +
                "trades_dataset",
         | 
| 35 | 
            +
                "cast_trades",
         | 
| 36 | 
            +
                "date_to_path",
         | 
| 37 | 
            +
                "get_custom_aggs_dates",
         | 
| 38 | 
            +
                "generate_csv_trades_tables",
         | 
| 39 | 
            +
                "custom_aggs_partitioning",
         | 
| 40 | 
            +
                "custom_aggs_schema",
         | 
| 41 | 
            +
                "trades_to_custom_aggs",
         | 
| 42 | 
            +
                "convert_trades_to_custom_aggs",
         | 
| 43 | 
            +
                "compute_signals_for_all_custom_aggs",
         | 
| 44 | 
            +
                "quotes_schema",
         | 
| 45 | 
            +
                "quotes_dataset",
         | 
| 46 | 
            +
                "cast_quotes",
         | 
| 47 | 
            +
                # "load_all_tickers", 
         | 
| 48 | 
            +
                # "merge_tickers",
         | 
| 49 | 
            +
                # "ticker_names_from_merged_tickers",
         | 
| 50 | 
            +
                "PolygonAssets",
         | 
| 51 | 
            +
                "get_ticker_universe",
         | 
| 22 52 | 
             
            ]
         | 
| @@ -153,3 +153,37 @@ def load_dividends( | |
| 153 153 | 
             
                return dividends[
         | 
| 154 154 | 
             
                    ["sid", "ex_date", "declared_date", "record_date", "pay_date", "amount"]
         | 
| 155 155 | 
             
                ]
         | 
| 156 | 
            +
             | 
| 157 | 
            +
             | 
| 158 | 
            +
            def load_conditions(config: PolygonConfig) -> pd.DataFrame:
         | 
| 159 | 
            +
                # The API doesn't use dates for the condition codes but this is a way to provide control over caching.
         | 
| 160 | 
            +
                # Main thing is to get the current conditions list but we don't want to call more than once a day.
         | 
| 161 | 
            +
                conditions_path = config.api_cache_path(
         | 
| 162 | 
            +
                    start_date=config.start_timestamp.date(), end_date=config.end_timestamp.date(), filename="conditions"
         | 
| 163 | 
            +
                )
         | 
| 164 | 
            +
                expected_conditions_count = 100
         | 
| 165 | 
            +
                if not os.path.exists(conditions_path):
         | 
| 166 | 
            +
                    client = polygon.RESTClient(api_key=config.api_key)
         | 
| 167 | 
            +
                    conditions_response = client.list_conditions(
         | 
| 168 | 
            +
                        limit=1000,
         | 
| 169 | 
            +
                    )
         | 
| 170 | 
            +
                    if conditions_response is HTTPResponse:
         | 
| 171 | 
            +
                        raise ValueError(f"Polygon.list_splits bad HTTPResponse: {conditions_response}")
         | 
| 172 | 
            +
                    conditions = pd.DataFrame(conditions_response)
         | 
| 173 | 
            +
                    print(f"Got {len(conditions)=} from Polygon list_conditions.")
         | 
| 174 | 
            +
                    os.makedirs(os.path.dirname(conditions_path), exist_ok=True)
         | 
| 175 | 
            +
                    conditions.to_parquet(conditions_path)
         | 
| 176 | 
            +
                    if len(conditions) < expected_conditions_count:
         | 
| 177 | 
            +
                        logging.warning(
         | 
| 178 | 
            +
                            f"Only got {len(conditions)=} from Polygon list_splits (expected {expected_conditions_count=}).  "
         | 
| 179 | 
            +
                        )
         | 
| 180 | 
            +
                    # We will always load from the file to avoid any chance of weird errors.
         | 
| 181 | 
            +
                if os.path.exists(conditions_path):
         | 
| 182 | 
            +
                    conditions = pd.read_parquet(conditions_path)
         | 
| 183 | 
            +
                    print(f"Loaded {len(conditions)=} from {conditions_path}")
         | 
| 184 | 
            +
                    if len(conditions) < expected_conditions_count:
         | 
| 185 | 
            +
                        logging.warning(
         | 
| 186 | 
            +
                            f"Only got {len(conditions)=} from cached conditions (expected {expected_conditions_count=}).  "
         | 
| 187 | 
            +
                        )
         | 
| 188 | 
            +
                    return conditions
         | 
| 189 | 
            +
                raise ValueError(f"Failed to load splits from {conditions_path}")
         | 
    
        zipline_polygon_bundle/bundle.py
    CHANGED
    
    | @@ -1,18 +1,23 @@ | |
| 1 | 
            +
            import os
         | 
| 1 2 | 
             
            from zipline.data.bundles import register
         | 
| 2 3 | 
             
            from zipline.data.resample import minute_frame_to_session_frame
         | 
| 3 4 |  | 
| 4 | 
            -
            from . | 
| 5 | 
            +
            from exchange_calendars.calendar_helpers import parse_date
         | 
| 6 | 
            +
            from exchange_calendars.calendar_utils import get_calendar
         | 
| 7 | 
            +
             | 
| 5 8 | 
             
            from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
         | 
| 6 9 | 
             
            from .adjustments import load_splits, load_dividends
         | 
| 10 | 
            +
            from .config import PolygonConfig
         | 
| 11 | 
            +
            from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
         | 
| 12 | 
            +
            from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
         | 
| 7 13 |  | 
| 8 14 | 
             
            import pyarrow
         | 
| 9 15 | 
             
            import pyarrow.compute
         | 
| 16 | 
            +
            import pyarrow.dataset
         | 
| 10 17 |  | 
| 11 18 | 
             
            import pandas as pd
         | 
| 12 19 | 
             
            import logging
         | 
| 13 20 |  | 
| 14 | 
            -
            import concurrent.futures
         | 
| 15 | 
            -
             | 
| 16 21 |  | 
| 17 22 | 
             
            # TODO: Change warnings to be relative to number of days in the range.
         | 
| 18 23 |  | 
| @@ -26,7 +31,7 @@ def symbol_to_upper(s: str) -> str: | |
| 26 31 | 
             
            def generate_all_agg_tables_from_csv(
         | 
| 27 32 | 
             
                config: PolygonConfig,
         | 
| 28 33 | 
             
            ):
         | 
| 29 | 
            -
                 | 
| 34 | 
            +
                schema, tables = generate_csv_agg_tables(config)
         | 
| 30 35 | 
             
                for table in tables:
         | 
| 31 36 | 
             
                    table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
         | 
| 32 37 | 
             
                    yield table
         | 
| @@ -175,8 +180,8 @@ def polygon_equities_bundle_day( | |
| 175 180 | 
             
                daily_bar_writer,
         | 
| 176 181 | 
             
                adjustment_writer,
         | 
| 177 182 | 
             
                calendar,
         | 
| 178 | 
            -
                 | 
| 179 | 
            -
                 | 
| 183 | 
            +
                start_date,
         | 
| 184 | 
            +
                end_date,
         | 
| 180 185 | 
             
                cache,
         | 
| 181 186 | 
             
                show_progress,
         | 
| 182 187 | 
             
                output_dir,
         | 
| @@ -184,8 +189,8 @@ def polygon_equities_bundle_day( | |
| 184 189 | 
             
                config = PolygonConfig(
         | 
| 185 190 | 
             
                    environ=environ,
         | 
| 186 191 | 
             
                    calendar_name=calendar.name,
         | 
| 187 | 
            -
                     | 
| 188 | 
            -
                     | 
| 192 | 
            +
                    start_date=start_date,
         | 
| 193 | 
            +
                    end_date=end_date,
         | 
| 189 194 | 
             
                    agg_time="day",
         | 
| 190 195 | 
             
                )
         | 
| 191 196 |  | 
| @@ -206,7 +211,19 @@ def polygon_equities_bundle_day( | |
| 206 211 | 
             
                    )
         | 
| 207 212 | 
             
                )
         | 
| 208 213 |  | 
| 209 | 
            -
                 | 
| 214 | 
            +
                # Only get the columns Zipline allows.
         | 
| 215 | 
            +
                table = aggregates.to_table(
         | 
| 216 | 
            +
                    columns=[
         | 
| 217 | 
            +
                        "ticker",
         | 
| 218 | 
            +
                        "window_start",
         | 
| 219 | 
            +
                        "open",
         | 
| 220 | 
            +
                        "high",
         | 
| 221 | 
            +
                        "low",
         | 
| 222 | 
            +
                        "close",
         | 
| 223 | 
            +
                        "volume",
         | 
| 224 | 
            +
                        "transactions",
         | 
| 225 | 
            +
                    ]
         | 
| 226 | 
            +
                )
         | 
| 210 227 | 
             
                table = rename_polygon_to_zipline(table, "day")
         | 
| 211 228 | 
             
                # Get all the symbols in the table by using value_counts to tabulate the unique values.
         | 
| 212 229 | 
             
                # pyarrow.Table.column returns a pyarrow.ChunkedArray.
         | 
| @@ -219,7 +236,7 @@ def polygon_equities_bundle_day( | |
| 219 236 | 
             
                daily_bar_writer.write(
         | 
| 220 237 | 
             
                    process_day_aggregates(
         | 
| 221 238 | 
             
                        table=table,
         | 
| 222 | 
            -
                        sessions=calendar.sessions_in_range( | 
| 239 | 
            +
                        sessions=calendar.sessions_in_range(start_date, end_date),
         | 
| 223 240 | 
             
                        metadata=metadata,
         | 
| 224 241 | 
             
                        calendar=calendar,
         | 
| 225 242 | 
             
                        symbol_to_sid=symbol_to_sid,
         | 
| @@ -251,7 +268,19 @@ def process_minute_fragment( | |
| 251 268 | 
             
                dates_with_data: set,
         | 
| 252 269 | 
             
                agg_time: str,
         | 
| 253 270 | 
             
            ):
         | 
| 254 | 
            -
                 | 
| 271 | 
            +
                # Only get the columns Zipline allows.
         | 
| 272 | 
            +
                table = fragment.to_table(
         | 
| 273 | 
            +
                    columns=[
         | 
| 274 | 
            +
                        "ticker",
         | 
| 275 | 
            +
                        "window_start",
         | 
| 276 | 
            +
                        "open",
         | 
| 277 | 
            +
                        "high",
         | 
| 278 | 
            +
                        "low",
         | 
| 279 | 
            +
                        "close",
         | 
| 280 | 
            +
                        "volume",
         | 
| 281 | 
            +
                        "transactions",
         | 
| 282 | 
            +
                    ]
         | 
| 283 | 
            +
                )
         | 
| 255 284 | 
             
                print(f" {table.num_rows=}")
         | 
| 256 285 | 
             
                table = rename_polygon_to_zipline(table, "timestamp")
         | 
| 257 286 | 
             
                table = table.sort_by([("symbol", "ascending"), ("timestamp", "ascending")])
         | 
| @@ -400,8 +429,8 @@ def polygon_equities_bundle_minute( | |
| 400 429 | 
             
                daily_bar_writer,
         | 
| 401 430 | 
             
                adjustment_writer,
         | 
| 402 431 | 
             
                calendar,
         | 
| 403 | 
            -
                 | 
| 404 | 
            -
                 | 
| 432 | 
            +
                start_date,
         | 
| 433 | 
            +
                end_date,
         | 
| 405 434 | 
             
                cache,
         | 
| 406 435 | 
             
                show_progress,
         | 
| 407 436 | 
             
                output_dir,
         | 
| @@ -409,8 +438,8 @@ def polygon_equities_bundle_minute( | |
| 409 438 | 
             
                config = PolygonConfig(
         | 
| 410 439 | 
             
                    environ=environ,
         | 
| 411 440 | 
             
                    calendar_name=calendar.name,
         | 
| 412 | 
            -
                     | 
| 413 | 
            -
                     | 
| 441 | 
            +
                    start_date=start_date,
         | 
| 442 | 
            +
                    end_date=end_date,
         | 
| 414 443 | 
             
                    agg_time="minute",
         | 
| 415 444 | 
             
                )
         | 
| 416 445 |  | 
| @@ -444,8 +473,99 @@ def polygon_equities_bundle_minute( | |
| 444 473 | 
             
                daily_bar_writer.write(
         | 
| 445 474 | 
             
                    process_minute_aggregates(
         | 
| 446 475 | 
             
                        fragments=aggregates.get_fragments(),
         | 
| 447 | 
            -
                        sessions=calendar.sessions_in_range( | 
| 448 | 
            -
                        minutes=calendar.sessions_minutes( | 
| 476 | 
            +
                        sessions=calendar.sessions_in_range(start_date, end_date),
         | 
| 477 | 
            +
                        minutes=calendar.sessions_minutes(start_date, end_date),
         | 
| 478 | 
            +
                        metadata=metadata,
         | 
| 479 | 
            +
                        calendar=calendar,
         | 
| 480 | 
            +
                        symbol_to_sid=symbol_to_sid,
         | 
| 481 | 
            +
                        dates_with_data=dates_with_data,
         | 
| 482 | 
            +
                        agg_time="day",
         | 
| 483 | 
            +
                    ),
         | 
| 484 | 
            +
                    show_progress=show_progress,
         | 
| 485 | 
            +
                )
         | 
| 486 | 
            +
             | 
| 487 | 
            +
                # Get data for all stocks and write to Zipline
         | 
| 488 | 
            +
                minute_bar_writer.write(
         | 
| 489 | 
            +
                    process_minute_aggregates(
         | 
| 490 | 
            +
                        fragments=aggregates.get_fragments(),
         | 
| 491 | 
            +
                        sessions=calendar.sessions_in_range(start_date, end_date),
         | 
| 492 | 
            +
                        minutes=calendar.sessions_minutes(start_date, end_date),
         | 
| 493 | 
            +
                        metadata=metadata,
         | 
| 494 | 
            +
                        calendar=calendar,
         | 
| 495 | 
            +
                        symbol_to_sid=symbol_to_sid,
         | 
| 496 | 
            +
                        dates_with_data=dates_with_data,
         | 
| 497 | 
            +
                        agg_time="minute",
         | 
| 498 | 
            +
                    ),
         | 
| 499 | 
            +
                    show_progress=show_progress,
         | 
| 500 | 
            +
                )
         | 
| 501 | 
            +
             | 
| 502 | 
            +
                # Write the metadata
         | 
| 503 | 
            +
                asset_db_writer.write(equities=metadata)
         | 
| 504 | 
            +
             | 
| 505 | 
            +
                # Load splits and dividends
         | 
| 506 | 
            +
                first_start_end = min(dates_with_data)
         | 
| 507 | 
            +
                last_end_date = max(dates_with_data)
         | 
| 508 | 
            +
                splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
         | 
| 509 | 
            +
                dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
         | 
| 510 | 
            +
             | 
| 511 | 
            +
                # Write splits and dividends
         | 
| 512 | 
            +
                adjustment_writer.write(splits=splits, dividends=dividends)
         | 
| 513 | 
            +
             | 
| 514 | 
            +
             | 
| 515 | 
            +
            def polygon_equities_bundle_trades(
         | 
| 516 | 
            +
                environ,
         | 
| 517 | 
            +
                asset_db_writer,
         | 
| 518 | 
            +
                minute_bar_writer,
         | 
| 519 | 
            +
                daily_bar_writer,
         | 
| 520 | 
            +
                adjustment_writer,
         | 
| 521 | 
            +
                calendar,
         | 
| 522 | 
            +
                start_date,
         | 
| 523 | 
            +
                end_date,
         | 
| 524 | 
            +
                cache,
         | 
| 525 | 
            +
                show_progress,
         | 
| 526 | 
            +
                output_dir,
         | 
| 527 | 
            +
            ):
         | 
| 528 | 
            +
                # TODO: Support agg durations other than `1min`.
         | 
| 529 | 
            +
                config = PolygonConfig(
         | 
| 530 | 
            +
                    environ=environ,
         | 
| 531 | 
            +
                    calendar_name=calendar.name,
         | 
| 532 | 
            +
                    start_date=start_date,
         | 
| 533 | 
            +
                    end_date=end_date,
         | 
| 534 | 
            +
                    agg_time="1min",
         | 
| 535 | 
            +
                )
         | 
| 536 | 
            +
             | 
| 537 | 
            +
                convert_trades_to_custom_aggs(config, overwrite=False)
         | 
| 538 | 
            +
                by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
         | 
| 539 | 
            +
                aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
         | 
| 540 | 
            +
                # 3.5 billion rows for 10 years of minute data.
         | 
| 541 | 
            +
                # print(f"{aggregates.count_rows()=}")
         | 
| 542 | 
            +
                # Can't sort the dataset because that reads it all into memory.
         | 
| 543 | 
            +
                # aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
         | 
| 544 | 
            +
                # print("Sorted")
         | 
| 545 | 
            +
             | 
| 546 | 
            +
                # Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
         | 
| 547 | 
            +
                # This is because the SQL schema zipline uses for symbols ignores case.
         | 
| 548 | 
            +
                # We put the original symbol in the asset_name field.
         | 
| 549 | 
            +
                metadata = pd.DataFrame(
         | 
| 550 | 
            +
                    columns=(
         | 
| 551 | 
            +
                        "start_date",
         | 
| 552 | 
            +
                        "end_date",
         | 
| 553 | 
            +
                        "auto_close_date",
         | 
| 554 | 
            +
                        "symbol",
         | 
| 555 | 
            +
                        "exchange",
         | 
| 556 | 
            +
                        "asset_name",
         | 
| 557 | 
            +
                    )
         | 
| 558 | 
            +
                )
         | 
| 559 | 
            +
             | 
| 560 | 
            +
                symbol_to_sid = {}
         | 
| 561 | 
            +
                dates_with_data = set()
         | 
| 562 | 
            +
             | 
| 563 | 
            +
                # Get data for all stocks and write to Zipline
         | 
| 564 | 
            +
                daily_bar_writer.write(
         | 
| 565 | 
            +
                    process_minute_aggregates(
         | 
| 566 | 
            +
                        fragments=aggregates.get_fragments(),
         | 
| 567 | 
            +
                        sessions=calendar.sessions_in_range(start_date, end_date),
         | 
| 568 | 
            +
                        minutes=calendar.sessions_minutes(start_date, end_date),
         | 
| 449 569 | 
             
                        metadata=metadata,
         | 
| 450 570 | 
             
                        calendar=calendar,
         | 
| 451 571 | 
             
                        symbol_to_sid=symbol_to_sid,
         | 
| @@ -459,8 +579,8 @@ def polygon_equities_bundle_minute( | |
| 459 579 | 
             
                minute_bar_writer.write(
         | 
| 460 580 | 
             
                    process_minute_aggregates(
         | 
| 461 581 | 
             
                        fragments=aggregates.get_fragments(),
         | 
| 462 | 
            -
                        sessions=calendar.sessions_in_range( | 
| 463 | 
            -
                        minutes=calendar.sessions_minutes( | 
| 582 | 
            +
                        sessions=calendar.sessions_in_range(start_date, end_date),
         | 
| 583 | 
            +
                        minutes=calendar.sessions_minutes(start_date, end_date),
         | 
| 464 584 | 
             
                        metadata=metadata,
         | 
| 465 585 | 
             
                        calendar=calendar,
         | 
| 466 586 | 
             
                        symbol_to_sid=symbol_to_sid,
         | 
| @@ -485,25 +605,54 @@ def polygon_equities_bundle_minute( | |
| 485 605 |  | 
| 486 606 | 
             
            def register_polygon_equities_bundle(
         | 
| 487 607 | 
             
                bundlename,
         | 
| 488 | 
            -
                 | 
| 489 | 
            -
                 | 
| 608 | 
            +
                start_date=None,
         | 
| 609 | 
            +
                end_date=None,
         | 
| 490 610 | 
             
                calendar_name="XNYS",
         | 
| 491 611 | 
             
                agg_time="day",
         | 
| 492 612 | 
             
                # ticker_list=None,
         | 
| 493 613 | 
             
                # watchlists=None,
         | 
| 494 614 | 
             
                # include_asset_types=None,
         | 
| 495 615 | 
             
            ):
         | 
| 496 | 
            -
                 | 
| 497 | 
            -
             | 
| 616 | 
            +
                register_nyse_all_hours_calendar()
         | 
| 617 | 
            +
             | 
| 618 | 
            +
                if agg_time not in ["day", "minute", "1min"]:
         | 
| 619 | 
            +
                    raise ValueError(
         | 
| 620 | 
            +
                        f"agg_time must be 'day', 'minute' (aggs), or '1min' (trades), not '{agg_time}'"
         | 
| 621 | 
            +
                    )
         | 
| 622 | 
            +
             | 
| 623 | 
            +
                # We need to know the start and end dates of the session before the bundle is
         | 
| 624 | 
            +
                # registered because even though we only need it for ingest, the metadata in
         | 
| 625 | 
            +
                # the writer is initialized and written before our ingest function is called.
         | 
| 626 | 
            +
                if start_date is None or end_date is None:
         | 
| 627 | 
            +
                    config = PolygonConfig(
         | 
| 628 | 
            +
                        environ=os.environ,
         | 
| 629 | 
            +
                        calendar_name=calendar_name,
         | 
| 630 | 
            +
                        start_date=start_date,
         | 
| 631 | 
            +
                        end_date=end_date,
         | 
| 632 | 
            +
                        agg_time=agg_time,
         | 
| 633 | 
            +
                    )
         | 
| 634 | 
            +
                    first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
         | 
| 635 | 
            +
                        config.aggs_dir if agg_time in ["day", "minute"] else config.trades_dir,
         | 
| 636 | 
            +
                        config.csv_paths_pattern,
         | 
| 637 | 
            +
                    )
         | 
| 638 | 
            +
                    if start_date is None:
         | 
| 639 | 
            +
                        start_date = first_aggs_date
         | 
| 640 | 
            +
                    if end_date is None:
         | 
| 641 | 
            +
                        end_date = last_aggs_date
         | 
| 642 | 
            +
             | 
| 498 643 | 
             
                register(
         | 
| 499 644 | 
             
                    bundlename,
         | 
| 500 645 | 
             
                    (
         | 
| 501 | 
            -
                         | 
| 502 | 
            -
                        if agg_time == " | 
| 503 | 
            -
                        else  | 
| 646 | 
            +
                        polygon_equities_bundle_day
         | 
| 647 | 
            +
                        if agg_time == "day"
         | 
| 648 | 
            +
                        else (
         | 
| 649 | 
            +
                            polygon_equities_bundle_minute
         | 
| 650 | 
            +
                            if agg_time == "minute"
         | 
| 651 | 
            +
                            else polygon_equities_bundle_trades
         | 
| 652 | 
            +
                        )
         | 
| 504 653 | 
             
                    ),
         | 
| 505 | 
            -
                    start_session= | 
| 506 | 
            -
                    end_session= | 
| 654 | 
            +
                    start_session=parse_date(start_date, raise_oob=False) if start_date else None,
         | 
| 655 | 
            +
                    end_session=parse_date(end_date, raise_oob=False) if end_date else None,
         | 
| 507 656 | 
             
                    calendar_name=calendar_name,
         | 
| 508 657 | 
             
                    # minutes_per_day=390,
         | 
| 509 658 | 
             
                    # create_writers=True,
         | 
| @@ -517,12 +666,12 @@ def register_polygon_equities_bundle( | |
| 517 666 | 
             
            #     config = PolygonConfig(
         | 
| 518 667 | 
             
            #         environ=os.environ,
         | 
| 519 668 | 
             
            #         calendar_name="XNYS",
         | 
| 520 | 
            -
            #         #  | 
| 521 | 
            -
            #         #  | 
| 522 | 
            -
            #          | 
| 523 | 
            -
            #         #  | 
| 524 | 
            -
            #          | 
| 525 | 
            -
            #         #  | 
| 669 | 
            +
            #         # start_date="2003-10-01",
         | 
| 670 | 
            +
            #         # start_date="2018-01-01",
         | 
| 671 | 
            +
            #         start_date="2023-01-01",
         | 
| 672 | 
            +
            #         # end_date="2023-01-12",
         | 
| 673 | 
            +
            #         end_date="2023-12-31",
         | 
| 674 | 
            +
            #         # end_date="2024-06-30",
         | 
| 526 675 | 
             
            #     )
         | 
| 527 676 | 
             
            #     splits = load_polygon_splits(config)
         | 
| 528 677 | 
             
            #     splits.info()
         | 
| @@ -1,40 +1,21 @@ | |
| 1 | 
            -
            from .config import PolygonConfig
         | 
| 1 | 
            +
            from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
         | 
| 2 2 |  | 
| 3 3 | 
             
            import shutil
         | 
| 4 | 
            -
            from typing import Iterator, Tuple
         | 
| 4 | 
            +
            from typing import Iterator, Tuple, List, Union
         | 
| 5 5 |  | 
| 6 6 | 
             
            import argparse
         | 
| 7 | 
            -
            import glob
         | 
| 8 7 | 
             
            import os
         | 
| 9 8 |  | 
| 10 9 | 
             
            import pyarrow as pa
         | 
| 11 10 | 
             
            from pyarrow import dataset as pa_ds
         | 
| 12 11 | 
             
            from pyarrow import csv as pa_csv
         | 
| 12 | 
            +
            from pyarrow import compute as pa_compute
         | 
| 13 13 |  | 
| 14 14 | 
             
            import pandas as pd
         | 
| 15 15 |  | 
| 16 16 |  | 
| 17 | 
            -
            PARTITION_COLUMN_NAME = "part"
         | 
| 18 | 
            -
            PARTITION_KEY_LENGTH = 2
         | 
| 19 | 
            -
             | 
| 20 | 
            -
             | 
| 21 | 
            -
            def to_partition_key(s: str) -> str:
         | 
| 22 | 
            -
                """
         | 
| 23 | 
            -
                Partition key is low cardinality and must be filesystem-safe.
         | 
| 24 | 
            -
                The reason for partitioning is to keep the parquet files from getting too big.
         | 
| 25 | 
            -
                10 years of minute aggs for US stocks is 83GB gzipped.  A single parquet would be 62GB on disk.
         | 
| 26 | 
            -
                Currently the first two characters so files stay under 1GB.  Weird characters are replaced with "A".
         | 
| 27 | 
            -
                """
         | 
| 28 | 
            -
                k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
         | 
| 29 | 
            -
                if k.isalpha():
         | 
| 30 | 
            -
                    return k
         | 
| 31 | 
            -
                # Replace non-alpha characters with "A".
         | 
| 32 | 
            -
                k = "".join([c if c.isalpha() else "A" for c in k])
         | 
| 33 | 
            -
                return k
         | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 17 | 
             
            def generate_tables_from_csv_files(
         | 
| 37 | 
            -
                paths:  | 
| 18 | 
            +
                paths: Iterator[Union[str, os.PathLike]],
         | 
| 38 19 | 
             
                schema: pa.Schema,
         | 
| 39 20 | 
             
                start_timestamp: pd.Timestamp,
         | 
| 40 21 | 
             
                limit_timestamp: pd.Timestamp,
         | 
| @@ -57,7 +38,7 @@ def generate_tables_from_csv_files( | |
| 57 38 | 
             
                        quoted_strings_can_be_null=False,
         | 
| 58 39 | 
             
                    )
         | 
| 59 40 |  | 
| 60 | 
            -
                    table =  | 
| 41 | 
            +
                    table = pa_csv.read_csv(path, convert_options=convert_options)
         | 
| 61 42 | 
             
                    tables_read_count += 1
         | 
| 62 43 | 
             
                    table = table.set_column(
         | 
| 63 44 | 
             
                        table.column_names.index("window_start"),
         | 
| @@ -75,10 +56,10 @@ def generate_tables_from_csv_files( | |
| 75 56 | 
             
                            ),
         | 
| 76 57 | 
             
                        )
         | 
| 77 58 | 
             
                    expr = (
         | 
| 78 | 
            -
                         | 
| 59 | 
            +
                        pa_compute.field("window_start")
         | 
| 79 60 | 
             
                        >= pa.scalar(start_timestamp, type=schema.field("window_start").type)
         | 
| 80 61 | 
             
                    ) & (
         | 
| 81 | 
            -
                         | 
| 62 | 
            +
                        pa_compute.field("window_start")
         | 
| 82 63 | 
             
                        < pa.scalar(
         | 
| 83 64 | 
             
                            limit_timestamp,
         | 
| 84 65 | 
             
                            type=schema.field("window_start").type,
         | 
| @@ -101,22 +82,8 @@ def generate_tables_from_csv_files( | |
| 101 82 |  | 
| 102 83 | 
             
            def generate_csv_agg_tables(
         | 
| 103 84 | 
             
                config: PolygonConfig,
         | 
| 104 | 
            -
            ) -> Tuple[ | 
| 85 | 
            +
            ) -> Tuple[pa.Schema, Iterator[pa.Table]]:
         | 
| 105 86 | 
             
                """zipline does bundle ingestion one ticker at a time."""
         | 
| 106 | 
            -
                # We sort by path because they have the year and month in the dir names and the date in the filename.
         | 
| 107 | 
            -
                paths = sorted(
         | 
| 108 | 
            -
                    list(
         | 
| 109 | 
            -
                        glob.glob(
         | 
| 110 | 
            -
                            os.path.join(config.aggs_dir, config.csv_paths_pattern),
         | 
| 111 | 
            -
                            recursive="**" in config.csv_paths_pattern,
         | 
| 112 | 
            -
                        )
         | 
| 113 | 
            -
                    )
         | 
| 114 | 
            -
                )
         | 
| 115 | 
            -
             | 
| 116 | 
            -
                print(f"{len(paths)=}")
         | 
| 117 | 
            -
                if len(paths) > 0:
         | 
| 118 | 
            -
                    print(f"{paths[0]=}")
         | 
| 119 | 
            -
                    print(f"{paths[-1]=}")
         | 
| 120 87 |  | 
| 121 88 | 
             
                # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
         | 
| 122 89 | 
             
                # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
         | 
| @@ -154,11 +121,11 @@ def generate_csv_agg_tables( | |
| 154 121 | 
             
                        pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
         | 
| 155 122 | 
             
                    )
         | 
| 156 123 |  | 
| 124 | 
            +
                # TODO: Use generator like os.walk for paths.
         | 
| 157 125 | 
             
                return (
         | 
| 158 | 
            -
                    paths,
         | 
| 159 126 | 
             
                    polygon_aggs_schema,
         | 
| 160 127 | 
             
                    generate_tables_from_csv_files(
         | 
| 161 | 
            -
                        paths= | 
| 128 | 
            +
                        paths=config.csv_paths(),
         | 
| 162 129 | 
             
                        schema=polygon_aggs_schema,
         | 
| 163 130 | 
             
                        start_timestamp=config.start_timestamp,
         | 
| 164 131 | 
             
                        limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
         | 
| @@ -176,11 +143,9 @@ def concat_all_aggs_from_csv( | |
| 176 143 | 
             
                config: PolygonConfig,
         | 
| 177 144 | 
             
                overwrite: bool = False,
         | 
| 178 145 | 
             
            ) -> str:
         | 
| 179 | 
            -
                 | 
| 146 | 
            +
                schema, tables = generate_csv_agg_tables(config)
         | 
| 180 147 |  | 
| 181 | 
            -
                 | 
| 182 | 
            -
                    raise ValueError(f"No Polygon CSV flat files found in {config.aggs_dir=}")
         | 
| 183 | 
            -
                by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir(paths[0], paths[-1])
         | 
| 148 | 
            +
                by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
         | 
| 184 149 | 
             
                if os.path.exists(by_ticker_aggs_arrow_dir):
         | 
| 185 150 | 
             
                    if overwrite:
         | 
| 186 151 | 
             
                        print(f"Removing {by_ticker_aggs_arrow_dir=}")
         | 
| @@ -212,10 +177,10 @@ if __name__ == "__main__": | |
| 212 177 | 
             
                parser = argparse.ArgumentParser()
         | 
| 213 178 | 
             
                parser.add_argument("--calendar_name", default="XNYS")
         | 
| 214 179 |  | 
| 215 | 
            -
                parser.add_argument("-- | 
| 216 | 
            -
                parser.add_argument("-- | 
| 217 | 
            -
                # parser.add_argument("-- | 
| 218 | 
            -
                # parser.add_argument("-- | 
| 180 | 
            +
                parser.add_argument("--start_date", default="2014-06-16")
         | 
| 181 | 
            +
                parser.add_argument("--end_date", default="2024-09-06")
         | 
| 182 | 
            +
                # parser.add_argument("--start_date", default="2020-01-01")
         | 
| 183 | 
            +
                # parser.add_argument("--end_date", default="2020-12-31")
         | 
| 219 184 |  | 
| 220 185 | 
             
                parser.add_argument("--agg_time", default="day")
         | 
| 221 186 |  | 
| @@ -235,8 +200,8 @@ if __name__ == "__main__": | |
| 235 200 | 
             
                config = PolygonConfig(
         | 
| 236 201 | 
             
                    environ=os.environ,
         | 
| 237 202 | 
             
                    calendar_name=args.calendar_name,
         | 
| 238 | 
            -
                     | 
| 239 | 
            -
                     | 
| 203 | 
            +
                    start_date=args.start_date,
         | 
| 204 | 
            +
                    end_date=args.end_date,
         | 
| 240 205 | 
             
                    agg_time=args.agg_time,
         | 
| 241 206 | 
             
                )
         | 
| 242 207 |  | 
| @@ -138,10 +138,10 @@ if __name__ == "__main__": | |
| 138 138 | 
             
                parser = argparse.ArgumentParser()
         | 
| 139 139 | 
             
                parser.add_argument("--calendar_name", default="XNYS")
         | 
| 140 140 |  | 
| 141 | 
            -
                parser.add_argument("-- | 
| 142 | 
            -
                parser.add_argument("-- | 
| 143 | 
            -
                # parser.add_argument("-- | 
| 144 | 
            -
                # parser.add_argument("-- | 
| 141 | 
            +
                parser.add_argument("--start_date", default="2014-06-16")
         | 
| 142 | 
            +
                parser.add_argument("--end_date", default="2024-09-06")
         | 
| 143 | 
            +
                # parser.add_argument("--start_date", default="2020-10-07")
         | 
| 144 | 
            +
                # parser.add_argument("--end_date", default="2020-10-15")
         | 
| 145 145 | 
             
                # parser.add_argument("--aggs_pattern", default="2020/10/**/*.csv.gz")
         | 
| 146 146 | 
             
                parser.add_argument("--aggs_pattern", default="**/*.csv.gz")
         | 
| 147 147 |  | 
| @@ -163,8 +163,8 @@ if __name__ == "__main__": | |
| 163 163 | 
             
                config = PolygonConfig(
         | 
| 164 164 | 
             
                    environ=os.environ,
         | 
| 165 165 | 
             
                    calendar_name=args.calendar_name,
         | 
| 166 | 
            -
                     | 
| 167 | 
            -
                     | 
| 166 | 
            +
                    start_date=args.start_date,
         | 
| 167 | 
            +
                    end_date=args.end_date,
         | 
| 168 168 | 
             
                )
         | 
| 169 169 |  | 
| 170 170 | 
             
                concat_all_aggs_from_csv(
         |