PyPI - zipline_polygon_bundle - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl - Mend

zipline_polygon_bundle 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

zipline_polygon_bundle/__init__.py +28 -1
zipline_polygon_bundle/adjustments.py +34 -0
zipline_polygon_bundle/bundle.py +47 -25
zipline_polygon_bundle/concat_all_aggs.py +17 -32
zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
zipline_polygon_bundle/config.py +99 -26
zipline_polygon_bundle/polygon_file_reader.py +1 -1
zipline_polygon_bundle/process_all_aggs.py +2 -2
zipline_polygon_bundle/quotes.py +101 -0
zipline_polygon_bundle/tickers_and_names.py +1 -37
zipline_polygon_bundle/trades.py +707 -0
{zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/METADATA +6 -3
zipline_polygon_bundle-0.1.8.dist-info/RECORD +16 -0
zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
{zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/LICENSE +0 -0
{zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/WHEEL +0 -0

zipline_polygon_bundle/quotes.py ADDED Viewed

@@ -0,0 +1,101 @@
+from .config import PolygonConfig
+from .trades import cast_strings_to_list
+import os
+import pyarrow as pa
+from pyarrow import dataset as pa_ds
+from pyarrow import compute as pa_compute
+from pyarrow import fs as pa_fs
+from fsspec.implementations.arrow import ArrowFSWrapper
+from pyarrow import csv as pa_csv
+def quotes_schema(raw: bool = False) -> pa.Schema:
+    # There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
+    # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
+    # timestamp_type = pa.timestamp("ns", tz="UTC")
+    timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
+    # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
+    # price_type = pa.decimal128(precision=38, scale=10)
+    # 64bit float a little overkill but avoids any plausible truncation error.
+    price_type = pa.float64()
+    # ticker: string
+    # ask_exchange: int64
+    # ask_price: double
+    # ask_size: int64
+    # bid_exchange: int64
+    # bid_price: double
+    # bid_size: int64
+    # conditions: string
+    # indicators: int64
+    # participant_timestamp: int64
+    # sequence_number: int64
+    # sip_timestamp: int64
+    # tape: int64
+    # trf_timestamp: int64
+    return pa.schema(
+            [
+                pa.field("ticker", pa.string(), nullable=False),
+                pa.field("ask_exchange", pa.int8(), nullable=False),
+                pa.field("ask_price", price_type, nullable=False),
+                pa.field("ask_size", pa.int64(), nullable=False),
+                pa.field("bid_exchange", pa.int8(), nullable=False),
+                pa.field("bid_price", price_type, nullable=False),
+                pa.field("bid_size", pa.int64(), nullable=False),
+                pa.field("conditions", pa.string(), nullable=False),
+                pa.field("indicators", pa.string(), nullable=False),
+                pa.field("participant_timestamp", timestamp_type, nullable=False),
+                pa.field("sequence_number", pa.int64(), nullable=False),
+                pa.field("sip_timestamp", timestamp_type, nullable=False),
+                pa.field("tape", pa.int8(), nullable=False),
+                pa.field("trf_timestamp", timestamp_type, nullable=False),
+            ]
+        )
+def quotes_dataset(config: PolygonConfig) -> pa_ds.Dataset:
+    """
+    Create a pyarrow dataset from the quotes files.
+    """
+    # https://arrow.apache.org/docs/python/filesystems.html#using-arrow-filesystems-with-fsspec
+    # https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem.glob.
+    fsspec = ArrowFSWrapper(config.filesystem)
+    # We sort by path because they have the year and month in the dir names and the date in the filename.
+    paths = sorted(
+        fsspec.glob(os.path.join(config.quotes_dir, config.csv_paths_pattern))
+    )
+    return pa_ds.FileSystemDataset.from_paths(paths,
+                                              format=pa_ds.CsvFileFormat(),
+                                              schema=quotes_schema(raw=True),
+                                              filesystem=config.filesystem)
+def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa.uint8()):
+    """Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
+    # Create a mask to identify empty strings
+    is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
+    # Use replace_with_mask to replace empty strings with the default ("0")
+    filled_column = pa_compute.replace_with_mask(string_array, is_empty, pa.scalar(default))
+    # Split the strings by comma
+    split_array = pa_compute.split_pattern(filled_column, pattern=separator)
+    # Cast each element in the resulting lists to integers
+    int_list_array = pa_compute.cast(split_array, pa.list_(value_type))
+    return int_list_array
+def cast_quotes(quotes):
+    quotes = quotes.cast(quotes_schema())
+    condition_values = cast_strings_to_list(quotes.column("conditions").combine_chunks())
+    return quotes.append_column('condition_values', condition_values)

zipline_polygon_bundle/tickers_and_names.py CHANGED Viewed

@@ -3,6 +3,7 @@ from .config import PolygonConfig
 import datetime
 import os
 import pandas as pd
+import csv
 import polygon
 import logging
 from concurrent.futures import ProcessPoolExecutor
@@ -383,40 +384,3 @@ def get_ticker_universe(config: PolygonConfig, fetch_missing: bool = False):
     merged_tickers = pd.read_parquet(parquet_path)
     merged_tickers.info()
     return merged_tickers
-# Initialize ticker files in __main__.  Use CLI args to specify start and end dates.
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description="Initialize ticker files.")
-    parser.add_argument(
-        "--start-date",
-        type=str,
-        help="Start date in ISO format (YYYY-MM-DD)",
-        default="2014-05-01",
-    )
-    parser.add_argument(
-        "--end-date",
-        type=str,
-        help="End date in ISO format (YYYY-MM-DD)",
-        default="2024-04-01",
-    )
-    args = parser.parse_args()
-    start_date = (
-        datetime.datetime.strptime(args.start_date, "%Y-%m-%d").date()
-        if args.start_date
-        else datetime.date.today()
-    )
-    end_date = (
-        datetime.datetime.strptime(args.end_date, "%Y-%m-%d").date()
-        if args.end_date
-        else datetime.date.today()
-    )
-    all_tickers = load_all_tickers(start_date, end_date, fetch_missing=True)
-    merged_tickers = merge_tickers(all_tickers)
-    merged_tickers.to_csv(f"data/tickers/us_tickers_{start_date}-{end_date}.csv")
-    ticker_names = ticker_names_from_merged_tickers(merged_tickers)
-    print(ticker_names)

zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

zipline_polygon_bundle 0.1.7py3-none-any.whl → 0.1.8py3-none-any.whl