zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +28 -1
- zipline_polygon_bundle/adjustments.py +34 -0
- zipline_polygon_bundle/bundle.py +47 -25
- zipline_polygon_bundle/concat_all_aggs.py +17 -32
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +99 -26
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +1 -37
- zipline_polygon_bundle/trades.py +707 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/METADATA +6 -3
- zipline_polygon_bundle-0.1.8.dist-info/RECORD +16 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/WHEEL +0 -0
@@ -0,0 +1,101 @@
|
|
1
|
+
from .config import PolygonConfig
|
2
|
+
from .trades import cast_strings_to_list
|
3
|
+
|
4
|
+
import os
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
from pyarrow import dataset as pa_ds
|
8
|
+
from pyarrow import compute as pa_compute
|
9
|
+
from pyarrow import fs as pa_fs
|
10
|
+
from fsspec.implementations.arrow import ArrowFSWrapper
|
11
|
+
from pyarrow import csv as pa_csv
|
12
|
+
|
13
|
+
|
14
|
+
def quotes_schema(raw: bool = False) -> pa.Schema:
|
15
|
+
# There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
|
16
|
+
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
17
|
+
# timestamp_type = pa.timestamp("ns", tz="UTC")
|
18
|
+
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
19
|
+
|
20
|
+
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
21
|
+
# price_type = pa.decimal128(precision=38, scale=10)
|
22
|
+
# 64bit float a little overkill but avoids any plausible truncation error.
|
23
|
+
price_type = pa.float64()
|
24
|
+
|
25
|
+
# ticker: string
|
26
|
+
# ask_exchange: int64
|
27
|
+
# ask_price: double
|
28
|
+
# ask_size: int64
|
29
|
+
# bid_exchange: int64
|
30
|
+
# bid_price: double
|
31
|
+
# bid_size: int64
|
32
|
+
# conditions: string
|
33
|
+
# indicators: int64
|
34
|
+
# participant_timestamp: int64
|
35
|
+
# sequence_number: int64
|
36
|
+
# sip_timestamp: int64
|
37
|
+
# tape: int64
|
38
|
+
# trf_timestamp: int64
|
39
|
+
|
40
|
+
return pa.schema(
|
41
|
+
[
|
42
|
+
pa.field("ticker", pa.string(), nullable=False),
|
43
|
+
pa.field("ask_exchange", pa.int8(), nullable=False),
|
44
|
+
pa.field("ask_price", price_type, nullable=False),
|
45
|
+
pa.field("ask_size", pa.int64(), nullable=False),
|
46
|
+
pa.field("bid_exchange", pa.int8(), nullable=False),
|
47
|
+
pa.field("bid_price", price_type, nullable=False),
|
48
|
+
pa.field("bid_size", pa.int64(), nullable=False),
|
49
|
+
pa.field("conditions", pa.string(), nullable=False),
|
50
|
+
pa.field("indicators", pa.string(), nullable=False),
|
51
|
+
pa.field("participant_timestamp", timestamp_type, nullable=False),
|
52
|
+
pa.field("sequence_number", pa.int64(), nullable=False),
|
53
|
+
pa.field("sip_timestamp", timestamp_type, nullable=False),
|
54
|
+
pa.field("tape", pa.int8(), nullable=False),
|
55
|
+
pa.field("trf_timestamp", timestamp_type, nullable=False),
|
56
|
+
]
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
def quotes_dataset(config: PolygonConfig) -> pa_ds.Dataset:
|
61
|
+
"""
|
62
|
+
Create a pyarrow dataset from the quotes files.
|
63
|
+
"""
|
64
|
+
|
65
|
+
# https://arrow.apache.org/docs/python/filesystems.html#using-arrow-filesystems-with-fsspec
|
66
|
+
# https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem.glob.
|
67
|
+
fsspec = ArrowFSWrapper(config.filesystem)
|
68
|
+
|
69
|
+
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
70
|
+
paths = sorted(
|
71
|
+
fsspec.glob(os.path.join(config.quotes_dir, config.csv_paths_pattern))
|
72
|
+
)
|
73
|
+
|
74
|
+
return pa_ds.FileSystemDataset.from_paths(paths,
|
75
|
+
format=pa_ds.CsvFileFormat(),
|
76
|
+
schema=quotes_schema(raw=True),
|
77
|
+
filesystem=config.filesystem)
|
78
|
+
|
79
|
+
|
80
|
+
def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa.uint8()):
|
81
|
+
"""Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
|
82
|
+
|
83
|
+
# Create a mask to identify empty strings
|
84
|
+
is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
|
85
|
+
|
86
|
+
# Use replace_with_mask to replace empty strings with the default ("0")
|
87
|
+
filled_column = pa_compute.replace_with_mask(string_array, is_empty, pa.scalar(default))
|
88
|
+
|
89
|
+
# Split the strings by comma
|
90
|
+
split_array = pa_compute.split_pattern(filled_column, pattern=separator)
|
91
|
+
|
92
|
+
# Cast each element in the resulting lists to integers
|
93
|
+
int_list_array = pa_compute.cast(split_array, pa.list_(value_type))
|
94
|
+
|
95
|
+
return int_list_array
|
96
|
+
|
97
|
+
|
98
|
+
def cast_quotes(quotes):
|
99
|
+
quotes = quotes.cast(quotes_schema())
|
100
|
+
condition_values = cast_strings_to_list(quotes.column("conditions").combine_chunks())
|
101
|
+
return quotes.append_column('condition_values', condition_values)
|
@@ -3,6 +3,7 @@ from .config import PolygonConfig
|
|
3
3
|
import datetime
|
4
4
|
import os
|
5
5
|
import pandas as pd
|
6
|
+
import csv
|
6
7
|
import polygon
|
7
8
|
import logging
|
8
9
|
from concurrent.futures import ProcessPoolExecutor
|
@@ -383,40 +384,3 @@ def get_ticker_universe(config: PolygonConfig, fetch_missing: bool = False):
|
|
383
384
|
merged_tickers = pd.read_parquet(parquet_path)
|
384
385
|
merged_tickers.info()
|
385
386
|
return merged_tickers
|
386
|
-
|
387
|
-
|
388
|
-
# Initialize ticker files in __main__. Use CLI args to specify start and end dates.
|
389
|
-
if __name__ == "__main__":
|
390
|
-
import argparse
|
391
|
-
|
392
|
-
parser = argparse.ArgumentParser(description="Initialize ticker files.")
|
393
|
-
parser.add_argument(
|
394
|
-
"--start-date",
|
395
|
-
type=str,
|
396
|
-
help="Start date in ISO format (YYYY-MM-DD)",
|
397
|
-
default="2014-05-01",
|
398
|
-
)
|
399
|
-
parser.add_argument(
|
400
|
-
"--end-date",
|
401
|
-
type=str,
|
402
|
-
help="End date in ISO format (YYYY-MM-DD)",
|
403
|
-
default="2024-04-01",
|
404
|
-
)
|
405
|
-
args = parser.parse_args()
|
406
|
-
|
407
|
-
start_date = (
|
408
|
-
datetime.datetime.strptime(args.start_date, "%Y-%m-%d").date()
|
409
|
-
if args.start_date
|
410
|
-
else datetime.date.today()
|
411
|
-
)
|
412
|
-
end_date = (
|
413
|
-
datetime.datetime.strptime(args.end_date, "%Y-%m-%d").date()
|
414
|
-
if args.end_date
|
415
|
-
else datetime.date.today()
|
416
|
-
)
|
417
|
-
|
418
|
-
all_tickers = load_all_tickers(start_date, end_date, fetch_missing=True)
|
419
|
-
merged_tickers = merge_tickers(all_tickers)
|
420
|
-
merged_tickers.to_csv(f"data/tickers/us_tickers_{start_date}-{end_date}.csv")
|
421
|
-
ticker_names = ticker_names_from_merged_tickers(merged_tickers)
|
422
|
-
print(ticker_names)
|