zipline_polygon_bundle 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +12 -11
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +172 -200
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +129 -44
- zipline_polygon_bundle/config.py +90 -32
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/tickers_and_names.py +4 -1
- zipline_polygon_bundle/trades.py +352 -526
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/METADATA +7 -5
- zipline_polygon_bundle-0.2.0.dist-info/RECORD +18 -0
- zipline_polygon_bundle-0.1.8.dist-info/RECORD +0 -16
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/WHEEL +0 -0
zipline_polygon_bundle/config.py
CHANGED
@@ -1,14 +1,39 @@
|
|
1
|
-
from exchange_calendars.calendar_helpers import Date, parse_date
|
1
|
+
from exchange_calendars.calendar_helpers import Date, parse_date
|
2
2
|
from zipline.utils.calendar_utils import get_calendar
|
3
3
|
|
4
|
-
from
|
4
|
+
from .nyse_all_hours_calendar import NYSE_ALL_HOURS
|
5
|
+
|
6
|
+
from typing import Iterator, Tuple
|
5
7
|
|
6
8
|
import pandas as pd
|
7
9
|
from pyarrow.fs import LocalFileSystem
|
8
10
|
import os
|
11
|
+
import datetime
|
9
12
|
import re
|
10
13
|
import fnmatch
|
11
14
|
|
15
|
+
AGG_TIME_DAY = "day"
|
16
|
+
AGG_TIME_MINUTE = "minute"
|
17
|
+
AGG_TIME_TRADES = "1minute"
|
18
|
+
|
19
|
+
PARTITION_COLUMN_NAME = "part"
|
20
|
+
PARTITION_KEY_LENGTH = 2
|
21
|
+
|
22
|
+
|
23
|
+
def to_partition_key(s: str) -> str:
|
24
|
+
"""
|
25
|
+
Partition key is low cardinality and must be filesystem-safe.
|
26
|
+
The reason for partitioning is to keep the parquet files from getting too big.
|
27
|
+
10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
|
28
|
+
Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
|
29
|
+
"""
|
30
|
+
k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
|
31
|
+
if k.isalpha():
|
32
|
+
return k
|
33
|
+
# Replace non-alpha characters with "A".
|
34
|
+
k = "".join([c if c.isalpha() else "A" for c in k])
|
35
|
+
return k
|
36
|
+
|
12
37
|
|
13
38
|
class PolygonConfig:
|
14
39
|
def __init__(
|
@@ -17,12 +42,12 @@ class PolygonConfig:
|
|
17
42
|
calendar_name: str,
|
18
43
|
start_date: Date,
|
19
44
|
end_date: Date,
|
20
|
-
agg_time: str =
|
21
|
-
custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
|
45
|
+
agg_time: str = AGG_TIME_DAY,
|
22
46
|
):
|
23
47
|
self.calendar_name = calendar_name
|
24
48
|
self.start_date = start_date
|
25
49
|
self.end_date = end_date
|
50
|
+
# TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
|
26
51
|
self.start_timestamp = (
|
27
52
|
parse_date(start_date, calendar=self.calendar)
|
28
53
|
if start_date
|
@@ -33,6 +58,8 @@ class PolygonConfig:
|
|
33
58
|
if end_date
|
34
59
|
else self.calendar.last_session
|
35
60
|
)
|
61
|
+
self.agg_time = agg_time
|
62
|
+
|
36
63
|
self.max_workers = None
|
37
64
|
if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
|
38
65
|
self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
|
@@ -61,7 +88,8 @@ class PolygonConfig:
|
|
61
88
|
# TODO: Restore non-recusive option. Always recursive for now.
|
62
89
|
self.csv_paths_pattern = environ.get(
|
63
90
|
# "POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz"
|
64
|
-
"POLYGON_FLAT_FILES_CSV_PATTERN",
|
91
|
+
"POLYGON_FLAT_FILES_CSV_PATTERN",
|
92
|
+
"*.csv.gz",
|
65
93
|
)
|
66
94
|
self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
|
67
95
|
self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
|
@@ -70,51 +98,67 @@ class PolygonConfig:
|
|
70
98
|
self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
|
71
99
|
|
72
100
|
# TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
|
101
|
+
self.custom_asset_files_dir = environ.get(
|
102
|
+
"CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
|
103
|
+
)
|
104
|
+
|
105
|
+
self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
|
106
|
+
|
107
|
+
self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
|
108
|
+
self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
|
109
|
+
self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
|
110
|
+
|
73
111
|
self.minute_by_ticker_dir = os.path.join(
|
74
|
-
self.
|
112
|
+
self.custom_asset_files_dir, "minute_by_ticker_v1"
|
113
|
+
)
|
114
|
+
self.day_by_ticker_dir = os.path.join(
|
115
|
+
self.custom_asset_files_dir, "day_by_ticker_v1"
|
75
116
|
)
|
76
|
-
self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
|
77
117
|
|
118
|
+
# If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
|
78
119
|
if bool(re.match(r"^\d", agg_time)):
|
79
120
|
self.agg_timedelta = pd.to_timedelta(agg_time)
|
80
|
-
self.
|
81
|
-
|
121
|
+
self.csv_files_dir = self.trades_dir
|
122
|
+
self.custom_aggs_name_format = environ.get(
|
123
|
+
"CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
|
82
124
|
)
|
83
|
-
self.
|
84
|
-
self.custom_asset_files_dir,
|
125
|
+
self.aggs_dir = os.path.join(
|
126
|
+
self.custom_asset_files_dir,
|
127
|
+
self.custom_aggs_name_format.format(config=self),
|
85
128
|
)
|
86
|
-
self.
|
129
|
+
self.by_ticker_dir = os.path.join(
|
87
130
|
self.custom_asset_files_dir,
|
88
|
-
(
|
131
|
+
(self.custom_aggs_name_format + "_by_ticker").format(config=self),
|
89
132
|
)
|
90
|
-
|
91
|
-
self.by_ticker_dir = self.custom_aggs_by_ticker_dir
|
92
|
-
elif agg_time == "minute":
|
133
|
+
elif agg_time == AGG_TIME_MINUTE:
|
93
134
|
self.agg_timedelta = pd.to_timedelta("1minute")
|
94
135
|
self.aggs_dir = self.minute_aggs_dir
|
136
|
+
self.csv_files_dir = self.aggs_dir
|
95
137
|
self.by_ticker_dir = self.minute_by_ticker_dir
|
96
|
-
elif agg_time ==
|
138
|
+
elif agg_time == AGG_TIME_DAY:
|
97
139
|
self.agg_timedelta = pd.to_timedelta("1day")
|
98
140
|
self.aggs_dir = self.day_aggs_dir
|
141
|
+
self.csv_files_dir = self.aggs_dir
|
99
142
|
self.by_ticker_dir = self.day_by_ticker_dir
|
100
143
|
else:
|
101
144
|
raise ValueError(
|
102
145
|
f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
|
103
146
|
)
|
104
|
-
self.agg_time = agg_time
|
105
147
|
|
106
148
|
self.arrow_format = environ.get(
|
107
|
-
"POLYGON_ARROW_FORMAT", "parquet" if self.agg_time ==
|
149
|
+
"POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
|
108
150
|
)
|
109
151
|
# self.by_ticker_hive_dir = os.path.join(
|
110
152
|
# self.by_ticker_dir,
|
111
153
|
# f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
|
112
154
|
# )
|
113
|
-
self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
|
114
155
|
|
115
156
|
@property
|
116
157
|
def calendar(self):
|
117
|
-
|
158
|
+
# print call stack
|
159
|
+
# import traceback
|
160
|
+
# traceback.print_stack()
|
161
|
+
return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
|
118
162
|
|
119
163
|
def ticker_file_path(self, date: pd.Timestamp):
|
120
164
|
ticker_year_dir = os.path.join(
|
@@ -129,18 +173,30 @@ class PolygonConfig:
|
|
129
173
|
# TODO: Use csv_paths_pattern to remove the suffixes
|
130
174
|
return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
|
131
175
|
|
176
|
+
def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
|
177
|
+
return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
|
178
|
+
|
132
179
|
@property
|
133
180
|
def by_ticker_aggs_arrow_dir(self):
|
134
|
-
|
135
|
-
|
181
|
+
# TODO: Don't split these up by ingestion range. They're already time indexed.
|
182
|
+
# Only reason to separate them is if we're worried about (or want) data being different across ingestions.
|
183
|
+
# This scattering is really slow and is usually gonna be redundant.
|
184
|
+
# This wasn't a problem when start/end dates were the calendar bounds when omitted.
|
185
|
+
# Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
|
186
|
+
# return os.path.join(
|
187
|
+
# self.by_ticker_dir,
|
188
|
+
# f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
|
189
|
+
# # "aggs.arrow",
|
190
|
+
# )
|
191
|
+
return self.by_ticker_dir
|
136
192
|
|
137
193
|
def api_cache_path(
|
138
|
-
self,
|
194
|
+
self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
|
139
195
|
):
|
140
|
-
|
141
|
-
|
196
|
+
first_day_str = first_day.date().isoformat()
|
197
|
+
last_day_str = last_day.date().isoformat()
|
142
198
|
return os.path.join(
|
143
|
-
self.cache_dir, f"{
|
199
|
+
self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
|
144
200
|
)
|
145
201
|
|
146
202
|
def csv_paths(self) -> Iterator[str]:
|
@@ -153,13 +209,15 @@ class PolygonConfig:
|
|
153
209
|
for filename in sorted(filenames):
|
154
210
|
yield os.path.join(root, filename)
|
155
211
|
|
156
|
-
def find_first_and_last_aggs(
|
212
|
+
def find_first_and_last_aggs(
|
213
|
+
self, aggs_dir, file_pattern
|
214
|
+
) -> Tuple[str | None, str | None]:
|
157
215
|
# Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
|
158
216
|
# Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
|
159
217
|
# This needs to be efficient because it is called on every init, even though we only need it for ingest.
|
160
218
|
# But we can't call it in ingest because the writer initializes and writes the metadata before it is called.
|
161
219
|
paths = []
|
162
|
-
for root, dirnames, filenames in os.walk(
|
220
|
+
for root, dirnames, filenames in os.walk(aggs_dir, topdown=True):
|
163
221
|
if dirnames:
|
164
222
|
# We only want first and last in each directory.
|
165
223
|
sorted_dirs = sorted(dirnames)
|
@@ -169,15 +227,15 @@ class PolygonConfig:
|
|
169
227
|
else sorted_dirs
|
170
228
|
)
|
171
229
|
# Filter out filenames that don't match the pattern.
|
172
|
-
filenames = fnmatch.filter(filenames,
|
230
|
+
filenames = fnmatch.filter(filenames, file_pattern)
|
173
231
|
if filenames:
|
174
232
|
filenames = sorted(filenames)
|
175
233
|
paths.append(os.path.join(root, filenames[0]))
|
176
234
|
if len(filenames) > 1:
|
177
235
|
paths.append(os.path.join(root, filenames[-1]))
|
236
|
+
if not paths:
|
237
|
+
return None, None
|
178
238
|
paths = sorted(paths)
|
179
|
-
if len(paths) < 2:
|
180
|
-
raise ValueError(f"Need more than one aggs file but found {len(paths)} paths in {self.aggs_dir}")
|
181
239
|
return self.file_path_to_name(paths[0]), self.file_path_to_name(paths[-1])
|
182
240
|
|
183
241
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import datetime
|
2
|
+
from exchange_calendars.calendar_utils import get_calendar_names, register_calendar_type
|
3
|
+
from exchange_calendars.exchange_calendar_xnys import XNYSExchangeCalendar
|
4
|
+
|
5
|
+
|
6
|
+
NYSE_ALL_HOURS = "NYSE_ALL_HOURS"
|
7
|
+
|
8
|
+
|
9
|
+
class USExtendedHoursExchangeCalendar(XNYSExchangeCalendar):
|
10
|
+
"""
|
11
|
+
A calendar for extended hours which runs from 4 AM to 8 PM.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = NYSE_ALL_HOURS
|
15
|
+
|
16
|
+
open_times = ((None, datetime.time(4)),)
|
17
|
+
|
18
|
+
close_times = ((None, datetime.time(20)),)
|
19
|
+
|
20
|
+
regular_early_close = datetime.time(13)
|
21
|
+
|
22
|
+
|
23
|
+
def register_nyse_all_hours_calendar():
|
24
|
+
if NYSE_ALL_HOURS not in get_calendar_names():
|
25
|
+
register_calendar_type(NYSE_ALL_HOURS, USExtendedHoursExchangeCalendar)
|
@@ -48,7 +48,10 @@ class PolygonAssets:
|
|
48
48
|
active: bool = True,
|
49
49
|
):
|
50
50
|
response = self.polygon_client.list_tickers(
|
51
|
-
market=self.config.market,
|
51
|
+
market=self.config.market,
|
52
|
+
active=active,
|
53
|
+
date=date.date().isoformat(),
|
54
|
+
limit=500,
|
52
55
|
)
|
53
56
|
tickers_df = pd.DataFrame(list(response))
|
54
57
|
# The currency info is for crypto. The source_feed is always NA.
|