zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +7 -9
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +157 -312
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +130 -25
- zipline_polygon_bundle/config.py +70 -45
- zipline_polygon_bundle/trades.py +197 -606
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/METADATA +90 -8
- zipline_polygon_bundle-0.2.3.dist-info/RECORD +18 -0
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/WHEEL +1 -1
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +0 -17
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.3.dist-info}/LICENSE +0 -0
zipline_polygon_bundle/bundle.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1
|
-
import os
|
2
1
|
from zipline.data.bundles import register
|
3
2
|
from zipline.data.resample import minute_frame_to_session_frame
|
4
3
|
|
5
4
|
from exchange_calendars.calendar_helpers import parse_date
|
6
|
-
from exchange_calendars.calendar_utils import get_calendar
|
7
5
|
|
8
6
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
9
7
|
from .adjustments import load_splits, load_dividends
|
10
|
-
from .config import PolygonConfig
|
8
|
+
from .config import PolygonConfig, AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES
|
11
9
|
from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
|
12
10
|
from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
|
13
11
|
|
@@ -16,6 +14,9 @@ import pyarrow.compute
|
|
16
14
|
import pyarrow.dataset
|
17
15
|
|
18
16
|
import pandas as pd
|
17
|
+
|
18
|
+
import os
|
19
|
+
from filelock import FileLock
|
19
20
|
import logging
|
20
21
|
|
21
22
|
|
@@ -37,61 +38,34 @@ def generate_all_agg_tables_from_csv(
|
|
37
38
|
yield table
|
38
39
|
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
# )
|
51
|
-
# df = df[~duplicated_index_with_zero_activity]
|
52
|
-
# duplicated_index = df.index.duplicated(keep=False)
|
53
|
-
# if not duplicated_index.any():
|
54
|
-
# return df
|
55
|
-
# print(f" WARNING: Dropping dupes {df[duplicated_index]=}")
|
56
|
-
# df = df[df.index.duplicated(keep="first")]
|
57
|
-
# return df
|
58
|
-
|
59
|
-
|
60
|
-
def aggregate_multiple_aggs_per_date(df: pd.DataFrame) -> pd.DataFrame:
|
61
|
-
duplicated_index = df.index.duplicated(keep=False)
|
62
|
-
if not duplicated_index.any():
|
63
|
-
return df
|
64
|
-
duplicates = df[duplicated_index]
|
65
|
-
duplicate_index_values = duplicates.index.values
|
66
|
-
print()
|
67
|
-
if duplicates["symbol"].nunique() != 1:
|
68
|
-
logging.error(f"{duplicates['symbol'].unique()=} {duplicate_index_values=}")
|
69
|
-
logging.warning(
|
70
|
-
f"Aggregating dupes df[df.index.duplicated(keep=False)]=\n{duplicates}"
|
71
|
-
)
|
72
|
-
df = df.groupby(df.index).agg(
|
73
|
-
{
|
74
|
-
"symbol": "first",
|
75
|
-
"volume": "sum",
|
76
|
-
"open": "first",
|
77
|
-
"close": "last",
|
78
|
-
"high": "max",
|
79
|
-
"low": "min",
|
80
|
-
"transactions": "sum",
|
81
|
-
}
|
41
|
+
def rename_polygon_to_zipline(table: pyarrow.Table, time_name: str) -> pyarrow.Table:
|
42
|
+
table = table.rename_columns(
|
43
|
+
[
|
44
|
+
(
|
45
|
+
"symbol"
|
46
|
+
if name == "ticker"
|
47
|
+
else time_name if name == "window_start" else name
|
48
|
+
)
|
49
|
+
for name in table.column_names
|
50
|
+
]
|
82
51
|
)
|
83
|
-
|
84
|
-
return df
|
52
|
+
return table
|
85
53
|
|
86
54
|
|
87
|
-
def
|
55
|
+
def process_day_table(
|
88
56
|
table,
|
89
57
|
sessions,
|
58
|
+
minutes,
|
90
59
|
metadata,
|
91
60
|
calendar,
|
92
61
|
symbol_to_sid: dict[str, int],
|
93
|
-
dates_with_data: set,
|
62
|
+
dates_with_data: set[pd.Timestamp],
|
63
|
+
agg_time: str,
|
94
64
|
):
|
65
|
+
table = rename_polygon_to_zipline(table, "day")
|
66
|
+
symbols = table.column("symbol").unique().to_pylist()
|
67
|
+
for sid, symbol in enumerate(symbols):
|
68
|
+
symbol_to_sid[symbol] = sid
|
95
69
|
for symbol, sid in symbol_to_sid.items():
|
96
70
|
df = table.filter(
|
97
71
|
pyarrow.compute.field("symbol") == pyarrow.scalar(symbol)
|
@@ -99,24 +73,25 @@ def process_day_aggregates(
|
|
99
73
|
# The SQL schema zipline uses for symbols ignores case
|
100
74
|
sql_symbol = symbol_to_upper(symbol)
|
101
75
|
df["symbol"] = sql_symbol
|
102
|
-
df["day"] = pd.to_datetime(df["day"].dt.date)
|
76
|
+
df["day"] = pd.to_datetime(df["day"].dt.tz_convert(calendar.tz.key).dt.date)
|
103
77
|
df = df.set_index("day")
|
104
78
|
if not df.index.is_monotonic_increasing:
|
105
|
-
print(f" INFO: {symbol=} {sid=} not monotonic increasing")
|
79
|
+
print(f" INFO: {symbol=} {sid=} not monotonic increasing: {df.index.min()=} {df.index.max()=}")
|
106
80
|
df.sort_index(inplace=True)
|
107
81
|
# Remove duplicates
|
108
82
|
df = df[~df.index.duplicated(keep="first")]
|
109
83
|
# Take days as per calendar
|
110
84
|
df = df[df.index.isin(sessions)]
|
111
85
|
# 2019-08-13 has a bunch of tickers with multiple day aggs per date
|
112
|
-
|
86
|
+
# TODO: Actually they're for different days so if the filtering doesn't work then do something about it.
|
87
|
+
# df = aggregate_multiple_aggs_per_date(df)
|
113
88
|
if len(df) < 1:
|
114
89
|
continue
|
115
90
|
# Check first and last date.
|
116
91
|
start_date = df.index[0]
|
117
|
-
dates_with_data.add(start_date
|
92
|
+
dates_with_data.add(start_date)
|
118
93
|
end_date = df.index[-1]
|
119
|
-
dates_with_data.add(end_date
|
94
|
+
dates_with_data.add(end_date)
|
120
95
|
try:
|
121
96
|
duplicated_index = df.index.duplicated(keep=False)
|
122
97
|
df_with_duplicates = df[duplicated_index]
|
@@ -159,133 +134,22 @@ def process_day_aggregates(
|
|
159
134
|
return
|
160
135
|
|
161
136
|
|
162
|
-
def
|
163
|
-
table
|
164
|
-
[
|
165
|
-
(
|
166
|
-
"symbol"
|
167
|
-
if name == "ticker"
|
168
|
-
else time_name if name == "window_start" else name
|
169
|
-
)
|
170
|
-
for name in table.column_names
|
171
|
-
]
|
172
|
-
)
|
173
|
-
return table
|
174
|
-
|
175
|
-
|
176
|
-
def polygon_equities_bundle_day(
|
177
|
-
environ,
|
178
|
-
asset_db_writer,
|
179
|
-
minute_bar_writer,
|
180
|
-
daily_bar_writer,
|
181
|
-
adjustment_writer,
|
182
|
-
calendar,
|
183
|
-
start_date,
|
184
|
-
end_date,
|
185
|
-
cache,
|
186
|
-
show_progress,
|
187
|
-
output_dir,
|
188
|
-
):
|
189
|
-
config = PolygonConfig(
|
190
|
-
environ=environ,
|
191
|
-
calendar_name=calendar.name,
|
192
|
-
start_date=start_date,
|
193
|
-
end_date=end_date,
|
194
|
-
agg_time="day",
|
195
|
-
)
|
196
|
-
|
197
|
-
by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
|
198
|
-
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
199
|
-
|
200
|
-
# Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
|
201
|
-
# This is because the SQL schema zipline uses for symbols ignores case.
|
202
|
-
# We put the original symbol in the asset_name field.
|
203
|
-
metadata = pd.DataFrame(
|
204
|
-
columns=(
|
205
|
-
"start_date",
|
206
|
-
"end_date",
|
207
|
-
"auto_close_date",
|
208
|
-
"symbol",
|
209
|
-
"exchange",
|
210
|
-
"asset_name",
|
211
|
-
)
|
212
|
-
)
|
213
|
-
|
214
|
-
# Only get the columns Zipline allows.
|
215
|
-
table = aggregates.to_table(
|
216
|
-
columns=[
|
217
|
-
"ticker",
|
218
|
-
"window_start",
|
219
|
-
"open",
|
220
|
-
"high",
|
221
|
-
"low",
|
222
|
-
"close",
|
223
|
-
"volume",
|
224
|
-
"transactions",
|
225
|
-
]
|
226
|
-
)
|
227
|
-
table = rename_polygon_to_zipline(table, "day")
|
228
|
-
# Get all the symbols in the table by using value_counts to tabulate the unique values.
|
229
|
-
# pyarrow.Table.column returns a pyarrow.ChunkedArray.
|
230
|
-
# https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow.ChunkedArray.value_counts
|
231
|
-
symbols = sorted(table.column("symbol").value_counts().field(0).to_pylist())
|
232
|
-
symbol_to_sid = {symbol: sid for sid, symbol in enumerate(symbols)}
|
233
|
-
dates_with_data = set()
|
234
|
-
|
235
|
-
# Get data for all stocks and write to Zipline
|
236
|
-
daily_bar_writer.write(
|
237
|
-
process_day_aggregates(
|
238
|
-
table=table,
|
239
|
-
sessions=calendar.sessions_in_range(start_date, end_date),
|
240
|
-
metadata=metadata,
|
241
|
-
calendar=calendar,
|
242
|
-
symbol_to_sid=symbol_to_sid,
|
243
|
-
dates_with_data=dates_with_data,
|
244
|
-
),
|
245
|
-
show_progress=show_progress,
|
246
|
-
)
|
247
|
-
|
248
|
-
# Write the metadata
|
249
|
-
asset_db_writer.write(equities=metadata)
|
250
|
-
|
251
|
-
# Load splits and dividends
|
252
|
-
first_start_end = min(dates_with_data)
|
253
|
-
last_end_date = max(dates_with_data)
|
254
|
-
splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
|
255
|
-
dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
|
256
|
-
|
257
|
-
# Write splits and dividends
|
258
|
-
adjustment_writer.write(splits=splits, dividends=dividends)
|
259
|
-
|
260
|
-
|
261
|
-
def process_minute_fragment(
|
262
|
-
fragment,
|
137
|
+
def process_minute_table(
|
138
|
+
table,
|
263
139
|
sessions,
|
264
140
|
minutes,
|
265
141
|
metadata,
|
266
142
|
calendar,
|
267
143
|
symbol_to_sid: dict[str, int],
|
268
|
-
dates_with_data: set,
|
144
|
+
dates_with_data: set[pd.Timestamp],
|
269
145
|
agg_time: str,
|
270
146
|
):
|
271
|
-
# Only get the columns Zipline allows.
|
272
|
-
table = fragment.to_table(
|
273
|
-
columns=[
|
274
|
-
"ticker",
|
275
|
-
"window_start",
|
276
|
-
"open",
|
277
|
-
"high",
|
278
|
-
"low",
|
279
|
-
"close",
|
280
|
-
"volume",
|
281
|
-
"transactions",
|
282
|
-
]
|
283
|
-
)
|
284
|
-
print(f" {table.num_rows=}")
|
285
147
|
table = rename_polygon_to_zipline(table, "timestamp")
|
286
|
-
|
148
|
+
# print(f"{minutes[:5]=}\n{minutes[-5:]=}")
|
287
149
|
table = table.filter(pyarrow.compute.field("timestamp").isin(minutes))
|
150
|
+
# print(f"filtered {table.num_rows=}")
|
288
151
|
table_df = table.to_pandas()
|
152
|
+
# print(f"{table_df.head()=}")
|
289
153
|
for symbol, df in table_df.groupby("symbol"):
|
290
154
|
# print(f"\n{symbol=} {len(df)=} {df['timestamp'].min()} {df['timestamp'].max()}")
|
291
155
|
if symbol not in symbol_to_sid:
|
@@ -295,29 +159,35 @@ def process_minute_fragment(
|
|
295
159
|
sql_symbol = symbol_to_upper(symbol)
|
296
160
|
df["symbol"] = sql_symbol
|
297
161
|
df = df.set_index("timestamp")
|
298
|
-
|
162
|
+
# Shouldn't need to do this because the table is sorted.
|
163
|
+
if not df.index.is_monotonic_increasing:
|
164
|
+
print(f" INFO: {symbol=} {sid=} not monotonic increasing")
|
165
|
+
df.sort_index(inplace=True)
|
166
|
+
if agg_time == AGG_TIME_DAY:
|
299
167
|
df.drop(columns=["symbol", "transactions"], inplace=True)
|
300
|
-
#
|
301
|
-
start_date = df.index[0].
|
302
|
-
start_timestamp = df.index[0]
|
168
|
+
# Remember first and last date.
|
169
|
+
start_date = df.index[0].tz_convert(calendar.tz.key).normalize()
|
303
170
|
dates_with_data.add(start_date)
|
304
|
-
end_date = df.index[-1].
|
305
|
-
end_timestamp = df.index[-1]
|
171
|
+
end_date = df.index[-1].tz_convert(calendar.tz.key).normalize()
|
306
172
|
dates_with_data.add(end_date)
|
307
173
|
df = df[df.index.isin(minutes)]
|
308
174
|
len_before = len(df)
|
175
|
+
# print(f"{start_date=} {end_date=} {dates_with_data=}")
|
176
|
+
# print(f"day pre {df.head()=}")
|
309
177
|
if len(df) < 1:
|
310
178
|
# TODO: Move sid assignment until after this check for no data.
|
311
179
|
print(
|
312
|
-
f" WARNING: No data for {symbol=} {sid=} {len_before=} {
|
180
|
+
f" WARNING: No data for {symbol=} {sid=} {len_before=} {start_date=} {end_date=}"
|
313
181
|
)
|
314
182
|
continue
|
315
183
|
df = minute_frame_to_session_frame(df, calendar)
|
316
|
-
|
184
|
+
# print(f"day sess {df.head()=}")
|
185
|
+
# df["symbol"] = sql_symbol
|
317
186
|
df = df[df.index.isin(sessions)]
|
318
187
|
|
319
188
|
# The auto_close date is the day after the last trade.
|
320
|
-
auto_close_date = end_date + pd.Timedelta(days=1)
|
189
|
+
# auto_close_date = end_date + pd.Timedelta(days=1)
|
190
|
+
auto_close_date = None
|
321
191
|
|
322
192
|
# If metadata already has this sid, just extend the end_date and ac_date.
|
323
193
|
if sid in metadata.index:
|
@@ -337,12 +207,12 @@ def process_minute_fragment(
|
|
337
207
|
start_date,
|
338
208
|
end_date,
|
339
209
|
auto_close_date,
|
340
|
-
|
210
|
+
sql_symbol,
|
341
211
|
calendar.name,
|
342
212
|
symbol,
|
343
213
|
)
|
344
|
-
df = df.reindex(sessions.tz_localize(None))
|
345
|
-
|
214
|
+
# df = df.reindex(sessions.tz_localize(None))
|
215
|
+
df = df.reindex(sessions)
|
346
216
|
# Missing volume and transactions are zero
|
347
217
|
df["volume"] = df["volume"].fillna(0)
|
348
218
|
# df["transactions"] = df["transactions"].fillna(0)
|
@@ -350,13 +220,14 @@ def process_minute_fragment(
|
|
350
220
|
# TODO: These fills should have the same price for OHLC (open for backfill, close for forward fill)
|
351
221
|
df.ffill(inplace=True)
|
352
222
|
# Back fill missing data (maybe necessary for before the first day bar)
|
223
|
+
# TODO: Don't want to backfill future values. What's better here?
|
353
224
|
df.bfill(inplace=True)
|
354
225
|
if len(df) > 0:
|
355
226
|
# print(f"\n{symbol=} {sid=} {len_before=} {start_timestamp=} {end_date=} {end_timestamp=} {len(df)=}")
|
356
227
|
yield sid, df
|
357
228
|
else:
|
358
229
|
print(
|
359
|
-
f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {
|
230
|
+
f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {start_date=} {end_date=} {end_date=}"
|
360
231
|
)
|
361
232
|
else:
|
362
233
|
len_before = len(df)
|
@@ -378,20 +249,35 @@ def process_minute_fragment(
|
|
378
249
|
return
|
379
250
|
|
380
251
|
|
381
|
-
def
|
252
|
+
def process_aggregates(
|
253
|
+
process_table_func,
|
382
254
|
fragments,
|
383
255
|
sessions,
|
384
256
|
minutes,
|
385
257
|
metadata,
|
386
258
|
calendar,
|
387
259
|
symbol_to_sid: dict[str, int],
|
388
|
-
dates_with_data: set,
|
260
|
+
dates_with_data: set[pd.Timestamp],
|
389
261
|
agg_time: str,
|
390
262
|
):
|
391
|
-
# We
|
263
|
+
# We do this by Hive partition at a time because each ticker will be complete.
|
392
264
|
for fragment in fragments:
|
393
|
-
|
394
|
-
|
265
|
+
# Only get the columns Zipline allows.
|
266
|
+
table = fragment.to_table(
|
267
|
+
columns=[
|
268
|
+
"ticker",
|
269
|
+
"window_start",
|
270
|
+
"open",
|
271
|
+
"high",
|
272
|
+
"low",
|
273
|
+
"close",
|
274
|
+
"volume",
|
275
|
+
"transactions",
|
276
|
+
]
|
277
|
+
)
|
278
|
+
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
279
|
+
yield from process_table_func(
|
280
|
+
table=table,
|
395
281
|
sessions=sessions,
|
396
282
|
minutes=minutes,
|
397
283
|
metadata=metadata,
|
@@ -400,6 +286,7 @@ def process_minute_aggregates(
|
|
400
286
|
dates_with_data=dates_with_data,
|
401
287
|
agg_time=agg_time,
|
402
288
|
)
|
289
|
+
del table
|
403
290
|
|
404
291
|
# This doesn't seem to be hardly any faster than the above, something with the GIL?
|
405
292
|
# Also to use this we'd need to make sure the symbol_to_sid and dates_with_data are thread safe.
|
@@ -422,7 +309,8 @@ def process_minute_aggregates(
|
|
422
309
|
# yield from future.result()
|
423
310
|
|
424
311
|
|
425
|
-
def
|
312
|
+
def ingest_polygon_equities_bundle(
|
313
|
+
agg_time: str,
|
426
314
|
environ,
|
427
315
|
asset_db_writer,
|
428
316
|
minute_bar_writer,
|
@@ -440,10 +328,20 @@ def polygon_equities_bundle_minute(
|
|
440
328
|
calendar_name=calendar.name,
|
441
329
|
start_date=start_date,
|
442
330
|
end_date=end_date,
|
443
|
-
agg_time=
|
331
|
+
agg_time=agg_time,
|
444
332
|
)
|
445
333
|
|
446
|
-
|
334
|
+
print(f"{calendar.name=} {start_date=} {end_date=}")
|
335
|
+
print(f"{calendar.sessions_in_range(start_date, end_date)[:4]}")
|
336
|
+
print(f"{calendar.sessions_in_range(start_date, end_date)[-4:]}")
|
337
|
+
print(f"{calendar.sessions_minutes(start_date, end_date)[:4]}")
|
338
|
+
print(f"{calendar.sessions_minutes(start_date, end_date)[-4:]}")
|
339
|
+
|
340
|
+
if agg_time in [AGG_TIME_TRADES, "1min", "1minute"]:
|
341
|
+
convert_trades_to_custom_aggs(config, overwrite=False)
|
342
|
+
by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
|
343
|
+
else:
|
344
|
+
by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
|
447
345
|
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
448
346
|
# print(f"{aggregates.schema=}")
|
449
347
|
# 3.5 billion rows for 10 years of minute data.
|
@@ -467,11 +365,13 @@ def polygon_equities_bundle_minute(
|
|
467
365
|
)
|
468
366
|
|
469
367
|
symbol_to_sid = {}
|
368
|
+
# Keep track of earliest and latest dates with data across all symbols.
|
470
369
|
dates_with_data = set()
|
471
370
|
|
472
371
|
# Get data for all stocks and write to Zipline
|
473
372
|
daily_bar_writer.write(
|
474
|
-
|
373
|
+
process_aggregates(
|
374
|
+
process_day_table if config.agg_time == AGG_TIME_DAY else process_minute_table,
|
475
375
|
fragments=aggregates.get_fragments(),
|
476
376
|
sessions=calendar.sessions_in_range(start_date, end_date),
|
477
377
|
minutes=calendar.sessions_minutes(start_date, end_date),
|
@@ -479,25 +379,26 @@ def polygon_equities_bundle_minute(
|
|
479
379
|
calendar=calendar,
|
480
380
|
symbol_to_sid=symbol_to_sid,
|
481
381
|
dates_with_data=dates_with_data,
|
482
|
-
agg_time=
|
382
|
+
agg_time=AGG_TIME_DAY,
|
483
383
|
),
|
484
384
|
show_progress=show_progress,
|
485
385
|
)
|
486
386
|
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
387
|
+
if config.agg_time != AGG_TIME_DAY:
|
388
|
+
minute_bar_writer.write(
|
389
|
+
process_aggregates(
|
390
|
+
process_minute_table,
|
391
|
+
fragments=aggregates.get_fragments(),
|
392
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
393
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
394
|
+
metadata=metadata,
|
395
|
+
calendar=calendar,
|
396
|
+
symbol_to_sid=symbol_to_sid,
|
397
|
+
dates_with_data=dates_with_data,
|
398
|
+
agg_time=AGG_TIME_MINUTE,
|
399
|
+
),
|
400
|
+
show_progress=show_progress,
|
401
|
+
)
|
501
402
|
|
502
403
|
# Write the metadata
|
503
404
|
asset_db_writer.write(equities=metadata)
|
@@ -512,95 +413,36 @@ def polygon_equities_bundle_minute(
|
|
512
413
|
adjustment_writer.write(splits=splits, dividends=dividends)
|
513
414
|
|
514
415
|
|
515
|
-
def
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
convert_trades_to_custom_aggs(config, overwrite=False)
|
538
|
-
by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
|
539
|
-
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
540
|
-
# 3.5 billion rows for 10 years of minute data.
|
541
|
-
# print(f"{aggregates.count_rows()=}")
|
542
|
-
# Can't sort the dataset because that reads it all into memory.
|
543
|
-
# aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
544
|
-
# print("Sorted")
|
545
|
-
|
546
|
-
# Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
|
547
|
-
# This is because the SQL schema zipline uses for symbols ignores case.
|
548
|
-
# We put the original symbol in the asset_name field.
|
549
|
-
metadata = pd.DataFrame(
|
550
|
-
columns=(
|
551
|
-
"start_date",
|
552
|
-
"end_date",
|
553
|
-
"auto_close_date",
|
554
|
-
"symbol",
|
555
|
-
"exchange",
|
556
|
-
"asset_name",
|
557
|
-
)
|
558
|
-
)
|
559
|
-
|
560
|
-
symbol_to_sid = {}
|
561
|
-
dates_with_data = set()
|
562
|
-
|
563
|
-
# Get data for all stocks and write to Zipline
|
564
|
-
daily_bar_writer.write(
|
565
|
-
process_minute_aggregates(
|
566
|
-
fragments=aggregates.get_fragments(),
|
567
|
-
sessions=calendar.sessions_in_range(start_date, end_date),
|
568
|
-
minutes=calendar.sessions_minutes(start_date, end_date),
|
569
|
-
metadata=metadata,
|
570
|
-
calendar=calendar,
|
571
|
-
symbol_to_sid=symbol_to_sid,
|
572
|
-
dates_with_data=dates_with_data,
|
573
|
-
agg_time="day",
|
574
|
-
),
|
575
|
-
show_progress=show_progress,
|
576
|
-
)
|
577
|
-
|
578
|
-
# Get data for all stocks and write to Zipline
|
579
|
-
minute_bar_writer.write(
|
580
|
-
process_minute_aggregates(
|
581
|
-
fragments=aggregates.get_fragments(),
|
582
|
-
sessions=calendar.sessions_in_range(start_date, end_date),
|
583
|
-
minutes=calendar.sessions_minutes(start_date, end_date),
|
584
|
-
metadata=metadata,
|
416
|
+
def ingest_polygon_equities_bundle_for_agg_time(agg_time: str):
|
417
|
+
def ingest_polygon_equities_bundle_inner(
|
418
|
+
environ,
|
419
|
+
asset_db_writer,
|
420
|
+
minute_bar_writer,
|
421
|
+
daily_bar_writer,
|
422
|
+
adjustment_writer,
|
423
|
+
calendar,
|
424
|
+
start_date,
|
425
|
+
end_date,
|
426
|
+
cache,
|
427
|
+
show_progress,
|
428
|
+
output_dir,
|
429
|
+
):
|
430
|
+
return ingest_polygon_equities_bundle(
|
431
|
+
agg_time=agg_time,
|
432
|
+
environ=environ,
|
433
|
+
asset_db_writer=asset_db_writer,
|
434
|
+
minute_bar_writer=minute_bar_writer,
|
435
|
+
daily_bar_writer=daily_bar_writer,
|
436
|
+
adjustment_writer=adjustment_writer,
|
585
437
|
calendar=calendar,
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
# Write the metadata
|
594
|
-
asset_db_writer.write(equities=metadata)
|
595
|
-
|
596
|
-
# Load splits and dividends
|
597
|
-
first_start_end = min(dates_with_data)
|
598
|
-
last_end_date = max(dates_with_data)
|
599
|
-
splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
|
600
|
-
dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
|
438
|
+
start_date=start_date,
|
439
|
+
end_date=end_date,
|
440
|
+
cache=cache,
|
441
|
+
show_progress=show_progress,
|
442
|
+
output_dir=output_dir,
|
443
|
+
)
|
601
444
|
|
602
|
-
|
603
|
-
adjustment_writer.write(splits=splits, dividends=dividends)
|
445
|
+
return ingest_polygon_equities_bundle_inner
|
604
446
|
|
605
447
|
|
606
448
|
def register_polygon_equities_bundle(
|
@@ -608,16 +450,22 @@ def register_polygon_equities_bundle(
|
|
608
450
|
start_date=None,
|
609
451
|
end_date=None,
|
610
452
|
calendar_name="XNYS",
|
611
|
-
agg_time=
|
453
|
+
agg_time=AGG_TIME_DAY,
|
454
|
+
minutes_per_day=390,
|
455
|
+
environ=os.environ,
|
612
456
|
# ticker_list=None,
|
613
457
|
# watchlists=None,
|
614
458
|
# include_asset_types=None,
|
615
459
|
):
|
616
460
|
register_nyse_all_hours_calendar()
|
617
461
|
|
618
|
-
|
462
|
+
# pd.set_option("display.max_columns", None)
|
463
|
+
# pd.set_option("display.width", 500)
|
464
|
+
|
465
|
+
# Note that "minute" is the Polygon minute aggs and "1minute" is the trades.
|
466
|
+
if agg_time not in [AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES, "1min", "1minute"]:
|
619
467
|
raise ValueError(
|
620
|
-
f"agg_time must be 'day', 'minute' (aggs), or '
|
468
|
+
f"agg_time must be 'day', 'minute' (aggs), or '1minute' (trades), not '{agg_time}'"
|
621
469
|
)
|
622
470
|
|
623
471
|
# We need to know the start and end dates of the session before the bundle is
|
@@ -625,36 +473,33 @@ def register_polygon_equities_bundle(
|
|
625
473
|
# the writer is initialized and written before our ingest function is called.
|
626
474
|
if start_date is None or end_date is None:
|
627
475
|
config = PolygonConfig(
|
628
|
-
environ=
|
476
|
+
environ=environ,
|
629
477
|
calendar_name=calendar_name,
|
630
478
|
start_date=start_date,
|
631
479
|
end_date=end_date,
|
632
480
|
agg_time=agg_time,
|
633
481
|
)
|
634
482
|
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
635
|
-
config.aggs_dir if agg_time in [
|
483
|
+
config.aggs_dir if agg_time in [AGG_TIME_DAY, AGG_TIME_MINUTE] else config.trades_dir,
|
636
484
|
config.csv_paths_pattern,
|
637
485
|
)
|
486
|
+
# print(f"{bundlename=} {first_aggs_date=} {last_aggs_date=}")
|
638
487
|
if start_date is None:
|
639
488
|
start_date = first_aggs_date
|
640
489
|
if end_date is None:
|
641
490
|
end_date = last_aggs_date
|
642
491
|
|
492
|
+
start_session = parse_date(start_date, raise_oob=False) if start_date else None
|
493
|
+
end_session = parse_date(end_date, raise_oob=False) if end_date else None
|
494
|
+
# print(f"Registered {bundlename=} {agg_time=} {start_session=} {end_session=}")
|
495
|
+
|
643
496
|
register(
|
644
497
|
bundlename,
|
645
|
-
(
|
646
|
-
|
647
|
-
|
648
|
-
else (
|
649
|
-
polygon_equities_bundle_minute
|
650
|
-
if agg_time == "minute"
|
651
|
-
else polygon_equities_bundle_trades
|
652
|
-
)
|
653
|
-
),
|
654
|
-
start_session=parse_date(start_date, raise_oob=False) if start_date else None,
|
655
|
-
end_session=parse_date(end_date, raise_oob=False) if end_date else None,
|
498
|
+
ingest_polygon_equities_bundle_for_agg_time(agg_time),
|
499
|
+
start_session=start_session,
|
500
|
+
end_session=end_session,
|
656
501
|
calendar_name=calendar_name,
|
657
|
-
|
502
|
+
minutes_per_day=minutes_per_day,
|
658
503
|
# create_writers=True,
|
659
504
|
)
|
660
505
|
|