zipline_polygon_bundle 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +12 -11
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +172 -200
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +129 -44
- zipline_polygon_bundle/config.py +90 -32
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/tickers_and_names.py +4 -1
- zipline_polygon_bundle/trades.py +352 -526
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/METADATA +7 -5
- zipline_polygon_bundle-0.2.0.dist-info/RECORD +18 -0
- zipline_polygon_bundle-0.1.8.dist-info/RECORD +0 -16
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/WHEEL +0 -0
zipline_polygon_bundle/bundle.py
CHANGED
@@ -1,19 +1,22 @@
|
|
1
|
-
import os
|
2
1
|
from zipline.data.bundles import register
|
3
2
|
from zipline.data.resample import minute_frame_to_session_frame
|
4
3
|
|
5
4
|
from exchange_calendars.calendar_helpers import parse_date
|
6
|
-
from zipline.utils.calendar_utils import get_calendar
|
7
5
|
|
8
|
-
from .config import PolygonConfig
|
9
6
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
10
7
|
from .adjustments import load_splits, load_dividends
|
8
|
+
from .config import PolygonConfig, AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES
|
9
|
+
from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
|
10
|
+
from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
|
11
11
|
|
12
12
|
import pyarrow
|
13
13
|
import pyarrow.compute
|
14
14
|
import pyarrow.dataset
|
15
15
|
|
16
16
|
import pandas as pd
|
17
|
+
|
18
|
+
import os
|
19
|
+
from filelock import FileLock
|
17
20
|
import logging
|
18
21
|
|
19
22
|
|
@@ -29,67 +32,40 @@ def symbol_to_upper(s: str) -> str:
|
|
29
32
|
def generate_all_agg_tables_from_csv(
|
30
33
|
config: PolygonConfig,
|
31
34
|
):
|
32
|
-
|
35
|
+
schema, tables = generate_csv_agg_tables(config)
|
33
36
|
for table in tables:
|
34
37
|
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
35
38
|
yield table
|
36
39
|
|
37
40
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
# )
|
49
|
-
# df = df[~duplicated_index_with_zero_activity]
|
50
|
-
# duplicated_index = df.index.duplicated(keep=False)
|
51
|
-
# if not duplicated_index.any():
|
52
|
-
# return df
|
53
|
-
# print(f" WARNING: Dropping dupes {df[duplicated_index]=}")
|
54
|
-
# df = df[df.index.duplicated(keep="first")]
|
55
|
-
# return df
|
56
|
-
|
57
|
-
|
58
|
-
def aggregate_multiple_aggs_per_date(df: pd.DataFrame) -> pd.DataFrame:
|
59
|
-
duplicated_index = df.index.duplicated(keep=False)
|
60
|
-
if not duplicated_index.any():
|
61
|
-
return df
|
62
|
-
duplicates = df[duplicated_index]
|
63
|
-
duplicate_index_values = duplicates.index.values
|
64
|
-
print()
|
65
|
-
if duplicates["symbol"].nunique() != 1:
|
66
|
-
logging.error(f"{duplicates['symbol'].unique()=} {duplicate_index_values=}")
|
67
|
-
logging.warning(
|
68
|
-
f"Aggregating dupes df[df.index.duplicated(keep=False)]=\n{duplicates}"
|
69
|
-
)
|
70
|
-
df = df.groupby(df.index).agg(
|
71
|
-
{
|
72
|
-
"symbol": "first",
|
73
|
-
"volume": "sum",
|
74
|
-
"open": "first",
|
75
|
-
"close": "last",
|
76
|
-
"high": "max",
|
77
|
-
"low": "min",
|
78
|
-
"transactions": "sum",
|
79
|
-
}
|
41
|
+
def rename_polygon_to_zipline(table: pyarrow.Table, time_name: str) -> pyarrow.Table:
|
42
|
+
table = table.rename_columns(
|
43
|
+
[
|
44
|
+
(
|
45
|
+
"symbol"
|
46
|
+
if name == "ticker"
|
47
|
+
else time_name if name == "window_start" else name
|
48
|
+
)
|
49
|
+
for name in table.column_names
|
50
|
+
]
|
80
51
|
)
|
81
|
-
|
82
|
-
return df
|
52
|
+
return table
|
83
53
|
|
84
54
|
|
85
|
-
def
|
55
|
+
def process_day_table(
|
86
56
|
table,
|
87
57
|
sessions,
|
58
|
+
minutes,
|
88
59
|
metadata,
|
89
60
|
calendar,
|
90
61
|
symbol_to_sid: dict[str, int],
|
91
|
-
dates_with_data: set,
|
62
|
+
dates_with_data: set[pd.Timestamp],
|
63
|
+
agg_time: str,
|
92
64
|
):
|
65
|
+
table = rename_polygon_to_zipline(table, "day")
|
66
|
+
symbols = table.column("symbol").unique().to_pylist()
|
67
|
+
for sid, symbol in enumerate(symbols):
|
68
|
+
symbol_to_sid[symbol] = sid
|
93
69
|
for symbol, sid in symbol_to_sid.items():
|
94
70
|
df = table.filter(
|
95
71
|
pyarrow.compute.field("symbol") == pyarrow.scalar(symbol)
|
@@ -97,24 +73,25 @@ def process_day_aggregates(
|
|
97
73
|
# The SQL schema zipline uses for symbols ignores case
|
98
74
|
sql_symbol = symbol_to_upper(symbol)
|
99
75
|
df["symbol"] = sql_symbol
|
100
|
-
df["day"] = pd.to_datetime(df["day"].dt.date)
|
76
|
+
df["day"] = pd.to_datetime(df["day"].dt.tz_convert(calendar.tz.key).dt.date)
|
101
77
|
df = df.set_index("day")
|
102
78
|
if not df.index.is_monotonic_increasing:
|
103
|
-
print(f" INFO: {symbol=} {sid=} not monotonic increasing")
|
79
|
+
print(f" INFO: {symbol=} {sid=} not monotonic increasing: {df.index.min()=} {df.index.max()=}")
|
104
80
|
df.sort_index(inplace=True)
|
105
81
|
# Remove duplicates
|
106
82
|
df = df[~df.index.duplicated(keep="first")]
|
107
83
|
# Take days as per calendar
|
108
84
|
df = df[df.index.isin(sessions)]
|
109
85
|
# 2019-08-13 has a bunch of tickers with multiple day aggs per date
|
110
|
-
|
86
|
+
# TODO: Actually they're for different days so if the filtering doesn't work then do something about it.
|
87
|
+
# df = aggregate_multiple_aggs_per_date(df)
|
111
88
|
if len(df) < 1:
|
112
89
|
continue
|
113
90
|
# Check first and last date.
|
114
91
|
start_date = df.index[0]
|
115
|
-
dates_with_data.add(start_date
|
92
|
+
dates_with_data.add(start_date)
|
116
93
|
end_date = df.index[-1]
|
117
|
-
dates_with_data.add(end_date
|
94
|
+
dates_with_data.add(end_date)
|
118
95
|
try:
|
119
96
|
duplicated_index = df.index.duplicated(keep=False)
|
120
97
|
df_with_duplicates = df[duplicated_index]
|
@@ -157,109 +134,22 @@ def process_day_aggregates(
|
|
157
134
|
return
|
158
135
|
|
159
136
|
|
160
|
-
def
|
161
|
-
table
|
162
|
-
[
|
163
|
-
(
|
164
|
-
"symbol"
|
165
|
-
if name == "ticker"
|
166
|
-
else time_name if name == "window_start" else name
|
167
|
-
)
|
168
|
-
for name in table.column_names
|
169
|
-
]
|
170
|
-
)
|
171
|
-
return table
|
172
|
-
|
173
|
-
|
174
|
-
def polygon_equities_bundle_day(
|
175
|
-
environ,
|
176
|
-
asset_db_writer,
|
177
|
-
minute_bar_writer,
|
178
|
-
daily_bar_writer,
|
179
|
-
adjustment_writer,
|
180
|
-
calendar,
|
181
|
-
start_date,
|
182
|
-
end_date,
|
183
|
-
cache,
|
184
|
-
show_progress,
|
185
|
-
output_dir,
|
186
|
-
):
|
187
|
-
config = PolygonConfig(
|
188
|
-
environ=environ,
|
189
|
-
calendar_name=calendar.name,
|
190
|
-
start_date=start_date,
|
191
|
-
end_date=end_date,
|
192
|
-
agg_time="day",
|
193
|
-
)
|
194
|
-
|
195
|
-
by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
|
196
|
-
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
197
|
-
|
198
|
-
# Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
|
199
|
-
# This is because the SQL schema zipline uses for symbols ignores case.
|
200
|
-
# We put the original symbol in the asset_name field.
|
201
|
-
metadata = pd.DataFrame(
|
202
|
-
columns=(
|
203
|
-
"start_date",
|
204
|
-
"end_date",
|
205
|
-
"auto_close_date",
|
206
|
-
"symbol",
|
207
|
-
"exchange",
|
208
|
-
"asset_name",
|
209
|
-
)
|
210
|
-
)
|
211
|
-
|
212
|
-
table = aggregates.to_table()
|
213
|
-
table = rename_polygon_to_zipline(table, "day")
|
214
|
-
# Get all the symbols in the table by using value_counts to tabulate the unique values.
|
215
|
-
# pyarrow.Table.column returns a pyarrow.ChunkedArray.
|
216
|
-
# https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow.ChunkedArray.value_counts
|
217
|
-
symbols = sorted(table.column("symbol").value_counts().field(0).to_pylist())
|
218
|
-
symbol_to_sid = {symbol: sid for sid, symbol in enumerate(symbols)}
|
219
|
-
dates_with_data = set()
|
220
|
-
|
221
|
-
# Get data for all stocks and write to Zipline
|
222
|
-
daily_bar_writer.write(
|
223
|
-
process_day_aggregates(
|
224
|
-
table=table,
|
225
|
-
sessions=calendar.sessions_in_range(start_date, end_date),
|
226
|
-
metadata=metadata,
|
227
|
-
calendar=calendar,
|
228
|
-
symbol_to_sid=symbol_to_sid,
|
229
|
-
dates_with_data=dates_with_data,
|
230
|
-
),
|
231
|
-
show_progress=show_progress,
|
232
|
-
)
|
233
|
-
|
234
|
-
# Write the metadata
|
235
|
-
asset_db_writer.write(equities=metadata)
|
236
|
-
|
237
|
-
# Load splits and dividends
|
238
|
-
first_start_end = min(dates_with_data)
|
239
|
-
last_end_date = max(dates_with_data)
|
240
|
-
splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
|
241
|
-
dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
|
242
|
-
|
243
|
-
# Write splits and dividends
|
244
|
-
adjustment_writer.write(splits=splits, dividends=dividends)
|
245
|
-
|
246
|
-
|
247
|
-
def process_minute_fragment(
|
248
|
-
fragment,
|
137
|
+
def process_minute_table(
|
138
|
+
table,
|
249
139
|
sessions,
|
250
140
|
minutes,
|
251
141
|
metadata,
|
252
142
|
calendar,
|
253
143
|
symbol_to_sid: dict[str, int],
|
254
|
-
dates_with_data: set,
|
144
|
+
dates_with_data: set[pd.Timestamp],
|
255
145
|
agg_time: str,
|
256
146
|
):
|
257
|
-
table = fragment.to_table()
|
258
|
-
print(f" {table.num_rows=}")
|
259
147
|
table = rename_polygon_to_zipline(table, "timestamp")
|
260
|
-
|
148
|
+
# print(f"{minutes[:5]=}\n{minutes[-5:]=}")
|
261
149
|
table = table.filter(pyarrow.compute.field("timestamp").isin(minutes))
|
150
|
+
# print(f"filtered {table.num_rows=}")
|
262
151
|
table_df = table.to_pandas()
|
152
|
+
# print(f"{table_df.head()=}")
|
263
153
|
for symbol, df in table_df.groupby("symbol"):
|
264
154
|
# print(f"\n{symbol=} {len(df)=} {df['timestamp'].min()} {df['timestamp'].max()}")
|
265
155
|
if symbol not in symbol_to_sid:
|
@@ -269,29 +159,35 @@ def process_minute_fragment(
|
|
269
159
|
sql_symbol = symbol_to_upper(symbol)
|
270
160
|
df["symbol"] = sql_symbol
|
271
161
|
df = df.set_index("timestamp")
|
272
|
-
|
162
|
+
# Shouldn't need to do this because the table is sorted.
|
163
|
+
if not df.index.is_monotonic_increasing:
|
164
|
+
print(f" INFO: {symbol=} {sid=} not monotonic increasing")
|
165
|
+
df.sort_index(inplace=True)
|
166
|
+
if agg_time == AGG_TIME_DAY:
|
273
167
|
df.drop(columns=["symbol", "transactions"], inplace=True)
|
274
|
-
#
|
275
|
-
start_date = df.index[0].
|
276
|
-
start_timestamp = df.index[0]
|
168
|
+
# Remember first and last date.
|
169
|
+
start_date = df.index[0].tz_convert(calendar.tz.key).normalize()
|
277
170
|
dates_with_data.add(start_date)
|
278
|
-
end_date = df.index[-1].
|
279
|
-
end_timestamp = df.index[-1]
|
171
|
+
end_date = df.index[-1].tz_convert(calendar.tz.key).normalize()
|
280
172
|
dates_with_data.add(end_date)
|
281
173
|
df = df[df.index.isin(minutes)]
|
282
174
|
len_before = len(df)
|
175
|
+
# print(f"{start_date=} {end_date=} {dates_with_data=}")
|
176
|
+
# print(f"day pre {df.head()=}")
|
283
177
|
if len(df) < 1:
|
284
178
|
# TODO: Move sid assignment until after this check for no data.
|
285
179
|
print(
|
286
|
-
f" WARNING: No data for {symbol=} {sid=} {len_before=} {
|
180
|
+
f" WARNING: No data for {symbol=} {sid=} {len_before=} {start_date=} {end_date=}"
|
287
181
|
)
|
288
182
|
continue
|
289
183
|
df = minute_frame_to_session_frame(df, calendar)
|
290
|
-
|
184
|
+
# print(f"day sess {df.head()=}")
|
185
|
+
# df["symbol"] = sql_symbol
|
291
186
|
df = df[df.index.isin(sessions)]
|
292
187
|
|
293
188
|
# The auto_close date is the day after the last trade.
|
294
|
-
auto_close_date = end_date + pd.Timedelta(days=1)
|
189
|
+
# auto_close_date = end_date + pd.Timedelta(days=1)
|
190
|
+
auto_close_date = None
|
295
191
|
|
296
192
|
# If metadata already has this sid, just extend the end_date and ac_date.
|
297
193
|
if sid in metadata.index:
|
@@ -311,12 +207,12 @@ def process_minute_fragment(
|
|
311
207
|
start_date,
|
312
208
|
end_date,
|
313
209
|
auto_close_date,
|
314
|
-
|
210
|
+
sql_symbol,
|
315
211
|
calendar.name,
|
316
212
|
symbol,
|
317
213
|
)
|
318
|
-
df = df.reindex(sessions.tz_localize(None))
|
319
|
-
|
214
|
+
# df = df.reindex(sessions.tz_localize(None))
|
215
|
+
df = df.reindex(sessions)
|
320
216
|
# Missing volume and transactions are zero
|
321
217
|
df["volume"] = df["volume"].fillna(0)
|
322
218
|
# df["transactions"] = df["transactions"].fillna(0)
|
@@ -324,13 +220,14 @@ def process_minute_fragment(
|
|
324
220
|
# TODO: These fills should have the same price for OHLC (open for backfill, close for forward fill)
|
325
221
|
df.ffill(inplace=True)
|
326
222
|
# Back fill missing data (maybe necessary for before the first day bar)
|
223
|
+
# TODO: Don't want to backfill future values. What's better here?
|
327
224
|
df.bfill(inplace=True)
|
328
225
|
if len(df) > 0:
|
329
226
|
# print(f"\n{symbol=} {sid=} {len_before=} {start_timestamp=} {end_date=} {end_timestamp=} {len(df)=}")
|
330
227
|
yield sid, df
|
331
228
|
else:
|
332
229
|
print(
|
333
|
-
f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {
|
230
|
+
f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {start_date=} {end_date=} {end_date=}"
|
334
231
|
)
|
335
232
|
else:
|
336
233
|
len_before = len(df)
|
@@ -352,20 +249,35 @@ def process_minute_fragment(
|
|
352
249
|
return
|
353
250
|
|
354
251
|
|
355
|
-
def
|
252
|
+
def process_aggregates(
|
253
|
+
process_table_func,
|
356
254
|
fragments,
|
357
255
|
sessions,
|
358
256
|
minutes,
|
359
257
|
metadata,
|
360
258
|
calendar,
|
361
259
|
symbol_to_sid: dict[str, int],
|
362
|
-
dates_with_data: set,
|
260
|
+
dates_with_data: set[pd.Timestamp],
|
363
261
|
agg_time: str,
|
364
262
|
):
|
365
|
-
# We
|
263
|
+
# We do this by Hive partition at a time because each ticker will be complete.
|
366
264
|
for fragment in fragments:
|
367
|
-
|
368
|
-
|
265
|
+
# Only get the columns Zipline allows.
|
266
|
+
table = fragment.to_table(
|
267
|
+
columns=[
|
268
|
+
"ticker",
|
269
|
+
"window_start",
|
270
|
+
"open",
|
271
|
+
"high",
|
272
|
+
"low",
|
273
|
+
"close",
|
274
|
+
"volume",
|
275
|
+
"transactions",
|
276
|
+
]
|
277
|
+
)
|
278
|
+
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
279
|
+
yield from process_table_func(
|
280
|
+
table=table,
|
369
281
|
sessions=sessions,
|
370
282
|
minutes=minutes,
|
371
283
|
metadata=metadata,
|
@@ -374,6 +286,7 @@ def process_minute_aggregates(
|
|
374
286
|
dates_with_data=dates_with_data,
|
375
287
|
agg_time=agg_time,
|
376
288
|
)
|
289
|
+
del table
|
377
290
|
|
378
291
|
# This doesn't seem to be hardly any faster than the above, something with the GIL?
|
379
292
|
# Also to use this we'd need to make sure the symbol_to_sid and dates_with_data are thread safe.
|
@@ -396,7 +309,8 @@ def process_minute_aggregates(
|
|
396
309
|
# yield from future.result()
|
397
310
|
|
398
311
|
|
399
|
-
def
|
312
|
+
def ingest_polygon_equities_bundle(
|
313
|
+
agg_time: str,
|
400
314
|
environ,
|
401
315
|
asset_db_writer,
|
402
316
|
minute_bar_writer,
|
@@ -414,10 +328,20 @@ def polygon_equities_bundle_minute(
|
|
414
328
|
calendar_name=calendar.name,
|
415
329
|
start_date=start_date,
|
416
330
|
end_date=end_date,
|
417
|
-
agg_time=
|
331
|
+
agg_time=agg_time,
|
418
332
|
)
|
419
333
|
|
420
|
-
|
334
|
+
print(f"{calendar.name=} {start_date=} {end_date=}")
|
335
|
+
print(f"{calendar.sessions_in_range(start_date, end_date)[:4]}")
|
336
|
+
print(f"{calendar.sessions_in_range(start_date, end_date)[-4:]}")
|
337
|
+
print(f"{calendar.sessions_minutes(start_date, end_date)[:4]}")
|
338
|
+
print(f"{calendar.sessions_minutes(start_date, end_date)[-4:]}")
|
339
|
+
|
340
|
+
if agg_time in [AGG_TIME_TRADES, "1min", "1minute"]:
|
341
|
+
convert_trades_to_custom_aggs(config, overwrite=False)
|
342
|
+
by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
|
343
|
+
else:
|
344
|
+
by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
|
421
345
|
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
422
346
|
# print(f"{aggregates.schema=}")
|
423
347
|
# 3.5 billion rows for 10 years of minute data.
|
@@ -441,11 +365,13 @@ def polygon_equities_bundle_minute(
|
|
441
365
|
)
|
442
366
|
|
443
367
|
symbol_to_sid = {}
|
368
|
+
# Keep track of earliest and latest dates with data across all symbols.
|
444
369
|
dates_with_data = set()
|
445
370
|
|
446
371
|
# Get data for all stocks and write to Zipline
|
447
372
|
daily_bar_writer.write(
|
448
|
-
|
373
|
+
process_aggregates(
|
374
|
+
process_day_table if config.agg_time == AGG_TIME_DAY else process_minute_table,
|
449
375
|
fragments=aggregates.get_fragments(),
|
450
376
|
sessions=calendar.sessions_in_range(start_date, end_date),
|
451
377
|
minutes=calendar.sessions_minutes(start_date, end_date),
|
@@ -453,25 +379,26 @@ def polygon_equities_bundle_minute(
|
|
453
379
|
calendar=calendar,
|
454
380
|
symbol_to_sid=symbol_to_sid,
|
455
381
|
dates_with_data=dates_with_data,
|
456
|
-
agg_time=
|
382
|
+
agg_time=AGG_TIME_DAY,
|
457
383
|
),
|
458
384
|
show_progress=show_progress,
|
459
385
|
)
|
460
386
|
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
387
|
+
if config.agg_time != AGG_TIME_DAY:
|
388
|
+
minute_bar_writer.write(
|
389
|
+
process_aggregates(
|
390
|
+
process_minute_table,
|
391
|
+
fragments=aggregates.get_fragments(),
|
392
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
393
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
394
|
+
metadata=metadata,
|
395
|
+
calendar=calendar,
|
396
|
+
symbol_to_sid=symbol_to_sid,
|
397
|
+
dates_with_data=dates_with_data,
|
398
|
+
agg_time=AGG_TIME_MINUTE,
|
399
|
+
),
|
400
|
+
show_progress=show_progress,
|
401
|
+
)
|
475
402
|
|
476
403
|
# Write the metadata
|
477
404
|
asset_db_writer.write(equities=metadata)
|
@@ -486,48 +413,93 @@ def polygon_equities_bundle_minute(
|
|
486
413
|
adjustment_writer.write(splits=splits, dividends=dividends)
|
487
414
|
|
488
415
|
|
416
|
+
def ingest_polygon_equities_bundle_for_agg_time(agg_time: str):
|
417
|
+
def ingest_polygon_equities_bundle_inner(
|
418
|
+
environ,
|
419
|
+
asset_db_writer,
|
420
|
+
minute_bar_writer,
|
421
|
+
daily_bar_writer,
|
422
|
+
adjustment_writer,
|
423
|
+
calendar,
|
424
|
+
start_date,
|
425
|
+
end_date,
|
426
|
+
cache,
|
427
|
+
show_progress,
|
428
|
+
output_dir,
|
429
|
+
):
|
430
|
+
return ingest_polygon_equities_bundle(
|
431
|
+
agg_time=agg_time,
|
432
|
+
environ=environ,
|
433
|
+
asset_db_writer=asset_db_writer,
|
434
|
+
minute_bar_writer=minute_bar_writer,
|
435
|
+
daily_bar_writer=daily_bar_writer,
|
436
|
+
adjustment_writer=adjustment_writer,
|
437
|
+
calendar=calendar,
|
438
|
+
start_date=start_date,
|
439
|
+
end_date=end_date,
|
440
|
+
cache=cache,
|
441
|
+
show_progress=show_progress,
|
442
|
+
output_dir=output_dir,
|
443
|
+
)
|
444
|
+
|
445
|
+
return ingest_polygon_equities_bundle_inner
|
446
|
+
|
447
|
+
|
489
448
|
def register_polygon_equities_bundle(
|
490
449
|
bundlename,
|
491
450
|
start_date=None,
|
492
451
|
end_date=None,
|
493
452
|
calendar_name="XNYS",
|
494
|
-
agg_time=
|
453
|
+
agg_time=AGG_TIME_DAY,
|
454
|
+
minutes_per_day=390,
|
455
|
+
environ=os.environ,
|
495
456
|
# ticker_list=None,
|
496
457
|
# watchlists=None,
|
497
458
|
# include_asset_types=None,
|
498
459
|
):
|
499
|
-
|
500
|
-
|
460
|
+
register_nyse_all_hours_calendar()
|
461
|
+
|
462
|
+
# pd.set_option("display.max_columns", None)
|
463
|
+
# pd.set_option("display.width", 500)
|
464
|
+
|
465
|
+
# Note that "minute" is the Polygon minute aggs and "1minute" is the trades.
|
466
|
+
if agg_time not in [AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES, "1min", "1minute"]:
|
467
|
+
raise ValueError(
|
468
|
+
f"agg_time must be 'day', 'minute' (aggs), or '1minute' (trades), not '{agg_time}'"
|
469
|
+
)
|
470
|
+
|
501
471
|
# We need to know the start and end dates of the session before the bundle is
|
502
|
-
# registered because even though we only need it for ingest, the metadata in
|
472
|
+
# registered because even though we only need it for ingest, the metadata in
|
503
473
|
# the writer is initialized and written before our ingest function is called.
|
504
474
|
if start_date is None or end_date is None:
|
505
475
|
config = PolygonConfig(
|
506
|
-
environ=
|
476
|
+
environ=environ,
|
507
477
|
calendar_name=calendar_name,
|
508
478
|
start_date=start_date,
|
509
479
|
end_date=end_date,
|
510
480
|
agg_time=agg_time,
|
511
481
|
)
|
512
|
-
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
482
|
+
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
483
|
+
config.aggs_dir if agg_time in [AGG_TIME_DAY, AGG_TIME_MINUTE] else config.trades_dir,
|
484
|
+
config.csv_paths_pattern,
|
485
|
+
)
|
486
|
+
# print(f"{bundlename=} {first_aggs_date=} {last_aggs_date=}")
|
513
487
|
if start_date is None:
|
514
488
|
start_date = first_aggs_date
|
515
489
|
if end_date is None:
|
516
490
|
end_date = last_aggs_date
|
517
491
|
|
518
|
-
|
492
|
+
start_session = parse_date(start_date, raise_oob=False) if start_date else None
|
493
|
+
end_session = parse_date(end_date, raise_oob=False) if end_date else None
|
494
|
+
# print(f"Registered {bundlename=} {agg_time=} {start_session=} {end_session=}")
|
519
495
|
|
520
496
|
register(
|
521
497
|
bundlename,
|
522
|
-
(
|
523
|
-
|
524
|
-
|
525
|
-
else polygon_equities_bundle_day
|
526
|
-
),
|
527
|
-
start_session=parse_date(start_date, calendar=calendar),
|
528
|
-
end_session=parse_date(end_date, calendar=calendar),
|
498
|
+
ingest_polygon_equities_bundle_for_agg_time(agg_time),
|
499
|
+
start_session=start_session,
|
500
|
+
end_session=end_session,
|
529
501
|
calendar_name=calendar_name,
|
530
|
-
|
502
|
+
minutes_per_day=minutes_per_day,
|
531
503
|
# create_writers=True,
|
532
504
|
)
|
533
505
|
|