zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +33 -5
- zipline_polygon_bundle/adjustments.py +60 -31
- zipline_polygon_bundle/bundle.py +202 -208
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +140 -70
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +167 -36
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +5 -38
- zipline_polygon_bundle/trades.py +533 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/METADATA +10 -5
- zipline_polygon_bundle-0.2.0.dist-info/RECORD +18 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/WHEEL +0 -0
zipline_polygon_bundle/bundle.py
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
from zipline.data.bundles import register
|
2
2
|
from zipline.data.resample import minute_frame_to_session_frame
|
3
3
|
|
4
|
-
from .
|
4
|
+
from exchange_calendars.calendar_helpers import parse_date
|
5
|
+
|
5
6
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
6
7
|
from .adjustments import load_splits, load_dividends
|
8
|
+
from .config import PolygonConfig, AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES
|
9
|
+
from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
|
10
|
+
from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
|
7
11
|
|
8
12
|
import pyarrow
|
9
13
|
import pyarrow.compute
|
14
|
+
import pyarrow.dataset
|
10
15
|
|
11
16
|
import pandas as pd
|
12
|
-
import logging
|
13
17
|
|
14
|
-
import
|
18
|
+
import os
|
19
|
+
from filelock import FileLock
|
20
|
+
import logging
|
15
21
|
|
16
22
|
|
17
23
|
# TODO: Change warnings to be relative to number of days in the range.
|
@@ -26,67 +32,40 @@ def symbol_to_upper(s: str) -> str:
|
|
26
32
|
def generate_all_agg_tables_from_csv(
|
27
33
|
config: PolygonConfig,
|
28
34
|
):
|
29
|
-
|
35
|
+
schema, tables = generate_csv_agg_tables(config)
|
30
36
|
for table in tables:
|
31
37
|
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
32
38
|
yield table
|
33
39
|
|
34
40
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
# )
|
46
|
-
# df = df[~duplicated_index_with_zero_activity]
|
47
|
-
# duplicated_index = df.index.duplicated(keep=False)
|
48
|
-
# if not duplicated_index.any():
|
49
|
-
# return df
|
50
|
-
# print(f" WARNING: Dropping dupes {df[duplicated_index]=}")
|
51
|
-
# df = df[df.index.duplicated(keep="first")]
|
52
|
-
# return df
|
53
|
-
|
54
|
-
|
55
|
-
def aggregate_multiple_aggs_per_date(df: pd.DataFrame) -> pd.DataFrame:
|
56
|
-
duplicated_index = df.index.duplicated(keep=False)
|
57
|
-
if not duplicated_index.any():
|
58
|
-
return df
|
59
|
-
duplicates = df[duplicated_index]
|
60
|
-
duplicate_index_values = duplicates.index.values
|
61
|
-
print()
|
62
|
-
if duplicates["symbol"].nunique() != 1:
|
63
|
-
logging.error(f"{duplicates['symbol'].unique()=} {duplicate_index_values=}")
|
64
|
-
logging.warning(
|
65
|
-
f"Aggregating dupes df[df.index.duplicated(keep=False)]=\n{duplicates}"
|
66
|
-
)
|
67
|
-
df = df.groupby(df.index).agg(
|
68
|
-
{
|
69
|
-
"symbol": "first",
|
70
|
-
"volume": "sum",
|
71
|
-
"open": "first",
|
72
|
-
"close": "last",
|
73
|
-
"high": "max",
|
74
|
-
"low": "min",
|
75
|
-
"transactions": "sum",
|
76
|
-
}
|
41
|
+
def rename_polygon_to_zipline(table: pyarrow.Table, time_name: str) -> pyarrow.Table:
|
42
|
+
table = table.rename_columns(
|
43
|
+
[
|
44
|
+
(
|
45
|
+
"symbol"
|
46
|
+
if name == "ticker"
|
47
|
+
else time_name if name == "window_start" else name
|
48
|
+
)
|
49
|
+
for name in table.column_names
|
50
|
+
]
|
77
51
|
)
|
78
|
-
|
79
|
-
return df
|
52
|
+
return table
|
80
53
|
|
81
54
|
|
82
|
-
def
|
55
|
+
def process_day_table(
|
83
56
|
table,
|
84
57
|
sessions,
|
58
|
+
minutes,
|
85
59
|
metadata,
|
86
60
|
calendar,
|
87
61
|
symbol_to_sid: dict[str, int],
|
88
|
-
dates_with_data: set,
|
62
|
+
dates_with_data: set[pd.Timestamp],
|
63
|
+
agg_time: str,
|
89
64
|
):
|
65
|
+
table = rename_polygon_to_zipline(table, "day")
|
66
|
+
symbols = table.column("symbol").unique().to_pylist()
|
67
|
+
for sid, symbol in enumerate(symbols):
|
68
|
+
symbol_to_sid[symbol] = sid
|
90
69
|
for symbol, sid in symbol_to_sid.items():
|
91
70
|
df = table.filter(
|
92
71
|
pyarrow.compute.field("symbol") == pyarrow.scalar(symbol)
|
@@ -94,24 +73,25 @@ def process_day_aggregates(
|
|
94
73
|
# The SQL schema zipline uses for symbols ignores case
|
95
74
|
sql_symbol = symbol_to_upper(symbol)
|
96
75
|
df["symbol"] = sql_symbol
|
97
|
-
df["day"] = pd.to_datetime(df["day"].dt.date)
|
76
|
+
df["day"] = pd.to_datetime(df["day"].dt.tz_convert(calendar.tz.key).dt.date)
|
98
77
|
df = df.set_index("day")
|
99
78
|
if not df.index.is_monotonic_increasing:
|
100
|
-
print(f" INFO: {symbol=} {sid=} not monotonic increasing")
|
79
|
+
print(f" INFO: {symbol=} {sid=} not monotonic increasing: {df.index.min()=} {df.index.max()=}")
|
101
80
|
df.sort_index(inplace=True)
|
102
81
|
# Remove duplicates
|
103
82
|
df = df[~df.index.duplicated(keep="first")]
|
104
83
|
# Take days as per calendar
|
105
84
|
df = df[df.index.isin(sessions)]
|
106
85
|
# 2019-08-13 has a bunch of tickers with multiple day aggs per date
|
107
|
-
|
86
|
+
# TODO: Actually they're for different days so if the filtering doesn't work then do something about it.
|
87
|
+
# df = aggregate_multiple_aggs_per_date(df)
|
108
88
|
if len(df) < 1:
|
109
89
|
continue
|
110
90
|
# Check first and last date.
|
111
91
|
start_date = df.index[0]
|
112
|
-
dates_with_data.add(start_date
|
92
|
+
dates_with_data.add(start_date)
|
113
93
|
end_date = df.index[-1]
|
114
|
-
dates_with_data.add(end_date
|
94
|
+
dates_with_data.add(end_date)
|
115
95
|
try:
|
116
96
|
duplicated_index = df.index.duplicated(keep=False)
|
117
97
|
df_with_duplicates = df[duplicated_index]
|
@@ -154,109 +134,22 @@ def process_day_aggregates(
|
|
154
134
|
return
|
155
135
|
|
156
136
|
|
157
|
-
def
|
158
|
-
table
|
159
|
-
[
|
160
|
-
(
|
161
|
-
"symbol"
|
162
|
-
if name == "ticker"
|
163
|
-
else time_name if name == "window_start" else name
|
164
|
-
)
|
165
|
-
for name in table.column_names
|
166
|
-
]
|
167
|
-
)
|
168
|
-
return table
|
169
|
-
|
170
|
-
|
171
|
-
def polygon_equities_bundle_day(
|
172
|
-
environ,
|
173
|
-
asset_db_writer,
|
174
|
-
minute_bar_writer,
|
175
|
-
daily_bar_writer,
|
176
|
-
adjustment_writer,
|
177
|
-
calendar,
|
178
|
-
start_session,
|
179
|
-
end_session,
|
180
|
-
cache,
|
181
|
-
show_progress,
|
182
|
-
output_dir,
|
183
|
-
):
|
184
|
-
config = PolygonConfig(
|
185
|
-
environ=environ,
|
186
|
-
calendar_name=calendar.name,
|
187
|
-
start_session=start_session,
|
188
|
-
end_session=end_session,
|
189
|
-
agg_time="day",
|
190
|
-
)
|
191
|
-
|
192
|
-
by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
|
193
|
-
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
194
|
-
|
195
|
-
# Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
|
196
|
-
# This is because the SQL schema zipline uses for symbols ignores case.
|
197
|
-
# We put the original symbol in the asset_name field.
|
198
|
-
metadata = pd.DataFrame(
|
199
|
-
columns=(
|
200
|
-
"start_date",
|
201
|
-
"end_date",
|
202
|
-
"auto_close_date",
|
203
|
-
"symbol",
|
204
|
-
"exchange",
|
205
|
-
"asset_name",
|
206
|
-
)
|
207
|
-
)
|
208
|
-
|
209
|
-
table = aggregates.to_table()
|
210
|
-
table = rename_polygon_to_zipline(table, "day")
|
211
|
-
# Get all the symbols in the table by using value_counts to tabulate the unique values.
|
212
|
-
# pyarrow.Table.column returns a pyarrow.ChunkedArray.
|
213
|
-
# https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow.ChunkedArray.value_counts
|
214
|
-
symbols = sorted(table.column("symbol").value_counts().field(0).to_pylist())
|
215
|
-
symbol_to_sid = {symbol: sid for sid, symbol in enumerate(symbols)}
|
216
|
-
dates_with_data = set()
|
217
|
-
|
218
|
-
# Get data for all stocks and write to Zipline
|
219
|
-
daily_bar_writer.write(
|
220
|
-
process_day_aggregates(
|
221
|
-
table=table,
|
222
|
-
sessions=calendar.sessions_in_range(start_session, end_session),
|
223
|
-
metadata=metadata,
|
224
|
-
calendar=calendar,
|
225
|
-
symbol_to_sid=symbol_to_sid,
|
226
|
-
dates_with_data=dates_with_data,
|
227
|
-
),
|
228
|
-
show_progress=show_progress,
|
229
|
-
)
|
230
|
-
|
231
|
-
# Write the metadata
|
232
|
-
asset_db_writer.write(equities=metadata)
|
233
|
-
|
234
|
-
# Load splits and dividends
|
235
|
-
first_start_end = min(dates_with_data)
|
236
|
-
last_end_date = max(dates_with_data)
|
237
|
-
splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
|
238
|
-
dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
|
239
|
-
|
240
|
-
# Write splits and dividends
|
241
|
-
adjustment_writer.write(splits=splits, dividends=dividends)
|
242
|
-
|
243
|
-
|
244
|
-
def process_minute_fragment(
|
245
|
-
fragment,
|
137
|
+
def process_minute_table(
|
138
|
+
table,
|
246
139
|
sessions,
|
247
140
|
minutes,
|
248
141
|
metadata,
|
249
142
|
calendar,
|
250
143
|
symbol_to_sid: dict[str, int],
|
251
|
-
dates_with_data: set,
|
144
|
+
dates_with_data: set[pd.Timestamp],
|
252
145
|
agg_time: str,
|
253
146
|
):
|
254
|
-
table = fragment.to_table()
|
255
|
-
print(f" {table.num_rows=}")
|
256
147
|
table = rename_polygon_to_zipline(table, "timestamp")
|
257
|
-
|
148
|
+
# print(f"{minutes[:5]=}\n{minutes[-5:]=}")
|
258
149
|
table = table.filter(pyarrow.compute.field("timestamp").isin(minutes))
|
150
|
+
# print(f"filtered {table.num_rows=}")
|
259
151
|
table_df = table.to_pandas()
|
152
|
+
# print(f"{table_df.head()=}")
|
260
153
|
for symbol, df in table_df.groupby("symbol"):
|
261
154
|
# print(f"\n{symbol=} {len(df)=} {df['timestamp'].min()} {df['timestamp'].max()}")
|
262
155
|
if symbol not in symbol_to_sid:
|
@@ -266,29 +159,35 @@ def process_minute_fragment(
|
|
266
159
|
sql_symbol = symbol_to_upper(symbol)
|
267
160
|
df["symbol"] = sql_symbol
|
268
161
|
df = df.set_index("timestamp")
|
269
|
-
|
162
|
+
# Shouldn't need to do this because the table is sorted.
|
163
|
+
if not df.index.is_monotonic_increasing:
|
164
|
+
print(f" INFO: {symbol=} {sid=} not monotonic increasing")
|
165
|
+
df.sort_index(inplace=True)
|
166
|
+
if agg_time == AGG_TIME_DAY:
|
270
167
|
df.drop(columns=["symbol", "transactions"], inplace=True)
|
271
|
-
#
|
272
|
-
start_date = df.index[0].
|
273
|
-
start_timestamp = df.index[0]
|
168
|
+
# Remember first and last date.
|
169
|
+
start_date = df.index[0].tz_convert(calendar.tz.key).normalize()
|
274
170
|
dates_with_data.add(start_date)
|
275
|
-
end_date = df.index[-1].
|
276
|
-
end_timestamp = df.index[-1]
|
171
|
+
end_date = df.index[-1].tz_convert(calendar.tz.key).normalize()
|
277
172
|
dates_with_data.add(end_date)
|
278
173
|
df = df[df.index.isin(minutes)]
|
279
174
|
len_before = len(df)
|
175
|
+
# print(f"{start_date=} {end_date=} {dates_with_data=}")
|
176
|
+
# print(f"day pre {df.head()=}")
|
280
177
|
if len(df) < 1:
|
281
178
|
# TODO: Move sid assignment until after this check for no data.
|
282
179
|
print(
|
283
|
-
f" WARNING: No data for {symbol=} {sid=} {len_before=} {
|
180
|
+
f" WARNING: No data for {symbol=} {sid=} {len_before=} {start_date=} {end_date=}"
|
284
181
|
)
|
285
182
|
continue
|
286
183
|
df = minute_frame_to_session_frame(df, calendar)
|
287
|
-
|
184
|
+
# print(f"day sess {df.head()=}")
|
185
|
+
# df["symbol"] = sql_symbol
|
288
186
|
df = df[df.index.isin(sessions)]
|
289
187
|
|
290
188
|
# The auto_close date is the day after the last trade.
|
291
|
-
auto_close_date = end_date + pd.Timedelta(days=1)
|
189
|
+
# auto_close_date = end_date + pd.Timedelta(days=1)
|
190
|
+
auto_close_date = None
|
292
191
|
|
293
192
|
# If metadata already has this sid, just extend the end_date and ac_date.
|
294
193
|
if sid in metadata.index:
|
@@ -308,12 +207,12 @@ def process_minute_fragment(
|
|
308
207
|
start_date,
|
309
208
|
end_date,
|
310
209
|
auto_close_date,
|
311
|
-
|
210
|
+
sql_symbol,
|
312
211
|
calendar.name,
|
313
212
|
symbol,
|
314
213
|
)
|
315
|
-
df = df.reindex(sessions.tz_localize(None))
|
316
|
-
|
214
|
+
# df = df.reindex(sessions.tz_localize(None))
|
215
|
+
df = df.reindex(sessions)
|
317
216
|
# Missing volume and transactions are zero
|
318
217
|
df["volume"] = df["volume"].fillna(0)
|
319
218
|
# df["transactions"] = df["transactions"].fillna(0)
|
@@ -321,13 +220,14 @@ def process_minute_fragment(
|
|
321
220
|
# TODO: These fills should have the same price for OHLC (open for backfill, close for forward fill)
|
322
221
|
df.ffill(inplace=True)
|
323
222
|
# Back fill missing data (maybe necessary for before the first day bar)
|
223
|
+
# TODO: Don't want to backfill future values. What's better here?
|
324
224
|
df.bfill(inplace=True)
|
325
225
|
if len(df) > 0:
|
326
226
|
# print(f"\n{symbol=} {sid=} {len_before=} {start_timestamp=} {end_date=} {end_timestamp=} {len(df)=}")
|
327
227
|
yield sid, df
|
328
228
|
else:
|
329
229
|
print(
|
330
|
-
f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {
|
230
|
+
f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {start_date=} {end_date=} {end_date=}"
|
331
231
|
)
|
332
232
|
else:
|
333
233
|
len_before = len(df)
|
@@ -349,20 +249,35 @@ def process_minute_fragment(
|
|
349
249
|
return
|
350
250
|
|
351
251
|
|
352
|
-
def
|
252
|
+
def process_aggregates(
|
253
|
+
process_table_func,
|
353
254
|
fragments,
|
354
255
|
sessions,
|
355
256
|
minutes,
|
356
257
|
metadata,
|
357
258
|
calendar,
|
358
259
|
symbol_to_sid: dict[str, int],
|
359
|
-
dates_with_data: set,
|
260
|
+
dates_with_data: set[pd.Timestamp],
|
360
261
|
agg_time: str,
|
361
262
|
):
|
362
|
-
# We
|
263
|
+
# We do this by Hive partition at a time because each ticker will be complete.
|
363
264
|
for fragment in fragments:
|
364
|
-
|
365
|
-
|
265
|
+
# Only get the columns Zipline allows.
|
266
|
+
table = fragment.to_table(
|
267
|
+
columns=[
|
268
|
+
"ticker",
|
269
|
+
"window_start",
|
270
|
+
"open",
|
271
|
+
"high",
|
272
|
+
"low",
|
273
|
+
"close",
|
274
|
+
"volume",
|
275
|
+
"transactions",
|
276
|
+
]
|
277
|
+
)
|
278
|
+
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
279
|
+
yield from process_table_func(
|
280
|
+
table=table,
|
366
281
|
sessions=sessions,
|
367
282
|
minutes=minutes,
|
368
283
|
metadata=metadata,
|
@@ -371,6 +286,7 @@ def process_minute_aggregates(
|
|
371
286
|
dates_with_data=dates_with_data,
|
372
287
|
agg_time=agg_time,
|
373
288
|
)
|
289
|
+
del table
|
374
290
|
|
375
291
|
# This doesn't seem to be hardly any faster than the above, something with the GIL?
|
376
292
|
# Also to use this we'd need to make sure the symbol_to_sid and dates_with_data are thread safe.
|
@@ -393,15 +309,16 @@ def process_minute_aggregates(
|
|
393
309
|
# yield from future.result()
|
394
310
|
|
395
311
|
|
396
|
-
def
|
312
|
+
def ingest_polygon_equities_bundle(
|
313
|
+
agg_time: str,
|
397
314
|
environ,
|
398
315
|
asset_db_writer,
|
399
316
|
minute_bar_writer,
|
400
317
|
daily_bar_writer,
|
401
318
|
adjustment_writer,
|
402
319
|
calendar,
|
403
|
-
|
404
|
-
|
320
|
+
start_date,
|
321
|
+
end_date,
|
405
322
|
cache,
|
406
323
|
show_progress,
|
407
324
|
output_dir,
|
@@ -409,12 +326,22 @@ def polygon_equities_bundle_minute(
|
|
409
326
|
config = PolygonConfig(
|
410
327
|
environ=environ,
|
411
328
|
calendar_name=calendar.name,
|
412
|
-
|
413
|
-
|
414
|
-
agg_time=
|
329
|
+
start_date=start_date,
|
330
|
+
end_date=end_date,
|
331
|
+
agg_time=agg_time,
|
415
332
|
)
|
416
333
|
|
417
|
-
|
334
|
+
print(f"{calendar.name=} {start_date=} {end_date=}")
|
335
|
+
print(f"{calendar.sessions_in_range(start_date, end_date)[:4]}")
|
336
|
+
print(f"{calendar.sessions_in_range(start_date, end_date)[-4:]}")
|
337
|
+
print(f"{calendar.sessions_minutes(start_date, end_date)[:4]}")
|
338
|
+
print(f"{calendar.sessions_minutes(start_date, end_date)[-4:]}")
|
339
|
+
|
340
|
+
if agg_time in [AGG_TIME_TRADES, "1min", "1minute"]:
|
341
|
+
convert_trades_to_custom_aggs(config, overwrite=False)
|
342
|
+
by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
|
343
|
+
else:
|
344
|
+
by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
|
418
345
|
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
419
346
|
# print(f"{aggregates.schema=}")
|
420
347
|
# 3.5 billion rows for 10 years of minute data.
|
@@ -438,37 +365,40 @@ def polygon_equities_bundle_minute(
|
|
438
365
|
)
|
439
366
|
|
440
367
|
symbol_to_sid = {}
|
368
|
+
# Keep track of earliest and latest dates with data across all symbols.
|
441
369
|
dates_with_data = set()
|
442
370
|
|
443
371
|
# Get data for all stocks and write to Zipline
|
444
372
|
daily_bar_writer.write(
|
445
|
-
|
373
|
+
process_aggregates(
|
374
|
+
process_day_table if config.agg_time == AGG_TIME_DAY else process_minute_table,
|
446
375
|
fragments=aggregates.get_fragments(),
|
447
|
-
sessions=calendar.sessions_in_range(
|
448
|
-
minutes=calendar.sessions_minutes(
|
376
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
377
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
449
378
|
metadata=metadata,
|
450
379
|
calendar=calendar,
|
451
380
|
symbol_to_sid=symbol_to_sid,
|
452
381
|
dates_with_data=dates_with_data,
|
453
|
-
agg_time=
|
382
|
+
agg_time=AGG_TIME_DAY,
|
454
383
|
),
|
455
384
|
show_progress=show_progress,
|
456
385
|
)
|
457
386
|
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
387
|
+
if config.agg_time != AGG_TIME_DAY:
|
388
|
+
minute_bar_writer.write(
|
389
|
+
process_aggregates(
|
390
|
+
process_minute_table,
|
391
|
+
fragments=aggregates.get_fragments(),
|
392
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
393
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
394
|
+
metadata=metadata,
|
395
|
+
calendar=calendar,
|
396
|
+
symbol_to_sid=symbol_to_sid,
|
397
|
+
dates_with_data=dates_with_data,
|
398
|
+
agg_time=AGG_TIME_MINUTE,
|
399
|
+
),
|
400
|
+
show_progress=show_progress,
|
401
|
+
)
|
472
402
|
|
473
403
|
# Write the metadata
|
474
404
|
asset_db_writer.write(equities=metadata)
|
@@ -483,29 +413,93 @@ def polygon_equities_bundle_minute(
|
|
483
413
|
adjustment_writer.write(splits=splits, dividends=dividends)
|
484
414
|
|
485
415
|
|
416
|
+
def ingest_polygon_equities_bundle_for_agg_time(agg_time: str):
|
417
|
+
def ingest_polygon_equities_bundle_inner(
|
418
|
+
environ,
|
419
|
+
asset_db_writer,
|
420
|
+
minute_bar_writer,
|
421
|
+
daily_bar_writer,
|
422
|
+
adjustment_writer,
|
423
|
+
calendar,
|
424
|
+
start_date,
|
425
|
+
end_date,
|
426
|
+
cache,
|
427
|
+
show_progress,
|
428
|
+
output_dir,
|
429
|
+
):
|
430
|
+
return ingest_polygon_equities_bundle(
|
431
|
+
agg_time=agg_time,
|
432
|
+
environ=environ,
|
433
|
+
asset_db_writer=asset_db_writer,
|
434
|
+
minute_bar_writer=minute_bar_writer,
|
435
|
+
daily_bar_writer=daily_bar_writer,
|
436
|
+
adjustment_writer=adjustment_writer,
|
437
|
+
calendar=calendar,
|
438
|
+
start_date=start_date,
|
439
|
+
end_date=end_date,
|
440
|
+
cache=cache,
|
441
|
+
show_progress=show_progress,
|
442
|
+
output_dir=output_dir,
|
443
|
+
)
|
444
|
+
|
445
|
+
return ingest_polygon_equities_bundle_inner
|
446
|
+
|
447
|
+
|
486
448
|
def register_polygon_equities_bundle(
|
487
449
|
bundlename,
|
488
|
-
|
489
|
-
|
450
|
+
start_date=None,
|
451
|
+
end_date=None,
|
490
452
|
calendar_name="XNYS",
|
491
|
-
agg_time=
|
453
|
+
agg_time=AGG_TIME_DAY,
|
454
|
+
minutes_per_day=390,
|
455
|
+
environ=os.environ,
|
492
456
|
# ticker_list=None,
|
493
457
|
# watchlists=None,
|
494
458
|
# include_asset_types=None,
|
495
459
|
):
|
496
|
-
|
497
|
-
|
460
|
+
register_nyse_all_hours_calendar()
|
461
|
+
|
462
|
+
# pd.set_option("display.max_columns", None)
|
463
|
+
# pd.set_option("display.width", 500)
|
464
|
+
|
465
|
+
# Note that "minute" is the Polygon minute aggs and "1minute" is the trades.
|
466
|
+
if agg_time not in [AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES, "1min", "1minute"]:
|
467
|
+
raise ValueError(
|
468
|
+
f"agg_time must be 'day', 'minute' (aggs), or '1minute' (trades), not '{agg_time}'"
|
469
|
+
)
|
470
|
+
|
471
|
+
# We need to know the start and end dates of the session before the bundle is
|
472
|
+
# registered because even though we only need it for ingest, the metadata in
|
473
|
+
# the writer is initialized and written before our ingest function is called.
|
474
|
+
if start_date is None or end_date is None:
|
475
|
+
config = PolygonConfig(
|
476
|
+
environ=environ,
|
477
|
+
calendar_name=calendar_name,
|
478
|
+
start_date=start_date,
|
479
|
+
end_date=end_date,
|
480
|
+
agg_time=agg_time,
|
481
|
+
)
|
482
|
+
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
483
|
+
config.aggs_dir if agg_time in [AGG_TIME_DAY, AGG_TIME_MINUTE] else config.trades_dir,
|
484
|
+
config.csv_paths_pattern,
|
485
|
+
)
|
486
|
+
# print(f"{bundlename=} {first_aggs_date=} {last_aggs_date=}")
|
487
|
+
if start_date is None:
|
488
|
+
start_date = first_aggs_date
|
489
|
+
if end_date is None:
|
490
|
+
end_date = last_aggs_date
|
491
|
+
|
492
|
+
start_session = parse_date(start_date, raise_oob=False) if start_date else None
|
493
|
+
end_session = parse_date(end_date, raise_oob=False) if end_date else None
|
494
|
+
# print(f"Registered {bundlename=} {agg_time=} {start_session=} {end_session=}")
|
495
|
+
|
498
496
|
register(
|
499
497
|
bundlename,
|
500
|
-
(
|
501
|
-
polygon_equities_bundle_minute
|
502
|
-
if agg_time == "minute"
|
503
|
-
else polygon_equities_bundle_day
|
504
|
-
),
|
498
|
+
ingest_polygon_equities_bundle_for_agg_time(agg_time),
|
505
499
|
start_session=start_session,
|
506
500
|
end_session=end_session,
|
507
501
|
calendar_name=calendar_name,
|
508
|
-
|
502
|
+
minutes_per_day=minutes_per_day,
|
509
503
|
# create_writers=True,
|
510
504
|
)
|
511
505
|
|
@@ -517,12 +511,12 @@ def register_polygon_equities_bundle(
|
|
517
511
|
# config = PolygonConfig(
|
518
512
|
# environ=os.environ,
|
519
513
|
# calendar_name="XNYS",
|
520
|
-
# #
|
521
|
-
# #
|
522
|
-
#
|
523
|
-
# #
|
524
|
-
#
|
525
|
-
# #
|
514
|
+
# # start_date="2003-10-01",
|
515
|
+
# # start_date="2018-01-01",
|
516
|
+
# start_date="2023-01-01",
|
517
|
+
# # end_date="2023-01-12",
|
518
|
+
# end_date="2023-12-31",
|
519
|
+
# # end_date="2024-06-30",
|
526
520
|
# )
|
527
521
|
# splits = load_polygon_splits(config)
|
528
522
|
# splits.info()
|