zipline_polygon_bundle 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,543 @@
1
+ from zipline.data.bundles import register
2
+ from zipline.data.resample import minute_frame_to_session_frame
3
+
4
+ from .config import PolygonConfig
5
+ from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
6
+ from .adjustments import load_splits, load_dividends
7
+
8
+ import pyarrow
9
+ import pyarrow.compute
10
+
11
+ import pandas as pd
12
+ import logging
13
+
14
+ import concurrent.futures
15
+
16
+
17
+ # TODO: Change warnings to be relative to number of days in the range.
18
+
19
+
20
+ def symbol_to_upper(s: str) -> str:
21
+ if s.isupper():
22
+ return s
23
+ return "".join(map(lambda c: ("^" + c.upper()) if c.islower() else c, s))
24
+
25
+
26
+ def generate_all_agg_tables_from_csv(
27
+ config: PolygonConfig,
28
+ ):
29
+ paths, schema, tables = generate_csv_agg_tables(config)
30
+ for table in tables:
31
+ table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
32
+ yield table
33
+
34
+
35
+ # def remove_duplicated_index(df: pd.DataFrame) -> pd.DataFrame:
36
+ # duplicated_index = df.index.duplicated(keep=False)
37
+ # if not duplicated_index.any():
38
+ # return df
39
+ # # Find duplicate index values (date) with zero volume or transactions
40
+ # duplicated_index_with_zero_activity = duplicated_index & (
41
+ # df["volume"] == 0) | (df["transactions"] == 0)
42
+ # if duplicated_index_with_zero_activity.any():
43
+ # print(
44
+ # f" WARNING: Got dupes with zero activity {df[duplicated_index_with_zero_activity]=}"
45
+ # )
46
+ # df = df[~duplicated_index_with_zero_activity]
47
+ # duplicated_index = df.index.duplicated(keep=False)
48
+ # if not duplicated_index.any():
49
+ # return df
50
+ # print(f" WARNING: Dropping dupes {df[duplicated_index]=}")
51
+ # df = df[df.index.duplicated(keep="first")]
52
+ # return df
53
+
54
+
55
+ def aggregate_multiple_aggs_per_date(df: pd.DataFrame) -> pd.DataFrame:
56
+ duplicated_index = df.index.duplicated(keep=False)
57
+ if not duplicated_index.any():
58
+ return df
59
+ duplicates = df[duplicated_index]
60
+ duplicate_index_values = duplicates.index.values
61
+ print()
62
+ if duplicates["symbol"].nunique() != 1:
63
+ logging.error(f"{duplicates['symbol'].unique()=} {duplicate_index_values=}")
64
+ logging.warning(
65
+ f"Aggregating dupes df[df.index.duplicated(keep=False)]=\n{duplicates}"
66
+ )
67
+ df = df.groupby(df.index).agg(
68
+ {
69
+ "symbol": "first",
70
+ "volume": "sum",
71
+ "open": "first",
72
+ "close": "last",
73
+ "high": "max",
74
+ "low": "min",
75
+ "transactions": "sum",
76
+ }
77
+ )
78
+ print(f"WARNING: Aggregated dupes df=\n{df[df.index.isin(duplicate_index_values)]}")
79
+ return df
80
+
81
+
82
+ def process_day_aggregates(
83
+ table,
84
+ sessions,
85
+ metadata,
86
+ calendar,
87
+ symbol_to_sid: dict[str, int],
88
+ dates_with_data: set,
89
+ ):
90
+ for symbol, sid in symbol_to_sid.items():
91
+ df = table.filter(
92
+ pyarrow.compute.field("symbol") == pyarrow.scalar(symbol)
93
+ ).to_pandas()
94
+ # The SQL schema zipline uses for symbols ignores case
95
+ sql_symbol = symbol_to_upper(symbol)
96
+ df["symbol"] = sql_symbol
97
+ df["day"] = pd.to_datetime(df["day"].dt.date)
98
+ df = df.set_index("day")
99
+ if not df.index.is_monotonic_increasing:
100
+ print(f" INFO: {symbol=} {sid=} not monotonic increasing")
101
+ df.sort_index(inplace=True)
102
+ # Remove duplicates
103
+ df = df[~df.index.duplicated(keep="first")]
104
+ # Take days as per calendar
105
+ df = df[df.index.isin(sessions)]
106
+ # 2019-08-13 has a bunch of tickers with multiple day aggs per date
107
+ df = aggregate_multiple_aggs_per_date(df)
108
+ if len(df) < 1:
109
+ continue
110
+ # Check first and last date.
111
+ start_date = df.index[0]
112
+ dates_with_data.add(start_date.date())
113
+ end_date = df.index[-1]
114
+ dates_with_data.add(end_date.date())
115
+ try:
116
+ duplicated_index = df.index.duplicated(keep=False)
117
+ df_with_duplicates = df[duplicated_index]
118
+ if len(df_with_duplicates) > 0:
119
+ print(f" WARNING: {symbol=} {sid=} {len(df_with_duplicates)=}")
120
+ df_with_duplicates.info()
121
+ print(df_with_duplicates)
122
+ # Synch to the official exchange calendar
123
+ df = df.reindex(sessions.tz_localize(None))
124
+ except ValueError as e:
125
+ print(f" ERROR: {symbol=} {sid=} {e}")
126
+ print(f"{start_date=} {end_date=} {sessions[0]=} {sessions[-1]=}")
127
+ df.info()
128
+ # Missing volume and transactions are zero
129
+ df["volume"] = df["volume"].fillna(0)
130
+ df["transactions"] = df["transactions"].fillna(0)
131
+ # TODO: These fills should have the same price for OHLC (open for backfill, close for forward fill)
132
+ # Forward fill missing price data (better than backfill)
133
+ df.ffill(inplace=True)
134
+ # Back fill missing data (maybe necessary for before the first day bar)
135
+ df.bfill(inplace=True)
136
+ # There should be no missing data
137
+ if df.isnull().sum().sum() > 0:
138
+ print(f" WARNING: Missing data for {symbol=} {sid=}")
139
+
140
+ # The auto_close date is the day after the last trade.
141
+ auto_close_date = end_date + pd.Timedelta(days=1)
142
+
143
+ # Add a row to the metadata DataFrame. Don't forget to add an exchange field.
144
+ metadata.loc[sid] = (
145
+ start_date,
146
+ end_date,
147
+ auto_close_date,
148
+ sql_symbol,
149
+ calendar.name,
150
+ symbol,
151
+ )
152
+ if len(df) > 0:
153
+ yield sid, df
154
+ return
155
+
156
+
157
+ def rename_polygon_to_zipline(table: pyarrow.Table, time_name: str) -> pyarrow.Table:
158
+ table = table.rename_columns(
159
+ [
160
+ (
161
+ "symbol"
162
+ if name == "ticker"
163
+ else time_name if name == "window_start" else name
164
+ )
165
+ for name in table.column_names
166
+ ]
167
+ )
168
+ return table
169
+
170
+
171
+ def polygon_equities_bundle_day(
172
+ environ,
173
+ asset_db_writer,
174
+ minute_bar_writer,
175
+ daily_bar_writer,
176
+ adjustment_writer,
177
+ calendar,
178
+ start_session,
179
+ end_session,
180
+ cache,
181
+ show_progress,
182
+ output_dir,
183
+ ):
184
+ config = PolygonConfig(
185
+ environ=environ,
186
+ calendar_name=calendar.name,
187
+ start_session=start_session,
188
+ end_session=end_session,
189
+ agg_time="day",
190
+ )
191
+
192
+ by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
193
+ aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
194
+
195
+ # Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
196
+ # This is because the SQL schema zipline uses for symbols ignores case.
197
+ # We put the original symbol in the asset_name field.
198
+ metadata = pd.DataFrame(
199
+ columns=(
200
+ "start_date",
201
+ "end_date",
202
+ "auto_close_date",
203
+ "symbol",
204
+ "exchange",
205
+ "asset_name",
206
+ )
207
+ )
208
+
209
+ table = aggregates.to_table()
210
+ table = rename_polygon_to_zipline(table, "day")
211
+ # Get all the symbols in the table by using value_counts to tabulate the unique values.
212
+ # pyarrow.Table.column returns a pyarrow.ChunkedArray.
213
+ # https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow.ChunkedArray.value_counts
214
+ symbols = sorted(table.column("symbol").value_counts().field(0).to_pylist())
215
+ symbol_to_sid = {symbol: sid for sid, symbol in enumerate(symbols)}
216
+ dates_with_data = set()
217
+
218
+ # Get data for all stocks and write to Zipline
219
+ daily_bar_writer.write(
220
+ process_day_aggregates(
221
+ table=table,
222
+ sessions=calendar.sessions_in_range(start_session, end_session),
223
+ metadata=metadata,
224
+ calendar=calendar,
225
+ symbol_to_sid=symbol_to_sid,
226
+ dates_with_data=dates_with_data,
227
+ ),
228
+ show_progress=show_progress,
229
+ )
230
+
231
+ # Write the metadata
232
+ asset_db_writer.write(equities=metadata)
233
+
234
+ # Load splits and dividends
235
+ first_start_end = min(dates_with_data)
236
+ last_end_date = max(dates_with_data)
237
+ splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
238
+ dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
239
+
240
+ # Write splits and dividends
241
+ adjustment_writer.write(splits=splits, dividends=dividends)
242
+
243
+
244
+ def process_minute_fragment(
245
+ fragment,
246
+ sessions,
247
+ minutes,
248
+ metadata,
249
+ calendar,
250
+ symbol_to_sid: dict[str, int],
251
+ dates_with_data: set,
252
+ agg_time: str,
253
+ ):
254
+ table = fragment.to_table()
255
+ print(f" {table.num_rows=}")
256
+ table = rename_polygon_to_zipline(table, "timestamp")
257
+ table = table.sort_by([("symbol", "ascending"), ("timestamp", "ascending")])
258
+ table = table.filter(pyarrow.compute.field("timestamp").isin(minutes))
259
+ table_df = table.to_pandas()
260
+ for symbol, df in table_df.groupby("symbol"):
261
+ # print(f"\n{symbol=} {len(df)=} {df['timestamp'].min()} {df['timestamp'].max()}")
262
+ if symbol not in symbol_to_sid:
263
+ symbol_to_sid[symbol] = len(symbol_to_sid) + 1
264
+ sid = symbol_to_sid[symbol]
265
+ # The SQL schema zipline uses for symbols ignores case
266
+ sql_symbol = symbol_to_upper(symbol)
267
+ df["symbol"] = sql_symbol
268
+ df = df.set_index("timestamp")
269
+ if agg_time == "day":
270
+ df.drop(columns=["symbol", "transactions"], inplace=True)
271
+ # Check first and last date.
272
+ start_date = df.index[0].date()
273
+ start_timestamp = df.index[0]
274
+ dates_with_data.add(start_date)
275
+ end_date = df.index[-1].date()
276
+ end_timestamp = df.index[-1]
277
+ dates_with_data.add(end_date)
278
+ df = df[df.index.isin(minutes)]
279
+ len_before = len(df)
280
+ if len(df) < 1:
281
+ # TODO: Move sid assignment until after this check for no data.
282
+ print(
283
+ f" WARNING: No data for {symbol=} {sid=} {len_before=} {start_timestamp=} {end_timestamp=}"
284
+ )
285
+ continue
286
+ df = minute_frame_to_session_frame(df, calendar)
287
+ df["symbol"] = sql_symbol
288
+ df = df[df.index.isin(sessions)]
289
+
290
+ # The auto_close date is the day after the last trade.
291
+ auto_close_date = end_date + pd.Timedelta(days=1)
292
+
293
+ # If metadata already has this sid, just extend the end_date and ac_date.
294
+ if sid in metadata.index:
295
+ if metadata.loc[sid, "start_date"] >= start_date:
296
+ print(
297
+ f" ERROR: {symbol=} {sid=} {metadata.loc[sid, 'start_date']=} >= {start_date=}"
298
+ )
299
+ if metadata.loc[sid, "end_date"] >= start_date:
300
+ print(
301
+ f" ERROR: {symbol=} {sid=} {metadata.loc[sid, 'end_date']=} >= {end_date=}"
302
+ )
303
+ metadata.loc[sid, "end_date"] = end_date
304
+ metadata.loc[sid, "auto_close_date"] = auto_close_date
305
+ else:
306
+ # Add a row to the metadata DataFrame. Don't forget to add an exchange field.
307
+ metadata.loc[sid] = (
308
+ start_date,
309
+ end_date,
310
+ auto_close_date,
311
+ symbol_to_upper(symbol),
312
+ calendar.name,
313
+ symbol,
314
+ )
315
+ df = df.reindex(sessions.tz_localize(None))
316
+ # df = df.reindex(sessions)
317
+ # Missing volume and transactions are zero
318
+ df["volume"] = df["volume"].fillna(0)
319
+ # df["transactions"] = df["transactions"].fillna(0)
320
+ # Forward fill missing price data (better than backfill)
321
+ # TODO: These fills should have the same price for OHLC (open for backfill, close for forward fill)
322
+ df.ffill(inplace=True)
323
+ # Back fill missing data (maybe necessary for before the first day bar)
324
+ df.bfill(inplace=True)
325
+ if len(df) > 0:
326
+ # print(f"\n{symbol=} {sid=} {len_before=} {start_timestamp=} {end_date=} {end_timestamp=} {len(df)=}")
327
+ yield sid, df
328
+ else:
329
+ print(
330
+ f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {start_timestamp=} {end_date=} {end_timestamp=}"
331
+ )
332
+ else:
333
+ len_before = len(df)
334
+ df = df[df.index.isin(minutes)]
335
+ if len(df) < 2:
336
+ print(
337
+ f" WARNING: Not enough data for {symbol=} {sid=} {len(df)=} {len_before=}"
338
+ )
339
+ continue
340
+
341
+ # A df with 1 bar crashes zipline/data/bcolz_minute_bars.py", line 747
342
+ # pd.Timestamp(dts[0]), direction="previous"
343
+ if len(df) > 1:
344
+ yield sid, df
345
+ else:
346
+ print(
347
+ f" WARNING: Not enough minute bars for {symbol=} {sid=} {len(df)=}"
348
+ )
349
+ return
350
+
351
+
352
+ def process_minute_aggregates(
353
+ fragments,
354
+ sessions,
355
+ minutes,
356
+ metadata,
357
+ calendar,
358
+ symbol_to_sid: dict[str, int],
359
+ dates_with_data: set,
360
+ agg_time: str,
361
+ ):
362
+ # We want to do this by Hive partition at a time because each ticker will be complete.
363
+ for fragment in fragments:
364
+ yield from process_minute_fragment(
365
+ fragment=fragment,
366
+ sessions=sessions,
367
+ minutes=minutes,
368
+ metadata=metadata,
369
+ calendar=calendar,
370
+ symbol_to_sid=symbol_to_sid,
371
+ dates_with_data=dates_with_data,
372
+ agg_time=agg_time,
373
+ )
374
+
375
+ # This doesn't seem to be hardly any faster than the above, something with the GIL?
376
+ # Also to use this we'd need to make sure the symbol_to_sid and dates_with_data are thread safe.
377
+ # with concurrent.futures.ThreadPoolExecutor(max_workers=12) as executor:
378
+ # futures = [
379
+ # executor.submit(
380
+ # process_minute_fragment,
381
+ # fragment,
382
+ # sessions,
383
+ # minutes,
384
+ # metadata,
385
+ # calendar,
386
+ # symbol_to_sid,
387
+ # dates_with_data,
388
+ # agg_time,
389
+ # )
390
+ # for fragment in fragments
391
+ # ]
392
+ # for future in concurrent.futures.as_completed(futures):
393
+ # yield from future.result()
394
+
395
+
396
+ def polygon_equities_bundle_minute(
397
+ environ,
398
+ asset_db_writer,
399
+ minute_bar_writer,
400
+ daily_bar_writer,
401
+ adjustment_writer,
402
+ calendar,
403
+ start_session,
404
+ end_session,
405
+ cache,
406
+ show_progress,
407
+ output_dir,
408
+ ):
409
+ config = PolygonConfig(
410
+ environ=environ,
411
+ calendar_name=calendar.name,
412
+ start_session=start_session,
413
+ end_session=end_session,
414
+ agg_time="minute",
415
+ )
416
+
417
+ by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
418
+ aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
419
+ # print(f"{aggregates.schema=}")
420
+ # 3.5 billion rows for 10 years of minute data.
421
+ # print(f"{aggregates.count_rows()=}")
422
+ # Can't sort the dataset because that reads it all into memory.
423
+ # aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
424
+ # print("Sorted")
425
+
426
+ # Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
427
+ # This is because the SQL schema zipline uses for symbols ignores case.
428
+ # We put the original symbol in the asset_name field.
429
+ metadata = pd.DataFrame(
430
+ columns=(
431
+ "start_date",
432
+ "end_date",
433
+ "auto_close_date",
434
+ "symbol",
435
+ "exchange",
436
+ "asset_name",
437
+ )
438
+ )
439
+
440
+ symbol_to_sid = {}
441
+ dates_with_data = set()
442
+
443
+ # Get data for all stocks and write to Zipline
444
+ daily_bar_writer.write(
445
+ process_minute_aggregates(
446
+ fragments=aggregates.get_fragments(),
447
+ sessions=calendar.sessions_in_range(start_session, end_session),
448
+ minutes=calendar.sessions_minutes(start_session, end_session),
449
+ metadata=metadata,
450
+ calendar=calendar,
451
+ symbol_to_sid=symbol_to_sid,
452
+ dates_with_data=dates_with_data,
453
+ agg_time="day",
454
+ ),
455
+ show_progress=show_progress,
456
+ )
457
+
458
+ # Get data for all stocks and write to Zipline
459
+ minute_bar_writer.write(
460
+ process_minute_aggregates(
461
+ fragments=aggregates.get_fragments(),
462
+ sessions=calendar.sessions_in_range(start_session, end_session),
463
+ minutes=calendar.sessions_minutes(start_session, end_session),
464
+ metadata=metadata,
465
+ calendar=calendar,
466
+ symbol_to_sid=symbol_to_sid,
467
+ dates_with_data=dates_with_data,
468
+ agg_time="minute",
469
+ ),
470
+ show_progress=show_progress,
471
+ )
472
+
473
+ # Write the metadata
474
+ asset_db_writer.write(equities=metadata)
475
+
476
+ # Load splits and dividends
477
+ first_start_end = min(dates_with_data)
478
+ last_end_date = max(dates_with_data)
479
+ splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
480
+ dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
481
+
482
+ # Write splits and dividends
483
+ adjustment_writer.write(splits=splits, dividends=dividends)
484
+
485
+
486
+ def register_polygon_equities_bundle(
487
+ bundlename,
488
+ start_session=None,
489
+ end_session=None,
490
+ calendar_name="XNYS",
491
+ agg_time="day",
492
+ # ticker_list=None,
493
+ # watchlists=None,
494
+ # include_asset_types=None,
495
+ ):
496
+ if agg_time not in ["day", "minute"]:
497
+ raise ValueError(f"agg_time must be 'day' or 'minute', not '{agg_time}'")
498
+ register(
499
+ bundlename,
500
+ (
501
+ polygon_equities_bundle_minute
502
+ if agg_time == "minute"
503
+ else polygon_equities_bundle_day
504
+ ),
505
+ start_session=start_session,
506
+ end_session=end_session,
507
+ calendar_name=calendar_name,
508
+ # minutes_per_day=390,
509
+ # create_writers=True,
510
+ )
511
+
512
+
513
+ # if __name__ == "__main__":
514
+ # logging.basicConfig(level=logging.WARNING)
515
+ # os.environ["POLYGON_MIRROR_DIR"] = "/Volumes/Oahu/Mirror/files.polygon.io"
516
+ # os.environ["ZIPLINE_ROOT"] = "/Volumes/Oahu/Workspaces/zipline"
517
+ # config = PolygonConfig(
518
+ # environ=os.environ,
519
+ # calendar_name="XNYS",
520
+ # # start_session="2003-10-01",
521
+ # # start_session="2018-01-01",
522
+ # start_session="2023-01-01",
523
+ # # end_session="2023-01-12",
524
+ # end_session="2023-12-31",
525
+ # # end_session="2024-06-30",
526
+ # )
527
+ # splits = load_polygon_splits(config)
528
+ # splits.info()
529
+ # print(splits.head())
530
+ # dividends = load_polygon_dividends(config)
531
+ # dividends.info()
532
+ # print(dividends.head())
533
+ # tickers = set(
534
+ # splits["ticker"].unique().tolist() + dividends["ticker"].unique().tolist()
535
+ # )
536
+ # print(f"{len(tickers)=}")
537
+ # ticker_to_sid = {ticker: sid for sid, ticker in enumerate(tickers)}
538
+ # splits = load_splits(config, ticker_to_sid)
539
+ # splits.info()
540
+ # print(splits.head())
541
+ # dividends = load_dividends(config, ticker_to_sid)
542
+ # dividends.info()
543
+ # print(dividends.head())