zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,11 @@
1
- import os
2
1
  from zipline.data.bundles import register
3
2
  from zipline.data.resample import minute_frame_to_session_frame
4
3
 
5
4
  from exchange_calendars.calendar_helpers import parse_date
6
- from exchange_calendars.calendar_utils import get_calendar
7
5
 
8
6
  from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
9
7
  from .adjustments import load_splits, load_dividends
10
- from .config import PolygonConfig
8
+ from .config import PolygonConfig, AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES
11
9
  from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
12
10
  from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
13
11
 
@@ -16,6 +14,9 @@ import pyarrow.compute
16
14
  import pyarrow.dataset
17
15
 
18
16
  import pandas as pd
17
+
18
+ import os
19
+ from filelock import FileLock
19
20
  import logging
20
21
 
21
22
 
@@ -37,61 +38,34 @@ def generate_all_agg_tables_from_csv(
37
38
  yield table
38
39
 
39
40
 
40
- # def remove_duplicated_index(df: pd.DataFrame) -> pd.DataFrame:
41
- # duplicated_index = df.index.duplicated(keep=False)
42
- # if not duplicated_index.any():
43
- # return df
44
- # # Find duplicate index values (date) with zero volume or transactions
45
- # duplicated_index_with_zero_activity = duplicated_index & (
46
- # df["volume"] == 0) | (df["transactions"] == 0)
47
- # if duplicated_index_with_zero_activity.any():
48
- # print(
49
- # f" WARNING: Got dupes with zero activity {df[duplicated_index_with_zero_activity]=}"
50
- # )
51
- # df = df[~duplicated_index_with_zero_activity]
52
- # duplicated_index = df.index.duplicated(keep=False)
53
- # if not duplicated_index.any():
54
- # return df
55
- # print(f" WARNING: Dropping dupes {df[duplicated_index]=}")
56
- # df = df[df.index.duplicated(keep="first")]
57
- # return df
58
-
59
-
60
- def aggregate_multiple_aggs_per_date(df: pd.DataFrame) -> pd.DataFrame:
61
- duplicated_index = df.index.duplicated(keep=False)
62
- if not duplicated_index.any():
63
- return df
64
- duplicates = df[duplicated_index]
65
- duplicate_index_values = duplicates.index.values
66
- print()
67
- if duplicates["symbol"].nunique() != 1:
68
- logging.error(f"{duplicates['symbol'].unique()=} {duplicate_index_values=}")
69
- logging.warning(
70
- f"Aggregating dupes df[df.index.duplicated(keep=False)]=\n{duplicates}"
71
- )
72
- df = df.groupby(df.index).agg(
73
- {
74
- "symbol": "first",
75
- "volume": "sum",
76
- "open": "first",
77
- "close": "last",
78
- "high": "max",
79
- "low": "min",
80
- "transactions": "sum",
81
- }
41
+ def rename_polygon_to_zipline(table: pyarrow.Table, time_name: str) -> pyarrow.Table:
42
+ table = table.rename_columns(
43
+ [
44
+ (
45
+ "symbol"
46
+ if name == "ticker"
47
+ else time_name if name == "window_start" else name
48
+ )
49
+ for name in table.column_names
50
+ ]
82
51
  )
83
- print(f"WARNING: Aggregated dupes df=\n{df[df.index.isin(duplicate_index_values)]}")
84
- return df
52
+ return table
85
53
 
86
54
 
87
- def process_day_aggregates(
55
+ def process_day_table(
88
56
  table,
89
57
  sessions,
58
+ minutes,
90
59
  metadata,
91
60
  calendar,
92
61
  symbol_to_sid: dict[str, int],
93
- dates_with_data: set,
62
+ dates_with_data: set[pd.Timestamp],
63
+ agg_time: str,
94
64
  ):
65
+ table = rename_polygon_to_zipline(table, "day")
66
+ symbols = table.column("symbol").unique().to_pylist()
67
+ for sid, symbol in enumerate(symbols):
68
+ symbol_to_sid[symbol] = sid
95
69
  for symbol, sid in symbol_to_sid.items():
96
70
  df = table.filter(
97
71
  pyarrow.compute.field("symbol") == pyarrow.scalar(symbol)
@@ -99,24 +73,25 @@ def process_day_aggregates(
99
73
  # The SQL schema zipline uses for symbols ignores case
100
74
  sql_symbol = symbol_to_upper(symbol)
101
75
  df["symbol"] = sql_symbol
102
- df["day"] = pd.to_datetime(df["day"].dt.date)
76
+ df["day"] = pd.to_datetime(df["day"].dt.tz_convert(calendar.tz.key).dt.date)
103
77
  df = df.set_index("day")
104
78
  if not df.index.is_monotonic_increasing:
105
- print(f" INFO: {symbol=} {sid=} not monotonic increasing")
79
+ print(f" INFO: {symbol=} {sid=} not monotonic increasing: {df.index.min()=} {df.index.max()=}")
106
80
  df.sort_index(inplace=True)
107
81
  # Remove duplicates
108
82
  df = df[~df.index.duplicated(keep="first")]
109
83
  # Take days as per calendar
110
84
  df = df[df.index.isin(sessions)]
111
85
  # 2019-08-13 has a bunch of tickers with multiple day aggs per date
112
- df = aggregate_multiple_aggs_per_date(df)
86
+ # TODO: Actually they're for different days so if the filtering doesn't work then do something about it.
87
+ # df = aggregate_multiple_aggs_per_date(df)
113
88
  if len(df) < 1:
114
89
  continue
115
90
  # Check first and last date.
116
91
  start_date = df.index[0]
117
- dates_with_data.add(start_date.date())
92
+ dates_with_data.add(start_date)
118
93
  end_date = df.index[-1]
119
- dates_with_data.add(end_date.date())
94
+ dates_with_data.add(end_date)
120
95
  try:
121
96
  duplicated_index = df.index.duplicated(keep=False)
122
97
  df_with_duplicates = df[duplicated_index]
@@ -159,133 +134,22 @@ def process_day_aggregates(
159
134
  return
160
135
 
161
136
 
162
- def rename_polygon_to_zipline(table: pyarrow.Table, time_name: str) -> pyarrow.Table:
163
- table = table.rename_columns(
164
- [
165
- (
166
- "symbol"
167
- if name == "ticker"
168
- else time_name if name == "window_start" else name
169
- )
170
- for name in table.column_names
171
- ]
172
- )
173
- return table
174
-
175
-
176
- def polygon_equities_bundle_day(
177
- environ,
178
- asset_db_writer,
179
- minute_bar_writer,
180
- daily_bar_writer,
181
- adjustment_writer,
182
- calendar,
183
- start_date,
184
- end_date,
185
- cache,
186
- show_progress,
187
- output_dir,
188
- ):
189
- config = PolygonConfig(
190
- environ=environ,
191
- calendar_name=calendar.name,
192
- start_date=start_date,
193
- end_date=end_date,
194
- agg_time="day",
195
- )
196
-
197
- by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
198
- aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
199
-
200
- # Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
201
- # This is because the SQL schema zipline uses for symbols ignores case.
202
- # We put the original symbol in the asset_name field.
203
- metadata = pd.DataFrame(
204
- columns=(
205
- "start_date",
206
- "end_date",
207
- "auto_close_date",
208
- "symbol",
209
- "exchange",
210
- "asset_name",
211
- )
212
- )
213
-
214
- # Only get the columns Zipline allows.
215
- table = aggregates.to_table(
216
- columns=[
217
- "ticker",
218
- "window_start",
219
- "open",
220
- "high",
221
- "low",
222
- "close",
223
- "volume",
224
- "transactions",
225
- ]
226
- )
227
- table = rename_polygon_to_zipline(table, "day")
228
- # Get all the symbols in the table by using value_counts to tabulate the unique values.
229
- # pyarrow.Table.column returns a pyarrow.ChunkedArray.
230
- # https://arrow.apache.org/docs/python/generated/pyarrow.ChunkedArray.html#pyarrow.ChunkedArray.value_counts
231
- symbols = sorted(table.column("symbol").value_counts().field(0).to_pylist())
232
- symbol_to_sid = {symbol: sid for sid, symbol in enumerate(symbols)}
233
- dates_with_data = set()
234
-
235
- # Get data for all stocks and write to Zipline
236
- daily_bar_writer.write(
237
- process_day_aggregates(
238
- table=table,
239
- sessions=calendar.sessions_in_range(start_date, end_date),
240
- metadata=metadata,
241
- calendar=calendar,
242
- symbol_to_sid=symbol_to_sid,
243
- dates_with_data=dates_with_data,
244
- ),
245
- show_progress=show_progress,
246
- )
247
-
248
- # Write the metadata
249
- asset_db_writer.write(equities=metadata)
250
-
251
- # Load splits and dividends
252
- first_start_end = min(dates_with_data)
253
- last_end_date = max(dates_with_data)
254
- splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
255
- dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
256
-
257
- # Write splits and dividends
258
- adjustment_writer.write(splits=splits, dividends=dividends)
259
-
260
-
261
- def process_minute_fragment(
262
- fragment,
137
+ def process_minute_table(
138
+ table,
263
139
  sessions,
264
140
  minutes,
265
141
  metadata,
266
142
  calendar,
267
143
  symbol_to_sid: dict[str, int],
268
- dates_with_data: set,
144
+ dates_with_data: set[pd.Timestamp],
269
145
  agg_time: str,
270
146
  ):
271
- # Only get the columns Zipline allows.
272
- table = fragment.to_table(
273
- columns=[
274
- "ticker",
275
- "window_start",
276
- "open",
277
- "high",
278
- "low",
279
- "close",
280
- "volume",
281
- "transactions",
282
- ]
283
- )
284
- print(f" {table.num_rows=}")
285
147
  table = rename_polygon_to_zipline(table, "timestamp")
286
- table = table.sort_by([("symbol", "ascending"), ("timestamp", "ascending")])
148
+ # print(f"{minutes[:5]=}\n{minutes[-5:]=}")
287
149
  table = table.filter(pyarrow.compute.field("timestamp").isin(minutes))
150
+ # print(f"filtered {table.num_rows=}")
288
151
  table_df = table.to_pandas()
152
+ # print(f"{table_df.head()=}")
289
153
  for symbol, df in table_df.groupby("symbol"):
290
154
  # print(f"\n{symbol=} {len(df)=} {df['timestamp'].min()} {df['timestamp'].max()}")
291
155
  if symbol not in symbol_to_sid:
@@ -295,29 +159,35 @@ def process_minute_fragment(
295
159
  sql_symbol = symbol_to_upper(symbol)
296
160
  df["symbol"] = sql_symbol
297
161
  df = df.set_index("timestamp")
298
- if agg_time == "day":
162
+ # Shouldn't need to do this because the table is sorted.
163
+ if not df.index.is_monotonic_increasing:
164
+ print(f" INFO: {symbol=} {sid=} not monotonic increasing")
165
+ df.sort_index(inplace=True)
166
+ if agg_time == AGG_TIME_DAY:
299
167
  df.drop(columns=["symbol", "transactions"], inplace=True)
300
- # Check first and last date.
301
- start_date = df.index[0].date()
302
- start_timestamp = df.index[0]
168
+ # Remember first and last date.
169
+ start_date = df.index[0].tz_convert(calendar.tz.key).normalize()
303
170
  dates_with_data.add(start_date)
304
- end_date = df.index[-1].date()
305
- end_timestamp = df.index[-1]
171
+ end_date = df.index[-1].tz_convert(calendar.tz.key).normalize()
306
172
  dates_with_data.add(end_date)
307
173
  df = df[df.index.isin(minutes)]
308
174
  len_before = len(df)
175
+ # print(f"{start_date=} {end_date=} {dates_with_data=}")
176
+ # print(f"day pre {df.head()=}")
309
177
  if len(df) < 1:
310
178
  # TODO: Move sid assignment until after this check for no data.
311
179
  print(
312
- f" WARNING: No data for {symbol=} {sid=} {len_before=} {start_timestamp=} {end_timestamp=}"
180
+ f" WARNING: No data for {symbol=} {sid=} {len_before=} {start_date=} {end_date=}"
313
181
  )
314
182
  continue
315
183
  df = minute_frame_to_session_frame(df, calendar)
316
- df["symbol"] = sql_symbol
184
+ # print(f"day sess {df.head()=}")
185
+ # df["symbol"] = sql_symbol
317
186
  df = df[df.index.isin(sessions)]
318
187
 
319
188
  # The auto_close date is the day after the last trade.
320
- auto_close_date = end_date + pd.Timedelta(days=1)
189
+ # auto_close_date = end_date + pd.Timedelta(days=1)
190
+ auto_close_date = None
321
191
 
322
192
  # If metadata already has this sid, just extend the end_date and ac_date.
323
193
  if sid in metadata.index:
@@ -337,12 +207,12 @@ def process_minute_fragment(
337
207
  start_date,
338
208
  end_date,
339
209
  auto_close_date,
340
- symbol_to_upper(symbol),
210
+ sql_symbol,
341
211
  calendar.name,
342
212
  symbol,
343
213
  )
344
- df = df.reindex(sessions.tz_localize(None))
345
- # df = df.reindex(sessions)
214
+ # df = df.reindex(sessions.tz_localize(None))
215
+ df = df.reindex(sessions)
346
216
  # Missing volume and transactions are zero
347
217
  df["volume"] = df["volume"].fillna(0)
348
218
  # df["transactions"] = df["transactions"].fillna(0)
@@ -350,13 +220,14 @@ def process_minute_fragment(
350
220
  # TODO: These fills should have the same price for OHLC (open for backfill, close for forward fill)
351
221
  df.ffill(inplace=True)
352
222
  # Back fill missing data (maybe necessary for before the first day bar)
223
+ # TODO: Don't want to backfill future values. What's better here?
353
224
  df.bfill(inplace=True)
354
225
  if len(df) > 0:
355
226
  # print(f"\n{symbol=} {sid=} {len_before=} {start_timestamp=} {end_date=} {end_timestamp=} {len(df)=}")
356
227
  yield sid, df
357
228
  else:
358
229
  print(
359
- f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {start_timestamp=} {end_date=} {end_timestamp=}"
230
+ f" WARNING: No day bars for {symbol=} {sid=} {len_before=} {start_date=} {start_date=} {end_date=} {end_date=}"
360
231
  )
361
232
  else:
362
233
  len_before = len(df)
@@ -378,20 +249,35 @@ def process_minute_fragment(
378
249
  return
379
250
 
380
251
 
381
- def process_minute_aggregates(
252
+ def process_aggregates(
253
+ process_table_func,
382
254
  fragments,
383
255
  sessions,
384
256
  minutes,
385
257
  metadata,
386
258
  calendar,
387
259
  symbol_to_sid: dict[str, int],
388
- dates_with_data: set,
260
+ dates_with_data: set[pd.Timestamp],
389
261
  agg_time: str,
390
262
  ):
391
- # We want to do this by Hive partition at a time because each ticker will be complete.
263
+ # We do this by Hive partition at a time because each ticker will be complete.
392
264
  for fragment in fragments:
393
- yield from process_minute_fragment(
394
- fragment=fragment,
265
+ # Only get the columns Zipline allows.
266
+ table = fragment.to_table(
267
+ columns=[
268
+ "ticker",
269
+ "window_start",
270
+ "open",
271
+ "high",
272
+ "low",
273
+ "close",
274
+ "volume",
275
+ "transactions",
276
+ ]
277
+ )
278
+ table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
279
+ yield from process_table_func(
280
+ table=table,
395
281
  sessions=sessions,
396
282
  minutes=minutes,
397
283
  metadata=metadata,
@@ -400,6 +286,7 @@ def process_minute_aggregates(
400
286
  dates_with_data=dates_with_data,
401
287
  agg_time=agg_time,
402
288
  )
289
+ del table
403
290
 
404
291
  # This doesn't seem to be hardly any faster than the above, something with the GIL?
405
292
  # Also to use this we'd need to make sure the symbol_to_sid and dates_with_data are thread safe.
@@ -422,7 +309,8 @@ def process_minute_aggregates(
422
309
  # yield from future.result()
423
310
 
424
311
 
425
- def polygon_equities_bundle_minute(
312
+ def ingest_polygon_equities_bundle(
313
+ agg_time: str,
426
314
  environ,
427
315
  asset_db_writer,
428
316
  minute_bar_writer,
@@ -440,10 +328,20 @@ def polygon_equities_bundle_minute(
440
328
  calendar_name=calendar.name,
441
329
  start_date=start_date,
442
330
  end_date=end_date,
443
- agg_time="minute",
331
+ agg_time=agg_time,
444
332
  )
445
333
 
446
- by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
334
+ print(f"{calendar.name=} {start_date=} {end_date=}")
335
+ print(f"{calendar.sessions_in_range(start_date, end_date)[:4]}")
336
+ print(f"{calendar.sessions_in_range(start_date, end_date)[-4:]}")
337
+ print(f"{calendar.sessions_minutes(start_date, end_date)[:4]}")
338
+ print(f"{calendar.sessions_minutes(start_date, end_date)[-4:]}")
339
+
340
+ if agg_time in [AGG_TIME_TRADES, "1min", "1minute"]:
341
+ convert_trades_to_custom_aggs(config, overwrite=False)
342
+ by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
343
+ else:
344
+ by_ticker_aggs_arrow_dir = concat_all_aggs_from_csv(config)
447
345
  aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
448
346
  # print(f"{aggregates.schema=}")
449
347
  # 3.5 billion rows for 10 years of minute data.
@@ -467,11 +365,13 @@ def polygon_equities_bundle_minute(
467
365
  )
468
366
 
469
367
  symbol_to_sid = {}
368
+ # Keep track of earliest and latest dates with data across all symbols.
470
369
  dates_with_data = set()
471
370
 
472
371
  # Get data for all stocks and write to Zipline
473
372
  daily_bar_writer.write(
474
- process_minute_aggregates(
373
+ process_aggregates(
374
+ process_day_table if config.agg_time == AGG_TIME_DAY else process_minute_table,
475
375
  fragments=aggregates.get_fragments(),
476
376
  sessions=calendar.sessions_in_range(start_date, end_date),
477
377
  minutes=calendar.sessions_minutes(start_date, end_date),
@@ -479,25 +379,26 @@ def polygon_equities_bundle_minute(
479
379
  calendar=calendar,
480
380
  symbol_to_sid=symbol_to_sid,
481
381
  dates_with_data=dates_with_data,
482
- agg_time="day",
382
+ agg_time=AGG_TIME_DAY,
483
383
  ),
484
384
  show_progress=show_progress,
485
385
  )
486
386
 
487
- # Get data for all stocks and write to Zipline
488
- minute_bar_writer.write(
489
- process_minute_aggregates(
490
- fragments=aggregates.get_fragments(),
491
- sessions=calendar.sessions_in_range(start_date, end_date),
492
- minutes=calendar.sessions_minutes(start_date, end_date),
493
- metadata=metadata,
494
- calendar=calendar,
495
- symbol_to_sid=symbol_to_sid,
496
- dates_with_data=dates_with_data,
497
- agg_time="minute",
498
- ),
499
- show_progress=show_progress,
500
- )
387
+ if config.agg_time != AGG_TIME_DAY:
388
+ minute_bar_writer.write(
389
+ process_aggregates(
390
+ process_minute_table,
391
+ fragments=aggregates.get_fragments(),
392
+ sessions=calendar.sessions_in_range(start_date, end_date),
393
+ minutes=calendar.sessions_minutes(start_date, end_date),
394
+ metadata=metadata,
395
+ calendar=calendar,
396
+ symbol_to_sid=symbol_to_sid,
397
+ dates_with_data=dates_with_data,
398
+ agg_time=AGG_TIME_MINUTE,
399
+ ),
400
+ show_progress=show_progress,
401
+ )
501
402
 
502
403
  # Write the metadata
503
404
  asset_db_writer.write(equities=metadata)
@@ -512,95 +413,36 @@ def polygon_equities_bundle_minute(
512
413
  adjustment_writer.write(splits=splits, dividends=dividends)
513
414
 
514
415
 
515
- def polygon_equities_bundle_trades(
516
- environ,
517
- asset_db_writer,
518
- minute_bar_writer,
519
- daily_bar_writer,
520
- adjustment_writer,
521
- calendar,
522
- start_date,
523
- end_date,
524
- cache,
525
- show_progress,
526
- output_dir,
527
- ):
528
- # TODO: Support agg durations other than `1min`.
529
- config = PolygonConfig(
530
- environ=environ,
531
- calendar_name=calendar.name,
532
- start_date=start_date,
533
- end_date=end_date,
534
- agg_time="1min",
535
- )
536
-
537
- convert_trades_to_custom_aggs(config, overwrite=False)
538
- by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
539
- aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
540
- # 3.5 billion rows for 10 years of minute data.
541
- # print(f"{aggregates.count_rows()=}")
542
- # Can't sort the dataset because that reads it all into memory.
543
- # aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
544
- # print("Sorted")
545
-
546
- # Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
547
- # This is because the SQL schema zipline uses for symbols ignores case.
548
- # We put the original symbol in the asset_name field.
549
- metadata = pd.DataFrame(
550
- columns=(
551
- "start_date",
552
- "end_date",
553
- "auto_close_date",
554
- "symbol",
555
- "exchange",
556
- "asset_name",
557
- )
558
- )
559
-
560
- symbol_to_sid = {}
561
- dates_with_data = set()
562
-
563
- # Get data for all stocks and write to Zipline
564
- daily_bar_writer.write(
565
- process_minute_aggregates(
566
- fragments=aggregates.get_fragments(),
567
- sessions=calendar.sessions_in_range(start_date, end_date),
568
- minutes=calendar.sessions_minutes(start_date, end_date),
569
- metadata=metadata,
570
- calendar=calendar,
571
- symbol_to_sid=symbol_to_sid,
572
- dates_with_data=dates_with_data,
573
- agg_time="day",
574
- ),
575
- show_progress=show_progress,
576
- )
577
-
578
- # Get data for all stocks and write to Zipline
579
- minute_bar_writer.write(
580
- process_minute_aggregates(
581
- fragments=aggregates.get_fragments(),
582
- sessions=calendar.sessions_in_range(start_date, end_date),
583
- minutes=calendar.sessions_minutes(start_date, end_date),
584
- metadata=metadata,
416
+ def ingest_polygon_equities_bundle_for_agg_time(agg_time: str):
417
+ def ingest_polygon_equities_bundle_inner(
418
+ environ,
419
+ asset_db_writer,
420
+ minute_bar_writer,
421
+ daily_bar_writer,
422
+ adjustment_writer,
423
+ calendar,
424
+ start_date,
425
+ end_date,
426
+ cache,
427
+ show_progress,
428
+ output_dir,
429
+ ):
430
+ return ingest_polygon_equities_bundle(
431
+ agg_time=agg_time,
432
+ environ=environ,
433
+ asset_db_writer=asset_db_writer,
434
+ minute_bar_writer=minute_bar_writer,
435
+ daily_bar_writer=daily_bar_writer,
436
+ adjustment_writer=adjustment_writer,
585
437
  calendar=calendar,
586
- symbol_to_sid=symbol_to_sid,
587
- dates_with_data=dates_with_data,
588
- agg_time="minute",
589
- ),
590
- show_progress=show_progress,
591
- )
592
-
593
- # Write the metadata
594
- asset_db_writer.write(equities=metadata)
595
-
596
- # Load splits and dividends
597
- first_start_end = min(dates_with_data)
598
- last_end_date = max(dates_with_data)
599
- splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
600
- dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
438
+ start_date=start_date,
439
+ end_date=end_date,
440
+ cache=cache,
441
+ show_progress=show_progress,
442
+ output_dir=output_dir,
443
+ )
601
444
 
602
- # Write splits and dividends
603
- adjustment_writer.write(splits=splits, dividends=dividends)
445
+ return ingest_polygon_equities_bundle_inner
604
446
 
605
447
 
606
448
  def register_polygon_equities_bundle(
@@ -608,16 +450,22 @@ def register_polygon_equities_bundle(
608
450
  start_date=None,
609
451
  end_date=None,
610
452
  calendar_name="XNYS",
611
- agg_time="day",
453
+ agg_time=AGG_TIME_DAY,
454
+ minutes_per_day=390,
455
+ environ=os.environ,
612
456
  # ticker_list=None,
613
457
  # watchlists=None,
614
458
  # include_asset_types=None,
615
459
  ):
616
460
  register_nyse_all_hours_calendar()
617
461
 
618
- if agg_time not in ["day", "minute", "1min"]:
462
+ # pd.set_option("display.max_columns", None)
463
+ # pd.set_option("display.width", 500)
464
+
465
+ # Note that "minute" is the Polygon minute aggs and "1minute" is the trades.
466
+ if agg_time not in [AGG_TIME_DAY, AGG_TIME_MINUTE, AGG_TIME_TRADES, "1min", "1minute"]:
619
467
  raise ValueError(
620
- f"agg_time must be 'day', 'minute' (aggs), or '1min' (trades), not '{agg_time}'"
468
+ f"agg_time must be 'day', 'minute' (aggs), or '1minute' (trades), not '{agg_time}'"
621
469
  )
622
470
 
623
471
  # We need to know the start and end dates of the session before the bundle is
@@ -625,36 +473,33 @@ def register_polygon_equities_bundle(
625
473
  # the writer is initialized and written before our ingest function is called.
626
474
  if start_date is None or end_date is None:
627
475
  config = PolygonConfig(
628
- environ=os.environ,
476
+ environ=environ,
629
477
  calendar_name=calendar_name,
630
478
  start_date=start_date,
631
479
  end_date=end_date,
632
480
  agg_time=agg_time,
633
481
  )
634
482
  first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
635
- config.aggs_dir if agg_time in ["day", "minute"] else config.trades_dir,
483
+ config.aggs_dir if agg_time in [AGG_TIME_DAY, AGG_TIME_MINUTE] else config.trades_dir,
636
484
  config.csv_paths_pattern,
637
485
  )
486
+ # print(f"{bundlename=} {first_aggs_date=} {last_aggs_date=}")
638
487
  if start_date is None:
639
488
  start_date = first_aggs_date
640
489
  if end_date is None:
641
490
  end_date = last_aggs_date
642
491
 
492
+ start_session = parse_date(start_date, raise_oob=False) if start_date else None
493
+ end_session = parse_date(end_date, raise_oob=False) if end_date else None
494
+ # print(f"Registered {bundlename=} {agg_time=} {start_session=} {end_session=}")
495
+
643
496
  register(
644
497
  bundlename,
645
- (
646
- polygon_equities_bundle_day
647
- if agg_time == "day"
648
- else (
649
- polygon_equities_bundle_minute
650
- if agg_time == "minute"
651
- else polygon_equities_bundle_trades
652
- )
653
- ),
654
- start_session=parse_date(start_date, raise_oob=False) if start_date else None,
655
- end_session=parse_date(end_date, raise_oob=False) if end_date else None,
498
+ ingest_polygon_equities_bundle_for_agg_time(agg_time),
499
+ start_session=start_session,
500
+ end_session=end_session,
656
501
  calendar_name=calendar_name,
657
- # minutes_per_day=390,
502
+ minutes_per_day=minutes_per_day,
658
503
  # create_writers=True,
659
504
  )
660
505