zipline_polygon_bundle 0.1.8__py3-none-any.whl → 0.2.0.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,10 +6,11 @@ from .bundle import (
6
6
  )
7
7
 
8
8
  from .config import PolygonConfig
9
+ from .nyse_all_hours_calendar import NYSE_ALL_HOURS, register_nyse_all_hours_calendar
9
10
  from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
10
11
  from .adjustments import load_splits, load_dividends, load_conditions
11
12
  from .trades import trades_schema, trades_dataset, cast_trades, date_to_path
12
- from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_all_to_custom_aggs
13
+ from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_trades_to_custom_aggs
13
14
  from .trades import get_custom_aggs_dates, generate_csv_trades_tables, compute_signals_for_all_custom_aggs
14
15
  from .quotes import quotes_schema, quotes_dataset, cast_quotes
15
16
  # from .tickers_and_names import load_all_tickers, merge_tickers, ticker_names_from_merged_tickers, get_ticker_universe
@@ -18,6 +19,8 @@ from .tickers_and_names import PolygonAssets, get_ticker_universe
18
19
 
19
20
  __all__ = [
20
21
  "register_polygon_equities_bundle",
22
+ "register_nyse_all_hours_calendar",
23
+ "NYSE_ALL_HOURS",
21
24
  "symbol_to_upper",
22
25
  "polygon_equities_bundle_day",
23
26
  "polygon_equities_bundle_minute",
@@ -36,7 +39,7 @@ __all__ = [
36
39
  "custom_aggs_partitioning",
37
40
  "custom_aggs_schema",
38
41
  "trades_to_custom_aggs",
39
- "convert_all_to_custom_aggs",
42
+ "convert_trades_to_custom_aggs",
40
43
  "compute_signals_for_all_custom_aggs",
41
44
  "quotes_schema",
42
45
  "quotes_dataset",
@@ -3,11 +3,13 @@ from zipline.data.bundles import register
3
3
  from zipline.data.resample import minute_frame_to_session_frame
4
4
 
5
5
  from exchange_calendars.calendar_helpers import parse_date
6
- from zipline.utils.calendar_utils import get_calendar
6
+ from exchange_calendars.calendar_utils import get_calendar
7
7
 
8
- from .config import PolygonConfig
9
8
  from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
10
9
  from .adjustments import load_splits, load_dividends
10
+ from .config import PolygonConfig
11
+ from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
12
+ from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
11
13
 
12
14
  import pyarrow
13
15
  import pyarrow.compute
@@ -29,7 +31,7 @@ def symbol_to_upper(s: str) -> str:
29
31
  def generate_all_agg_tables_from_csv(
30
32
  config: PolygonConfig,
31
33
  ):
32
- paths, schema, tables = generate_csv_agg_tables(config)
34
+ schema, tables = generate_csv_agg_tables(config)
33
35
  for table in tables:
34
36
  table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
35
37
  yield table
@@ -209,7 +211,19 @@ def polygon_equities_bundle_day(
209
211
  )
210
212
  )
211
213
 
212
- table = aggregates.to_table()
214
+ # Only get the columns Zipline allows.
215
+ table = aggregates.to_table(
216
+ columns=[
217
+ "ticker",
218
+ "window_start",
219
+ "open",
220
+ "high",
221
+ "low",
222
+ "close",
223
+ "volume",
224
+ "transactions",
225
+ ]
226
+ )
213
227
  table = rename_polygon_to_zipline(table, "day")
214
228
  # Get all the symbols in the table by using value_counts to tabulate the unique values.
215
229
  # pyarrow.Table.column returns a pyarrow.ChunkedArray.
@@ -254,7 +268,19 @@ def process_minute_fragment(
254
268
  dates_with_data: set,
255
269
  agg_time: str,
256
270
  ):
257
- table = fragment.to_table()
271
+ # Only get the columns Zipline allows.
272
+ table = fragment.to_table(
273
+ columns=[
274
+ "ticker",
275
+ "window_start",
276
+ "open",
277
+ "high",
278
+ "low",
279
+ "close",
280
+ "volume",
281
+ "transactions",
282
+ ]
283
+ )
258
284
  print(f" {table.num_rows=}")
259
285
  table = rename_polygon_to_zipline(table, "timestamp")
260
286
  table = table.sort_by([("symbol", "ascending"), ("timestamp", "ascending")])
@@ -486,6 +512,97 @@ def polygon_equities_bundle_minute(
486
512
  adjustment_writer.write(splits=splits, dividends=dividends)
487
513
 
488
514
 
515
+ def polygon_equities_bundle_trades(
516
+ environ,
517
+ asset_db_writer,
518
+ minute_bar_writer,
519
+ daily_bar_writer,
520
+ adjustment_writer,
521
+ calendar,
522
+ start_date,
523
+ end_date,
524
+ cache,
525
+ show_progress,
526
+ output_dir,
527
+ ):
528
+ # TODO: Support agg durations other than `1min`.
529
+ config = PolygonConfig(
530
+ environ=environ,
531
+ calendar_name=calendar.name,
532
+ start_date=start_date,
533
+ end_date=end_date,
534
+ agg_time="1min",
535
+ )
536
+
537
+ convert_trades_to_custom_aggs(config, overwrite=False)
538
+ by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
539
+ aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
540
+ # 3.5 billion rows for 10 years of minute data.
541
+ # print(f"{aggregates.count_rows()=}")
542
+ # Can't sort the dataset because that reads it all into memory.
543
+ # aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
544
+ # print("Sorted")
545
+
546
+ # Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
547
+ # This is because the SQL schema zipline uses for symbols ignores case.
548
+ # We put the original symbol in the asset_name field.
549
+ metadata = pd.DataFrame(
550
+ columns=(
551
+ "start_date",
552
+ "end_date",
553
+ "auto_close_date",
554
+ "symbol",
555
+ "exchange",
556
+ "asset_name",
557
+ )
558
+ )
559
+
560
+ symbol_to_sid = {}
561
+ dates_with_data = set()
562
+
563
+ # Get data for all stocks and write to Zipline
564
+ daily_bar_writer.write(
565
+ process_minute_aggregates(
566
+ fragments=aggregates.get_fragments(),
567
+ sessions=calendar.sessions_in_range(start_date, end_date),
568
+ minutes=calendar.sessions_minutes(start_date, end_date),
569
+ metadata=metadata,
570
+ calendar=calendar,
571
+ symbol_to_sid=symbol_to_sid,
572
+ dates_with_data=dates_with_data,
573
+ agg_time="day",
574
+ ),
575
+ show_progress=show_progress,
576
+ )
577
+
578
+ # Get data for all stocks and write to Zipline
579
+ minute_bar_writer.write(
580
+ process_minute_aggregates(
581
+ fragments=aggregates.get_fragments(),
582
+ sessions=calendar.sessions_in_range(start_date, end_date),
583
+ minutes=calendar.sessions_minutes(start_date, end_date),
584
+ metadata=metadata,
585
+ calendar=calendar,
586
+ symbol_to_sid=symbol_to_sid,
587
+ dates_with_data=dates_with_data,
588
+ agg_time="minute",
589
+ ),
590
+ show_progress=show_progress,
591
+ )
592
+
593
+ # Write the metadata
594
+ asset_db_writer.write(equities=metadata)
595
+
596
+ # Load splits and dividends
597
+ first_start_end = min(dates_with_data)
598
+ last_end_date = max(dates_with_data)
599
+ splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
600
+ dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
601
+
602
+ # Write splits and dividends
603
+ adjustment_writer.write(splits=splits, dividends=dividends)
604
+
605
+
489
606
  def register_polygon_equities_bundle(
490
607
  bundlename,
491
608
  start_date=None,
@@ -496,10 +613,15 @@ def register_polygon_equities_bundle(
496
613
  # watchlists=None,
497
614
  # include_asset_types=None,
498
615
  ):
499
- if agg_time not in ["day", "minute"]:
500
- raise ValueError(f"agg_time must be 'day' or 'minute', not '{agg_time}'")
616
+ register_nyse_all_hours_calendar()
617
+
618
+ if agg_time not in ["day", "minute", "1min"]:
619
+ raise ValueError(
620
+ f"agg_time must be 'day', 'minute' (aggs), or '1min' (trades), not '{agg_time}'"
621
+ )
622
+
501
623
  # We need to know the start and end dates of the session before the bundle is
502
- # registered because even though we only need it for ingest, the metadata in
624
+ # registered because even though we only need it for ingest, the metadata in
503
625
  # the writer is initialized and written before our ingest function is called.
504
626
  if start_date is None or end_date is None:
505
627
  config = PolygonConfig(
@@ -509,23 +631,28 @@ def register_polygon_equities_bundle(
509
631
  end_date=end_date,
510
632
  agg_time=agg_time,
511
633
  )
512
- first_aggs_date, last_aggs_date = config.find_first_and_last_aggs()
634
+ first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
635
+ config.aggs_dir if agg_time in ["day", "minute"] else config.trades_dir,
636
+ config.csv_paths_pattern,
637
+ )
513
638
  if start_date is None:
514
639
  start_date = first_aggs_date
515
640
  if end_date is None:
516
641
  end_date = last_aggs_date
517
642
 
518
- calendar = get_calendar(calendar_name)
519
-
520
643
  register(
521
644
  bundlename,
522
645
  (
523
- polygon_equities_bundle_minute
524
- if agg_time == "minute"
525
- else polygon_equities_bundle_day
646
+ polygon_equities_bundle_day
647
+ if agg_time == "day"
648
+ else (
649
+ polygon_equities_bundle_minute
650
+ if agg_time == "minute"
651
+ else polygon_equities_bundle_trades
652
+ )
526
653
  ),
527
- start_session=parse_date(start_date, calendar=calendar),
528
- end_session=parse_date(end_date, calendar=calendar),
654
+ start_session=parse_date(start_date, raise_oob=False) if start_date else None,
655
+ end_session=parse_date(end_date, raise_oob=False) if end_date else None,
529
656
  calendar_name=calendar_name,
530
657
  # minutes_per_day=390,
531
658
  # create_writers=True,
@@ -1,10 +1,9 @@
1
- from .config import PolygonConfig
1
+ from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
2
2
 
3
3
  import shutil
4
4
  from typing import Iterator, Tuple, List, Union
5
5
 
6
6
  import argparse
7
- import glob
8
7
  import os
9
8
 
10
9
  import pyarrow as pa
@@ -15,25 +14,6 @@ from pyarrow import compute as pa_compute
15
14
  import pandas as pd
16
15
 
17
16
 
18
- PARTITION_COLUMN_NAME = "part"
19
- PARTITION_KEY_LENGTH = 2
20
-
21
-
22
- def to_partition_key(s: str) -> str:
23
- """
24
- Partition key is low cardinality and must be filesystem-safe.
25
- The reason for partitioning is to keep the parquet files from getting too big.
26
- 10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
27
- Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
28
- """
29
- k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
30
- if k.isalpha():
31
- return k
32
- # Replace non-alpha characters with "A".
33
- k = "".join([c if c.isalpha() else "A" for c in k])
34
- return k
35
-
36
-
37
17
  def generate_tables_from_csv_files(
38
18
  paths: Iterator[Union[str, os.PathLike]],
39
19
  schema: pa.Schema,
@@ -1,7 +1,9 @@
1
- from exchange_calendars.calendar_helpers import Date, parse_date, parse_timestamp
2
- from zipline.utils.calendar_utils import get_calendar
1
+ from exchange_calendars.calendar_helpers import Date, parse_date
2
+ from exchange_calendars.calendar_utils import get_calendar
3
3
 
4
- from typing import Iterator, Tuple, List
4
+ from .nyse_all_hours_calendar import NYSE_ALL_HOURS
5
+
6
+ from typing import Iterator, Tuple
5
7
 
6
8
  import pandas as pd
7
9
  from pyarrow.fs import LocalFileSystem
@@ -10,6 +12,25 @@ import re
10
12
  import fnmatch
11
13
 
12
14
 
15
+ PARTITION_COLUMN_NAME = "part"
16
+ PARTITION_KEY_LENGTH = 2
17
+
18
+
19
+ def to_partition_key(s: str) -> str:
20
+ """
21
+ Partition key is low cardinality and must be filesystem-safe.
22
+ The reason for partitioning is to keep the parquet files from getting too big.
23
+ 10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
24
+ Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
25
+ """
26
+ k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
27
+ if k.isalpha():
28
+ return k
29
+ # Replace non-alpha characters with "A".
30
+ k = "".join([c if c.isalpha() else "A" for c in k])
31
+ return k
32
+
33
+
13
34
  class PolygonConfig:
14
35
  def __init__(
15
36
  self,
@@ -61,7 +82,8 @@ class PolygonConfig:
61
82
  # TODO: Restore non-recusive option. Always recursive for now.
62
83
  self.csv_paths_pattern = environ.get(
63
84
  # "POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz"
64
- "POLYGON_FLAT_FILES_CSV_PATTERN", "*.csv.gz"
85
+ "POLYGON_FLAT_FILES_CSV_PATTERN",
86
+ "*.csv.gz",
65
87
  )
66
88
  self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
67
89
  self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
@@ -114,7 +136,10 @@ class PolygonConfig:
114
136
 
115
137
  @property
116
138
  def calendar(self):
117
- return get_calendar(self.calendar_name)
139
+ # If you don't give a start date you'll only get 20 years from today.
140
+ if self.calendar_name in [NYSE_ALL_HOURS, "us_futures", "CMES", "XNYS", "NYSE"]:
141
+ return get_calendar(self.calendar_name, side="right", start=pd.Timestamp("1990-01-01"))
142
+ return get_calendar(self.calendar_name, side="right")
118
143
 
119
144
  def ticker_file_path(self, date: pd.Timestamp):
120
145
  ticker_year_dir = os.path.join(
@@ -131,8 +156,16 @@ class PolygonConfig:
131
156
 
132
157
  @property
133
158
  def by_ticker_aggs_arrow_dir(self):
134
- return os.path.join(self.by_ticker_dir,
135
- f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow")
159
+ # TODO: Don't split these up by ingestion range. They're already time indexed.
160
+ # Only reason to separate them is if we're worried about (or want) data being different across ingestions.
161
+ # This scattering is really slow and is usually gonna be redundant.
162
+ # This wasn't a problem when start/end dates were the calendar bounds when omitted.
163
+ # Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
164
+ return os.path.join(
165
+ self.by_ticker_dir,
166
+ f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
167
+ # "aggs.arrow",
168
+ )
136
169
 
137
170
  def api_cache_path(
138
171
  self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
@@ -153,13 +186,13 @@ class PolygonConfig:
153
186
  for filename in sorted(filenames):
154
187
  yield os.path.join(root, filename)
155
188
 
156
- def find_first_and_last_aggs(self) -> Tuple[str, str]:
189
+ def find_first_and_last_aggs(self, aggs_dir, file_pattern) -> Tuple[str | None, str | None]:
157
190
  # Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
158
191
  # Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
159
192
  # This needs to be efficient because it is called on every init, even though we only need it for ingest.
160
193
  # But we can't call it in ingest because the writer initializes and writes the metadata before it is called.
161
194
  paths = []
162
- for root, dirnames, filenames in os.walk(self.aggs_dir, topdown=True):
195
+ for root, dirnames, filenames in os.walk(aggs_dir, topdown=True):
163
196
  if dirnames:
164
197
  # We only want first and last in each directory.
165
198
  sorted_dirs = sorted(dirnames)
@@ -169,15 +202,15 @@ class PolygonConfig:
169
202
  else sorted_dirs
170
203
  )
171
204
  # Filter out filenames that don't match the pattern.
172
- filenames = fnmatch.filter(filenames, self.csv_paths_pattern)
205
+ filenames = fnmatch.filter(filenames, file_pattern)
173
206
  if filenames:
174
207
  filenames = sorted(filenames)
175
208
  paths.append(os.path.join(root, filenames[0]))
176
209
  if len(filenames) > 1:
177
210
  paths.append(os.path.join(root, filenames[-1]))
211
+ if not paths:
212
+ return None, None
178
213
  paths = sorted(paths)
179
- if len(paths) < 2:
180
- raise ValueError(f"Need more than one aggs file but found {len(paths)} paths in {self.aggs_dir}")
181
214
  return self.file_path_to_name(paths[0]), self.file_path_to_name(paths[-1])
182
215
 
183
216
 
@@ -0,0 +1,25 @@
1
+ import datetime
2
+ from exchange_calendars.calendar_utils import get_calendar_names, register_calendar_type
3
+ from exchange_calendars.exchange_calendar_xnys import XNYSExchangeCalendar
4
+
5
+
6
+ NYSE_ALL_HOURS = "NYSE_ALL_HOURS"
7
+
8
+
9
+ class USExtendedHoursExchangeCalendar(XNYSExchangeCalendar):
10
+ """
11
+ A calendar for extended hours which runs from 4 AM to 8 PM.
12
+ """
13
+
14
+ name = NYSE_ALL_HOURS
15
+
16
+ open_times = ((None, datetime.time(4)),)
17
+
18
+ close_times = ((None, datetime.time(20)),)
19
+
20
+ regular_early_close = datetime.time(13)
21
+
22
+
23
+ def register_nyse_all_hours_calendar():
24
+ if NYSE_ALL_HOURS not in get_calendar_names():
25
+ register_calendar_type(NYSE_ALL_HOURS, USExtendedHoursExchangeCalendar)
@@ -48,7 +48,10 @@ class PolygonAssets:
48
48
  active: bool = True,
49
49
  ):
50
50
  response = self.polygon_client.list_tickers(
51
- market=self.config.market, active=active, date=date.date(), limit=500
51
+ market=self.config.market,
52
+ active=active,
53
+ date=date.date().isoformat(),
54
+ limit=500,
52
55
  )
53
56
  tickers_df = pd.DataFrame(list(response))
54
57
  # The currency info is for crypto. The source_feed is always NA.