zipline_polygon_bundle 0.1.8__py3-none-any.whl → 0.2.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +5 -2
- zipline_polygon_bundle/bundle.py +143 -16
- zipline_polygon_bundle/concat_all_aggs.py +1 -21
- zipline_polygon_bundle/config.py +45 -12
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/tickers_and_names.py +4 -1
- zipline_polygon_bundle/trades.py +474 -237
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/METADATA +1 -1
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +17 -0
- zipline_polygon_bundle-0.1.8.dist-info/RECORD +0 -16
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/WHEEL +0 -0
@@ -6,10 +6,11 @@ from .bundle import (
|
|
6
6
|
)
|
7
7
|
|
8
8
|
from .config import PolygonConfig
|
9
|
+
from .nyse_all_hours_calendar import NYSE_ALL_HOURS, register_nyse_all_hours_calendar
|
9
10
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
10
11
|
from .adjustments import load_splits, load_dividends, load_conditions
|
11
12
|
from .trades import trades_schema, trades_dataset, cast_trades, date_to_path
|
12
|
-
from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs,
|
13
|
+
from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_trades_to_custom_aggs
|
13
14
|
from .trades import get_custom_aggs_dates, generate_csv_trades_tables, compute_signals_for_all_custom_aggs
|
14
15
|
from .quotes import quotes_schema, quotes_dataset, cast_quotes
|
15
16
|
# from .tickers_and_names import load_all_tickers, merge_tickers, ticker_names_from_merged_tickers, get_ticker_universe
|
@@ -18,6 +19,8 @@ from .tickers_and_names import PolygonAssets, get_ticker_universe
|
|
18
19
|
|
19
20
|
__all__ = [
|
20
21
|
"register_polygon_equities_bundle",
|
22
|
+
"register_nyse_all_hours_calendar",
|
23
|
+
"NYSE_ALL_HOURS",
|
21
24
|
"symbol_to_upper",
|
22
25
|
"polygon_equities_bundle_day",
|
23
26
|
"polygon_equities_bundle_minute",
|
@@ -36,7 +39,7 @@ __all__ = [
|
|
36
39
|
"custom_aggs_partitioning",
|
37
40
|
"custom_aggs_schema",
|
38
41
|
"trades_to_custom_aggs",
|
39
|
-
"
|
42
|
+
"convert_trades_to_custom_aggs",
|
40
43
|
"compute_signals_for_all_custom_aggs",
|
41
44
|
"quotes_schema",
|
42
45
|
"quotes_dataset",
|
zipline_polygon_bundle/bundle.py
CHANGED
@@ -3,11 +3,13 @@ from zipline.data.bundles import register
|
|
3
3
|
from zipline.data.resample import minute_frame_to_session_frame
|
4
4
|
|
5
5
|
from exchange_calendars.calendar_helpers import parse_date
|
6
|
-
from
|
6
|
+
from exchange_calendars.calendar_utils import get_calendar
|
7
7
|
|
8
|
-
from .config import PolygonConfig
|
9
8
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
10
9
|
from .adjustments import load_splits, load_dividends
|
10
|
+
from .config import PolygonConfig
|
11
|
+
from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
|
12
|
+
from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
|
11
13
|
|
12
14
|
import pyarrow
|
13
15
|
import pyarrow.compute
|
@@ -29,7 +31,7 @@ def symbol_to_upper(s: str) -> str:
|
|
29
31
|
def generate_all_agg_tables_from_csv(
|
30
32
|
config: PolygonConfig,
|
31
33
|
):
|
32
|
-
|
34
|
+
schema, tables = generate_csv_agg_tables(config)
|
33
35
|
for table in tables:
|
34
36
|
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
35
37
|
yield table
|
@@ -209,7 +211,19 @@ def polygon_equities_bundle_day(
|
|
209
211
|
)
|
210
212
|
)
|
211
213
|
|
212
|
-
|
214
|
+
# Only get the columns Zipline allows.
|
215
|
+
table = aggregates.to_table(
|
216
|
+
columns=[
|
217
|
+
"ticker",
|
218
|
+
"window_start",
|
219
|
+
"open",
|
220
|
+
"high",
|
221
|
+
"low",
|
222
|
+
"close",
|
223
|
+
"volume",
|
224
|
+
"transactions",
|
225
|
+
]
|
226
|
+
)
|
213
227
|
table = rename_polygon_to_zipline(table, "day")
|
214
228
|
# Get all the symbols in the table by using value_counts to tabulate the unique values.
|
215
229
|
# pyarrow.Table.column returns a pyarrow.ChunkedArray.
|
@@ -254,7 +268,19 @@ def process_minute_fragment(
|
|
254
268
|
dates_with_data: set,
|
255
269
|
agg_time: str,
|
256
270
|
):
|
257
|
-
|
271
|
+
# Only get the columns Zipline allows.
|
272
|
+
table = fragment.to_table(
|
273
|
+
columns=[
|
274
|
+
"ticker",
|
275
|
+
"window_start",
|
276
|
+
"open",
|
277
|
+
"high",
|
278
|
+
"low",
|
279
|
+
"close",
|
280
|
+
"volume",
|
281
|
+
"transactions",
|
282
|
+
]
|
283
|
+
)
|
258
284
|
print(f" {table.num_rows=}")
|
259
285
|
table = rename_polygon_to_zipline(table, "timestamp")
|
260
286
|
table = table.sort_by([("symbol", "ascending"), ("timestamp", "ascending")])
|
@@ -486,6 +512,97 @@ def polygon_equities_bundle_minute(
|
|
486
512
|
adjustment_writer.write(splits=splits, dividends=dividends)
|
487
513
|
|
488
514
|
|
515
|
+
def polygon_equities_bundle_trades(
|
516
|
+
environ,
|
517
|
+
asset_db_writer,
|
518
|
+
minute_bar_writer,
|
519
|
+
daily_bar_writer,
|
520
|
+
adjustment_writer,
|
521
|
+
calendar,
|
522
|
+
start_date,
|
523
|
+
end_date,
|
524
|
+
cache,
|
525
|
+
show_progress,
|
526
|
+
output_dir,
|
527
|
+
):
|
528
|
+
# TODO: Support agg durations other than `1min`.
|
529
|
+
config = PolygonConfig(
|
530
|
+
environ=environ,
|
531
|
+
calendar_name=calendar.name,
|
532
|
+
start_date=start_date,
|
533
|
+
end_date=end_date,
|
534
|
+
agg_time="1min",
|
535
|
+
)
|
536
|
+
|
537
|
+
convert_trades_to_custom_aggs(config, overwrite=False)
|
538
|
+
by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
|
539
|
+
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
540
|
+
# 3.5 billion rows for 10 years of minute data.
|
541
|
+
# print(f"{aggregates.count_rows()=}")
|
542
|
+
# Can't sort the dataset because that reads it all into memory.
|
543
|
+
# aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
544
|
+
# print("Sorted")
|
545
|
+
|
546
|
+
# Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
|
547
|
+
# This is because the SQL schema zipline uses for symbols ignores case.
|
548
|
+
# We put the original symbol in the asset_name field.
|
549
|
+
metadata = pd.DataFrame(
|
550
|
+
columns=(
|
551
|
+
"start_date",
|
552
|
+
"end_date",
|
553
|
+
"auto_close_date",
|
554
|
+
"symbol",
|
555
|
+
"exchange",
|
556
|
+
"asset_name",
|
557
|
+
)
|
558
|
+
)
|
559
|
+
|
560
|
+
symbol_to_sid = {}
|
561
|
+
dates_with_data = set()
|
562
|
+
|
563
|
+
# Get data for all stocks and write to Zipline
|
564
|
+
daily_bar_writer.write(
|
565
|
+
process_minute_aggregates(
|
566
|
+
fragments=aggregates.get_fragments(),
|
567
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
568
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
569
|
+
metadata=metadata,
|
570
|
+
calendar=calendar,
|
571
|
+
symbol_to_sid=symbol_to_sid,
|
572
|
+
dates_with_data=dates_with_data,
|
573
|
+
agg_time="day",
|
574
|
+
),
|
575
|
+
show_progress=show_progress,
|
576
|
+
)
|
577
|
+
|
578
|
+
# Get data for all stocks and write to Zipline
|
579
|
+
minute_bar_writer.write(
|
580
|
+
process_minute_aggregates(
|
581
|
+
fragments=aggregates.get_fragments(),
|
582
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
583
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
584
|
+
metadata=metadata,
|
585
|
+
calendar=calendar,
|
586
|
+
symbol_to_sid=symbol_to_sid,
|
587
|
+
dates_with_data=dates_with_data,
|
588
|
+
agg_time="minute",
|
589
|
+
),
|
590
|
+
show_progress=show_progress,
|
591
|
+
)
|
592
|
+
|
593
|
+
# Write the metadata
|
594
|
+
asset_db_writer.write(equities=metadata)
|
595
|
+
|
596
|
+
# Load splits and dividends
|
597
|
+
first_start_end = min(dates_with_data)
|
598
|
+
last_end_date = max(dates_with_data)
|
599
|
+
splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
|
600
|
+
dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
|
601
|
+
|
602
|
+
# Write splits and dividends
|
603
|
+
adjustment_writer.write(splits=splits, dividends=dividends)
|
604
|
+
|
605
|
+
|
489
606
|
def register_polygon_equities_bundle(
|
490
607
|
bundlename,
|
491
608
|
start_date=None,
|
@@ -496,10 +613,15 @@ def register_polygon_equities_bundle(
|
|
496
613
|
# watchlists=None,
|
497
614
|
# include_asset_types=None,
|
498
615
|
):
|
499
|
-
|
500
|
-
|
616
|
+
register_nyse_all_hours_calendar()
|
617
|
+
|
618
|
+
if agg_time not in ["day", "minute", "1min"]:
|
619
|
+
raise ValueError(
|
620
|
+
f"agg_time must be 'day', 'minute' (aggs), or '1min' (trades), not '{agg_time}'"
|
621
|
+
)
|
622
|
+
|
501
623
|
# We need to know the start and end dates of the session before the bundle is
|
502
|
-
# registered because even though we only need it for ingest, the metadata in
|
624
|
+
# registered because even though we only need it for ingest, the metadata in
|
503
625
|
# the writer is initialized and written before our ingest function is called.
|
504
626
|
if start_date is None or end_date is None:
|
505
627
|
config = PolygonConfig(
|
@@ -509,23 +631,28 @@ def register_polygon_equities_bundle(
|
|
509
631
|
end_date=end_date,
|
510
632
|
agg_time=agg_time,
|
511
633
|
)
|
512
|
-
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
634
|
+
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
635
|
+
config.aggs_dir if agg_time in ["day", "minute"] else config.trades_dir,
|
636
|
+
config.csv_paths_pattern,
|
637
|
+
)
|
513
638
|
if start_date is None:
|
514
639
|
start_date = first_aggs_date
|
515
640
|
if end_date is None:
|
516
641
|
end_date = last_aggs_date
|
517
642
|
|
518
|
-
calendar = get_calendar(calendar_name)
|
519
|
-
|
520
643
|
register(
|
521
644
|
bundlename,
|
522
645
|
(
|
523
|
-
|
524
|
-
if agg_time == "
|
525
|
-
else
|
646
|
+
polygon_equities_bundle_day
|
647
|
+
if agg_time == "day"
|
648
|
+
else (
|
649
|
+
polygon_equities_bundle_minute
|
650
|
+
if agg_time == "minute"
|
651
|
+
else polygon_equities_bundle_trades
|
652
|
+
)
|
526
653
|
),
|
527
|
-
start_session=parse_date(start_date,
|
528
|
-
end_session=parse_date(end_date,
|
654
|
+
start_session=parse_date(start_date, raise_oob=False) if start_date else None,
|
655
|
+
end_session=parse_date(end_date, raise_oob=False) if end_date else None,
|
529
656
|
calendar_name=calendar_name,
|
530
657
|
# minutes_per_day=390,
|
531
658
|
# create_writers=True,
|
@@ -1,10 +1,9 @@
|
|
1
|
-
from .config import PolygonConfig
|
1
|
+
from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
|
2
2
|
|
3
3
|
import shutil
|
4
4
|
from typing import Iterator, Tuple, List, Union
|
5
5
|
|
6
6
|
import argparse
|
7
|
-
import glob
|
8
7
|
import os
|
9
8
|
|
10
9
|
import pyarrow as pa
|
@@ -15,25 +14,6 @@ from pyarrow import compute as pa_compute
|
|
15
14
|
import pandas as pd
|
16
15
|
|
17
16
|
|
18
|
-
PARTITION_COLUMN_NAME = "part"
|
19
|
-
PARTITION_KEY_LENGTH = 2
|
20
|
-
|
21
|
-
|
22
|
-
def to_partition_key(s: str) -> str:
|
23
|
-
"""
|
24
|
-
Partition key is low cardinality and must be filesystem-safe.
|
25
|
-
The reason for partitioning is to keep the parquet files from getting too big.
|
26
|
-
10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
|
27
|
-
Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
|
28
|
-
"""
|
29
|
-
k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
|
30
|
-
if k.isalpha():
|
31
|
-
return k
|
32
|
-
# Replace non-alpha characters with "A".
|
33
|
-
k = "".join([c if c.isalpha() else "A" for c in k])
|
34
|
-
return k
|
35
|
-
|
36
|
-
|
37
17
|
def generate_tables_from_csv_files(
|
38
18
|
paths: Iterator[Union[str, os.PathLike]],
|
39
19
|
schema: pa.Schema,
|
zipline_polygon_bundle/config.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1
|
-
from exchange_calendars.calendar_helpers import Date, parse_date
|
2
|
-
from
|
1
|
+
from exchange_calendars.calendar_helpers import Date, parse_date
|
2
|
+
from exchange_calendars.calendar_utils import get_calendar
|
3
3
|
|
4
|
-
from
|
4
|
+
from .nyse_all_hours_calendar import NYSE_ALL_HOURS
|
5
|
+
|
6
|
+
from typing import Iterator, Tuple
|
5
7
|
|
6
8
|
import pandas as pd
|
7
9
|
from pyarrow.fs import LocalFileSystem
|
@@ -10,6 +12,25 @@ import re
|
|
10
12
|
import fnmatch
|
11
13
|
|
12
14
|
|
15
|
+
PARTITION_COLUMN_NAME = "part"
|
16
|
+
PARTITION_KEY_LENGTH = 2
|
17
|
+
|
18
|
+
|
19
|
+
def to_partition_key(s: str) -> str:
|
20
|
+
"""
|
21
|
+
Partition key is low cardinality and must be filesystem-safe.
|
22
|
+
The reason for partitioning is to keep the parquet files from getting too big.
|
23
|
+
10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
|
24
|
+
Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
|
25
|
+
"""
|
26
|
+
k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
|
27
|
+
if k.isalpha():
|
28
|
+
return k
|
29
|
+
# Replace non-alpha characters with "A".
|
30
|
+
k = "".join([c if c.isalpha() else "A" for c in k])
|
31
|
+
return k
|
32
|
+
|
33
|
+
|
13
34
|
class PolygonConfig:
|
14
35
|
def __init__(
|
15
36
|
self,
|
@@ -61,7 +82,8 @@ class PolygonConfig:
|
|
61
82
|
# TODO: Restore non-recusive option. Always recursive for now.
|
62
83
|
self.csv_paths_pattern = environ.get(
|
63
84
|
# "POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz"
|
64
|
-
"POLYGON_FLAT_FILES_CSV_PATTERN",
|
85
|
+
"POLYGON_FLAT_FILES_CSV_PATTERN",
|
86
|
+
"*.csv.gz",
|
65
87
|
)
|
66
88
|
self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
|
67
89
|
self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
|
@@ -114,7 +136,10 @@ class PolygonConfig:
|
|
114
136
|
|
115
137
|
@property
|
116
138
|
def calendar(self):
|
117
|
-
|
139
|
+
# If you don't give a start date you'll only get 20 years from today.
|
140
|
+
if self.calendar_name in [NYSE_ALL_HOURS, "us_futures", "CMES", "XNYS", "NYSE"]:
|
141
|
+
return get_calendar(self.calendar_name, side="right", start=pd.Timestamp("1990-01-01"))
|
142
|
+
return get_calendar(self.calendar_name, side="right")
|
118
143
|
|
119
144
|
def ticker_file_path(self, date: pd.Timestamp):
|
120
145
|
ticker_year_dir = os.path.join(
|
@@ -131,8 +156,16 @@ class PolygonConfig:
|
|
131
156
|
|
132
157
|
@property
|
133
158
|
def by_ticker_aggs_arrow_dir(self):
|
134
|
-
|
135
|
-
|
159
|
+
# TODO: Don't split these up by ingestion range. They're already time indexed.
|
160
|
+
# Only reason to separate them is if we're worried about (or want) data being different across ingestions.
|
161
|
+
# This scattering is really slow and is usually gonna be redundant.
|
162
|
+
# This wasn't a problem when start/end dates were the calendar bounds when omitted.
|
163
|
+
# Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
|
164
|
+
return os.path.join(
|
165
|
+
self.by_ticker_dir,
|
166
|
+
f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
|
167
|
+
# "aggs.arrow",
|
168
|
+
)
|
136
169
|
|
137
170
|
def api_cache_path(
|
138
171
|
self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
|
@@ -153,13 +186,13 @@ class PolygonConfig:
|
|
153
186
|
for filename in sorted(filenames):
|
154
187
|
yield os.path.join(root, filename)
|
155
188
|
|
156
|
-
def find_first_and_last_aggs(self) -> Tuple[str, str]:
|
189
|
+
def find_first_and_last_aggs(self, aggs_dir, file_pattern) -> Tuple[str | None, str | None]:
|
157
190
|
# Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
|
158
191
|
# Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
|
159
192
|
# This needs to be efficient because it is called on every init, even though we only need it for ingest.
|
160
193
|
# But we can't call it in ingest because the writer initializes and writes the metadata before it is called.
|
161
194
|
paths = []
|
162
|
-
for root, dirnames, filenames in os.walk(
|
195
|
+
for root, dirnames, filenames in os.walk(aggs_dir, topdown=True):
|
163
196
|
if dirnames:
|
164
197
|
# We only want first and last in each directory.
|
165
198
|
sorted_dirs = sorted(dirnames)
|
@@ -169,15 +202,15 @@ class PolygonConfig:
|
|
169
202
|
else sorted_dirs
|
170
203
|
)
|
171
204
|
# Filter out filenames that don't match the pattern.
|
172
|
-
filenames = fnmatch.filter(filenames,
|
205
|
+
filenames = fnmatch.filter(filenames, file_pattern)
|
173
206
|
if filenames:
|
174
207
|
filenames = sorted(filenames)
|
175
208
|
paths.append(os.path.join(root, filenames[0]))
|
176
209
|
if len(filenames) > 1:
|
177
210
|
paths.append(os.path.join(root, filenames[-1]))
|
211
|
+
if not paths:
|
212
|
+
return None, None
|
178
213
|
paths = sorted(paths)
|
179
|
-
if len(paths) < 2:
|
180
|
-
raise ValueError(f"Need more than one aggs file but found {len(paths)} paths in {self.aggs_dir}")
|
181
214
|
return self.file_path_to_name(paths[0]), self.file_path_to_name(paths[-1])
|
182
215
|
|
183
216
|
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import datetime
|
2
|
+
from exchange_calendars.calendar_utils import get_calendar_names, register_calendar_type
|
3
|
+
from exchange_calendars.exchange_calendar_xnys import XNYSExchangeCalendar
|
4
|
+
|
5
|
+
|
6
|
+
NYSE_ALL_HOURS = "NYSE_ALL_HOURS"
|
7
|
+
|
8
|
+
|
9
|
+
class USExtendedHoursExchangeCalendar(XNYSExchangeCalendar):
|
10
|
+
"""
|
11
|
+
A calendar for extended hours which runs from 4 AM to 8 PM.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = NYSE_ALL_HOURS
|
15
|
+
|
16
|
+
open_times = ((None, datetime.time(4)),)
|
17
|
+
|
18
|
+
close_times = ((None, datetime.time(20)),)
|
19
|
+
|
20
|
+
regular_early_close = datetime.time(13)
|
21
|
+
|
22
|
+
|
23
|
+
def register_nyse_all_hours_calendar():
|
24
|
+
if NYSE_ALL_HOURS not in get_calendar_names():
|
25
|
+
register_calendar_type(NYSE_ALL_HOURS, USExtendedHoursExchangeCalendar)
|
@@ -48,7 +48,10 @@ class PolygonAssets:
|
|
48
48
|
active: bool = True,
|
49
49
|
):
|
50
50
|
response = self.polygon_client.list_tickers(
|
51
|
-
market=self.config.market,
|
51
|
+
market=self.config.market,
|
52
|
+
active=active,
|
53
|
+
date=date.date().isoformat(),
|
54
|
+
limit=500,
|
52
55
|
)
|
53
56
|
tickers_df = pd.DataFrame(list(response))
|
54
57
|
# The currency info is for crypto. The source_feed is always NA.
|