zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +7 -9
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +157 -312
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +130 -25
- zipline_polygon_bundle/config.py +57 -32
- zipline_polygon_bundle/trades.py +196 -607
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/METADATA +8 -6
- zipline_polygon_bundle-0.2.1.dist-info/RECORD +18 -0
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/WHEEL +1 -1
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +0 -17
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/LICENSE +0 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
from .config import PolygonConfig
|
2
|
+
from .trades import custom_aggs_schema, custom_aggs_partitioning
|
3
|
+
|
4
|
+
import datetime
|
5
|
+
import numpy as np
|
6
|
+
import pyarrow as pa
|
7
|
+
import pyarrow.compute as pa_compute
|
8
|
+
import pyarrow.dataset as pa_ds
|
9
|
+
import pandas_ta as ta
|
10
|
+
import pandas as pd
|
11
|
+
|
12
|
+
|
13
|
+
def calculate_mfi(typical_price: pd.Series, money_flow: pd.Series, period: int):
|
14
|
+
mf_sign = np.where(typical_price > np.roll(typical_price, shift=1), 1, -1)
|
15
|
+
signed_mf = money_flow * mf_sign
|
16
|
+
|
17
|
+
# Calculate gain and loss using vectorized operations
|
18
|
+
positive_mf = np.maximum(signed_mf, 0)
|
19
|
+
negative_mf = np.maximum(-signed_mf, 0)
|
20
|
+
|
21
|
+
mf_avg_gain = (
|
22
|
+
np.convolve(positive_mf, np.ones(period), mode="full")[: len(positive_mf)]
|
23
|
+
/ period
|
24
|
+
)
|
25
|
+
mf_avg_loss = (
|
26
|
+
np.convolve(negative_mf, np.ones(period), mode="full")[: len(negative_mf)]
|
27
|
+
/ period
|
28
|
+
)
|
29
|
+
|
30
|
+
epsilon = 1e-10 # Small epsilon value to avoid division by zero
|
31
|
+
mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
|
32
|
+
return mfi
|
33
|
+
|
34
|
+
|
35
|
+
# https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
|
36
|
+
# https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
|
37
|
+
# `k` vs `fast_k` arg names.
|
38
|
+
# https://github.com/twopirllc/pandas-ta/issues/726
|
39
|
+
# Results affected by values outside range
|
40
|
+
# https://github.com/twopirllc/pandas-ta/issues/535
|
41
|
+
|
42
|
+
|
43
|
+
def calculate_stoch(
|
44
|
+
high: pd.Series,
|
45
|
+
low: pd.Series,
|
46
|
+
close: pd.Series,
|
47
|
+
k: int = 14,
|
48
|
+
d: int = 3,
|
49
|
+
smooth_k: int = 3,
|
50
|
+
mamode: str = "sma",
|
51
|
+
):
|
52
|
+
"""Indicator: Stochastic Oscillator (STOCH)"""
|
53
|
+
lowest_low = low.rolling(k).min()
|
54
|
+
highest_high = high.rolling(k).max()
|
55
|
+
|
56
|
+
stoch = 100 * (close - lowest_low)
|
57
|
+
stoch /= ta.utils.non_zero_range(highest_high, lowest_low)
|
58
|
+
|
59
|
+
stoch_k = ta.overlap.ma(
|
60
|
+
mamode, stoch.loc[stoch.first_valid_index() :,], length=smooth_k
|
61
|
+
)
|
62
|
+
stoch_d = (
|
63
|
+
ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index() :,], length=d)
|
64
|
+
if stoch_k is not None
|
65
|
+
else None
|
66
|
+
)
|
67
|
+
# Histogram
|
68
|
+
stoch_h = stoch_k - stoch_d if stoch_d is not None else None
|
69
|
+
|
70
|
+
return stoch_k, stoch_d, stoch_h
|
71
|
+
|
72
|
+
|
73
|
+
def compute_per_ticker_signals(df: pd.DataFrame, period: int = 14) -> pd.DataFrame:
|
74
|
+
df = df.set_index("window_start").sort_index()
|
75
|
+
session_index = pd.date_range(
|
76
|
+
start=df.index[0], end=df.index[-1], freq=pd.Timedelta(seconds=60)
|
77
|
+
)
|
78
|
+
df = df.reindex(session_index)
|
79
|
+
df.index.rename("window_start", inplace=True)
|
80
|
+
|
81
|
+
# df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
|
82
|
+
# df["day_of_week"] = df.index.day_of_week
|
83
|
+
|
84
|
+
df.transactions = df.transactions.fillna(0)
|
85
|
+
df.volume = df.volume.fillna(0)
|
86
|
+
df.total = df.total.fillna(0)
|
87
|
+
df.close = df.close.ffill()
|
88
|
+
close = df.close
|
89
|
+
df.vwap = df.vwap.fillna(close)
|
90
|
+
df.high = df.high.fillna(close)
|
91
|
+
df.low = df.low.fillna(close)
|
92
|
+
df.open = df.open.fillna(close)
|
93
|
+
price_open = df.open
|
94
|
+
high = df.high
|
95
|
+
low = df.low
|
96
|
+
vwap = df.vwap
|
97
|
+
# volume = df.volume
|
98
|
+
total = df.total
|
99
|
+
next_close = close.shift()
|
100
|
+
|
101
|
+
# TODO: Odometer rollover signal. Relative difference to nearest power of 10.
|
102
|
+
# Something about log10 being a whole number? When is $50 the rollover vs $100 or $10?
|
103
|
+
|
104
|
+
# "True (Typical?) Price" which I think is an approximation of VWAP.
|
105
|
+
# Trouble with both is that if there are no trades in a bar we get NaN.
|
106
|
+
# That then means we get NaN for averages for the next period-1 bars too.
|
107
|
+
# Question is whether to ffill the price for these calculations.
|
108
|
+
df["TP"] = (high + low + close) / 3
|
109
|
+
|
110
|
+
# Gain/loss in this bar.
|
111
|
+
df["ret1bar"] = close.div(price_open).sub(1)
|
112
|
+
|
113
|
+
for t in range(2, period):
|
114
|
+
df[f"ret{t}bar"] = close.div(price_open.shift(t - 1)).sub(1)
|
115
|
+
|
116
|
+
# Average True Range (ATR)
|
117
|
+
true_range = pd.concat(
|
118
|
+
[high.sub(low), high.sub(next_close).abs(), low.sub(next_close).abs()], axis=1
|
119
|
+
).max(1)
|
120
|
+
# Normalized ATR (NATR) or Average of Normalized TR.
|
121
|
+
# Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
|
122
|
+
# He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
|
123
|
+
# atr = true_range.ewm(span=period).mean()
|
124
|
+
# df["natr_c"] = atr / close
|
125
|
+
# df["antr_c"] = (true_range / close).ewm(span=period).mean()
|
126
|
+
# df["natr_v"] = atr / vwap
|
127
|
+
# df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
|
128
|
+
df["NATR"] = (true_range / vwap).ewm(span=period).mean()
|
129
|
+
|
130
|
+
# True Price as HLC average VS VWAP.
|
131
|
+
# VWAP is better I think but is quite different than standard CCI.
|
132
|
+
# Three ways to compute CCI, all give the same value using TP.
|
133
|
+
# tp = (high + low + close) / 3
|
134
|
+
# df['SMA'] = ta.sma(tp, length=period)
|
135
|
+
# df['sma_r'] = tp.rolling(period).mean()
|
136
|
+
# df['MAD'] = ta.mad(tp, length=period)
|
137
|
+
# # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
|
138
|
+
# df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
|
139
|
+
|
140
|
+
# df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
|
141
|
+
# df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
|
142
|
+
# df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
|
143
|
+
|
144
|
+
df["taCCI"] = ta.cci(high=high, low=low, close=close, length=period)
|
145
|
+
|
146
|
+
# https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
|
147
|
+
# https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
|
148
|
+
sma = pd.Series(ta.sma(vwap, length=period))
|
149
|
+
mad = pd.Series(ta.mad(vwap, length=period))
|
150
|
+
df["CCI"] = (vwap - sma) / (0.015 * mad)
|
151
|
+
|
152
|
+
# df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
|
153
|
+
df["MFI"] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
|
154
|
+
|
155
|
+
# We use Stochastic (rather than MACD because we need a ticker independent indicator.
|
156
|
+
# IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
|
157
|
+
# https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
|
158
|
+
# We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
|
159
|
+
stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
|
160
|
+
df["STOCHk"] = stoch_k
|
161
|
+
df["STOCHd"] = stoch_d
|
162
|
+
df["STOCHh"] = stoch_h
|
163
|
+
|
164
|
+
return df
|
165
|
+
|
166
|
+
|
167
|
+
def iterate_all_aggs_tables(
|
168
|
+
config: PolygonConfig,
|
169
|
+
valid_tickers: pa.Array,
|
170
|
+
):
|
171
|
+
schedule = config.calendar.trading_index(
|
172
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
173
|
+
)
|
174
|
+
for timestamp in schedule:
|
175
|
+
date = timestamp.to_pydatetime().date()
|
176
|
+
aggs_ds = pa_ds.dataset(
|
177
|
+
config.aggs_dir,
|
178
|
+
format="parquet",
|
179
|
+
schema=custom_aggs_schema(tz=config.calendar.tz.key),
|
180
|
+
partitioning=custom_aggs_partitioning(),
|
181
|
+
)
|
182
|
+
date_filter_expr = (
|
183
|
+
(pa_compute.field("year") == date.year)
|
184
|
+
& (pa_compute.field("month") == date.month)
|
185
|
+
& (pa_compute.field("date") == date)
|
186
|
+
)
|
187
|
+
# print(f"{date_filter_expr=}")
|
188
|
+
for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
|
189
|
+
session_filter = (
|
190
|
+
(pa_compute.field("window_start") >= start_dt)
|
191
|
+
& (pa_compute.field("window_start") < end_dt)
|
192
|
+
& pa_compute.is_in(pa_compute.field("ticker"), valid_tickers)
|
193
|
+
)
|
194
|
+
# Sorting table doesn't seem to avoid needing to sort the df. Maybe use_threads=False on to_pandas would help?
|
195
|
+
# table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
|
196
|
+
table = fragment.to_table(filter=session_filter)
|
197
|
+
if table.num_rows > 0:
|
198
|
+
metadata = (
|
199
|
+
dict(table.schema.metadata) if table.schema.metadata else dict()
|
200
|
+
)
|
201
|
+
metadata["date"] = date.isoformat()
|
202
|
+
table = table.replace_schema_metadata(metadata)
|
203
|
+
yield table
|
204
|
+
|
205
|
+
|
206
|
+
# def iterate_all_aggs_with_signals(config: PolygonConfig):
|
207
|
+
# for table in iterate_all_aggs_tables(config):
|
208
|
+
# df = table.to_pandas()
|
209
|
+
# df = df.groupby("ticker").apply(
|
210
|
+
# compute_per_ticker_signals, include_groups=False
|
211
|
+
# )
|
212
|
+
# yield pa.Table.from_pandas(df)
|
213
|
+
|
214
|
+
|
215
|
+
def file_visitor(written_file):
|
216
|
+
print(f"{written_file.path=}")
|
217
|
+
|
218
|
+
|
219
|
+
def compute_signals_for_all_aggs(
|
220
|
+
from_config: PolygonConfig,
|
221
|
+
to_config: PolygonConfig,
|
222
|
+
valid_tickers: pa.Array,
|
223
|
+
overwrite: bool = False,
|
224
|
+
) -> str:
|
225
|
+
if overwrite:
|
226
|
+
print("WARNING: overwrite not implemented/ignored.")
|
227
|
+
|
228
|
+
# Need a different aggs_dir for the signals because schema is different.
|
229
|
+
print(f"{to_config.aggs_dir=}")
|
230
|
+
|
231
|
+
for aggs_table in iterate_all_aggs_tables(from_config, valid_tickers):
|
232
|
+
metadata = aggs_table.schema.metadata
|
233
|
+
date = datetime.date.fromisoformat(metadata[b"date"].decode("utf-8"))
|
234
|
+
print(f"{date=}")
|
235
|
+
df = aggs_table.to_pandas()
|
236
|
+
df = df.groupby("ticker").apply(
|
237
|
+
compute_per_ticker_signals, include_groups=False
|
238
|
+
)
|
239
|
+
table = pa.Table.from_pandas(df)
|
240
|
+
if table.num_rows > 0:
|
241
|
+
table = table.replace_schema_metadata(metadata)
|
242
|
+
table = table.append_column("date", pa.array(np.full(len(table), date)))
|
243
|
+
table = table.append_column(
|
244
|
+
"year", pa.array(np.full(len(table), date.year), type=pa.uint16())
|
245
|
+
)
|
246
|
+
table = table.append_column(
|
247
|
+
"month", pa.array(np.full(len(table), date.month), type=pa.uint8())
|
248
|
+
)
|
249
|
+
table = table.sort_by(
|
250
|
+
[("ticker", "ascending"), ("window_start", "ascending")]
|
251
|
+
)
|
252
|
+
pa_ds.write_dataset(
|
253
|
+
table,
|
254
|
+
filesystem=to_config.filesystem,
|
255
|
+
base_dir=to_config.aggs_dir,
|
256
|
+
partitioning=custom_aggs_partitioning(),
|
257
|
+
format="parquet",
|
258
|
+
existing_data_behavior="overwrite_or_ignore",
|
259
|
+
file_visitor=file_visitor,
|
260
|
+
)
|
261
|
+
return to_config.aggs_dir
|
@@ -1,24 +1,43 @@
|
|
1
1
|
from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
|
2
2
|
|
3
3
|
import shutil
|
4
|
-
from typing import Iterator, Tuple,
|
4
|
+
from typing import Iterator, Tuple, Union
|
5
5
|
|
6
6
|
import argparse
|
7
7
|
import os
|
8
|
+
import datetime
|
8
9
|
|
9
10
|
import pyarrow as pa
|
10
|
-
|
11
|
-
|
12
|
-
|
11
|
+
import pyarrow.compute as pa_compute
|
12
|
+
import pyarrow.csv as pa_csv
|
13
|
+
import pyarrow.dataset as pa_ds
|
14
|
+
import pyarrow.fs as pa_fs
|
13
15
|
|
14
16
|
import pandas as pd
|
15
17
|
|
16
18
|
|
19
|
+
# def get_by_ticker_dates(config: PolygonConfig, schema) -> set[datetime.date]:
|
20
|
+
# file_info = config.filesystem.get_file_info(config.by_ticker_dir)
|
21
|
+
# if file_info.type == pa_fs.FileType.NotFound:
|
22
|
+
# return set()
|
23
|
+
# partitioning = None
|
24
|
+
# if PARTITION_COLUMN_NAME in schema.names:
|
25
|
+
# partitioning = pa_ds.partitioning(
|
26
|
+
# pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
|
27
|
+
# )
|
28
|
+
# by_ticker_aggs_ds = pa_ds.dataset(config.by_ticker_aggs_arrow_dir, schema=schema, partitioning=partitioning)
|
29
|
+
# return set(
|
30
|
+
# [
|
31
|
+
# pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
32
|
+
# for fragment in by_ticker_aggs_ds.get_fragments()
|
33
|
+
# ]
|
34
|
+
# )
|
35
|
+
|
36
|
+
|
17
37
|
def generate_tables_from_csv_files(
|
18
|
-
|
38
|
+
config: PolygonConfig,
|
19
39
|
schema: pa.Schema,
|
20
|
-
|
21
|
-
limit_timestamp: pd.Timestamp,
|
40
|
+
overwrite: bool = False,
|
22
41
|
) -> Iterator[pa.Table]:
|
23
42
|
empty_table = schema.empty_table()
|
24
43
|
# TODO: Find which column(s) need to be cast to int64 from the schema.
|
@@ -29,16 +48,35 @@ def generate_tables_from_csv_files(
|
|
29
48
|
)
|
30
49
|
csv_schema = empty_table.schema
|
31
50
|
|
51
|
+
existing_by_ticker_dates = set()
|
52
|
+
if not overwrite:
|
53
|
+
# print("Getting existing by_ticker_dates")
|
54
|
+
# existing_by_ticker_dates = get_by_ticker_dates(config, schema)
|
55
|
+
print(f"{len(existing_by_ticker_dates)=}")
|
56
|
+
|
57
|
+
schedule = config.calendar.trading_index(
|
58
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
59
|
+
)
|
60
|
+
start_timestamp = config.start_timestamp.tz_localize(config.calendar.tz.key)
|
61
|
+
limit_timestamp = (config.end_timestamp + pd.Timedelta(days=1)).tz_localize(
|
62
|
+
config.calendar.tz.key)
|
63
|
+
# print(f"{start_timestamp=} {limit_timestamp=} {config.calendar.tz=} {schedule[:2]=} {schedule[-2:]=}")
|
64
|
+
|
32
65
|
tables_read_count = 0
|
33
66
|
skipped_table_count = 0
|
34
|
-
for
|
67
|
+
for timestamp in schedule:
|
68
|
+
date: datetime.date = timestamp.tz_localize(config.calendar.tz.key).to_pydatetime().date()
|
69
|
+
# print(f"{date=} {timestamp=}")
|
70
|
+
if date in existing_by_ticker_dates:
|
71
|
+
continue
|
72
|
+
csv_path = config.date_to_csv_file_path(date)
|
35
73
|
convert_options = pa_csv.ConvertOptions(
|
36
74
|
column_types=csv_schema,
|
37
75
|
strings_can_be_null=False,
|
38
76
|
quoted_strings_can_be_null=False,
|
39
77
|
)
|
40
78
|
|
41
|
-
table = pa_csv.read_csv(
|
79
|
+
table = pa_csv.read_csv(csv_path, convert_options=convert_options)
|
42
80
|
tables_read_count += 1
|
43
81
|
table = table.set_column(
|
44
82
|
table.column_names.index("window_start"),
|
@@ -76,18 +114,86 @@ def generate_tables_from_csv_files(
|
|
76
114
|
skipped_table_count += 1
|
77
115
|
continue
|
78
116
|
|
117
|
+
if PARTITION_COLUMN_NAME in schema.names:
|
118
|
+
print(f"{date=}")
|
79
119
|
yield table
|
80
120
|
print(f"{tables_read_count=} {skipped_table_count=}")
|
81
121
|
|
82
122
|
|
123
|
+
# def generate_tables_from_csv_files(
|
124
|
+
# paths: Iterator[Union[str, os.PathLike]],
|
125
|
+
# schema: pa.Schema,
|
126
|
+
# start_timestamp: pd.Timestamp,
|
127
|
+
# limit_timestamp: pd.Timestamp,
|
128
|
+
# ) -> Iterator[pa.Table]:
|
129
|
+
# empty_table = schema.empty_table()
|
130
|
+
# # TODO: Find which column(s) need to be cast to int64 from the schema.
|
131
|
+
# empty_table = empty_table.set_column(
|
132
|
+
# empty_table.column_names.index("window_start"),
|
133
|
+
# "window_start",
|
134
|
+
# empty_table.column("window_start").cast(pa.int64()),
|
135
|
+
# )
|
136
|
+
# csv_schema = empty_table.schema
|
137
|
+
|
138
|
+
# tables_read_count = 0
|
139
|
+
# skipped_table_count = 0
|
140
|
+
# for path in paths:
|
141
|
+
# convert_options = pa_csv.ConvertOptions(
|
142
|
+
# column_types=csv_schema,
|
143
|
+
# strings_can_be_null=False,
|
144
|
+
# quoted_strings_can_be_null=False,
|
145
|
+
# )
|
146
|
+
|
147
|
+
# table = pa_csv.read_csv(path, convert_options=convert_options)
|
148
|
+
# tables_read_count += 1
|
149
|
+
# table = table.set_column(
|
150
|
+
# table.column_names.index("window_start"),
|
151
|
+
# "window_start",
|
152
|
+
# table.column("window_start").cast(schema.field("window_start").type),
|
153
|
+
# )
|
154
|
+
# if PARTITION_COLUMN_NAME in schema.names:
|
155
|
+
# table = table.append_column(
|
156
|
+
# PARTITION_COLUMN_NAME,
|
157
|
+
# pa.array(
|
158
|
+
# [
|
159
|
+
# to_partition_key(ticker)
|
160
|
+
# for ticker in table.column("ticker").to_pylist()
|
161
|
+
# ]
|
162
|
+
# ),
|
163
|
+
# )
|
164
|
+
# expr = (
|
165
|
+
# pa_compute.field("window_start")
|
166
|
+
# >= pa.scalar(start_timestamp, type=schema.field("window_start").type)
|
167
|
+
# ) & (
|
168
|
+
# pa_compute.field("window_start")
|
169
|
+
# < pa.scalar(
|
170
|
+
# limit_timestamp,
|
171
|
+
# type=schema.field("window_start").type,
|
172
|
+
# )
|
173
|
+
# )
|
174
|
+
# table = table.filter(expr)
|
175
|
+
|
176
|
+
# # TODO: Also check that these rows are within range for this file's date (not just the whole session).
|
177
|
+
# # And if we're doing that (figuring date for each file), we can just skip reading the file.
|
178
|
+
# # Might able to do a single comparison using compute.days_between.
|
179
|
+
# # https://arrow.apache.org/docs/python/generated/pyarrow.compute.days_between.html
|
180
|
+
|
181
|
+
# if table.num_rows == 0:
|
182
|
+
# skipped_table_count += 1
|
183
|
+
# continue
|
184
|
+
|
185
|
+
# yield table
|
186
|
+
# print(f"{tables_read_count=} {skipped_table_count=}")
|
187
|
+
|
188
|
+
|
83
189
|
def generate_csv_agg_tables(
|
84
|
-
config: PolygonConfig,
|
190
|
+
config: PolygonConfig, overwrite: bool = False
|
85
191
|
) -> Tuple[pa.Schema, Iterator[pa.Table]]:
|
86
192
|
"""zipline does bundle ingestion one ticker at a time."""
|
87
193
|
|
88
194
|
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
89
195
|
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
90
|
-
timestamp_type = pa.timestamp("ns", tz=
|
196
|
+
timestamp_type = pa.timestamp("ns", tz='UTC')
|
91
197
|
|
92
198
|
# But we can't use the timestamp type in the schema here because it's not supported by the CSV reader.
|
93
199
|
# So we'll use int64 and cast it after reading the CSV file.
|
@@ -121,14 +227,12 @@ def generate_csv_agg_tables(
|
|
121
227
|
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
|
122
228
|
)
|
123
229
|
|
124
|
-
# TODO: Use generator like os.walk for paths.
|
125
230
|
return (
|
126
231
|
polygon_aggs_schema,
|
127
232
|
generate_tables_from_csv_files(
|
128
|
-
|
233
|
+
config,
|
129
234
|
schema=polygon_aggs_schema,
|
130
|
-
|
131
|
-
limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
|
235
|
+
overwrite=overwrite,
|
132
236
|
),
|
133
237
|
)
|
134
238
|
|
@@ -143,16 +247,17 @@ def concat_all_aggs_from_csv(
|
|
143
247
|
config: PolygonConfig,
|
144
248
|
overwrite: bool = False,
|
145
249
|
) -> str:
|
146
|
-
schema, tables = generate_csv_agg_tables(config)
|
250
|
+
schema, tables = generate_csv_agg_tables(config, overwrite=overwrite)
|
147
251
|
|
148
252
|
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
149
|
-
if os.path.exists(by_ticker_aggs_arrow_dir):
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
253
|
+
# if os.path.exists(by_ticker_aggs_arrow_dir):
|
254
|
+
# if overwrite:
|
255
|
+
# print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
256
|
+
# shutil.rmtree(by_ticker_aggs_arrow_dir)
|
257
|
+
# else:
|
258
|
+
# # TODO: Validate the existing data.
|
259
|
+
# print(f"Found existing {by_ticker_aggs_arrow_dir=}")
|
260
|
+
# return by_ticker_aggs_arrow_dir
|
156
261
|
|
157
262
|
partitioning = None
|
158
263
|
if PARTITION_COLUMN_NAME in schema.names:
|
@@ -160,7 +265,7 @@ def concat_all_aggs_from_csv(
|
|
160
265
|
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
|
161
266
|
)
|
162
267
|
|
163
|
-
|
268
|
+
print(f"Scattering aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
164
269
|
pa_ds.write_dataset(
|
165
270
|
generate_batches_from_tables(tables),
|
166
271
|
schema=schema,
|
@@ -169,7 +274,7 @@ def concat_all_aggs_from_csv(
|
|
169
274
|
format="parquet",
|
170
275
|
existing_data_behavior="overwrite_or_ignore",
|
171
276
|
)
|
172
|
-
print(f"
|
277
|
+
print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
173
278
|
return by_ticker_aggs_arrow_dir
|
174
279
|
|
175
280
|
|
zipline_polygon_bundle/config.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
from exchange_calendars.calendar_helpers import Date, parse_date
|
2
|
-
from
|
2
|
+
from zipline.utils.calendar_utils import get_calendar
|
3
3
|
|
4
4
|
from .nyse_all_hours_calendar import NYSE_ALL_HOURS
|
5
5
|
|
@@ -8,9 +8,13 @@ from typing import Iterator, Tuple
|
|
8
8
|
import pandas as pd
|
9
9
|
from pyarrow.fs import LocalFileSystem
|
10
10
|
import os
|
11
|
+
import datetime
|
11
12
|
import re
|
12
13
|
import fnmatch
|
13
14
|
|
15
|
+
AGG_TIME_DAY = "day"
|
16
|
+
AGG_TIME_MINUTE = "minute"
|
17
|
+
AGG_TIME_TRADES = "1minute"
|
14
18
|
|
15
19
|
PARTITION_COLUMN_NAME = "part"
|
16
20
|
PARTITION_KEY_LENGTH = 2
|
@@ -38,12 +42,12 @@ class PolygonConfig:
|
|
38
42
|
calendar_name: str,
|
39
43
|
start_date: Date,
|
40
44
|
end_date: Date,
|
41
|
-
agg_time: str =
|
42
|
-
custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
|
45
|
+
agg_time: str = AGG_TIME_DAY,
|
43
46
|
):
|
44
47
|
self.calendar_name = calendar_name
|
45
48
|
self.start_date = start_date
|
46
49
|
self.end_date = end_date
|
50
|
+
# TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
|
47
51
|
self.start_timestamp = (
|
48
52
|
parse_date(start_date, calendar=self.calendar)
|
49
53
|
if start_date
|
@@ -54,6 +58,8 @@ class PolygonConfig:
|
|
54
58
|
if end_date
|
55
59
|
else self.calendar.last_session
|
56
60
|
)
|
61
|
+
self.agg_time = agg_time
|
62
|
+
|
57
63
|
self.max_workers = None
|
58
64
|
if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
|
59
65
|
self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
|
@@ -92,54 +98,67 @@ class PolygonConfig:
|
|
92
98
|
self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
|
93
99
|
|
94
100
|
# TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
|
101
|
+
self.custom_asset_files_dir = environ.get(
|
102
|
+
"CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
|
103
|
+
)
|
104
|
+
|
105
|
+
self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
|
106
|
+
|
107
|
+
self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
|
108
|
+
self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
|
109
|
+
self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
|
110
|
+
|
95
111
|
self.minute_by_ticker_dir = os.path.join(
|
96
|
-
self.
|
112
|
+
self.custom_asset_files_dir, "minute_by_ticker_v1"
|
113
|
+
)
|
114
|
+
self.day_by_ticker_dir = os.path.join(
|
115
|
+
self.custom_asset_files_dir, "day_by_ticker_v1"
|
97
116
|
)
|
98
|
-
self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
|
99
117
|
|
118
|
+
# If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
|
100
119
|
if bool(re.match(r"^\d", agg_time)):
|
101
120
|
self.agg_timedelta = pd.to_timedelta(agg_time)
|
102
|
-
self.
|
103
|
-
|
121
|
+
self.csv_files_dir = self.trades_dir
|
122
|
+
self.custom_aggs_name_format = environ.get(
|
123
|
+
"CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
|
104
124
|
)
|
105
|
-
self.
|
106
|
-
self.custom_asset_files_dir,
|
125
|
+
self.aggs_dir = os.path.join(
|
126
|
+
self.custom_asset_files_dir,
|
127
|
+
self.custom_aggs_name_format.format(config=self),
|
107
128
|
)
|
108
|
-
self.
|
129
|
+
self.by_ticker_dir = os.path.join(
|
109
130
|
self.custom_asset_files_dir,
|
110
|
-
(
|
131
|
+
(self.custom_aggs_name_format + "_by_ticker").format(config=self),
|
111
132
|
)
|
112
|
-
|
113
|
-
self.by_ticker_dir = self.custom_aggs_by_ticker_dir
|
114
|
-
elif agg_time == "minute":
|
133
|
+
elif agg_time == AGG_TIME_MINUTE:
|
115
134
|
self.agg_timedelta = pd.to_timedelta("1minute")
|
116
135
|
self.aggs_dir = self.minute_aggs_dir
|
136
|
+
self.csv_files_dir = self.aggs_dir
|
117
137
|
self.by_ticker_dir = self.minute_by_ticker_dir
|
118
|
-
elif agg_time ==
|
138
|
+
elif agg_time == AGG_TIME_DAY:
|
119
139
|
self.agg_timedelta = pd.to_timedelta("1day")
|
120
140
|
self.aggs_dir = self.day_aggs_dir
|
141
|
+
self.csv_files_dir = self.aggs_dir
|
121
142
|
self.by_ticker_dir = self.day_by_ticker_dir
|
122
143
|
else:
|
123
144
|
raise ValueError(
|
124
145
|
f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
|
125
146
|
)
|
126
|
-
self.agg_time = agg_time
|
127
147
|
|
128
148
|
self.arrow_format = environ.get(
|
129
|
-
"POLYGON_ARROW_FORMAT", "parquet" if self.agg_time ==
|
149
|
+
"POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
|
130
150
|
)
|
131
151
|
# self.by_ticker_hive_dir = os.path.join(
|
132
152
|
# self.by_ticker_dir,
|
133
153
|
# f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
|
134
154
|
# )
|
135
|
-
self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
|
136
155
|
|
137
156
|
@property
|
138
157
|
def calendar(self):
|
139
|
-
#
|
140
|
-
|
141
|
-
|
142
|
-
return get_calendar(self.calendar_name,
|
158
|
+
# print call stack
|
159
|
+
# import traceback
|
160
|
+
# traceback.print_stack()
|
161
|
+
return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
|
143
162
|
|
144
163
|
def ticker_file_path(self, date: pd.Timestamp):
|
145
164
|
ticker_year_dir = os.path.join(
|
@@ -154,6 +173,9 @@ class PolygonConfig:
|
|
154
173
|
# TODO: Use csv_paths_pattern to remove the suffixes
|
155
174
|
return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
|
156
175
|
|
176
|
+
def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
|
177
|
+
return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
|
178
|
+
|
157
179
|
@property
|
158
180
|
def by_ticker_aggs_arrow_dir(self):
|
159
181
|
# TODO: Don't split these up by ingestion range. They're already time indexed.
|
@@ -161,19 +183,20 @@ class PolygonConfig:
|
|
161
183
|
# This scattering is really slow and is usually gonna be redundant.
|
162
184
|
# This wasn't a problem when start/end dates were the calendar bounds when omitted.
|
163
185
|
# Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
|
164
|
-
return os.path.join(
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
)
|
186
|
+
# return os.path.join(
|
187
|
+
# self.by_ticker_dir,
|
188
|
+
# f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
|
189
|
+
# # "aggs.arrow",
|
190
|
+
# )
|
191
|
+
return self.by_ticker_dir
|
169
192
|
|
170
193
|
def api_cache_path(
|
171
|
-
self,
|
194
|
+
self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
|
172
195
|
):
|
173
|
-
|
174
|
-
|
196
|
+
first_day_str = first_day.date().isoformat()
|
197
|
+
last_day_str = last_day.date().isoformat()
|
175
198
|
return os.path.join(
|
176
|
-
self.cache_dir, f"{
|
199
|
+
self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
|
177
200
|
)
|
178
201
|
|
179
202
|
def csv_paths(self) -> Iterator[str]:
|
@@ -186,7 +209,9 @@ class PolygonConfig:
|
|
186
209
|
for filename in sorted(filenames):
|
187
210
|
yield os.path.join(root, filename)
|
188
211
|
|
189
|
-
def find_first_and_last_aggs(
|
212
|
+
def find_first_and_last_aggs(
|
213
|
+
self, aggs_dir, file_pattern
|
214
|
+
) -> Tuple[str | None, str | None]:
|
190
215
|
# Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
|
191
216
|
# Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
|
192
217
|
# This needs to be efficient because it is called on every init, even though we only need it for ingest.
|