zipline_polygon_bundle 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +12 -11
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +172 -200
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +129 -44
- zipline_polygon_bundle/config.py +90 -32
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/tickers_and_names.py +4 -1
- zipline_polygon_bundle/trades.py +352 -526
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/METADATA +7 -5
- zipline_polygon_bundle-0.2.0.dist-info/RECORD +18 -0
- zipline_polygon_bundle-0.1.8.dist-info/RECORD +0 -16
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.8.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/WHEEL +0 -0
zipline_polygon_bundle/trades.py
CHANGED
@@ -1,33 +1,29 @@
|
|
1
|
-
from .config import PolygonConfig
|
1
|
+
from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
|
2
2
|
|
3
3
|
from typing import Iterator, Tuple
|
4
4
|
|
5
5
|
import pyarrow as pa
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
from pyarrow import csv as pa_csv
|
11
|
-
from pyarrow import fs as pa_fs
|
6
|
+
import pyarrow.compute as pa_compute
|
7
|
+
import pyarrow.csv as pa_csv
|
8
|
+
import pyarrow.dataset as pa_ds
|
9
|
+
import pyarrow.fs as pa_fs
|
12
10
|
|
13
11
|
from fsspec.implementations.arrow import ArrowFSWrapper
|
14
12
|
|
13
|
+
import os
|
15
14
|
import datetime
|
16
|
-
|
15
|
+
|
17
16
|
import numpy as np
|
18
17
|
import pandas as pd
|
19
18
|
|
20
|
-
import pandas_ta as ta
|
21
|
-
|
22
|
-
# from concurrent.futures import ThreadPoolExecutor
|
23
|
-
# from concurrent.futures import ProcessPoolExecutor
|
24
|
-
|
25
19
|
|
26
20
|
def trades_schema(raw: bool = False) -> pa.Schema:
|
27
21
|
# There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
|
28
22
|
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
29
23
|
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
30
|
-
#
|
24
|
+
# The timezone is America/New_York because that's the US exchanges timezone and the date is a trading day.
|
25
|
+
# timestamp_type = pa.timestamp("ns", tz="America/New_York")
|
26
|
+
# timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
|
31
27
|
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
32
28
|
|
33
29
|
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
@@ -36,22 +32,22 @@ def trades_schema(raw: bool = False) -> pa.Schema:
|
|
36
32
|
price_type = pa.float64()
|
37
33
|
|
38
34
|
return pa.schema(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
35
|
+
[
|
36
|
+
pa.field("ticker", pa.string(), nullable=False),
|
37
|
+
pa.field("conditions", pa.string(), nullable=False),
|
38
|
+
pa.field("correction", pa.string(), nullable=False),
|
39
|
+
pa.field("exchange", pa.int8(), nullable=False),
|
40
|
+
pa.field("id", pa.string(), nullable=False),
|
41
|
+
pa.field("participant_timestamp", timestamp_type, nullable=False),
|
42
|
+
pa.field("price", price_type, nullable=False),
|
43
|
+
pa.field("sequence_number", pa.int64(), nullable=False),
|
44
|
+
pa.field("sip_timestamp", timestamp_type, nullable=False),
|
45
|
+
pa.field("size", pa.int64(), nullable=False),
|
46
|
+
pa.field("tape", pa.int8(), nullable=False),
|
47
|
+
pa.field("trf_id", pa.int64(), nullable=False),
|
48
|
+
pa.field("trf_timestamp", timestamp_type, nullable=False),
|
49
|
+
]
|
50
|
+
)
|
55
51
|
|
56
52
|
|
57
53
|
def trades_dataset(config: PolygonConfig) -> pa_ds.Dataset:
|
@@ -68,20 +64,26 @@ def trades_dataset(config: PolygonConfig) -> pa_ds.Dataset:
|
|
68
64
|
fsspec.glob(os.path.join(config.trades_dir, config.csv_paths_pattern))
|
69
65
|
)
|
70
66
|
|
71
|
-
return pa_ds.FileSystemDataset.from_paths(
|
72
|
-
|
73
|
-
|
74
|
-
|
67
|
+
return pa_ds.FileSystemDataset.from_paths(
|
68
|
+
paths,
|
69
|
+
format=pa_ds.CsvFileFormat(),
|
70
|
+
schema=trades_schema(raw=True),
|
71
|
+
filesystem=config.filesystem,
|
72
|
+
)
|
75
73
|
|
76
74
|
|
77
|
-
def cast_strings_to_list(
|
75
|
+
def cast_strings_to_list(
|
76
|
+
string_array, separator=",", default="0", value_type=pa.uint8()
|
77
|
+
):
|
78
78
|
"""Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
|
79
79
|
|
80
80
|
# Create a mask to identify empty strings
|
81
81
|
is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
|
82
82
|
|
83
83
|
# Use replace_with_mask to replace empty strings with the default ("0")
|
84
|
-
filled_column = pa_compute.replace_with_mask(
|
84
|
+
filled_column = pa_compute.replace_with_mask(
|
85
|
+
string_array, is_empty, pa.scalar(default)
|
86
|
+
)
|
85
87
|
|
86
88
|
# Split the strings by comma
|
87
89
|
split_array = pa_compute.split_pattern(filled_column, pattern=separator)
|
@@ -92,254 +94,78 @@ def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa
|
|
92
94
|
return int_list_array
|
93
95
|
|
94
96
|
|
95
|
-
def cast_trades(trades):
|
96
|
-
trades = trades.cast(trades_schema())
|
97
|
-
condition_values = cast_strings_to_list(trades.column("conditions").combine_chunks())
|
98
|
-
return trades.append_column('condition_values', condition_values)
|
99
|
-
|
100
|
-
|
101
|
-
def date_to_path(date, ext=".csv.gz"):
|
102
|
-
# return f"{date.year}/{date.month:02}/{date.isoformat()}{ext}"
|
103
|
-
return date.strftime("%Y/%m/%Y-%m-%d") + ext
|
104
|
-
|
105
|
-
|
106
|
-
def convert_to_custom_aggs_file(config: PolygonConfig,
|
107
|
-
overwrite: bool,
|
108
|
-
timestamp: pd.Timestamp,
|
109
|
-
start_session: pd.Timestamp,
|
110
|
-
end_session: pd.Timestamp):
|
111
|
-
date = timestamp.to_pydatetime().date()
|
112
|
-
aggs_date_path = date_to_path(date, ext=".parquet")
|
113
|
-
aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
|
114
|
-
# aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
|
115
|
-
fsspec = ArrowFSWrapper(config.filesystem)
|
116
|
-
if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
|
117
|
-
if overwrite:
|
118
|
-
if fsspec.exists(aggs_path):
|
119
|
-
config.filesystem.delete_file(aggs_path)
|
120
|
-
if fsspec.exists(aggs_by_ticker_path):
|
121
|
-
config.filesystem.delete_file(aggs_by_ticker_path)
|
122
|
-
else:
|
123
|
-
if fsspec.exists(aggs_path):
|
124
|
-
print(f"SKIPPING: {date=} File exists {aggs_path=}")
|
125
|
-
if fsspec.exists(aggs_by_ticker_path):
|
126
|
-
print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
|
127
|
-
return
|
128
|
-
fsspec.mkdir(fsspec._parent(aggs_path))
|
129
|
-
fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
|
130
|
-
trades_path = f"{config.trades_dir}/{date_to_path(date)}"
|
131
|
-
if not fsspec.exists(trades_path):
|
132
|
-
print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
|
133
|
-
return
|
134
|
-
print(f"{trades_path=}")
|
135
|
-
format = pa_ds.CsvFileFormat()
|
136
|
-
trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
|
137
|
-
fragments = trades_ds.get_fragments()
|
138
|
-
fragment = next(fragments)
|
139
|
-
try:
|
140
|
-
next(fragments)
|
141
|
-
print("ERROR: More than one fragment for {path=}")
|
142
|
-
except StopIteration:
|
143
|
-
pass
|
144
|
-
trades = fragment.to_table(schema=trades_ds.schema)
|
97
|
+
def cast_trades(trades) -> pa.Table:
|
145
98
|
trades = trades.cast(trades_schema())
|
146
|
-
|
147
|
-
|
148
|
-
if min_timestamp < start_session:
|
149
|
-
print(f"ERROR: {min_timestamp=} < {start_session=}")
|
150
|
-
if max_timestamp >= end_session:
|
151
|
-
print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
152
|
-
trades_df = trades.to_pandas()
|
153
|
-
trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
|
154
|
-
aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
|
155
|
-
open=('price', 'first'),
|
156
|
-
high=('price', 'max'),
|
157
|
-
low=('price', 'min'),
|
158
|
-
close=('price', 'last'),
|
159
|
-
volume=('size', 'sum'),
|
99
|
+
condition_values = cast_strings_to_list(
|
100
|
+
trades.column("conditions").combine_chunks()
|
160
101
|
)
|
161
|
-
|
162
|
-
aggs_df.reset_index(inplace=True)
|
163
|
-
aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
|
164
|
-
aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
|
165
|
-
print(f"{aggs_by_ticker_path=}")
|
166
|
-
pa_parquet.write_table(table=aggs_table,
|
167
|
-
where=aggs_by_ticker_path, filesystem=to_config.filesystem)
|
168
|
-
aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
|
169
|
-
print(f"{aggs_path=}")
|
170
|
-
pa_parquet.write_table(table=aggs_table,
|
171
|
-
where=aggs_path, filesystem=to_config.filesystem)
|
172
|
-
|
173
|
-
|
174
|
-
# def convert_to_custom_aggs(config: PolygonConfig,
|
175
|
-
# overwrite: bool,
|
176
|
-
# timestamp: pd.Timestamp,
|
177
|
-
# start_session: pd.Timestamp,
|
178
|
-
# end_session: pd.Timestamp):
|
179
|
-
# date = timestamp.to_pydatetime().date()
|
180
|
-
# aggs_date_path = date_to_path(date, ext=".parquet")
|
181
|
-
# aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
|
182
|
-
# # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
|
183
|
-
# fsspec = ArrowFSWrapper(config.filesystem)
|
184
|
-
# if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
|
185
|
-
# if overwrite:
|
186
|
-
# if fsspec.exists(aggs_path):
|
187
|
-
# config.filesystem.delete_file(aggs_path)
|
188
|
-
# if fsspec.exists(aggs_by_ticker_path):
|
189
|
-
# config.filesystem.delete_file(aggs_by_ticker_path)
|
190
|
-
# else:
|
191
|
-
# if fsspec.exists(aggs_path):
|
192
|
-
# print(f"SKIPPING: {date=} File exists {aggs_path=}")
|
193
|
-
# if fsspec.exists(aggs_by_ticker_path):
|
194
|
-
# print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
|
195
|
-
# return
|
196
|
-
# fsspec.mkdir(fsspec._parent(aggs_path))
|
197
|
-
# fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
|
198
|
-
# trades_path = f"{config.trades_dir}/{date_to_path(date)}"
|
199
|
-
# if not fsspec.exists(trades_path):
|
200
|
-
# print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
|
201
|
-
# return
|
202
|
-
# print(f"{trades_path=}")
|
203
|
-
# format = pa_ds.CsvFileFormat()
|
204
|
-
# trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
|
205
|
-
# fragments = trades_ds.get_fragments()
|
206
|
-
# fragment = next(fragments)
|
207
|
-
# try:
|
208
|
-
# next(fragments)
|
209
|
-
# print("ERROR: More than one fragment for {path=}")
|
210
|
-
# except StopIteration:
|
211
|
-
# pass
|
212
|
-
# trades = fragment.to_table(schema=trades_ds.schema)
|
213
|
-
# trades = trades.cast(trades_schema())
|
214
|
-
# min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
|
215
|
-
# max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
|
216
|
-
# if min_timestamp < start_session:
|
217
|
-
# print(f"ERROR: {min_timestamp=} < {start_session=}")
|
218
|
-
# if max_timestamp >= end_session:
|
219
|
-
# print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
220
|
-
# trades_df = trades.to_pandas()
|
221
|
-
# trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
|
222
|
-
# aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
|
223
|
-
# open=('price', 'first'),
|
224
|
-
# high=('price', 'max'),
|
225
|
-
# low=('price', 'min'),
|
226
|
-
# close=('price', 'last'),
|
227
|
-
# volume=('size', 'sum'),
|
228
|
-
# )
|
229
|
-
# aggs_df['transactions'] = trades_df.groupby(["ticker", "window_start"]).size()
|
230
|
-
# aggs_df.reset_index(inplace=True)
|
231
|
-
# aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
|
232
|
-
# aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
|
233
|
-
# print(f"{aggs_by_ticker_path=}")
|
234
|
-
# pa_parquet.write_table(table=aggs_table,
|
235
|
-
# where=aggs_by_ticker_path, filesystem=to_config.filesystem)
|
236
|
-
# aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
|
237
|
-
# print(f"{aggs_path=}")
|
238
|
-
# pa_parquet.write_table(table=aggs_table,
|
239
|
-
# where=aggs_path, filesystem=to_config.filesystem)
|
240
|
-
# pa_ds.write_dataset(
|
241
|
-
# generate_batches_from_tables(tables),
|
242
|
-
# schema=schema,
|
243
|
-
# base_dir=by_ticker_aggs_arrow_dir,
|
244
|
-
# partitioning=partitioning,
|
245
|
-
# format="parquet",
|
246
|
-
# existing_data_behavior="overwrite_or_ignore",
|
247
|
-
# )
|
248
|
-
|
249
|
-
|
250
|
-
# def generate_csv_trades_tables(
|
251
|
-
# config: PolygonConfig,
|
252
|
-
# ) -> Tuple[datetime.date, Iterator[pa.Table]]:
|
253
|
-
# """Generator for trades tables from flatfile CSVs."""
|
254
|
-
# # Use pandas_market_calendars so we can get extended hours.
|
255
|
-
# # NYSE and NASDAQ have extended hours but XNYS does not.
|
256
|
-
# calendar = pandas_market_calendars.get_calendar(config.calendar_name)
|
257
|
-
# schedule = calendar.schedule(start_date=config.start_timestamp, end_date=config.end_timestamp, start="pre", end="post")
|
258
|
-
# for timestamp, session in schedule.iterrows():
|
259
|
-
# date = timestamp.to_pydatetime().date()
|
260
|
-
# trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
|
261
|
-
# format = pa_ds.CsvFileFormat()
|
262
|
-
# trades_ds = pa_ds.FileSystemDataset.from_paths([trades_csv_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
|
263
|
-
# fragments = trades_ds.get_fragments()
|
264
|
-
# fragment = next(fragments)
|
265
|
-
# try:
|
266
|
-
# next(fragments)
|
267
|
-
# print("ERROR: More than one fragment for {path=}")
|
268
|
-
# except StopIteration:
|
269
|
-
# pass
|
270
|
-
# trades = fragment.to_table(schema=trades_ds.schema)
|
271
|
-
# trades = trades.cast(trades_schema())
|
272
|
-
# min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
|
273
|
-
# max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
|
274
|
-
# start_session = session['pre']
|
275
|
-
# end_session = session['post']
|
276
|
-
# # print(f"{start_session=} {end_session=}")
|
277
|
-
# # print(f"{min_timestamp=} {max_timestamp=}")
|
278
|
-
# if min_timestamp < start_session:
|
279
|
-
# print(f"ERROR: {min_timestamp=} < {start_session=}")
|
280
|
-
# # The end_session is supposed to be a limit but there are many with trades at that second.
|
281
|
-
# if max_timestamp >= (end_session + pd.Timedelta(seconds=1)):
|
282
|
-
# # print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
283
|
-
# print(f"ERROR: {max_timestamp=} > {end_session+pd.Timedelta(seconds=1)=}")
|
284
|
-
# yield date, trades
|
285
|
-
# del fragment
|
286
|
-
# del fragments
|
287
|
-
# del trades_ds
|
102
|
+
return trades.append_column("condition_values", condition_values)
|
288
103
|
|
289
104
|
|
290
105
|
def custom_aggs_schema(raw: bool = False) -> pa.Schema:
|
106
|
+
# timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
|
291
107
|
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
292
108
|
price_type = pa.float64()
|
293
109
|
return pa.schema(
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
110
|
+
[
|
111
|
+
pa.field("ticker", pa.string(), nullable=False),
|
112
|
+
pa.field("volume", pa.int64(), nullable=False),
|
113
|
+
pa.field("open", price_type, nullable=False),
|
114
|
+
pa.field("close", price_type, nullable=False),
|
115
|
+
pa.field("high", price_type, nullable=False),
|
116
|
+
pa.field("low", price_type, nullable=False),
|
117
|
+
pa.field("window_start", timestamp_type, nullable=False),
|
118
|
+
pa.field("transactions", pa.int64(), nullable=False),
|
119
|
+
pa.field("date", pa.date32(), nullable=False),
|
120
|
+
pa.field("year", pa.uint16(), nullable=False),
|
121
|
+
pa.field("month", pa.uint8(), nullable=False),
|
122
|
+
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
123
|
+
]
|
124
|
+
)
|
308
125
|
|
309
126
|
|
310
127
|
def custom_aggs_partitioning() -> pa.Schema:
|
311
128
|
return pa_ds.partitioning(
|
312
|
-
pa.schema(
|
129
|
+
pa.schema(
|
130
|
+
[("year", pa.uint16()), ("month", pa.uint8()), ("date", pa.date32())]
|
131
|
+
),
|
132
|
+
flavor="hive",
|
313
133
|
)
|
314
134
|
|
315
135
|
|
316
|
-
def
|
317
|
-
file_info = config.filesystem.get_file_info(config.
|
136
|
+
def get_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
137
|
+
file_info = config.filesystem.get_file_info(config.aggs_dir)
|
318
138
|
if file_info.type == pa_fs.FileType.NotFound:
|
319
139
|
return set()
|
320
|
-
aggs_ds = pa_ds.dataset(
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
140
|
+
aggs_ds = pa_ds.dataset(
|
141
|
+
config.aggs_dir,
|
142
|
+
format="parquet",
|
143
|
+
schema=custom_aggs_schema(),
|
144
|
+
partitioning=custom_aggs_partitioning(),
|
145
|
+
)
|
146
|
+
return set(
|
147
|
+
[
|
148
|
+
pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
149
|
+
for fragment in aggs_ds.get_fragments()
|
150
|
+
]
|
151
|
+
)
|
325
152
|
|
326
153
|
|
327
154
|
def generate_csv_trades_tables(
|
328
155
|
config: PolygonConfig, overwrite: bool = False
|
329
|
-
) -> Tuple[datetime.date,
|
156
|
+
) -> Iterator[Tuple[datetime.date, pa.Table]]:
|
330
157
|
"""Generator for trades tables from flatfile CSVs."""
|
331
|
-
|
158
|
+
existing_aggs_dates = set()
|
332
159
|
if not overwrite:
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
date
|
340
|
-
if date in custom_aggs_dates:
|
160
|
+
existing_aggs_dates = get_aggs_dates(config)
|
161
|
+
schedule = config.calendar.trading_index(
|
162
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
163
|
+
)
|
164
|
+
for timestamp in schedule:
|
165
|
+
date: datetime.date = timestamp.to_pydatetime().date()
|
166
|
+
if date in existing_aggs_dates:
|
341
167
|
continue
|
342
|
-
trades_csv_path =
|
168
|
+
trades_csv_path = config.date_to_csv_file_path(date)
|
343
169
|
convert_options = pa_csv.ConvertOptions(column_types=trades_schema(raw=True))
|
344
170
|
trades = pa_csv.read_csv(trades_csv_path, convert_options=convert_options)
|
345
171
|
trades = trades.cast(trades_schema())
|
@@ -359,77 +185,87 @@ def generate_csv_trades_tables(
|
|
359
185
|
del trades
|
360
186
|
|
361
187
|
|
362
|
-
def trades_to_custom_aggs(
|
363
|
-
|
188
|
+
def trades_to_custom_aggs(
|
189
|
+
config: PolygonConfig,
|
190
|
+
date: datetime.date,
|
191
|
+
table: pa.Table,
|
192
|
+
include_trf: bool = False,
|
193
|
+
) -> pa.Table:
|
194
|
+
print(f"{date=} {pa.default_memory_pool()=}")
|
195
|
+
# print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
|
364
196
|
# print(f"{resource.getrusage(resource.RUSAGE_SELF).ru_maxrss=}")
|
365
197
|
table = table.filter(pa_compute.greater(table["size"], 0))
|
366
198
|
table = table.filter(pa_compute.equal(table["correction"], "0"))
|
367
199
|
if not include_trf:
|
368
200
|
table = table.filter(pa_compute.not_equal(table["exchange"], 4))
|
369
|
-
table = table.append_column(
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
201
|
+
table = table.append_column(
|
202
|
+
"price_total", pa_compute.multiply(table["price"], table["size"])
|
203
|
+
)
|
204
|
+
table = table.append_column(
|
205
|
+
"window_start",
|
206
|
+
pa_compute.floor_temporal(
|
207
|
+
table["sip_timestamp"], multiple=config.agg_timedelta.seconds, unit="second"
|
208
|
+
),
|
209
|
+
)
|
210
|
+
table = table.group_by(["ticker", "window_start"], use_threads=False).aggregate(
|
211
|
+
[
|
212
|
+
("price", "first"),
|
213
|
+
("price", "max"),
|
214
|
+
("price", "min"),
|
215
|
+
("price", "last"),
|
216
|
+
("price_total", "sum"),
|
217
|
+
("size", "sum"),
|
218
|
+
([], "count_all"),
|
219
|
+
]
|
220
|
+
)
|
221
|
+
table = table.rename_columns(
|
222
|
+
{
|
223
|
+
"price_first": "open",
|
224
|
+
"price_max": "high",
|
225
|
+
"price_min": "low",
|
226
|
+
"price_last": "close",
|
227
|
+
"size_sum": "volume",
|
228
|
+
"price_total_sum": "total",
|
229
|
+
"count_all": "transactions",
|
230
|
+
}
|
231
|
+
)
|
232
|
+
table = table.append_column(
|
233
|
+
"vwap", pa_compute.divide(table["total"], table["volume"])
|
234
|
+
)
|
392
235
|
# table.append_column('date', pa.array([date] * len(table), type=pa.date32()))
|
393
236
|
# table.append_column('year', pa.array([date.year] * len(table), type=pa.uint16()))
|
394
237
|
# table.append_column('month', pa.array([date.month] * len(table), type=pa.uint8()))
|
395
|
-
table = table.append_column(
|
396
|
-
table = table.append_column(
|
397
|
-
|
398
|
-
|
238
|
+
table = table.append_column("date", pa.array(np.full(len(table), date)))
|
239
|
+
table = table.append_column(
|
240
|
+
"year", pa.array(np.full(len(table), date.year), type=pa.uint16())
|
241
|
+
)
|
242
|
+
table = table.append_column(
|
243
|
+
"month", pa.array(np.full(len(table), date.month), type=pa.uint8())
|
244
|
+
)
|
245
|
+
table = table.append_column(
|
246
|
+
PARTITION_COLUMN_NAME,
|
247
|
+
pa.array(
|
248
|
+
[to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
|
249
|
+
),
|
250
|
+
)
|
251
|
+
table = table.sort_by([("window_start", "ascending"), ("ticker", "ascending")])
|
252
|
+
# print(f"aggs {date=} {table.to_pandas().head()=}")
|
399
253
|
return table
|
400
254
|
|
401
255
|
|
402
|
-
def generate_custom_agg_batches_from_tables(config: PolygonConfig)
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
def generate_custom_agg_tables(config: PolygonConfig) -> pa.Table:
|
410
|
-
for date, trades_table in generate_csv_trades_tables(config):
|
411
|
-
yield trades_to_custom_aggs(config, date, trades_table)
|
412
|
-
|
413
|
-
|
414
|
-
def configure_write_custom_aggs_to_dataset(config: PolygonConfig):
|
415
|
-
def write_custom_aggs_to_dataset(args: Tuple[datetime.date, pa.Table]):
|
416
|
-
date, table = args
|
417
|
-
pa_ds.write_dataset(
|
418
|
-
trades_to_custom_aggs(config, date, table),
|
419
|
-
filesystem=config.filesystem,
|
420
|
-
base_dir=config.custom_aggs_dir,
|
421
|
-
partitioning=custom_aggs_partitioning(),
|
422
|
-
format="parquet",
|
423
|
-
existing_data_behavior="overwrite_or_ignore",
|
424
|
-
)
|
425
|
-
return write_custom_aggs_to_dataset
|
256
|
+
# def generate_custom_agg_batches_from_tables(config: PolygonConfig):
|
257
|
+
# for date, trades_table in generate_csv_trades_tables(config):
|
258
|
+
# aggs_table = trades_to_custom_aggs(config, date, trades_table)
|
259
|
+
# yield aggs_table
|
260
|
+
# del aggs_table
|
261
|
+
# del trades_table
|
426
262
|
|
427
263
|
|
428
264
|
def file_visitor(written_file):
|
429
265
|
print(f"{written_file.path=}")
|
430
266
|
|
431
267
|
|
432
|
-
def
|
268
|
+
def convert_trades_to_custom_aggs(
|
433
269
|
config: PolygonConfig, overwrite: bool = False
|
434
270
|
) -> str:
|
435
271
|
if overwrite:
|
@@ -438,7 +274,7 @@ def convert_all_to_custom_aggs(
|
|
438
274
|
# MAX_FILES_OPEN = 8
|
439
275
|
# MIN_ROWS_PER_GROUP = 100_000
|
440
276
|
|
441
|
-
print(f"{config.
|
277
|
+
print(f"{config.aggs_dir=}")
|
442
278
|
|
443
279
|
# pa.set_memory_pool()
|
444
280
|
|
@@ -446,26 +282,25 @@ def convert_all_to_custom_aggs(
|
|
446
282
|
# generate_custom_agg_batches_from_tables(config),
|
447
283
|
# schema=custom_aggs_schema(),
|
448
284
|
# filesystem=config.filesystem,
|
449
|
-
# base_dir=config.
|
285
|
+
# base_dir=config.aggs_dir,
|
450
286
|
# partitioning=custom_aggs_partitioning(),
|
451
287
|
# format="parquet",
|
452
288
|
# existing_data_behavior="overwrite_or_ignore",
|
453
|
-
# max_open_files = MAX_FILES_OPEN,
|
454
|
-
# min_rows_per_group = MIN_ROWS_PER_GROUP,
|
289
|
+
# # max_open_files = MAX_FILES_OPEN,
|
290
|
+
# # min_rows_per_group = MIN_ROWS_PER_GROUP,
|
455
291
|
# )
|
456
292
|
|
457
293
|
for date, trades_table in generate_csv_trades_tables(config):
|
458
294
|
aggs_table = trades_to_custom_aggs(config, date, trades_table)
|
459
295
|
pa_ds.write_dataset(
|
460
296
|
aggs_table,
|
461
|
-
# schema=custom_aggs_schema(),
|
462
297
|
filesystem=config.filesystem,
|
463
|
-
base_dir=config.
|
298
|
+
base_dir=config.aggs_dir,
|
464
299
|
partitioning=custom_aggs_partitioning(),
|
465
300
|
format="parquet",
|
466
301
|
existing_data_behavior="overwrite_or_ignore",
|
467
302
|
file_visitor=file_visitor,
|
468
|
-
# max_open_files=
|
303
|
+
# max_open_files=10,
|
469
304
|
# min_rows_per_group=MIN_ROWS_PER_GROUP,
|
470
305
|
)
|
471
306
|
del aggs_table
|
@@ -477,8 +312,8 @@ def convert_all_to_custom_aggs(
|
|
477
312
|
# generate_csv_trades_tables(config),
|
478
313
|
# )
|
479
314
|
|
480
|
-
print(f"Generated aggregates to {config.
|
481
|
-
return config.
|
315
|
+
print(f"Generated aggregates to {config.aggs_dir=}")
|
316
|
+
return config.aggs_dir
|
482
317
|
|
483
318
|
|
484
319
|
# https://github.com/twopirllc/pandas-ta/issues/731#issuecomment-1766786952
|
@@ -500,208 +335,199 @@ def convert_all_to_custom_aggs(
|
|
500
335
|
# mfi = 100 - 100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon))
|
501
336
|
# return mfi
|
502
337
|
|
503
|
-
def calculate_mfi(typical_price: pd.Series, money_flow: pd.Series, period: int):
|
504
|
-
mf_sign = np.where(typical_price > np.roll(typical_price, shift=1), 1, -1)
|
505
|
-
signed_mf = money_flow * mf_sign
|
506
|
-
|
507
|
-
# Calculate gain and loss using vectorized operations
|
508
|
-
positive_mf = np.maximum(signed_mf, 0)
|
509
|
-
negative_mf = np.maximum(-signed_mf, 0)
|
510
|
-
|
511
|
-
mf_avg_gain = np.convolve(positive_mf, np.ones(period), mode='full')[:len(positive_mf)] / period
|
512
|
-
mf_avg_loss = np.convolve(negative_mf, np.ones(period), mode='full')[:len(negative_mf)] / period
|
513
|
-
|
514
|
-
epsilon = 1e-10 # Small epsilon value to avoid division by zero
|
515
|
-
mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
|
516
|
-
return mfi
|
517
|
-
|
518
|
-
|
519
|
-
# https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
|
520
|
-
# https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
|
521
|
-
# `k` vs `fast_k` arg names.
|
522
|
-
# https://github.com/twopirllc/pandas-ta/issues/726
|
523
|
-
# Results affected by values outside range
|
524
|
-
# https://github.com/twopirllc/pandas-ta/issues/535
|
525
|
-
|
526
|
-
def calculate_stoch(high: pd.Series, low: pd.Series, close: pd.Series, k: int = 14, d: int = 3, smooth_k: int = 3, mamode:str = "sma"):
|
527
|
-
"""Indicator: Stochastic Oscillator (STOCH)"""
|
528
|
-
lowest_low = low.rolling(k).min()
|
529
|
-
highest_high = high.rolling(k).max()
|
530
|
-
|
531
|
-
stoch = 100 * (close - lowest_low)
|
532
|
-
stoch /= ta.utils.non_zero_range(highest_high, lowest_low)
|
533
|
-
|
534
|
-
stoch_k = ta.overlap.ma(mamode, stoch.loc[stoch.first_valid_index():,], length=smooth_k)
|
535
|
-
stoch_d = ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index():,], length=d) if stoch_k is not None else None
|
536
|
-
# Histogram
|
537
|
-
stoch_h = stoch_k - stoch_d if stoch_d is not None else None
|
538
|
-
|
539
|
-
return stoch_k, stoch_d, stoch_h
|
540
|
-
|
541
|
-
|
542
|
-
def compute_per_ticker_signals(df: pd.DataFrame, period: int = 14) -> pd.DataFrame:
|
543
|
-
df = df.set_index('window_start').sort_index()
|
544
|
-
session_index = pd.date_range(start=df.index[0],
|
545
|
-
end=df.index[-1],
|
546
|
-
freq=pd.Timedelta(seconds=60))
|
547
|
-
df = df.reindex(session_index)
|
548
|
-
df.index.rename('window_start', inplace=True)
|
549
|
-
|
550
|
-
# df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
|
551
|
-
# df["day_of_week"] = df.index.day_of_week
|
552
|
-
|
553
|
-
df.transactions = df.transactions.fillna(0)
|
554
|
-
df.volume = df.volume.fillna(0)
|
555
|
-
df.total = df.total.fillna(0)
|
556
|
-
df.close = df.close.ffill()
|
557
|
-
close = df.close
|
558
|
-
df.vwap = df.vwap.fillna(close)
|
559
|
-
df.high = df.high.fillna(close)
|
560
|
-
df.low = df.low.fillna(close)
|
561
|
-
df.open = df.open.fillna(close)
|
562
|
-
price_open = df.open
|
563
|
-
high = df.high
|
564
|
-
low = df.low
|
565
|
-
vwap = df.vwap
|
566
|
-
# volume = df.volume
|
567
|
-
total = df.total
|
568
|
-
next_close = close.shift()
|
569
|
-
|
570
|
-
# TODO: Odometer rollover signal. Relative difference to nearest power of 10.
|
571
|
-
# Something about log10 being a whole number? When is $50 the rollover vs $100 or $10?
|
572
|
-
|
573
|
-
# "True (Typical?) Price" which I think is an approximation of VWAP.
|
574
|
-
# Trouble with both is that if there are no trades in a bar we get NaN.
|
575
|
-
# That then means we get NaN for averages for the next period-1 bars too.
|
576
|
-
# Question is whether to ffill the price for these calculations.
|
577
|
-
df["TP"] = (high + low + close) / 3
|
578
|
-
|
579
|
-
# Gain/loss in this bar.
|
580
|
-
df["ret1bar"] = close.div(price_open).sub(1)
|
581
|
-
|
582
|
-
for t in range(2, period):
|
583
|
-
df[f'ret{t}bar'] = close.div(price_open.shift(t-1)).sub(1)
|
584
|
-
|
585
|
-
# Average True Range (ATR)
|
586
|
-
true_range = pd.concat([high.sub(low),
|
587
|
-
high.sub(next_close).abs(),
|
588
|
-
low.sub(next_close).abs()], axis=1).max(1)
|
589
|
-
# Normalized ATR (NATR) or Average of Normalized TR.
|
590
|
-
# Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
|
591
|
-
# He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
|
592
|
-
# atr = true_range.ewm(span=period).mean()
|
593
|
-
# df["natr_c"] = atr / close
|
594
|
-
# df["antr_c"] = (true_range / close).ewm(span=period).mean()
|
595
|
-
# df["natr_v"] = atr / vwap
|
596
|
-
# df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
|
597
|
-
df["NATR"] = (true_range / vwap).ewm(span=period).mean()
|
598
|
-
|
599
|
-
# True Price as HLC average VS VWAP.
|
600
|
-
# VWAP is better I think but is quite different than standard CCI.
|
601
|
-
# Three ways to compute CCI, all give the same value using TP.
|
602
|
-
# tp = (high + low + close) / 3
|
603
|
-
# df['SMA'] = ta.sma(tp, length=period)
|
604
|
-
# df['sma_r'] = tp.rolling(period).mean()
|
605
|
-
# df['MAD'] = ta.mad(tp, length=period)
|
606
|
-
# # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
|
607
|
-
# df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
|
608
|
-
|
609
|
-
# df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
|
610
|
-
# df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
|
611
|
-
# df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
|
612
|
-
|
613
|
-
df['taCCI'] = ta.cci(high=high, low=low, close=close, length=period)
|
614
|
-
|
615
|
-
# https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
|
616
|
-
# https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
|
617
|
-
sma = pd.Series(ta.sma(vwap, length=period))
|
618
|
-
mad = pd.Series(ta.mad(vwap, length=period))
|
619
|
-
df['CCI'] = (vwap - sma) / (0.015 * mad)
|
620
|
-
|
621
|
-
# df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
|
622
|
-
df['MFI'] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
|
623
|
-
|
624
|
-
# We use Stochastic (rather than MACD because we need a ticker independent indicator.
|
625
|
-
# IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
|
626
|
-
# https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
|
627
|
-
# We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
|
628
|
-
stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
|
629
|
-
df["STOCHk"] = stoch_k
|
630
|
-
df["STOCHd"] = stoch_d
|
631
|
-
df["STOCHh"] = stoch_h
|
632
|
-
|
633
|
-
return df
|
634
|
-
|
635
|
-
|
636
|
-
def iterate_all_aggs_tables(config: PolygonConfig, valid_tickers: pa.Array, start_session: str = "pre", end_session: str = "market_open"):
|
637
|
-
calendar = pandas_market_calendars.get_calendar(config.calendar_name)
|
638
|
-
schedule = calendar.schedule(start_date=config.start_date,
|
639
|
-
end_date=config.end_date,
|
640
|
-
start="pre",
|
641
|
-
end="post")
|
642
|
-
for date, sessions in schedule.iterrows():
|
643
|
-
# print(f"{date=} {sessions=}")
|
644
|
-
start_dt = sessions[start_session]
|
645
|
-
end_dt = sessions[end_session]
|
646
|
-
# print(f"{date=} {start_dt=} {end_dt=}")
|
647
|
-
aggs_ds = pa_ds.dataset(config.custom_aggs_dir,
|
648
|
-
format="parquet",
|
649
|
-
schema=custom_aggs_schema(),
|
650
|
-
partitioning=custom_aggs_partitioning())
|
651
|
-
date_filter_expr = ((pc.field('year') == date.year)
|
652
|
-
& (pc.field('month') == date.month)
|
653
|
-
& (pc.field('date') == date.to_pydatetime().date()))
|
654
|
-
# print(f"{date_filter_expr=}")
|
655
|
-
for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
|
656
|
-
session_filter = ((pc.field('window_start') >= start_dt)
|
657
|
-
& (pc.field('window_start') < end_dt)
|
658
|
-
& pc.is_in(pc.field('ticker'), valid_tickers)
|
659
|
-
)
|
660
|
-
# Sorting table doesn't seem to avoid needing to sort the df. Maybe use_threads=False on to_pandas would help?
|
661
|
-
# table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
|
662
|
-
table = fragment.to_table(filter=session_filter)
|
663
|
-
if table.num_rows > 0:
|
664
|
-
metadata = dict(table.schema.metadata) if table.schema.metadata else dict()
|
665
|
-
metadata["date"] = date.date().isoformat()
|
666
|
-
table = table.replace_schema_metadata(metadata)
|
667
|
-
yield table
|
668
|
-
|
669
|
-
|
670
|
-
def iterate_all_aggs_with_signals(config: PolygonConfig):
|
671
|
-
for table in iterate_all_aggs_tables(config):
|
672
|
-
df = table.to_pandas()
|
673
|
-
df = df.groupby("ticker").apply(compute_per_ticker_signals, include_groups=False)
|
674
|
-
yield pa.Table.from_pandas(df)
|
675
|
-
|
676
|
-
|
677
|
-
def compute_signals_for_all_custom_aggs(
|
678
|
-
from_config: PolygonConfig, to_config: PolygonConfig, valid_tickers: pa.Array, overwrite: bool = False
|
679
|
-
) -> str:
|
680
|
-
if overwrite:
|
681
|
-
print("WARNING: overwrite not implemented/ignored.")
|
682
338
|
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
339
|
+
def get_by_ticker_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
340
|
+
file_info = config.filesystem.get_file_info(config.by_ticker_aggs_arrow_dir)
|
341
|
+
if file_info.type == pa_fs.FileType.NotFound:
|
342
|
+
return set()
|
343
|
+
by_ticker_aggs_ds = pa_ds.dataset(
|
344
|
+
config.by_ticker_aggs_arrow_dir,
|
345
|
+
format="parquet",
|
346
|
+
schema=custom_aggs_schema(),
|
347
|
+
partitioning=custom_aggs_partitioning(),
|
348
|
+
)
|
349
|
+
return set(
|
350
|
+
[
|
351
|
+
pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
352
|
+
for fragment in by_ticker_aggs_ds.get_fragments()
|
353
|
+
]
|
354
|
+
)
|
355
|
+
|
356
|
+
|
357
|
+
def batches_for_date(aggs_ds: pa_ds.Dataset, date: pd.Timestamp):
|
358
|
+
date_filter_expr = (
|
359
|
+
(pa_compute.field("year") == date.year)
|
360
|
+
& (pa_compute.field("month") == date.month)
|
361
|
+
& (pa_compute.field("date") == date.date())
|
362
|
+
)
|
363
|
+
print(f"table for {date=}")
|
364
|
+
# return aggs_ds.scanner(filter=date_filter_expr).to_batches()
|
365
|
+
table = aggs_ds.scanner(filter=date_filter_expr).to_table()
|
366
|
+
table = table.sort_by([("part", "ascending"), ("ticker", "ascending"), ("window_start", "ascending"), ])
|
367
|
+
return table.to_batches()
|
368
|
+
|
369
|
+
def generate_batches_for_schedule(config, aggs_ds):
|
370
|
+
schedule = config.calendar.trading_index(
|
371
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
372
|
+
)
|
373
|
+
for timestamp in schedule:
|
374
|
+
# print(f"{timestamp=}")
|
375
|
+
yield from batches_for_date(aggs_ds=aggs_ds, date=timestamp)
|
376
|
+
|
377
|
+
|
378
|
+
# def scatter_custom_aggs_to_by_ticker(
|
379
|
+
# config: PolygonConfig,
|
380
|
+
# overwrite: bool = False,
|
381
|
+
# ) -> str:
|
382
|
+
# lock = FileLock(config.lock_file_path, blocking=False)
|
383
|
+
# with lock:
|
384
|
+
# if not lock.is_locked:
|
385
|
+
# raise IOError("Failed to acquire lock for updating custom assets.")
|
386
|
+
# with open(config.by_ticker_dates_path, "a") as f:
|
387
|
+
# f.write("I have a bad feeling about this.")
|
388
|
+
# by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker_(config, overwrite)
|
389
|
+
|
390
|
+
# print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
391
|
+
# return by_ticker_aggs_arrow_dir
|
392
|
+
|
393
|
+
|
394
|
+
def filter_by_date(config: PolygonConfig) -> pa_compute.Expression:
|
395
|
+
start_date = config.start_timestamp.tz_localize(config.calendar.tz.key).date()
|
396
|
+
limit_date = (
|
397
|
+
(config.end_timestamp + pd.Timedelta(days=1))
|
398
|
+
.tz_localize(config.calendar.tz.key)
|
399
|
+
.date()
|
400
|
+
)
|
401
|
+
return (pa_compute.field("date") >= start_date) & (
|
402
|
+
pa_compute.field("date") <= limit_date
|
403
|
+
)
|
404
|
+
|
405
|
+
|
406
|
+
# def generate_batches_with_partition(
|
407
|
+
# config: PolygonConfig,
|
408
|
+
# aggs_ds: pa_ds.Dataset,
|
409
|
+
# ) -> Iterator[pa.Table]:
|
410
|
+
# for fragment in aggs_ds.sort_by("date").get_fragments(
|
411
|
+
# filter=filter_by_date(config),
|
412
|
+
# ):
|
413
|
+
# for batch in fragment.to_batches():
|
414
|
+
# # batch = batch.append_column(
|
415
|
+
# # PARTITION_COLUMN_NAME,
|
416
|
+
# # pa.array(
|
417
|
+
# # [
|
418
|
+
# # to_partition_key(ticker)
|
419
|
+
# # for ticker in batch.column("ticker").to_pylist()
|
420
|
+
# # ]
|
421
|
+
# # ),
|
422
|
+
# # )
|
423
|
+
# yield batch.sort_by(
|
424
|
+
# [("ticker", "ascending"), ("window_start", "ascending")]
|
425
|
+
# )
|
426
|
+
# del batch
|
427
|
+
# del fragment
|
428
|
+
|
429
|
+
|
430
|
+
def generate_batches_with_partition(
|
431
|
+
config: PolygonConfig,
|
432
|
+
aggs_ds: pa_ds.Dataset,
|
433
|
+
) -> Iterator[pa.Table]:
|
434
|
+
for fragment in (
|
435
|
+
aggs_ds.filter(filter_by_date(config))
|
436
|
+
.sort_by([(PARTITION_COLUMN_NAME, "ascending"), ("date", "ascending")])
|
437
|
+
.get_fragments()
|
438
|
+
):
|
439
|
+
for batch in fragment.to_batches():
|
440
|
+
yield batch.sort_by(
|
441
|
+
[("ticker", "ascending"), ("window_start", "ascending")]
|
706
442
|
)
|
707
|
-
|
443
|
+
del batch
|
444
|
+
del fragment
|
445
|
+
|
446
|
+
|
447
|
+
def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
|
448
|
+
aggs_ds = pa_ds.dataset(
|
449
|
+
config.aggs_dir,
|
450
|
+
format="parquet",
|
451
|
+
schema=custom_aggs_schema(),
|
452
|
+
partitioning=custom_aggs_partitioning(),
|
453
|
+
)
|
454
|
+
by_ticker_schema = aggs_ds.schema
|
455
|
+
partitioning = pa_ds.partitioning(
|
456
|
+
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
457
|
+
flavor="hive",
|
458
|
+
)
|
459
|
+
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
460
|
+
print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
461
|
+
pa_ds.write_dataset(
|
462
|
+
# generate_batches_with_partition(config=config, aggs_ds=aggs_ds),
|
463
|
+
generate_batches_for_schedule(config=config, aggs_ds=aggs_ds),
|
464
|
+
schema=by_ticker_schema,
|
465
|
+
base_dir=by_ticker_aggs_arrow_dir,
|
466
|
+
partitioning=partitioning,
|
467
|
+
format="parquet",
|
468
|
+
existing_data_behavior="overwrite_or_ignore",
|
469
|
+
)
|
470
|
+
print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
471
|
+
return by_ticker_aggs_arrow_dir
|
472
|
+
|
473
|
+
|
474
|
+
# def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
|
475
|
+
# file_info = config.filesystem.get_file_info(config.aggs_dir)
|
476
|
+
# if file_info.type == pa_fs.FileType.NotFound:
|
477
|
+
# raise FileNotFoundError(f"{config.aggs_dir=} not found.")
|
478
|
+
|
479
|
+
# by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
480
|
+
# if os.path.exists(by_ticker_aggs_arrow_dir):
|
481
|
+
# if overwrite:
|
482
|
+
# print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
483
|
+
# shutil.rmtree(by_ticker_aggs_arrow_dir)
|
484
|
+
|
485
|
+
# schedule = config.calendar.trading_index(
|
486
|
+
# start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
487
|
+
# )
|
488
|
+
# assert type(schedule) is pd.DatetimeIndex
|
489
|
+
|
490
|
+
# print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
491
|
+
# aggs_ds = pa_ds.dataset(
|
492
|
+
# config.aggs_dir,
|
493
|
+
# format="parquet",
|
494
|
+
# schema=custom_aggs_schema(),
|
495
|
+
# partitioning=custom_aggs_partitioning(),
|
496
|
+
# )
|
497
|
+
# by_ticker_partitioning = pa_ds.partitioning(
|
498
|
+
# pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
499
|
+
# # pa.schema(
|
500
|
+
# # [
|
501
|
+
# # (PARTITION_COLUMN_NAME, pa.string()),
|
502
|
+
# # ("year", pa.uint16()),
|
503
|
+
# # ("month", pa.uint8()),
|
504
|
+
# # ("date", pa.date32()),
|
505
|
+
# # ]
|
506
|
+
# # ),
|
507
|
+
# flavor="hive",
|
508
|
+
# )
|
509
|
+
# by_ticker_schema = custom_aggs_schema()
|
510
|
+
# by_ticker_schema = by_ticker_schema.append(
|
511
|
+
# pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
512
|
+
# )
|
513
|
+
|
514
|
+
# # TODO: Collect the dates we've scattered and write a special partition key with them.
|
515
|
+
# pa_ds.write_dataset(
|
516
|
+
# generate_batches_for_schedule(schedule, aggs_ds),
|
517
|
+
# schema=by_ticker_schema,
|
518
|
+
# base_dir=by_ticker_aggs_arrow_dir,
|
519
|
+
# partitioning=by_ticker_partitioning,
|
520
|
+
# format="parquet",
|
521
|
+
# existing_data_behavior="overwrite_or_ignore",
|
522
|
+
# # max_open_files=250,
|
523
|
+
# # file_visitor=file_visitor,
|
524
|
+
# )
|
525
|
+
|
526
|
+
# return by_ticker_aggs_arrow_dir
|
527
|
+
|
528
|
+
|
529
|
+
# def generate_tables_from_custom_aggs_ds(
|
530
|
+
# aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
|
531
|
+
# ):
|
532
|
+
# for timestamp in schedule:
|
533
|
+
# yield table_for_date(aggs_ds=aggs_ds, date=timestamp.to_pydatetime().date())
|