zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +7 -9
- zipline_polygon_bundle/adjustments.py +27 -32
- zipline_polygon_bundle/bundle.py +157 -312
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +130 -25
- zipline_polygon_bundle/config.py +57 -32
- zipline_polygon_bundle/trades.py +196 -607
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/METADATA +8 -6
- zipline_polygon_bundle-0.2.1.dist-info/RECORD +18 -0
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/WHEEL +1 -1
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +0 -17
- {zipline_polygon_bundle-0.2.0.dev1.dist-info → zipline_polygon_bundle-0.2.1.dist-info}/LICENSE +0 -0
zipline_polygon_bundle/trades.py
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
|
2
2
|
|
3
|
-
from typing import Iterator, Tuple
|
3
|
+
from typing import Iterator, Tuple
|
4
4
|
|
5
5
|
import pyarrow as pa
|
6
|
-
import pyarrow.dataset as pa_ds
|
7
6
|
import pyarrow.compute as pa_compute
|
8
7
|
import pyarrow.csv as pa_csv
|
8
|
+
import pyarrow.dataset as pa_ds
|
9
9
|
import pyarrow.fs as pa_fs
|
10
10
|
|
11
11
|
from fsspec.implementations.arrow import ArrowFSWrapper
|
12
12
|
|
13
13
|
import os
|
14
14
|
import datetime
|
15
|
-
import shutil
|
16
15
|
|
17
16
|
import numpy as np
|
18
17
|
import pandas as pd
|
19
|
-
import pandas_ta as ta
|
20
18
|
|
21
19
|
|
22
20
|
def trades_schema(raw: bool = False) -> pa.Schema:
|
23
21
|
# There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
|
24
22
|
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
25
23
|
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
26
|
-
#
|
24
|
+
# The timezone is America/New_York because that's the US exchanges timezone and the date is a trading day.
|
25
|
+
# timestamp_type = pa.timestamp("ns", tz="America/New_York")
|
26
|
+
# timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
|
27
27
|
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
28
28
|
|
29
29
|
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
@@ -94,7 +94,7 @@ def cast_strings_to_list(
|
|
94
94
|
return int_list_array
|
95
95
|
|
96
96
|
|
97
|
-
def cast_trades(trades):
|
97
|
+
def cast_trades(trades) -> pa.Table:
|
98
98
|
trades = trades.cast(trades_schema())
|
99
99
|
condition_values = cast_strings_to_list(
|
100
100
|
trades.column("conditions").combine_chunks()
|
@@ -102,220 +102,8 @@ def cast_trades(trades):
|
|
102
102
|
return trades.append_column("condition_values", condition_values)
|
103
103
|
|
104
104
|
|
105
|
-
def date_to_path(date, ext=".csv.gz"):
|
106
|
-
# return f"{date.year}/{date.month:02}/{date.isoformat()}{ext}"
|
107
|
-
return date.strftime("%Y/%m/%Y-%m-%d") + ext
|
108
|
-
|
109
|
-
|
110
|
-
# def convert_to_custom_aggs_file(
|
111
|
-
# config: PolygonConfig,
|
112
|
-
# overwrite: bool,
|
113
|
-
# timestamp: pd.Timestamp,
|
114
|
-
# start_session: pd.Timestamp,
|
115
|
-
# end_session: pd.Timestamp,
|
116
|
-
# ):
|
117
|
-
# date = timestamp.to_pydatetime().date()
|
118
|
-
# aggs_date_path = date_to_path(date, ext=".parquet")
|
119
|
-
# aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
|
120
|
-
# # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
|
121
|
-
# fsspec = ArrowFSWrapper(config.filesystem)
|
122
|
-
# if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
|
123
|
-
# if overwrite:
|
124
|
-
# if fsspec.exists(aggs_path):
|
125
|
-
# config.filesystem.delete_file(aggs_path)
|
126
|
-
# if fsspec.exists(aggs_by_ticker_path):
|
127
|
-
# config.filesystem.delete_file(aggs_by_ticker_path)
|
128
|
-
# else:
|
129
|
-
# if fsspec.exists(aggs_path):
|
130
|
-
# print(f"SKIPPING: {date=} File exists {aggs_path=}")
|
131
|
-
# if fsspec.exists(aggs_by_ticker_path):
|
132
|
-
# print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
|
133
|
-
# return
|
134
|
-
# fsspec.mkdir(fsspec._parent(aggs_path))
|
135
|
-
# fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
|
136
|
-
# trades_path = f"{config.trades_dir}/{date_to_path(date)}"
|
137
|
-
# if not fsspec.exists(trades_path):
|
138
|
-
# print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
|
139
|
-
# return
|
140
|
-
# print(f"{trades_path=}")
|
141
|
-
# format = pa_ds.CsvFileFormat()
|
142
|
-
# trades_ds = pa_ds.FileSystemDataset.from_paths(
|
143
|
-
# [trades_path],
|
144
|
-
# format=format,
|
145
|
-
# schema=trades_schema(raw=True),
|
146
|
-
# filesystem=config.filesystem,
|
147
|
-
# )
|
148
|
-
# fragments = trades_ds.get_fragments()
|
149
|
-
# fragment = next(fragments)
|
150
|
-
# try:
|
151
|
-
# next(fragments)
|
152
|
-
# print("ERROR: More than one fragment for {path=}")
|
153
|
-
# except StopIteration:
|
154
|
-
# pass
|
155
|
-
# trades = fragment.to_table(schema=trades_ds.schema)
|
156
|
-
# trades = trades.cast(trades_schema())
|
157
|
-
# min_timestamp = pa.compute.min(trades.column("sip_timestamp")).as_py()
|
158
|
-
# max_timestamp = pa.compute.max(trades.column("sip_timestamp")).as_py()
|
159
|
-
# if min_timestamp < start_session:
|
160
|
-
# print(f"ERROR: {min_timestamp=} < {start_session=}")
|
161
|
-
# if max_timestamp >= end_session:
|
162
|
-
# print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
163
|
-
# trades_df = trades.to_pandas()
|
164
|
-
# trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
|
165
|
-
# aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
|
166
|
-
# open=("price", "first"),
|
167
|
-
# high=("price", "max"),
|
168
|
-
# low=("price", "min"),
|
169
|
-
# close=("price", "last"),
|
170
|
-
# volume=("size", "sum"),
|
171
|
-
# )
|
172
|
-
# aggs_df["transactions"] = trades_df.groupby(["ticker", "window_start"]).size()
|
173
|
-
# aggs_df.reset_index(inplace=True)
|
174
|
-
# aggs_table = pa.Table.from_pandas(aggs_df).select(
|
175
|
-
# [
|
176
|
-
# "ticker",
|
177
|
-
# "volume",
|
178
|
-
# "open",
|
179
|
-
# "close",
|
180
|
-
# "high",
|
181
|
-
# "low",
|
182
|
-
# "window_start",
|
183
|
-
# "transactions",
|
184
|
-
# ]
|
185
|
-
# )
|
186
|
-
# aggs_table = aggs_table.sort_by(
|
187
|
-
# [("ticker", "ascending"), ("window_start", "ascending")]
|
188
|
-
# )
|
189
|
-
# print(f"{aggs_by_ticker_path=}")
|
190
|
-
# pa_parquet.write_table(
|
191
|
-
# table=aggs_table, where=aggs_by_ticker_path, filesystem=to_config.filesystem
|
192
|
-
# )
|
193
|
-
# aggs_table = aggs_table.sort_by(
|
194
|
-
# [("window_start", "ascending"), ("ticker", "ascending")]
|
195
|
-
# )
|
196
|
-
# print(f"{aggs_path=}")
|
197
|
-
# pa_parquet.write_table(
|
198
|
-
# table=aggs_table, where=aggs_path, filesystem=to_config.filesystem
|
199
|
-
# )
|
200
|
-
|
201
|
-
|
202
|
-
# def convert_to_custom_aggs(config: PolygonConfig,
|
203
|
-
# overwrite: bool,
|
204
|
-
# timestamp: pd.Timestamp,
|
205
|
-
# start_session: pd.Timestamp,
|
206
|
-
# end_session: pd.Timestamp):
|
207
|
-
# date = timestamp.to_pydatetime().date()
|
208
|
-
# aggs_date_path = date_to_path(date, ext=".parquet")
|
209
|
-
# aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
|
210
|
-
# # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
|
211
|
-
# fsspec = ArrowFSWrapper(config.filesystem)
|
212
|
-
# if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
|
213
|
-
# if overwrite:
|
214
|
-
# if fsspec.exists(aggs_path):
|
215
|
-
# config.filesystem.delete_file(aggs_path)
|
216
|
-
# if fsspec.exists(aggs_by_ticker_path):
|
217
|
-
# config.filesystem.delete_file(aggs_by_ticker_path)
|
218
|
-
# else:
|
219
|
-
# if fsspec.exists(aggs_path):
|
220
|
-
# print(f"SKIPPING: {date=} File exists {aggs_path=}")
|
221
|
-
# if fsspec.exists(aggs_by_ticker_path):
|
222
|
-
# print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
|
223
|
-
# return
|
224
|
-
# fsspec.mkdir(fsspec._parent(aggs_path))
|
225
|
-
# fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
|
226
|
-
# trades_path = f"{config.trades_dir}/{date_to_path(date)}"
|
227
|
-
# if not fsspec.exists(trades_path):
|
228
|
-
# print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
|
229
|
-
# return
|
230
|
-
# print(f"{trades_path=}")
|
231
|
-
# format = pa_ds.CsvFileFormat()
|
232
|
-
# trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
|
233
|
-
# fragments = trades_ds.get_fragments()
|
234
|
-
# fragment = next(fragments)
|
235
|
-
# try:
|
236
|
-
# next(fragments)
|
237
|
-
# print("ERROR: More than one fragment for {path=}")
|
238
|
-
# except StopIteration:
|
239
|
-
# pass
|
240
|
-
# trades = fragment.to_table(schema=trades_ds.schema)
|
241
|
-
# trades = trades.cast(trades_schema())
|
242
|
-
# min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
|
243
|
-
# max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
|
244
|
-
# if min_timestamp < start_session:
|
245
|
-
# print(f"ERROR: {min_timestamp=} < {start_session=}")
|
246
|
-
# if max_timestamp >= end_session:
|
247
|
-
# print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
248
|
-
# trades_df = trades.to_pandas()
|
249
|
-
# trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
|
250
|
-
# aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
|
251
|
-
# open=('price', 'first'),
|
252
|
-
# high=('price', 'max'),
|
253
|
-
# low=('price', 'min'),
|
254
|
-
# close=('price', 'last'),
|
255
|
-
# volume=('size', 'sum'),
|
256
|
-
# )
|
257
|
-
# aggs_df['transactions'] = trades_df.groupby(["ticker", "window_start"]).size()
|
258
|
-
# aggs_df.reset_index(inplace=True)
|
259
|
-
# aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
|
260
|
-
# aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
|
261
|
-
# print(f"{aggs_by_ticker_path=}")
|
262
|
-
# pa_parquet.write_table(table=aggs_table,
|
263
|
-
# where=aggs_by_ticker_path, filesystem=to_config.filesystem)
|
264
|
-
# aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
|
265
|
-
# print(f"{aggs_path=}")
|
266
|
-
# pa_parquet.write_table(table=aggs_table,
|
267
|
-
# where=aggs_path, filesystem=to_config.filesystem)
|
268
|
-
# pa_ds.write_dataset(
|
269
|
-
# generate_batches_from_tables(tables),
|
270
|
-
# schema=schema,
|
271
|
-
# base_dir=by_ticker_aggs_arrow_dir,
|
272
|
-
# partitioning=partitioning,
|
273
|
-
# format="parquet",
|
274
|
-
# existing_data_behavior="overwrite_or_ignore",
|
275
|
-
# )
|
276
|
-
|
277
|
-
|
278
|
-
# def generate_csv_trades_tables(
|
279
|
-
# config: PolygonConfig,
|
280
|
-
# ) -> Tuple[datetime.date, Iterator[pa.Table]]:
|
281
|
-
# """Generator for trades tables from flatfile CSVs."""
|
282
|
-
# # Use pandas_market_calendars so we can get extended hours.
|
283
|
-
# # NYSE and NASDAQ have extended hours but XNYS does not.
|
284
|
-
# calendar = pandas_market_calendars.get_calendar(config.calendar_name)
|
285
|
-
# schedule = calendar.schedule(start_date=config.start_timestamp, end_date=config.end_timestamp, start="pre", end="post")
|
286
|
-
# for timestamp, session in schedule.iterrows():
|
287
|
-
# date = timestamp.to_pydatetime().date()
|
288
|
-
# trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
|
289
|
-
# format = pa_ds.CsvFileFormat()
|
290
|
-
# trades_ds = pa_ds.FileSystemDataset.from_paths([trades_csv_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
|
291
|
-
# fragments = trades_ds.get_fragments()
|
292
|
-
# fragment = next(fragments)
|
293
|
-
# try:
|
294
|
-
# next(fragments)
|
295
|
-
# print("ERROR: More than one fragment for {path=}")
|
296
|
-
# except StopIteration:
|
297
|
-
# pass
|
298
|
-
# trades = fragment.to_table(schema=trades_ds.schema)
|
299
|
-
# trades = trades.cast(trades_schema())
|
300
|
-
# min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
|
301
|
-
# max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
|
302
|
-
# start_session = session['pre']
|
303
|
-
# end_session = session['post']
|
304
|
-
# # print(f"{start_session=} {end_session=}")
|
305
|
-
# # print(f"{min_timestamp=} {max_timestamp=}")
|
306
|
-
# if min_timestamp < start_session:
|
307
|
-
# print(f"ERROR: {min_timestamp=} < {start_session=}")
|
308
|
-
# # The end_session is supposed to be a limit but there are many with trades at that second.
|
309
|
-
# if max_timestamp >= (end_session + pd.Timedelta(seconds=1)):
|
310
|
-
# # print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
311
|
-
# print(f"ERROR: {max_timestamp=} > {end_session+pd.Timedelta(seconds=1)=}")
|
312
|
-
# yield date, trades
|
313
|
-
# del fragment
|
314
|
-
# del fragments
|
315
|
-
# del trades_ds
|
316
|
-
|
317
|
-
|
318
105
|
def custom_aggs_schema(raw: bool = False) -> pa.Schema:
|
106
|
+
# timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
|
319
107
|
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
320
108
|
price_type = pa.float64()
|
321
109
|
return pa.schema(
|
@@ -331,6 +119,7 @@ def custom_aggs_schema(raw: bool = False) -> pa.Schema:
|
|
331
119
|
pa.field("date", pa.date32(), nullable=False),
|
332
120
|
pa.field("year", pa.uint16(), nullable=False),
|
333
121
|
pa.field("month", pa.uint8(), nullable=False),
|
122
|
+
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
334
123
|
]
|
335
124
|
)
|
336
125
|
|
@@ -344,12 +133,12 @@ def custom_aggs_partitioning() -> pa.Schema:
|
|
344
133
|
)
|
345
134
|
|
346
135
|
|
347
|
-
def
|
348
|
-
file_info = config.filesystem.get_file_info(config.
|
136
|
+
def get_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
137
|
+
file_info = config.filesystem.get_file_info(config.aggs_dir)
|
349
138
|
if file_info.type == pa_fs.FileType.NotFound:
|
350
139
|
return set()
|
351
140
|
aggs_ds = pa_ds.dataset(
|
352
|
-
config.
|
141
|
+
config.aggs_dir,
|
353
142
|
format="parquet",
|
354
143
|
schema=custom_aggs_schema(),
|
355
144
|
partitioning=custom_aggs_partitioning(),
|
@@ -366,17 +155,17 @@ def generate_csv_trades_tables(
|
|
366
155
|
config: PolygonConfig, overwrite: bool = False
|
367
156
|
) -> Iterator[Tuple[datetime.date, pa.Table]]:
|
368
157
|
"""Generator for trades tables from flatfile CSVs."""
|
369
|
-
|
158
|
+
existing_aggs_dates = set()
|
370
159
|
if not overwrite:
|
371
|
-
|
160
|
+
existing_aggs_dates = get_aggs_dates(config)
|
372
161
|
schedule = config.calendar.trading_index(
|
373
162
|
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
374
163
|
)
|
375
164
|
for timestamp in schedule:
|
376
|
-
date = timestamp.to_pydatetime().date()
|
377
|
-
if date in
|
165
|
+
date: datetime.date = timestamp.to_pydatetime().date()
|
166
|
+
if date in existing_aggs_dates:
|
378
167
|
continue
|
379
|
-
trades_csv_path =
|
168
|
+
trades_csv_path = config.date_to_csv_file_path(date)
|
380
169
|
convert_options = pa_csv.ConvertOptions(column_types=trades_schema(raw=True))
|
381
170
|
trades = pa_csv.read_csv(trades_csv_path, convert_options=convert_options)
|
382
171
|
trades = trades.cast(trades_schema())
|
@@ -402,7 +191,8 @@ def trades_to_custom_aggs(
|
|
402
191
|
table: pa.Table,
|
403
192
|
include_trf: bool = False,
|
404
193
|
) -> pa.Table:
|
405
|
-
print(f"{
|
194
|
+
print(f"{date=} {pa.default_memory_pool()=}")
|
195
|
+
# print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
|
406
196
|
# print(f"{resource.getrusage(resource.RUSAGE_SELF).ru_maxrss=}")
|
407
197
|
table = table.filter(pa_compute.greater(table["size"], 0))
|
408
198
|
table = table.filter(pa_compute.equal(table["correction"], "0"))
|
@@ -452,37 +242,25 @@ def trades_to_custom_aggs(
|
|
452
242
|
table = table.append_column(
|
453
243
|
"month", pa.array(np.full(len(table), date.month), type=pa.uint8())
|
454
244
|
)
|
245
|
+
table = table.append_column(
|
246
|
+
PARTITION_COLUMN_NAME,
|
247
|
+
pa.array(
|
248
|
+
[to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
|
249
|
+
),
|
250
|
+
)
|
455
251
|
table = table.sort_by([("window_start", "ascending"), ("ticker", "ascending")])
|
252
|
+
# print(f"aggs {date=} {table.to_pandas().head()=}")
|
456
253
|
return table
|
457
254
|
|
458
255
|
|
459
|
-
# def generate_custom_agg_batches_from_tables(config: PolygonConfig)
|
256
|
+
# def generate_custom_agg_batches_from_tables(config: PolygonConfig):
|
460
257
|
# for date, trades_table in generate_csv_trades_tables(config):
|
461
|
-
#
|
462
|
-
#
|
258
|
+
# aggs_table = trades_to_custom_aggs(config, date, trades_table)
|
259
|
+
# yield aggs_table
|
260
|
+
# del aggs_table
|
463
261
|
# del trades_table
|
464
262
|
|
465
263
|
|
466
|
-
# def generate_custom_agg_tables(config: PolygonConfig) -> pa.Table:
|
467
|
-
# for date, trades_table in generate_csv_trades_tables(config):
|
468
|
-
# yield trades_to_custom_aggs(config, date, trades_table)
|
469
|
-
|
470
|
-
|
471
|
-
# def configure_write_custom_aggs_to_dataset(config: PolygonConfig):
|
472
|
-
# def write_custom_aggs_to_dataset(args: Tuple[datetime.date, pa.Table]):
|
473
|
-
# date, table = args
|
474
|
-
# pa_ds.write_dataset(
|
475
|
-
# trades_to_custom_aggs(config, date, table),
|
476
|
-
# filesystem=config.filesystem,
|
477
|
-
# base_dir=config.custom_aggs_dir,
|
478
|
-
# partitioning=custom_aggs_partitioning(),
|
479
|
-
# format="parquet",
|
480
|
-
# existing_data_behavior="overwrite_or_ignore",
|
481
|
-
# )
|
482
|
-
|
483
|
-
# return write_custom_aggs_to_dataset
|
484
|
-
|
485
|
-
|
486
264
|
def file_visitor(written_file):
|
487
265
|
print(f"{written_file.path=}")
|
488
266
|
|
@@ -504,26 +282,25 @@ def convert_trades_to_custom_aggs(
|
|
504
282
|
# generate_custom_agg_batches_from_tables(config),
|
505
283
|
# schema=custom_aggs_schema(),
|
506
284
|
# filesystem=config.filesystem,
|
507
|
-
# base_dir=config.
|
285
|
+
# base_dir=config.aggs_dir,
|
508
286
|
# partitioning=custom_aggs_partitioning(),
|
509
287
|
# format="parquet",
|
510
288
|
# existing_data_behavior="overwrite_or_ignore",
|
511
|
-
# max_open_files = MAX_FILES_OPEN,
|
512
|
-
# min_rows_per_group = MIN_ROWS_PER_GROUP,
|
289
|
+
# # max_open_files = MAX_FILES_OPEN,
|
290
|
+
# # min_rows_per_group = MIN_ROWS_PER_GROUP,
|
513
291
|
# )
|
514
292
|
|
515
293
|
for date, trades_table in generate_csv_trades_tables(config):
|
516
294
|
aggs_table = trades_to_custom_aggs(config, date, trades_table)
|
517
295
|
pa_ds.write_dataset(
|
518
296
|
aggs_table,
|
519
|
-
# schema=custom_aggs_schema(),
|
520
297
|
filesystem=config.filesystem,
|
521
298
|
base_dir=config.aggs_dir,
|
522
299
|
partitioning=custom_aggs_partitioning(),
|
523
300
|
format="parquet",
|
524
301
|
existing_data_behavior="overwrite_or_ignore",
|
525
302
|
file_visitor=file_visitor,
|
526
|
-
# max_open_files=
|
303
|
+
# max_open_files=10,
|
527
304
|
# min_rows_per_group=MIN_ROWS_PER_GROUP,
|
528
305
|
)
|
529
306
|
del aggs_table
|
@@ -559,386 +336,198 @@ def convert_trades_to_custom_aggs(
|
|
559
336
|
# return mfi
|
560
337
|
|
561
338
|
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
# # https://github.com/apache/arrow/issues/39839#issuecomment-1915981816
|
579
|
-
# # Also I don't think you can use those in a format string without a separator.
|
580
|
-
|
581
|
-
# # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
582
|
-
# # price_type = pa.decimal128(precision=38, scale=10)
|
583
|
-
# # 64bit float a little overkill but avoids any plausible truncation error.
|
584
|
-
# price_type = pa.float64()
|
585
|
-
|
586
|
-
# custom_aggs_schema = pa.schema(
|
587
|
-
# [
|
588
|
-
# pa.field("ticker", pa.string(), nullable=False),
|
589
|
-
# pa.field("volume", pa.int64(), nullable=False),
|
590
|
-
# pa.field("open", price_type, nullable=False),
|
591
|
-
# pa.field("close", price_type, nullable=False),
|
592
|
-
# pa.field("high", price_type, nullable=False),
|
593
|
-
# pa.field("low", price_type, nullable=False),
|
594
|
-
# pa.field("window_start", timestamp_type, nullable=False),
|
595
|
-
# pa.field("transactions", pa.int64(), nullable=False),
|
596
|
-
# pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
597
|
-
# ]
|
598
|
-
# )
|
599
|
-
|
600
|
-
# # TODO: Use generator like os.walk for paths.
|
601
|
-
# return (
|
602
|
-
# custom_aggs_schema,
|
603
|
-
# generate_tables_from_custom_aggs(
|
604
|
-
# paths=config.csv_paths(),
|
605
|
-
# schema=custom_aggs_schema,
|
606
|
-
# start_timestamp=config.start_timestamp,
|
607
|
-
# limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
|
608
|
-
# ),
|
609
|
-
# )
|
339
|
+
def get_by_ticker_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
340
|
+
file_info = config.filesystem.get_file_info(config.by_ticker_aggs_arrow_dir)
|
341
|
+
if file_info.type == pa_fs.FileType.NotFound:
|
342
|
+
return set()
|
343
|
+
by_ticker_aggs_ds = pa_ds.dataset(
|
344
|
+
config.by_ticker_aggs_arrow_dir,
|
345
|
+
format="parquet",
|
346
|
+
schema=custom_aggs_schema(),
|
347
|
+
partitioning=custom_aggs_partitioning(),
|
348
|
+
)
|
349
|
+
return set(
|
350
|
+
[
|
351
|
+
pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
352
|
+
for fragment in by_ticker_aggs_ds.get_fragments()
|
353
|
+
]
|
354
|
+
)
|
610
355
|
|
611
|
-
# def get_custom_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
612
|
-
# file_info = config.filesystem.get_file_info(config.custom_aggs_dir)
|
613
|
-
# if file_info.type == pa_fs.FileType.NotFound:
|
614
|
-
# return set()
|
615
|
-
# aggs_ds = pa_ds.dataset(
|
616
|
-
# config.custom_aggs_dir,
|
617
|
-
# format="parquet",
|
618
|
-
# schema=custom_aggs_schema(),
|
619
|
-
# partitioning=custom_aggs_partitioning(),
|
620
|
-
# )
|
621
|
-
# return set(
|
622
|
-
# [
|
623
|
-
# pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
624
|
-
# for fragment in aggs_ds.get_fragments()
|
625
|
-
# ]
|
626
|
-
# )
|
627
356
|
|
357
|
+
def batches_for_date(aggs_ds: pa_ds.Dataset, date: pd.Timestamp):
|
358
|
+
date_filter_expr = (
|
359
|
+
(pa_compute.field("year") == date.year)
|
360
|
+
& (pa_compute.field("month") == date.month)
|
361
|
+
& (pa_compute.field("date") == date.date())
|
362
|
+
)
|
363
|
+
print(f"table for {date=}")
|
364
|
+
# return aggs_ds.scanner(filter=date_filter_expr).to_batches()
|
365
|
+
table = aggs_ds.scanner(filter=date_filter_expr).to_table()
|
366
|
+
table = table.sort_by([("part", "ascending"), ("ticker", "ascending"), ("window_start", "ascending"), ])
|
367
|
+
return table.to_batches()
|
628
368
|
|
629
|
-
def
|
630
|
-
|
631
|
-
|
369
|
+
def generate_batches_for_schedule(config, aggs_ds):
|
370
|
+
schedule = config.calendar.trading_index(
|
371
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
372
|
+
)
|
632
373
|
for timestamp in schedule:
|
633
|
-
|
634
|
-
|
635
|
-
(pa_compute.field("year") == date.year)
|
636
|
-
& (pa_compute.field("month") == date.month)
|
637
|
-
& (pa_compute.field("date") == date)
|
638
|
-
)
|
639
|
-
for batch in aggs_ds.to_batches(filter=date_filter_expr):
|
640
|
-
# TODO: Check that these rows are within range for this file's date (not just the whole session).
|
641
|
-
# And if we're doing that (figuring date for each file), we can just skip reading the file.
|
642
|
-
# Might able to do a single comparison using compute.days_between.
|
643
|
-
# https://arrow.apache.org/docs/python/generated/pyarrow.compute.days_between.html
|
644
|
-
batch = batch.append_column(
|
645
|
-
PARTITION_COLUMN_NAME,
|
646
|
-
pa.array(
|
647
|
-
[
|
648
|
-
to_partition_key(ticker)
|
649
|
-
for ticker in batch.column("ticker").to_pylist()
|
650
|
-
]
|
651
|
-
),
|
652
|
-
)
|
653
|
-
yield batch
|
374
|
+
# print(f"{timestamp=}")
|
375
|
+
yield from batches_for_date(aggs_ds=aggs_ds, date=timestamp)
|
654
376
|
|
655
377
|
|
656
|
-
def scatter_custom_aggs_to_by_ticker(
|
378
|
+
# def scatter_custom_aggs_to_by_ticker(
|
379
|
+
# config: PolygonConfig,
|
380
|
+
# overwrite: bool = False,
|
381
|
+
# ) -> str:
|
382
|
+
# lock = FileLock(config.lock_file_path, blocking=False)
|
383
|
+
# with lock:
|
384
|
+
# if not lock.is_locked:
|
385
|
+
# raise IOError("Failed to acquire lock for updating custom assets.")
|
386
|
+
# with open(config.by_ticker_dates_path, "a") as f:
|
387
|
+
# f.write("I have a bad feeling about this.")
|
388
|
+
# by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker_(config, overwrite)
|
389
|
+
|
390
|
+
# print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
391
|
+
# return by_ticker_aggs_arrow_dir
|
392
|
+
|
393
|
+
|
394
|
+
def filter_by_date(config: PolygonConfig) -> pa_compute.Expression:
|
395
|
+
start_date = config.start_timestamp.tz_localize(config.calendar.tz.key).date()
|
396
|
+
limit_date = (
|
397
|
+
(config.end_timestamp + pd.Timedelta(days=1))
|
398
|
+
.tz_localize(config.calendar.tz.key)
|
399
|
+
.date()
|
400
|
+
)
|
401
|
+
return (pa_compute.field("date") >= start_date) & (
|
402
|
+
pa_compute.field("date") <= limit_date
|
403
|
+
)
|
404
|
+
|
405
|
+
|
406
|
+
# def generate_batches_with_partition(
|
407
|
+
# config: PolygonConfig,
|
408
|
+
# aggs_ds: pa_ds.Dataset,
|
409
|
+
# ) -> Iterator[pa.Table]:
|
410
|
+
# for fragment in aggs_ds.sort_by("date").get_fragments(
|
411
|
+
# filter=filter_by_date(config),
|
412
|
+
# ):
|
413
|
+
# for batch in fragment.to_batches():
|
414
|
+
# # batch = batch.append_column(
|
415
|
+
# # PARTITION_COLUMN_NAME,
|
416
|
+
# # pa.array(
|
417
|
+
# # [
|
418
|
+
# # to_partition_key(ticker)
|
419
|
+
# # for ticker in batch.column("ticker").to_pylist()
|
420
|
+
# # ]
|
421
|
+
# # ),
|
422
|
+
# # )
|
423
|
+
# yield batch.sort_by(
|
424
|
+
# [("ticker", "ascending"), ("window_start", "ascending")]
|
425
|
+
# )
|
426
|
+
# del batch
|
427
|
+
# del fragment
|
428
|
+
|
429
|
+
|
430
|
+
def generate_batches_with_partition(
|
657
431
|
config: PolygonConfig,
|
658
|
-
|
659
|
-
) ->
|
660
|
-
|
661
|
-
|
662
|
-
|
432
|
+
aggs_ds: pa_ds.Dataset,
|
433
|
+
) -> Iterator[pa.Table]:
|
434
|
+
for fragment in (
|
435
|
+
aggs_ds.filter(filter_by_date(config))
|
436
|
+
.sort_by([(PARTITION_COLUMN_NAME, "ascending"), ("date", "ascending")])
|
437
|
+
.get_fragments()
|
438
|
+
):
|
439
|
+
for batch in fragment.to_batches():
|
440
|
+
yield batch.sort_by(
|
441
|
+
[("ticker", "ascending"), ("window_start", "ascending")]
|
442
|
+
)
|
443
|
+
del batch
|
444
|
+
del fragment
|
663
445
|
|
664
|
-
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
665
|
-
if os.path.exists(by_ticker_aggs_arrow_dir):
|
666
|
-
if overwrite:
|
667
|
-
print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
668
|
-
shutil.rmtree(by_ticker_aggs_arrow_dir)
|
669
|
-
else:
|
670
|
-
print(f"Found existing {by_ticker_aggs_arrow_dir=}")
|
671
|
-
return by_ticker_aggs_arrow_dir
|
672
446
|
|
447
|
+
def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
|
673
448
|
aggs_ds = pa_ds.dataset(
|
674
|
-
config.
|
449
|
+
config.aggs_dir,
|
675
450
|
format="parquet",
|
676
451
|
schema=custom_aggs_schema(),
|
677
452
|
partitioning=custom_aggs_partitioning(),
|
678
453
|
)
|
679
|
-
|
680
|
-
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
681
|
-
)
|
682
|
-
assert type(schedule) is pd.DatetimeIndex
|
454
|
+
by_ticker_schema = aggs_ds.schema
|
683
455
|
partitioning = pa_ds.partitioning(
|
684
|
-
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
456
|
+
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
457
|
+
flavor="hive",
|
685
458
|
)
|
686
|
-
|
687
|
-
|
688
|
-
|
459
|
+
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
460
|
+
print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
689
461
|
pa_ds.write_dataset(
|
690
|
-
|
691
|
-
|
462
|
+
# generate_batches_with_partition(config=config, aggs_ds=aggs_ds),
|
463
|
+
generate_batches_for_schedule(config=config, aggs_ds=aggs_ds),
|
464
|
+
schema=by_ticker_schema,
|
692
465
|
base_dir=by_ticker_aggs_arrow_dir,
|
693
466
|
partitioning=partitioning,
|
694
467
|
format="parquet",
|
695
468
|
existing_data_behavior="overwrite_or_ignore",
|
696
469
|
)
|
697
|
-
print(f"Scattered
|
470
|
+
print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
698
471
|
return by_ticker_aggs_arrow_dir
|
699
472
|
|
700
473
|
|
701
|
-
def
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
# Calculate gain and loss using vectorized operations
|
706
|
-
positive_mf = np.maximum(signed_mf, 0)
|
707
|
-
negative_mf = np.maximum(-signed_mf, 0)
|
708
|
-
|
709
|
-
mf_avg_gain = (
|
710
|
-
np.convolve(positive_mf, np.ones(period), mode="full")[: len(positive_mf)]
|
711
|
-
/ period
|
712
|
-
)
|
713
|
-
mf_avg_loss = (
|
714
|
-
np.convolve(negative_mf, np.ones(period), mode="full")[: len(negative_mf)]
|
715
|
-
/ period
|
716
|
-
)
|
717
|
-
|
718
|
-
epsilon = 1e-10 # Small epsilon value to avoid division by zero
|
719
|
-
mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
|
720
|
-
return mfi
|
721
|
-
|
722
|
-
|
723
|
-
# https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
|
724
|
-
# https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
|
725
|
-
# `k` vs `fast_k` arg names.
|
726
|
-
# https://github.com/twopirllc/pandas-ta/issues/726
|
727
|
-
# Results affected by values outside range
|
728
|
-
# https://github.com/twopirllc/pandas-ta/issues/535
|
729
|
-
|
730
|
-
|
731
|
-
def calculate_stoch(
|
732
|
-
high: pd.Series,
|
733
|
-
low: pd.Series,
|
734
|
-
close: pd.Series,
|
735
|
-
k: int = 14,
|
736
|
-
d: int = 3,
|
737
|
-
smooth_k: int = 3,
|
738
|
-
mamode: str = "sma",
|
739
|
-
):
|
740
|
-
"""Indicator: Stochastic Oscillator (STOCH)"""
|
741
|
-
lowest_low = low.rolling(k).min()
|
742
|
-
highest_high = high.rolling(k).max()
|
474
|
+
# def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
|
475
|
+
# file_info = config.filesystem.get_file_info(config.aggs_dir)
|
476
|
+
# if file_info.type == pa_fs.FileType.NotFound:
|
477
|
+
# raise FileNotFoundError(f"{config.aggs_dir=} not found.")
|
743
478
|
|
744
|
-
|
745
|
-
|
479
|
+
# by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
480
|
+
# if os.path.exists(by_ticker_aggs_arrow_dir):
|
481
|
+
# if overwrite:
|
482
|
+
# print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
483
|
+
# shutil.rmtree(by_ticker_aggs_arrow_dir)
|
746
484
|
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index() :,], length=d)
|
752
|
-
if stoch_k is not None
|
753
|
-
else None
|
754
|
-
)
|
755
|
-
# Histogram
|
756
|
-
stoch_h = stoch_k - stoch_d if stoch_d is not None else None
|
485
|
+
# schedule = config.calendar.trading_index(
|
486
|
+
# start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
487
|
+
# )
|
488
|
+
# assert type(schedule) is pd.DatetimeIndex
|
757
489
|
|
758
|
-
|
490
|
+
# print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
491
|
+
# aggs_ds = pa_ds.dataset(
|
492
|
+
# config.aggs_dir,
|
493
|
+
# format="parquet",
|
494
|
+
# schema=custom_aggs_schema(),
|
495
|
+
# partitioning=custom_aggs_partitioning(),
|
496
|
+
# )
|
497
|
+
# by_ticker_partitioning = pa_ds.partitioning(
|
498
|
+
# pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
499
|
+
# # pa.schema(
|
500
|
+
# # [
|
501
|
+
# # (PARTITION_COLUMN_NAME, pa.string()),
|
502
|
+
# # ("year", pa.uint16()),
|
503
|
+
# # ("month", pa.uint8()),
|
504
|
+
# # ("date", pa.date32()),
|
505
|
+
# # ]
|
506
|
+
# # ),
|
507
|
+
# flavor="hive",
|
508
|
+
# )
|
509
|
+
# by_ticker_schema = custom_aggs_schema()
|
510
|
+
# by_ticker_schema = by_ticker_schema.append(
|
511
|
+
# pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
512
|
+
# )
|
759
513
|
|
514
|
+
# # TODO: Collect the dates we've scattered and write a special partition key with them.
|
515
|
+
# pa_ds.write_dataset(
|
516
|
+
# generate_batches_for_schedule(schedule, aggs_ds),
|
517
|
+
# schema=by_ticker_schema,
|
518
|
+
# base_dir=by_ticker_aggs_arrow_dir,
|
519
|
+
# partitioning=by_ticker_partitioning,
|
520
|
+
# format="parquet",
|
521
|
+
# existing_data_behavior="overwrite_or_ignore",
|
522
|
+
# # max_open_files=250,
|
523
|
+
# # file_visitor=file_visitor,
|
524
|
+
# )
|
760
525
|
|
761
|
-
|
762
|
-
df = df.set_index("window_start").sort_index()
|
763
|
-
session_index = pd.date_range(
|
764
|
-
start=df.index[0], end=df.index[-1], freq=pd.Timedelta(seconds=60)
|
765
|
-
)
|
766
|
-
df = df.reindex(session_index)
|
767
|
-
df.index.rename("window_start", inplace=True)
|
768
|
-
|
769
|
-
# df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
|
770
|
-
# df["day_of_week"] = df.index.day_of_week
|
771
|
-
|
772
|
-
df.transactions = df.transactions.fillna(0)
|
773
|
-
df.volume = df.volume.fillna(0)
|
774
|
-
df.total = df.total.fillna(0)
|
775
|
-
df.close = df.close.ffill()
|
776
|
-
close = df.close
|
777
|
-
df.vwap = df.vwap.fillna(close)
|
778
|
-
df.high = df.high.fillna(close)
|
779
|
-
df.low = df.low.fillna(close)
|
780
|
-
df.open = df.open.fillna(close)
|
781
|
-
price_open = df.open
|
782
|
-
high = df.high
|
783
|
-
low = df.low
|
784
|
-
vwap = df.vwap
|
785
|
-
# volume = df.volume
|
786
|
-
total = df.total
|
787
|
-
next_close = close.shift()
|
788
|
-
|
789
|
-
# TODO: Odometer rollover signal. Relative difference to nearest power of 10.
|
790
|
-
# Something about log10 being a whole number? When is $50 the rollover vs $100 or $10?
|
791
|
-
|
792
|
-
# "True (Typical?) Price" which I think is an approximation of VWAP.
|
793
|
-
# Trouble with both is that if there are no trades in a bar we get NaN.
|
794
|
-
# That then means we get NaN for averages for the next period-1 bars too.
|
795
|
-
# Question is whether to ffill the price for these calculations.
|
796
|
-
df["TP"] = (high + low + close) / 3
|
797
|
-
|
798
|
-
# Gain/loss in this bar.
|
799
|
-
df["ret1bar"] = close.div(price_open).sub(1)
|
800
|
-
|
801
|
-
for t in range(2, period):
|
802
|
-
df[f"ret{t}bar"] = close.div(price_open.shift(t - 1)).sub(1)
|
803
|
-
|
804
|
-
# Average True Range (ATR)
|
805
|
-
true_range = pd.concat(
|
806
|
-
[high.sub(low), high.sub(next_close).abs(), low.sub(next_close).abs()], axis=1
|
807
|
-
).max(1)
|
808
|
-
# Normalized ATR (NATR) or Average of Normalized TR.
|
809
|
-
# Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
|
810
|
-
# He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
|
811
|
-
# atr = true_range.ewm(span=period).mean()
|
812
|
-
# df["natr_c"] = atr / close
|
813
|
-
# df["antr_c"] = (true_range / close).ewm(span=period).mean()
|
814
|
-
# df["natr_v"] = atr / vwap
|
815
|
-
# df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
|
816
|
-
df["NATR"] = (true_range / vwap).ewm(span=period).mean()
|
817
|
-
|
818
|
-
# True Price as HLC average VS VWAP.
|
819
|
-
# VWAP is better I think but is quite different than standard CCI.
|
820
|
-
# Three ways to compute CCI, all give the same value using TP.
|
821
|
-
# tp = (high + low + close) / 3
|
822
|
-
# df['SMA'] = ta.sma(tp, length=period)
|
823
|
-
# df['sma_r'] = tp.rolling(period).mean()
|
824
|
-
# df['MAD'] = ta.mad(tp, length=period)
|
825
|
-
# # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
|
826
|
-
# df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
|
827
|
-
|
828
|
-
# df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
|
829
|
-
# df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
|
830
|
-
# df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
|
831
|
-
|
832
|
-
df["taCCI"] = ta.cci(high=high, low=low, close=close, length=period)
|
833
|
-
|
834
|
-
# https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
|
835
|
-
# https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
|
836
|
-
sma = pd.Series(ta.sma(vwap, length=period))
|
837
|
-
mad = pd.Series(ta.mad(vwap, length=period))
|
838
|
-
df["CCI"] = (vwap - sma) / (0.015 * mad)
|
839
|
-
|
840
|
-
# df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
|
841
|
-
df["MFI"] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
|
842
|
-
|
843
|
-
# We use Stochastic (rather than MACD because we need a ticker independent indicator.
|
844
|
-
# IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
|
845
|
-
# https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
|
846
|
-
# We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
|
847
|
-
stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
|
848
|
-
df["STOCHk"] = stoch_k
|
849
|
-
df["STOCHd"] = stoch_d
|
850
|
-
df["STOCHh"] = stoch_h
|
851
|
-
|
852
|
-
return df
|
853
|
-
|
854
|
-
|
855
|
-
def iterate_all_aggs_tables(
|
856
|
-
config: PolygonConfig,
|
857
|
-
valid_tickers: pa.Array,
|
858
|
-
):
|
859
|
-
schedule = config.calendar.trading_index(
|
860
|
-
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
861
|
-
)
|
862
|
-
for timestamp in schedule:
|
863
|
-
date = timestamp.to_pydatetime().date()
|
864
|
-
aggs_ds = pa_ds.dataset(
|
865
|
-
config.custom_aggs_dir,
|
866
|
-
format="parquet",
|
867
|
-
schema=custom_aggs_schema(),
|
868
|
-
partitioning=custom_aggs_partitioning(),
|
869
|
-
)
|
870
|
-
date_filter_expr = (
|
871
|
-
(pa_compute.field("year") == date.year)
|
872
|
-
& (pa_compute.field("month") == date.month)
|
873
|
-
& (pa_compute.field("date") == date)
|
874
|
-
)
|
875
|
-
# print(f"{date_filter_expr=}")
|
876
|
-
for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
|
877
|
-
session_filter = (
|
878
|
-
(pa_compute.field("window_start") >= start_dt)
|
879
|
-
& (pa_compute.field("window_start") < end_dt)
|
880
|
-
& pa_compute.is_in(pa_compute.field("ticker"), valid_tickers)
|
881
|
-
)
|
882
|
-
# Sorting table doesn't seem to avoid needing to sort the df. Maybe use_threads=False on to_pandas would help?
|
883
|
-
# table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
|
884
|
-
table = fragment.to_table(filter=session_filter)
|
885
|
-
if table.num_rows > 0:
|
886
|
-
metadata = (
|
887
|
-
dict(table.schema.metadata) if table.schema.metadata else dict()
|
888
|
-
)
|
889
|
-
metadata["date"] = date.isoformat()
|
890
|
-
table = table.replace_schema_metadata(metadata)
|
891
|
-
yield table
|
892
|
-
|
893
|
-
|
894
|
-
# def iterate_all_aggs_with_signals(config: PolygonConfig):
|
895
|
-
# for table in iterate_all_aggs_tables(config):
|
896
|
-
# df = table.to_pandas()
|
897
|
-
# df = df.groupby("ticker").apply(
|
898
|
-
# compute_per_ticker_signals, include_groups=False
|
899
|
-
# )
|
900
|
-
# yield pa.Table.from_pandas(df)
|
901
|
-
|
902
|
-
|
903
|
-
def compute_signals_for_all_custom_aggs(
|
904
|
-
from_config: PolygonConfig,
|
905
|
-
to_config: PolygonConfig,
|
906
|
-
valid_tickers: pa.Array,
|
907
|
-
overwrite: bool = False,
|
908
|
-
) -> str:
|
909
|
-
if overwrite:
|
910
|
-
print("WARNING: overwrite not implemented/ignored.")
|
526
|
+
# return by_ticker_aggs_arrow_dir
|
911
527
|
|
912
|
-
print(f"{to_config.custom_aggs_dir=}")
|
913
528
|
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
|
919
|
-
df = df.groupby("ticker").apply(
|
920
|
-
compute_per_ticker_signals, include_groups=False
|
921
|
-
)
|
922
|
-
table = pa.Table.from_pandas(df)
|
923
|
-
if table.num_rows > 0:
|
924
|
-
table = table.replace_schema_metadata(metadata)
|
925
|
-
table = table.append_column("date", pa.array(np.full(len(table), date)))
|
926
|
-
table = table.append_column(
|
927
|
-
"year", pa.array(np.full(len(table), date.year), type=pa.uint16())
|
928
|
-
)
|
929
|
-
table = table.append_column(
|
930
|
-
"month", pa.array(np.full(len(table), date.month), type=pa.uint8())
|
931
|
-
)
|
932
|
-
table = table.sort_by(
|
933
|
-
[("ticker", "ascending"), ("window_start", "ascending")]
|
934
|
-
)
|
935
|
-
pa_ds.write_dataset(
|
936
|
-
table,
|
937
|
-
filesystem=to_config.filesystem,
|
938
|
-
base_dir=to_config.custom_aggs_dir,
|
939
|
-
partitioning=custom_aggs_partitioning(),
|
940
|
-
format="parquet",
|
941
|
-
existing_data_behavior="overwrite_or_ignore",
|
942
|
-
file_visitor=file_visitor,
|
943
|
-
)
|
944
|
-
return to_config.custom_aggs_dir
|
529
|
+
# def generate_tables_from_custom_aggs_ds(
|
530
|
+
# aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
|
531
|
+
# ):
|
532
|
+
# for timestamp in schedule:
|
533
|
+
# yield table_for_date(aggs_ds=aggs_ds, date=timestamp.to_pydatetime().date())
|