zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +33 -5
- zipline_polygon_bundle/adjustments.py +60 -31
- zipline_polygon_bundle/bundle.py +202 -208
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +140 -70
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +167 -36
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +5 -38
- zipline_polygon_bundle/trades.py +533 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/METADATA +10 -5
- zipline_polygon_bundle-0.2.0.dist-info/RECORD +18 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/WHEEL +0 -0
@@ -0,0 +1,533 @@
|
|
1
|
+
from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
|
2
|
+
|
3
|
+
from typing import Iterator, Tuple
|
4
|
+
|
5
|
+
import pyarrow as pa
|
6
|
+
import pyarrow.compute as pa_compute
|
7
|
+
import pyarrow.csv as pa_csv
|
8
|
+
import pyarrow.dataset as pa_ds
|
9
|
+
import pyarrow.fs as pa_fs
|
10
|
+
|
11
|
+
from fsspec.implementations.arrow import ArrowFSWrapper
|
12
|
+
|
13
|
+
import os
|
14
|
+
import datetime
|
15
|
+
|
16
|
+
import numpy as np
|
17
|
+
import pandas as pd
|
18
|
+
|
19
|
+
|
20
|
+
def trades_schema(raw: bool = False) -> pa.Schema:
|
21
|
+
# There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
|
22
|
+
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
23
|
+
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
24
|
+
# The timezone is America/New_York because that's the US exchanges timezone and the date is a trading day.
|
25
|
+
# timestamp_type = pa.timestamp("ns", tz="America/New_York")
|
26
|
+
# timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
|
27
|
+
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
28
|
+
|
29
|
+
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
30
|
+
# price_type = pa.decimal128(precision=38, scale=10)
|
31
|
+
# 64bit float a little overkill but avoids any plausible truncation error.
|
32
|
+
price_type = pa.float64()
|
33
|
+
|
34
|
+
return pa.schema(
|
35
|
+
[
|
36
|
+
pa.field("ticker", pa.string(), nullable=False),
|
37
|
+
pa.field("conditions", pa.string(), nullable=False),
|
38
|
+
pa.field("correction", pa.string(), nullable=False),
|
39
|
+
pa.field("exchange", pa.int8(), nullable=False),
|
40
|
+
pa.field("id", pa.string(), nullable=False),
|
41
|
+
pa.field("participant_timestamp", timestamp_type, nullable=False),
|
42
|
+
pa.field("price", price_type, nullable=False),
|
43
|
+
pa.field("sequence_number", pa.int64(), nullable=False),
|
44
|
+
pa.field("sip_timestamp", timestamp_type, nullable=False),
|
45
|
+
pa.field("size", pa.int64(), nullable=False),
|
46
|
+
pa.field("tape", pa.int8(), nullable=False),
|
47
|
+
pa.field("trf_id", pa.int64(), nullable=False),
|
48
|
+
pa.field("trf_timestamp", timestamp_type, nullable=False),
|
49
|
+
]
|
50
|
+
)
|
51
|
+
|
52
|
+
|
53
|
+
def trades_dataset(config: PolygonConfig) -> pa_ds.Dataset:
|
54
|
+
"""
|
55
|
+
Create a pyarrow dataset from the trades files.
|
56
|
+
"""
|
57
|
+
|
58
|
+
# https://arrow.apache.org/docs/python/filesystems.html#using-arrow-filesystems-with-fsspec
|
59
|
+
# https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem.glob.
|
60
|
+
fsspec = ArrowFSWrapper(config.filesystem)
|
61
|
+
|
62
|
+
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
63
|
+
paths = sorted(
|
64
|
+
fsspec.glob(os.path.join(config.trades_dir, config.csv_paths_pattern))
|
65
|
+
)
|
66
|
+
|
67
|
+
return pa_ds.FileSystemDataset.from_paths(
|
68
|
+
paths,
|
69
|
+
format=pa_ds.CsvFileFormat(),
|
70
|
+
schema=trades_schema(raw=True),
|
71
|
+
filesystem=config.filesystem,
|
72
|
+
)
|
73
|
+
|
74
|
+
|
75
|
+
def cast_strings_to_list(
|
76
|
+
string_array, separator=",", default="0", value_type=pa.uint8()
|
77
|
+
):
|
78
|
+
"""Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
|
79
|
+
|
80
|
+
# Create a mask to identify empty strings
|
81
|
+
is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
|
82
|
+
|
83
|
+
# Use replace_with_mask to replace empty strings with the default ("0")
|
84
|
+
filled_column = pa_compute.replace_with_mask(
|
85
|
+
string_array, is_empty, pa.scalar(default)
|
86
|
+
)
|
87
|
+
|
88
|
+
# Split the strings by comma
|
89
|
+
split_array = pa_compute.split_pattern(filled_column, pattern=separator)
|
90
|
+
|
91
|
+
# Cast each element in the resulting lists to integers
|
92
|
+
int_list_array = pa_compute.cast(split_array, pa.list_(value_type))
|
93
|
+
|
94
|
+
return int_list_array
|
95
|
+
|
96
|
+
|
97
|
+
def cast_trades(trades) -> pa.Table:
|
98
|
+
trades = trades.cast(trades_schema())
|
99
|
+
condition_values = cast_strings_to_list(
|
100
|
+
trades.column("conditions").combine_chunks()
|
101
|
+
)
|
102
|
+
return trades.append_column("condition_values", condition_values)
|
103
|
+
|
104
|
+
|
105
|
+
def custom_aggs_schema(raw: bool = False) -> pa.Schema:
|
106
|
+
# timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
|
107
|
+
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
108
|
+
price_type = pa.float64()
|
109
|
+
return pa.schema(
|
110
|
+
[
|
111
|
+
pa.field("ticker", pa.string(), nullable=False),
|
112
|
+
pa.field("volume", pa.int64(), nullable=False),
|
113
|
+
pa.field("open", price_type, nullable=False),
|
114
|
+
pa.field("close", price_type, nullable=False),
|
115
|
+
pa.field("high", price_type, nullable=False),
|
116
|
+
pa.field("low", price_type, nullable=False),
|
117
|
+
pa.field("window_start", timestamp_type, nullable=False),
|
118
|
+
pa.field("transactions", pa.int64(), nullable=False),
|
119
|
+
pa.field("date", pa.date32(), nullable=False),
|
120
|
+
pa.field("year", pa.uint16(), nullable=False),
|
121
|
+
pa.field("month", pa.uint8(), nullable=False),
|
122
|
+
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
123
|
+
]
|
124
|
+
)
|
125
|
+
|
126
|
+
|
127
|
+
def custom_aggs_partitioning() -> pa.Schema:
|
128
|
+
return pa_ds.partitioning(
|
129
|
+
pa.schema(
|
130
|
+
[("year", pa.uint16()), ("month", pa.uint8()), ("date", pa.date32())]
|
131
|
+
),
|
132
|
+
flavor="hive",
|
133
|
+
)
|
134
|
+
|
135
|
+
|
136
|
+
def get_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
137
|
+
file_info = config.filesystem.get_file_info(config.aggs_dir)
|
138
|
+
if file_info.type == pa_fs.FileType.NotFound:
|
139
|
+
return set()
|
140
|
+
aggs_ds = pa_ds.dataset(
|
141
|
+
config.aggs_dir,
|
142
|
+
format="parquet",
|
143
|
+
schema=custom_aggs_schema(),
|
144
|
+
partitioning=custom_aggs_partitioning(),
|
145
|
+
)
|
146
|
+
return set(
|
147
|
+
[
|
148
|
+
pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
149
|
+
for fragment in aggs_ds.get_fragments()
|
150
|
+
]
|
151
|
+
)
|
152
|
+
|
153
|
+
|
154
|
+
def generate_csv_trades_tables(
|
155
|
+
config: PolygonConfig, overwrite: bool = False
|
156
|
+
) -> Iterator[Tuple[datetime.date, pa.Table]]:
|
157
|
+
"""Generator for trades tables from flatfile CSVs."""
|
158
|
+
existing_aggs_dates = set()
|
159
|
+
if not overwrite:
|
160
|
+
existing_aggs_dates = get_aggs_dates(config)
|
161
|
+
schedule = config.calendar.trading_index(
|
162
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
163
|
+
)
|
164
|
+
for timestamp in schedule:
|
165
|
+
date: datetime.date = timestamp.to_pydatetime().date()
|
166
|
+
if date in existing_aggs_dates:
|
167
|
+
continue
|
168
|
+
trades_csv_path = config.date_to_csv_file_path(date)
|
169
|
+
convert_options = pa_csv.ConvertOptions(column_types=trades_schema(raw=True))
|
170
|
+
trades = pa_csv.read_csv(trades_csv_path, convert_options=convert_options)
|
171
|
+
trades = trades.cast(trades_schema())
|
172
|
+
# min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
|
173
|
+
# max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
|
174
|
+
# start_session = session['pre']
|
175
|
+
# end_session = session['post']
|
176
|
+
# # print(f"{start_session=} {end_session=}")
|
177
|
+
# # print(f"{min_timestamp=} {max_timestamp=}")
|
178
|
+
# if min_timestamp < start_session:
|
179
|
+
# print(f"ERROR: {min_timestamp=} < {start_session=}")
|
180
|
+
# # The end_session is supposed to be a limit but there are many with trades at that second.
|
181
|
+
# if max_timestamp >= (end_session + pd.Timedelta(seconds=1)):
|
182
|
+
# # print(f"ERROR: {max_timestamp=} >= {end_session=}")
|
183
|
+
# print(f"ERROR: {max_timestamp=} > {end_session+pd.Timedelta(seconds=1)=}")
|
184
|
+
yield date, trades
|
185
|
+
del trades
|
186
|
+
|
187
|
+
|
188
|
+
def trades_to_custom_aggs(
|
189
|
+
config: PolygonConfig,
|
190
|
+
date: datetime.date,
|
191
|
+
table: pa.Table,
|
192
|
+
include_trf: bool = False,
|
193
|
+
) -> pa.Table:
|
194
|
+
print(f"{date=} {pa.default_memory_pool()=}")
|
195
|
+
# print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
|
196
|
+
# print(f"{resource.getrusage(resource.RUSAGE_SELF).ru_maxrss=}")
|
197
|
+
table = table.filter(pa_compute.greater(table["size"], 0))
|
198
|
+
table = table.filter(pa_compute.equal(table["correction"], "0"))
|
199
|
+
if not include_trf:
|
200
|
+
table = table.filter(pa_compute.not_equal(table["exchange"], 4))
|
201
|
+
table = table.append_column(
|
202
|
+
"price_total", pa_compute.multiply(table["price"], table["size"])
|
203
|
+
)
|
204
|
+
table = table.append_column(
|
205
|
+
"window_start",
|
206
|
+
pa_compute.floor_temporal(
|
207
|
+
table["sip_timestamp"], multiple=config.agg_timedelta.seconds, unit="second"
|
208
|
+
),
|
209
|
+
)
|
210
|
+
table = table.group_by(["ticker", "window_start"], use_threads=False).aggregate(
|
211
|
+
[
|
212
|
+
("price", "first"),
|
213
|
+
("price", "max"),
|
214
|
+
("price", "min"),
|
215
|
+
("price", "last"),
|
216
|
+
("price_total", "sum"),
|
217
|
+
("size", "sum"),
|
218
|
+
([], "count_all"),
|
219
|
+
]
|
220
|
+
)
|
221
|
+
table = table.rename_columns(
|
222
|
+
{
|
223
|
+
"price_first": "open",
|
224
|
+
"price_max": "high",
|
225
|
+
"price_min": "low",
|
226
|
+
"price_last": "close",
|
227
|
+
"size_sum": "volume",
|
228
|
+
"price_total_sum": "total",
|
229
|
+
"count_all": "transactions",
|
230
|
+
}
|
231
|
+
)
|
232
|
+
table = table.append_column(
|
233
|
+
"vwap", pa_compute.divide(table["total"], table["volume"])
|
234
|
+
)
|
235
|
+
# table.append_column('date', pa.array([date] * len(table), type=pa.date32()))
|
236
|
+
# table.append_column('year', pa.array([date.year] * len(table), type=pa.uint16()))
|
237
|
+
# table.append_column('month', pa.array([date.month] * len(table), type=pa.uint8()))
|
238
|
+
table = table.append_column("date", pa.array(np.full(len(table), date)))
|
239
|
+
table = table.append_column(
|
240
|
+
"year", pa.array(np.full(len(table), date.year), type=pa.uint16())
|
241
|
+
)
|
242
|
+
table = table.append_column(
|
243
|
+
"month", pa.array(np.full(len(table), date.month), type=pa.uint8())
|
244
|
+
)
|
245
|
+
table = table.append_column(
|
246
|
+
PARTITION_COLUMN_NAME,
|
247
|
+
pa.array(
|
248
|
+
[to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
|
249
|
+
),
|
250
|
+
)
|
251
|
+
table = table.sort_by([("window_start", "ascending"), ("ticker", "ascending")])
|
252
|
+
# print(f"aggs {date=} {table.to_pandas().head()=}")
|
253
|
+
return table
|
254
|
+
|
255
|
+
|
256
|
+
# def generate_custom_agg_batches_from_tables(config: PolygonConfig):
|
257
|
+
# for date, trades_table in generate_csv_trades_tables(config):
|
258
|
+
# aggs_table = trades_to_custom_aggs(config, date, trades_table)
|
259
|
+
# yield aggs_table
|
260
|
+
# del aggs_table
|
261
|
+
# del trades_table
|
262
|
+
|
263
|
+
|
264
|
+
def file_visitor(written_file):
|
265
|
+
print(f"{written_file.path=}")
|
266
|
+
|
267
|
+
|
268
|
+
def convert_trades_to_custom_aggs(
|
269
|
+
config: PolygonConfig, overwrite: bool = False
|
270
|
+
) -> str:
|
271
|
+
if overwrite:
|
272
|
+
print("WARNING: overwrite not implemented/ignored.")
|
273
|
+
|
274
|
+
# MAX_FILES_OPEN = 8
|
275
|
+
# MIN_ROWS_PER_GROUP = 100_000
|
276
|
+
|
277
|
+
print(f"{config.aggs_dir=}")
|
278
|
+
|
279
|
+
# pa.set_memory_pool()
|
280
|
+
|
281
|
+
# pa_ds.write_dataset(
|
282
|
+
# generate_custom_agg_batches_from_tables(config),
|
283
|
+
# schema=custom_aggs_schema(),
|
284
|
+
# filesystem=config.filesystem,
|
285
|
+
# base_dir=config.aggs_dir,
|
286
|
+
# partitioning=custom_aggs_partitioning(),
|
287
|
+
# format="parquet",
|
288
|
+
# existing_data_behavior="overwrite_or_ignore",
|
289
|
+
# # max_open_files = MAX_FILES_OPEN,
|
290
|
+
# # min_rows_per_group = MIN_ROWS_PER_GROUP,
|
291
|
+
# )
|
292
|
+
|
293
|
+
for date, trades_table in generate_csv_trades_tables(config):
|
294
|
+
aggs_table = trades_to_custom_aggs(config, date, trades_table)
|
295
|
+
pa_ds.write_dataset(
|
296
|
+
aggs_table,
|
297
|
+
filesystem=config.filesystem,
|
298
|
+
base_dir=config.aggs_dir,
|
299
|
+
partitioning=custom_aggs_partitioning(),
|
300
|
+
format="parquet",
|
301
|
+
existing_data_behavior="overwrite_or_ignore",
|
302
|
+
file_visitor=file_visitor,
|
303
|
+
# max_open_files=10,
|
304
|
+
# min_rows_per_group=MIN_ROWS_PER_GROUP,
|
305
|
+
)
|
306
|
+
del aggs_table
|
307
|
+
del trades_table
|
308
|
+
|
309
|
+
# with ProcessPoolExecutor(max_workers=1) as executor:
|
310
|
+
# executor.map(
|
311
|
+
# configure_write_custom_aggs_to_dataset(config),
|
312
|
+
# generate_csv_trades_tables(config),
|
313
|
+
# )
|
314
|
+
|
315
|
+
print(f"Generated aggregates to {config.aggs_dir=}")
|
316
|
+
return config.aggs_dir
|
317
|
+
|
318
|
+
|
319
|
+
# https://github.com/twopirllc/pandas-ta/issues/731#issuecomment-1766786952
|
320
|
+
|
321
|
+
# def calculate_mfi(high, low, close, volume, period):
|
322
|
+
# typical_price = (high + low + close) / 3
|
323
|
+
# money_flow = typical_price * volume
|
324
|
+
# mf_sign = np.where(typical_price > np.roll(typical_price, shift=1), 1, -1)
|
325
|
+
# signed_mf = money_flow * mf_sign
|
326
|
+
|
327
|
+
# # Calculate gain and loss using vectorized operations
|
328
|
+
# positive_mf = np.maximum(signed_mf, 0)
|
329
|
+
# negative_mf = np.maximum(-signed_mf, 0)
|
330
|
+
|
331
|
+
# mf_avg_gain = np.convolve(positive_mf, np.ones(period), mode='full')[:len(positive_mf)] / period
|
332
|
+
# mf_avg_loss = np.convolve(negative_mf, np.ones(period), mode='full')[:len(negative_mf)] / period
|
333
|
+
|
334
|
+
# epsilon = 1e-10 # Small epsilon value to avoid division by zero
|
335
|
+
# mfi = 100 - 100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon))
|
336
|
+
# return mfi
|
337
|
+
|
338
|
+
|
339
|
+
def get_by_ticker_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
|
340
|
+
file_info = config.filesystem.get_file_info(config.by_ticker_aggs_arrow_dir)
|
341
|
+
if file_info.type == pa_fs.FileType.NotFound:
|
342
|
+
return set()
|
343
|
+
by_ticker_aggs_ds = pa_ds.dataset(
|
344
|
+
config.by_ticker_aggs_arrow_dir,
|
345
|
+
format="parquet",
|
346
|
+
schema=custom_aggs_schema(),
|
347
|
+
partitioning=custom_aggs_partitioning(),
|
348
|
+
)
|
349
|
+
return set(
|
350
|
+
[
|
351
|
+
pa_ds.get_partition_keys(fragment.partition_expression).get("date")
|
352
|
+
for fragment in by_ticker_aggs_ds.get_fragments()
|
353
|
+
]
|
354
|
+
)
|
355
|
+
|
356
|
+
|
357
|
+
def batches_for_date(aggs_ds: pa_ds.Dataset, date: pd.Timestamp):
|
358
|
+
date_filter_expr = (
|
359
|
+
(pa_compute.field("year") == date.year)
|
360
|
+
& (pa_compute.field("month") == date.month)
|
361
|
+
& (pa_compute.field("date") == date.date())
|
362
|
+
)
|
363
|
+
print(f"table for {date=}")
|
364
|
+
# return aggs_ds.scanner(filter=date_filter_expr).to_batches()
|
365
|
+
table = aggs_ds.scanner(filter=date_filter_expr).to_table()
|
366
|
+
table = table.sort_by([("part", "ascending"), ("ticker", "ascending"), ("window_start", "ascending"), ])
|
367
|
+
return table.to_batches()
|
368
|
+
|
369
|
+
def generate_batches_for_schedule(config, aggs_ds):
|
370
|
+
schedule = config.calendar.trading_index(
|
371
|
+
start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
372
|
+
)
|
373
|
+
for timestamp in schedule:
|
374
|
+
# print(f"{timestamp=}")
|
375
|
+
yield from batches_for_date(aggs_ds=aggs_ds, date=timestamp)
|
376
|
+
|
377
|
+
|
378
|
+
# def scatter_custom_aggs_to_by_ticker(
|
379
|
+
# config: PolygonConfig,
|
380
|
+
# overwrite: bool = False,
|
381
|
+
# ) -> str:
|
382
|
+
# lock = FileLock(config.lock_file_path, blocking=False)
|
383
|
+
# with lock:
|
384
|
+
# if not lock.is_locked:
|
385
|
+
# raise IOError("Failed to acquire lock for updating custom assets.")
|
386
|
+
# with open(config.by_ticker_dates_path, "a") as f:
|
387
|
+
# f.write("I have a bad feeling about this.")
|
388
|
+
# by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker_(config, overwrite)
|
389
|
+
|
390
|
+
# print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
391
|
+
# return by_ticker_aggs_arrow_dir
|
392
|
+
|
393
|
+
|
394
|
+
def filter_by_date(config: PolygonConfig) -> pa_compute.Expression:
|
395
|
+
start_date = config.start_timestamp.tz_localize(config.calendar.tz.key).date()
|
396
|
+
limit_date = (
|
397
|
+
(config.end_timestamp + pd.Timedelta(days=1))
|
398
|
+
.tz_localize(config.calendar.tz.key)
|
399
|
+
.date()
|
400
|
+
)
|
401
|
+
return (pa_compute.field("date") >= start_date) & (
|
402
|
+
pa_compute.field("date") <= limit_date
|
403
|
+
)
|
404
|
+
|
405
|
+
|
406
|
+
# def generate_batches_with_partition(
|
407
|
+
# config: PolygonConfig,
|
408
|
+
# aggs_ds: pa_ds.Dataset,
|
409
|
+
# ) -> Iterator[pa.Table]:
|
410
|
+
# for fragment in aggs_ds.sort_by("date").get_fragments(
|
411
|
+
# filter=filter_by_date(config),
|
412
|
+
# ):
|
413
|
+
# for batch in fragment.to_batches():
|
414
|
+
# # batch = batch.append_column(
|
415
|
+
# # PARTITION_COLUMN_NAME,
|
416
|
+
# # pa.array(
|
417
|
+
# # [
|
418
|
+
# # to_partition_key(ticker)
|
419
|
+
# # for ticker in batch.column("ticker").to_pylist()
|
420
|
+
# # ]
|
421
|
+
# # ),
|
422
|
+
# # )
|
423
|
+
# yield batch.sort_by(
|
424
|
+
# [("ticker", "ascending"), ("window_start", "ascending")]
|
425
|
+
# )
|
426
|
+
# del batch
|
427
|
+
# del fragment
|
428
|
+
|
429
|
+
|
430
|
+
def generate_batches_with_partition(
|
431
|
+
config: PolygonConfig,
|
432
|
+
aggs_ds: pa_ds.Dataset,
|
433
|
+
) -> Iterator[pa.Table]:
|
434
|
+
for fragment in (
|
435
|
+
aggs_ds.filter(filter_by_date(config))
|
436
|
+
.sort_by([(PARTITION_COLUMN_NAME, "ascending"), ("date", "ascending")])
|
437
|
+
.get_fragments()
|
438
|
+
):
|
439
|
+
for batch in fragment.to_batches():
|
440
|
+
yield batch.sort_by(
|
441
|
+
[("ticker", "ascending"), ("window_start", "ascending")]
|
442
|
+
)
|
443
|
+
del batch
|
444
|
+
del fragment
|
445
|
+
|
446
|
+
|
447
|
+
def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
|
448
|
+
aggs_ds = pa_ds.dataset(
|
449
|
+
config.aggs_dir,
|
450
|
+
format="parquet",
|
451
|
+
schema=custom_aggs_schema(),
|
452
|
+
partitioning=custom_aggs_partitioning(),
|
453
|
+
)
|
454
|
+
by_ticker_schema = aggs_ds.schema
|
455
|
+
partitioning = pa_ds.partitioning(
|
456
|
+
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
457
|
+
flavor="hive",
|
458
|
+
)
|
459
|
+
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
460
|
+
print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
461
|
+
pa_ds.write_dataset(
|
462
|
+
# generate_batches_with_partition(config=config, aggs_ds=aggs_ds),
|
463
|
+
generate_batches_for_schedule(config=config, aggs_ds=aggs_ds),
|
464
|
+
schema=by_ticker_schema,
|
465
|
+
base_dir=by_ticker_aggs_arrow_dir,
|
466
|
+
partitioning=partitioning,
|
467
|
+
format="parquet",
|
468
|
+
existing_data_behavior="overwrite_or_ignore",
|
469
|
+
)
|
470
|
+
print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
471
|
+
return by_ticker_aggs_arrow_dir
|
472
|
+
|
473
|
+
|
474
|
+
# def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
|
475
|
+
# file_info = config.filesystem.get_file_info(config.aggs_dir)
|
476
|
+
# if file_info.type == pa_fs.FileType.NotFound:
|
477
|
+
# raise FileNotFoundError(f"{config.aggs_dir=} not found.")
|
478
|
+
|
479
|
+
# by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
480
|
+
# if os.path.exists(by_ticker_aggs_arrow_dir):
|
481
|
+
# if overwrite:
|
482
|
+
# print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
483
|
+
# shutil.rmtree(by_ticker_aggs_arrow_dir)
|
484
|
+
|
485
|
+
# schedule = config.calendar.trading_index(
|
486
|
+
# start=config.start_timestamp, end=config.end_timestamp, period="1D"
|
487
|
+
# )
|
488
|
+
# assert type(schedule) is pd.DatetimeIndex
|
489
|
+
|
490
|
+
# print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
|
491
|
+
# aggs_ds = pa_ds.dataset(
|
492
|
+
# config.aggs_dir,
|
493
|
+
# format="parquet",
|
494
|
+
# schema=custom_aggs_schema(),
|
495
|
+
# partitioning=custom_aggs_partitioning(),
|
496
|
+
# )
|
497
|
+
# by_ticker_partitioning = pa_ds.partitioning(
|
498
|
+
# pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
|
499
|
+
# # pa.schema(
|
500
|
+
# # [
|
501
|
+
# # (PARTITION_COLUMN_NAME, pa.string()),
|
502
|
+
# # ("year", pa.uint16()),
|
503
|
+
# # ("month", pa.uint8()),
|
504
|
+
# # ("date", pa.date32()),
|
505
|
+
# # ]
|
506
|
+
# # ),
|
507
|
+
# flavor="hive",
|
508
|
+
# )
|
509
|
+
# by_ticker_schema = custom_aggs_schema()
|
510
|
+
# by_ticker_schema = by_ticker_schema.append(
|
511
|
+
# pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
|
512
|
+
# )
|
513
|
+
|
514
|
+
# # TODO: Collect the dates we've scattered and write a special partition key with them.
|
515
|
+
# pa_ds.write_dataset(
|
516
|
+
# generate_batches_for_schedule(schedule, aggs_ds),
|
517
|
+
# schema=by_ticker_schema,
|
518
|
+
# base_dir=by_ticker_aggs_arrow_dir,
|
519
|
+
# partitioning=by_ticker_partitioning,
|
520
|
+
# format="parquet",
|
521
|
+
# existing_data_behavior="overwrite_or_ignore",
|
522
|
+
# # max_open_files=250,
|
523
|
+
# # file_visitor=file_visitor,
|
524
|
+
# )
|
525
|
+
|
526
|
+
# return by_ticker_aggs_arrow_dir
|
527
|
+
|
528
|
+
|
529
|
+
# def generate_tables_from_custom_aggs_ds(
|
530
|
+
# aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
|
531
|
+
# ):
|
532
|
+
# for timestamp in schedule:
|
533
|
+
# yield table_for_date(aggs_ds=aggs_ds, date=timestamp.to_pydatetime().date())
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: zipline_polygon_bundle
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: A zipline-reloaded data provider bundle for Polygon.io
|
5
5
|
License: GNU AFFERO GENERAL PUBLIC LICENSE
|
6
6
|
Version 3, 19 November 2007
|
@@ -666,19 +666,21 @@ License: GNU AFFERO GENERAL PUBLIC LICENSE
|
|
666
666
|
Keywords: zipline,data-bundle,finance
|
667
667
|
Author: Jim White
|
668
668
|
Author-email: jim@fovi.com
|
669
|
-
Requires-Python: >=3.
|
669
|
+
Requires-Python: >=3.10,<4.0
|
670
670
|
Classifier: Programming Language :: Python :: 3
|
671
671
|
Classifier: License :: OSI Approved :: GNU Affero General Public License v3
|
672
672
|
Classifier: Operating System :: OS Independent
|
673
673
|
Requires-Dist: bcolz-zipline (>=1.2.11)
|
674
|
+
Requires-Dist: filelock (>=3.16.0)
|
675
|
+
Requires-Dist: fsspec (>=2024.10)
|
674
676
|
Requires-Dist: numpy (<2)
|
675
677
|
Requires-Dist: pandas (>=2.2,<3)
|
676
|
-
Requires-Dist: polygon-api-client
|
677
|
-
Requires-Dist: pyarrow
|
678
|
+
Requires-Dist: polygon-api-client (>=1.14.2)
|
679
|
+
Requires-Dist: pyarrow (>=18.1.0,<19)
|
678
680
|
Requires-Dist: pytz (>=2018.5)
|
679
681
|
Requires-Dist: requests (>=2.9.1)
|
680
682
|
Requires-Dist: toolz (>=0.8.2)
|
681
|
-
Requires-Dist: zipline-
|
683
|
+
Requires-Dist: zipline-arrow (>=3.2)
|
682
684
|
Project-URL: Repository, https://github.com/fovi-llc/zipline-polygon-bundle
|
683
685
|
Description-Content-Type: text/markdown
|
684
686
|
|
@@ -742,6 +744,9 @@ register_polygon_equities_bundle(
|
|
742
744
|
## Install the Zipline Polygon.io Bundle PyPi package and check that it works.
|
743
745
|
Listing bundles will show if everything is working correctly.
|
744
746
|
```bash
|
747
|
+
pip install -U git+https://github.com/fovi-llc/zipline-reloaded.git@calendar
|
748
|
+
pip install -U git+https://github.com/fovi-llc/zipline-polygon-bundle.git
|
749
|
+
|
745
750
|
pip install zipline_polygon_bundle
|
746
751
|
zipline -e extension.py bundles
|
747
752
|
```
|
@@ -0,0 +1,18 @@
|
|
1
|
+
zipline_polygon_bundle/__init__.py,sha256=KGN5kBi021Eiz_GDtxVRTUdXgYWe6loG_C8XcrVNHrY,1765
|
2
|
+
zipline_polygon_bundle/adjustments.py,sha256=4garYK7RUrYyCIhCm0ZqHsk3y2bCt9vHUkWoHvVniTA,8233
|
3
|
+
zipline_polygon_bundle/bundle.py,sha256=7f_rpVBhR1XyOJ1e7Lulq1Uh4DWJmHxFQKZNfz9OSgQ,19805
|
4
|
+
zipline_polygon_bundle/compute_signals.py,sha256=FxcMuwMmxuvyy45y1avdL_uFEn0B4_2ekcv_B4AyPo0,10115
|
5
|
+
zipline_polygon_bundle/concat_all_aggs.py,sha256=Nuj0pytQAVoK8OK7qx5m3jWCV8uJIPsa0XHnmicgSmg,12066
|
6
|
+
zipline_polygon_bundle/concat_all_aggs_partitioned.py,sha256=AQq4ai5u5GyclWzQq2C8zIvHl_zjvLiDtxarNejwCQ4,6325
|
7
|
+
zipline_polygon_bundle/config.py,sha256=_-BlT57ff4byeOJU54tkQ7OdtFmoaA9xHAQDMdGnkb4,10471
|
8
|
+
zipline_polygon_bundle/nyse_all_hours_calendar.py,sha256=QrwWHm3_sfwrtt1tN5u6rqjTQcwN3qxyhjNGeHdyqcI,698
|
9
|
+
zipline_polygon_bundle/polygon_file_reader.py,sha256=TCq6hKlxixwtL57xLxs9GnvH3MMa6aWBI9mi1-PBNHw,3749
|
10
|
+
zipline_polygon_bundle/process_all_aggs.py,sha256=MVhb8xn9-DngSNSrRIpMG4XAgHjMXktoqYrxuM9ph-c,3069
|
11
|
+
zipline_polygon_bundle/quotes.py,sha256=yFjlPiQXPp0t6w2Bo96VLtYSqITP7WCLwMp5CH3zx1E,4260
|
12
|
+
zipline_polygon_bundle/split_aggs_by_ticker.py,sha256=HI_3nuN6E_VCq7LfOj4Dib_qm8wYME-jdXXX4rt-9YI,2150
|
13
|
+
zipline_polygon_bundle/tickers_and_names.py,sha256=BjYquIlSBQGd1yDW3m3cGuXKVvUfh_waYwdMR7eAhuM,15402
|
14
|
+
zipline_polygon_bundle/trades.py,sha256=5EXD8FUKPUB4ROTXJsl29_U7wzBPWTGbOkKZMbPWZUU,20133
|
15
|
+
zipline_polygon_bundle-0.2.0.dist-info/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
16
|
+
zipline_polygon_bundle-0.2.0.dist-info/METADATA,sha256=0PIiUhmj7kTVZeo0iNIjlZmYHBCKIaGVCY7zSbmOvqY,46912
|
17
|
+
zipline_polygon_bundle-0.2.0.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
18
|
+
zipline_polygon_bundle-0.2.0.dist-info/RECORD,,
|
@@ -1,14 +0,0 @@
|
|
1
|
-
zipline_polygon_bundle/__init__.py,sha256=kFkI4ZEej7yeuSig2r59AWLGzqSVh0dON4wdreCeizA,595
|
2
|
-
zipline_polygon_bundle/adjustments.py,sha256=k8Ykc4zv49Z8m1veFnJNeoPcw1FMN2dAxqV6xWmUfLw,6814
|
3
|
-
zipline_polygon_bundle/bundle.py,sha256=De1IHUjAxoZRaE6fVXY4qa6E7t43q_ELXAmcnJOtJEc,19260
|
4
|
-
zipline_polygon_bundle/concat_all_aggs.py,sha256=vv0MDxbSJjgZzstUP1K084FRy3W7w6Tt7FymghwcfMU,9021
|
5
|
-
zipline_polygon_bundle/concat_all_aggs_partitioned.py,sha256=b-yvwlQMyv2JO8KeeNUFD0EL0giNxWkS9ukDczgIJ20,6349
|
6
|
-
zipline_polygon_bundle/config.py,sha256=s1z4SGCcZH671NT8wjZZMQeBL4ef5SKzn8c8FXbTvlI,4755
|
7
|
-
zipline_polygon_bundle/polygon_file_reader.py,sha256=a-MTMc_FnecmB2Q1o_LE03IeqDYvbPYCvFtawxt0INw,3755
|
8
|
-
zipline_polygon_bundle/process_all_aggs.py,sha256=QLgH2HpS27JvkMqG1dsG-D0FIUDdXw1IR_UaMIJfdeA,3075
|
9
|
-
zipline_polygon_bundle/split_aggs_by_ticker.py,sha256=HI_3nuN6E_VCq7LfOj4Dib_qm8wYME-jdXXX4rt-9YI,2150
|
10
|
-
zipline_polygon_bundle/tickers_and_names.py,sha256=VVtI2FD_Gr0YOpCXhUlU0Agg1_-Ul1XW374kVwjMJck,16506
|
11
|
-
zipline_polygon_bundle-0.1.7.dist-info/LICENSE,sha256=hIahDEOTzuHCU5J2nd07LWwkLW7Hko4UFO__ffsvB-8,34523
|
12
|
-
zipline_polygon_bundle-0.1.7.dist-info/METADATA,sha256=4XfjLKiVXX30yq2xGI-4kuA2b8fzB5J-8AXRzt65HkQ,46667
|
13
|
-
zipline_polygon_bundle-0.1.7.dist-info/WHEEL,sha256=XbeZDeTWKc1w7CSIyre5aMDU_-PohRwTQceYnisIYYY,88
|
14
|
-
zipline_polygon_bundle-0.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|