zipline_polygon_bundle 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +22 -0
- zipline_polygon_bundle/adjustments.py +151 -0
- zipline_polygon_bundle/bundle.py +543 -0
- zipline_polygon_bundle/concat_all_aggs.py +246 -0
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +173 -0
- zipline_polygon_bundle/config.py +113 -0
- zipline_polygon_bundle/polygon_file_reader.py +104 -0
- zipline_polygon_bundle/process_all_aggs.py +100 -0
- zipline_polygon_bundle/split_aggs_by_ticker.py +63 -0
- zipline_polygon_bundle/tickers_and_names.py +422 -0
- zipline_polygon_bundle-0.1.6.dist-info/LICENSE +661 -0
- zipline_polygon_bundle-0.1.6.dist-info/METADATA +797 -0
- zipline_polygon_bundle-0.1.6.dist-info/RECORD +14 -0
- zipline_polygon_bundle-0.1.6.dist-info/WHEEL +4 -0
@@ -0,0 +1,246 @@
|
|
1
|
+
from .config import PolygonConfig
|
2
|
+
|
3
|
+
import shutil
|
4
|
+
from typing import Iterator, Tuple
|
5
|
+
|
6
|
+
import argparse
|
7
|
+
import glob
|
8
|
+
import os
|
9
|
+
|
10
|
+
import pyarrow as pa
|
11
|
+
from pyarrow import dataset as pa_ds
|
12
|
+
from pyarrow import csv as pa_csv
|
13
|
+
|
14
|
+
import pandas as pd
|
15
|
+
|
16
|
+
|
17
|
+
PARTITION_COLUMN_NAME = "part"
|
18
|
+
PARTITION_KEY_LENGTH = 2
|
19
|
+
|
20
|
+
|
21
|
+
def to_partition_key(s: str) -> str:
|
22
|
+
"""
|
23
|
+
Partition key is low cardinality and must be filesystem-safe.
|
24
|
+
The reason for partitioning is to keep the parquet files from getting too big.
|
25
|
+
10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
|
26
|
+
Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
|
27
|
+
"""
|
28
|
+
k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
|
29
|
+
if k.isalpha():
|
30
|
+
return k
|
31
|
+
# Replace non-alpha characters with "A".
|
32
|
+
k = "".join([c if c.isalpha() else "A" for c in k])
|
33
|
+
return k
|
34
|
+
|
35
|
+
|
36
|
+
def generate_tables_from_csv_files(
|
37
|
+
paths: list,
|
38
|
+
schema: pa.Schema,
|
39
|
+
start_timestamp: pd.Timestamp,
|
40
|
+
limit_timestamp: pd.Timestamp,
|
41
|
+
) -> Iterator[pa.Table]:
|
42
|
+
empty_table = schema.empty_table()
|
43
|
+
# TODO: Find which column(s) need to be cast to int64 from the schema.
|
44
|
+
empty_table = empty_table.set_column(
|
45
|
+
empty_table.column_names.index("window_start"),
|
46
|
+
"window_start",
|
47
|
+
empty_table.column("window_start").cast(pa.int64()),
|
48
|
+
)
|
49
|
+
csv_schema = empty_table.schema
|
50
|
+
|
51
|
+
tables_read_count = 0
|
52
|
+
skipped_table_count = 0
|
53
|
+
for path in paths:
|
54
|
+
convert_options = pa_csv.ConvertOptions(
|
55
|
+
column_types=csv_schema,
|
56
|
+
strings_can_be_null=False,
|
57
|
+
quoted_strings_can_be_null=False,
|
58
|
+
)
|
59
|
+
|
60
|
+
table = pa.csv.read_csv(path, convert_options=convert_options)
|
61
|
+
tables_read_count += 1
|
62
|
+
table = table.set_column(
|
63
|
+
table.column_names.index("window_start"),
|
64
|
+
"window_start",
|
65
|
+
table.column("window_start").cast(schema.field("window_start").type),
|
66
|
+
)
|
67
|
+
if PARTITION_COLUMN_NAME in schema.names:
|
68
|
+
table = table.append_column(
|
69
|
+
PARTITION_COLUMN_NAME,
|
70
|
+
pa.array(
|
71
|
+
[
|
72
|
+
to_partition_key(ticker)
|
73
|
+
for ticker in table.column("ticker").to_pylist()
|
74
|
+
]
|
75
|
+
),
|
76
|
+
)
|
77
|
+
expr = (
|
78
|
+
pa.compute.field("window_start")
|
79
|
+
>= pa.scalar(start_timestamp, type=schema.field("window_start").type)
|
80
|
+
) & (
|
81
|
+
pa.compute.field("window_start")
|
82
|
+
< pa.scalar(
|
83
|
+
limit_timestamp,
|
84
|
+
type=schema.field("window_start").type,
|
85
|
+
)
|
86
|
+
)
|
87
|
+
table = table.filter(expr)
|
88
|
+
|
89
|
+
# TODO: Also check that these rows are within range for this file's date (not just the whole session).
|
90
|
+
# And if we're doing that (figuring date for each file), we can just skip reading the file.
|
91
|
+
# Might able to do a single comparison using compute.days_between.
|
92
|
+
# https://arrow.apache.org/docs/python/generated/pyarrow.compute.days_between.html
|
93
|
+
|
94
|
+
if table.num_rows == 0:
|
95
|
+
skipped_table_count += 1
|
96
|
+
continue
|
97
|
+
|
98
|
+
yield table
|
99
|
+
print(f"{tables_read_count=} {skipped_table_count=}")
|
100
|
+
|
101
|
+
|
102
|
+
def generate_csv_agg_tables(
|
103
|
+
config: PolygonConfig,
|
104
|
+
) -> Tuple[list[str], pa.Schema, Iterator[pa.Table]]:
|
105
|
+
"""zipline does bundle ingestion one ticker at a time."""
|
106
|
+
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
107
|
+
paths = sorted(
|
108
|
+
list(
|
109
|
+
glob.glob(
|
110
|
+
os.path.join(config.aggs_dir, config.csv_paths_pattern),
|
111
|
+
recursive="**" in config.csv_paths_pattern,
|
112
|
+
)
|
113
|
+
)
|
114
|
+
)
|
115
|
+
|
116
|
+
print(f"{len(paths)=}")
|
117
|
+
if len(paths) > 0:
|
118
|
+
print(f"{paths[0]=}")
|
119
|
+
print(f"{paths[-1]=}")
|
120
|
+
|
121
|
+
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
122
|
+
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
123
|
+
timestamp_type = pa.timestamp("ns", tz="UTC")
|
124
|
+
|
125
|
+
# But we can't use the timestamp type in the schema here because it's not supported by the CSV reader.
|
126
|
+
# So we'll use int64 and cast it after reading the CSV file.
|
127
|
+
# https://github.com/apache/arrow/issues/44030
|
128
|
+
|
129
|
+
# strptime(3) (used by CSV reader for timestamps in ConvertOptions.timestamp_parsers) supports Unix timestamps (%s) and milliseconds (%f) but not nanoseconds.
|
130
|
+
# https://www.geeksforgeeks.org/how-to-use-strptime-with-milliseconds-in-python/
|
131
|
+
# Actually that's the wrong strptime (it's Python's). C++ strptime(3) doesn't even support %f.
|
132
|
+
# https://github.com/apache/arrow/issues/39839#issuecomment-1915981816
|
133
|
+
# Also I don't think you can use those in a format string without a separator.
|
134
|
+
|
135
|
+
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
136
|
+
# price_type = pa.decimal128(precision=38, scale=10)
|
137
|
+
# 64bit float a little overkill but avoids any plausible truncation error.
|
138
|
+
price_type = pa.float64()
|
139
|
+
|
140
|
+
polygon_aggs_schema = pa.schema(
|
141
|
+
[
|
142
|
+
pa.field("ticker", pa.string(), nullable=False),
|
143
|
+
pa.field("volume", pa.int64(), nullable=False),
|
144
|
+
pa.field("open", price_type, nullable=False),
|
145
|
+
pa.field("close", price_type, nullable=False),
|
146
|
+
pa.field("high", price_type, nullable=False),
|
147
|
+
pa.field("low", price_type, nullable=False),
|
148
|
+
pa.field("window_start", timestamp_type, nullable=False),
|
149
|
+
pa.field("transactions", pa.int64(), nullable=False),
|
150
|
+
]
|
151
|
+
)
|
152
|
+
if config.agg_time == "minute":
|
153
|
+
polygon_aggs_schema = polygon_aggs_schema.append(
|
154
|
+
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
|
155
|
+
)
|
156
|
+
|
157
|
+
return (
|
158
|
+
paths,
|
159
|
+
polygon_aggs_schema,
|
160
|
+
generate_tables_from_csv_files(
|
161
|
+
paths=paths,
|
162
|
+
schema=polygon_aggs_schema,
|
163
|
+
start_timestamp=config.start_timestamp,
|
164
|
+
limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
|
165
|
+
),
|
166
|
+
)
|
167
|
+
|
168
|
+
|
169
|
+
def generate_batches_from_tables(tables):
|
170
|
+
for table in tables:
|
171
|
+
for batch in table.to_batches():
|
172
|
+
yield batch
|
173
|
+
|
174
|
+
|
175
|
+
def concat_all_aggs_from_csv(
|
176
|
+
config: PolygonConfig,
|
177
|
+
overwrite: bool = False,
|
178
|
+
) -> str:
|
179
|
+
paths, schema, tables = generate_csv_agg_tables(config)
|
180
|
+
|
181
|
+
if len(paths) < 1:
|
182
|
+
raise ValueError(f"No Polygon CSV flat files found in {config.aggs_dir=}")
|
183
|
+
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir(paths[0], paths[-1])
|
184
|
+
if os.path.exists(by_ticker_aggs_arrow_dir):
|
185
|
+
if overwrite:
|
186
|
+
print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
187
|
+
shutil.rmtree(by_ticker_aggs_arrow_dir)
|
188
|
+
else:
|
189
|
+
print(f"Found existing {by_ticker_aggs_arrow_dir=}")
|
190
|
+
return by_ticker_aggs_arrow_dir
|
191
|
+
|
192
|
+
partitioning = None
|
193
|
+
if PARTITION_COLUMN_NAME in schema.names:
|
194
|
+
partitioning = pa_ds.partitioning(
|
195
|
+
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
|
196
|
+
)
|
197
|
+
|
198
|
+
# scanner = pa_ds.Scanner.from_batches(source=generate_batches_from_tables(tables), schema=schema)
|
199
|
+
pa_ds.write_dataset(
|
200
|
+
generate_batches_from_tables(tables),
|
201
|
+
schema=schema,
|
202
|
+
base_dir=by_ticker_aggs_arrow_dir,
|
203
|
+
partitioning=partitioning,
|
204
|
+
format="parquet",
|
205
|
+
existing_data_behavior="overwrite_or_ignore",
|
206
|
+
)
|
207
|
+
print(f"Concatenated aggregates to {by_ticker_aggs_arrow_dir=}")
|
208
|
+
return by_ticker_aggs_arrow_dir
|
209
|
+
|
210
|
+
|
211
|
+
if __name__ == "__main__":
|
212
|
+
parser = argparse.ArgumentParser()
|
213
|
+
parser.add_argument("--calendar_name", default="XNYS")
|
214
|
+
|
215
|
+
parser.add_argument("--start_session", default="2014-06-16")
|
216
|
+
parser.add_argument("--end_session", default="2024-09-06")
|
217
|
+
# parser.add_argument("--start_session", default="2020-01-01")
|
218
|
+
# parser.add_argument("--end_session", default="2020-12-31")
|
219
|
+
|
220
|
+
parser.add_argument("--agg_time", default="day")
|
221
|
+
|
222
|
+
parser.add_argument("--overwrite", action="store_true")
|
223
|
+
|
224
|
+
# TODO: These defaults should be None but for dev convenience they are set for my local config.
|
225
|
+
parser.add_argument("--data_dir", default="/Volumes/Oahu/Mirror/files.polygon.io")
|
226
|
+
# parser.add_argument("--aggs_pattern", default="**/*.csv.gz")
|
227
|
+
# parser.add_argument("--aggs_pattern", default="2020/10/**/*.csv.gz")
|
228
|
+
|
229
|
+
args = parser.parse_args()
|
230
|
+
|
231
|
+
# Maybe the way to do this is to use the os.environ as the argparser defaults.
|
232
|
+
if args.data_dir:
|
233
|
+
os.environ["POLYGON_DATA_DIR"] = args.data_dir
|
234
|
+
|
235
|
+
config = PolygonConfig(
|
236
|
+
environ=os.environ,
|
237
|
+
calendar_name=args.calendar_name,
|
238
|
+
start_session=args.start_session,
|
239
|
+
end_session=args.end_session,
|
240
|
+
agg_time=args.agg_time,
|
241
|
+
)
|
242
|
+
|
243
|
+
concat_all_aggs_from_csv(
|
244
|
+
config=config,
|
245
|
+
overwrite=args.overwrite,
|
246
|
+
)
|
@@ -0,0 +1,173 @@
|
|
1
|
+
from .config import PolygonConfig
|
2
|
+
|
3
|
+
from typing import Iterator
|
4
|
+
|
5
|
+
import argparse
|
6
|
+
import glob
|
7
|
+
import os
|
8
|
+
|
9
|
+
import pyarrow as pa
|
10
|
+
from pyarrow import dataset as pa_ds
|
11
|
+
from pyarrow import csv as pa_csv
|
12
|
+
|
13
|
+
PARTITION_COLUMN_NAME = "part"
|
14
|
+
|
15
|
+
|
16
|
+
# To work across all reasonable filesystems, we need to escape the characters in partition keys that are treated weirdly in filenames.
|
17
|
+
def partition_key_escape(c: str) -> str:
|
18
|
+
return ("^" + c.upper()) if c.islower() else ("%" + "%02X" % ord(c))
|
19
|
+
|
20
|
+
|
21
|
+
def to_partition_key(s: str) -> str:
|
22
|
+
if s.isalnum() and s.isupper():
|
23
|
+
return s
|
24
|
+
return "".join(
|
25
|
+
[f"{c if (c.isupper() or c.isdigit()) else partition_key_escape(c)}" for c in s]
|
26
|
+
)
|
27
|
+
|
28
|
+
|
29
|
+
def read_csv_table(path, timestamp_type: pa.TimestampType, convert_options):
|
30
|
+
table = pa.csv.read_csv(path, convert_options=convert_options)
|
31
|
+
table = table.set_column(
|
32
|
+
table.column_names.index("window_start"),
|
33
|
+
"window_start",
|
34
|
+
table.column("window_start").cast(timestamp_type),
|
35
|
+
)
|
36
|
+
return table
|
37
|
+
|
38
|
+
|
39
|
+
def csv_agg_scanner(
|
40
|
+
paths: list, schema: pa.Schema, timestamp_type: pa.TimestampType
|
41
|
+
) -> Iterator[pa.RecordBatch]:
|
42
|
+
for path in paths:
|
43
|
+
convert_options = pa_csv.ConvertOptions(
|
44
|
+
column_types=schema,
|
45
|
+
strings_can_be_null=False,
|
46
|
+
quoted_strings_can_be_null=False,
|
47
|
+
)
|
48
|
+
|
49
|
+
print(f"{path=}")
|
50
|
+
table = read_csv_table(
|
51
|
+
path=path, timestamp_type=timestamp_type, convert_options=convert_options
|
52
|
+
)
|
53
|
+
|
54
|
+
table = table.append_column(
|
55
|
+
PARTITION_COLUMN_NAME,
|
56
|
+
pa.array(
|
57
|
+
[to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
|
58
|
+
),
|
59
|
+
)
|
60
|
+
|
61
|
+
for batch in table.to_batches():
|
62
|
+
yield batch
|
63
|
+
|
64
|
+
|
65
|
+
def concat_all_aggs_from_csv(
|
66
|
+
config: PolygonConfig,
|
67
|
+
aggs_pattern: str = "**/*.csv.gz",
|
68
|
+
) -> list:
|
69
|
+
"""zipline does bundle ingestion one ticker at a time."""
|
70
|
+
|
71
|
+
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
72
|
+
paths = sorted(
|
73
|
+
list(
|
74
|
+
glob.glob(
|
75
|
+
os.path.join(config.aggs_dir, aggs_pattern),
|
76
|
+
recursive="**" in aggs_pattern,
|
77
|
+
)
|
78
|
+
)
|
79
|
+
)
|
80
|
+
|
81
|
+
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
82
|
+
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
83
|
+
timestamp_type = pa.timestamp("ns", tz="UTC")
|
84
|
+
|
85
|
+
# But we can't use the timestamp type in the schema here because it's not supported by the CSV reader.
|
86
|
+
# So we'll use int64 and cast it after reading the CSV file.
|
87
|
+
# https://github.com/apache/arrow/issues/44030
|
88
|
+
|
89
|
+
# strptime(3) (used by CSV reader for timestamps in ConvertOptions.timestamp_parsers) supports Unix timestamps (%s) and milliseconds (%f) but not nanoseconds.
|
90
|
+
# https://www.geeksforgeeks.org/how-to-use-strptime-with-milliseconds-in-python/
|
91
|
+
# Actually that's the wrong strptime (it's Python's). C++ strptime(3) doesn't even support %f.
|
92
|
+
# https://github.com/apache/arrow/issues/39839#issuecomment-1915981816
|
93
|
+
# Also I don't think you can use those in a format string without a separator.
|
94
|
+
|
95
|
+
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
96
|
+
price_type = pa.decimal128(precision=38, scale=10)
|
97
|
+
|
98
|
+
polygon_aggs_schema = pa.schema(
|
99
|
+
[
|
100
|
+
pa.field("ticker", pa.string(), nullable=False),
|
101
|
+
pa.field("volume", pa.int64(), nullable=False),
|
102
|
+
pa.field("open", price_type, nullable=False),
|
103
|
+
pa.field("close", price_type, nullable=False),
|
104
|
+
pa.field("high", price_type, nullable=False),
|
105
|
+
pa.field("low", price_type, nullable=False),
|
106
|
+
pa.field("window_start", pa.int64(), nullable=False),
|
107
|
+
pa.field("transactions", pa.int64(), nullable=False),
|
108
|
+
]
|
109
|
+
)
|
110
|
+
|
111
|
+
partitioned_schema = polygon_aggs_schema.append(
|
112
|
+
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
|
113
|
+
)
|
114
|
+
agg_scanner = pa_ds.Scanner.from_batches(
|
115
|
+
csv_agg_scanner(paths=paths, schema=polygon_aggs_schema, timestamp_type=timestamp_type),
|
116
|
+
schema=partitioned_schema
|
117
|
+
)
|
118
|
+
|
119
|
+
by_ticker_base_dir = os.path.join(
|
120
|
+
config.by_ticker_dir,
|
121
|
+
f"{config.agg_time}_{config.start_timestamp.date().isoformat()}_{config.end_timestamp.date().isoformat()}.hive",
|
122
|
+
)
|
123
|
+
partition_by_ticker = pa.dataset.partitioning(
|
124
|
+
pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
|
125
|
+
)
|
126
|
+
pa_ds.write_dataset(
|
127
|
+
agg_scanner,
|
128
|
+
base_dir=by_ticker_base_dir,
|
129
|
+
format="parquet",
|
130
|
+
partitioning=partition_by_ticker,
|
131
|
+
existing_data_behavior="overwrite_or_ignore",
|
132
|
+
max_partitions=config.max_partitions,
|
133
|
+
max_open_files=config.max_open_files,
|
134
|
+
)
|
135
|
+
|
136
|
+
|
137
|
+
if __name__ == "__main__":
|
138
|
+
parser = argparse.ArgumentParser()
|
139
|
+
parser.add_argument("--calendar_name", default="XNYS")
|
140
|
+
|
141
|
+
parser.add_argument("--start_session", default="2014-06-16")
|
142
|
+
parser.add_argument("--end_session", default="2024-09-06")
|
143
|
+
# parser.add_argument("--start_session", default="2020-10-07")
|
144
|
+
# parser.add_argument("--end_session", default="2020-10-15")
|
145
|
+
# parser.add_argument("--aggs_pattern", default="2020/10/**/*.csv.gz")
|
146
|
+
parser.add_argument("--aggs_pattern", default="**/*.csv.gz")
|
147
|
+
|
148
|
+
parser.add_argument("--overwrite", action="store_true")
|
149
|
+
|
150
|
+
# TODO: These defaults should be None but for dev convenience they are set for my local config.
|
151
|
+
parser.add_argument("--agg_time", default="day")
|
152
|
+
parser.add_argument("--data_dir", default="/Volumes/Oahu/Mirror/files.polygon.io")
|
153
|
+
|
154
|
+
args = parser.parse_args()
|
155
|
+
|
156
|
+
# Maybe the way to do this is to use the os.environ as the argparser defaults.
|
157
|
+
environ = dict(os.environ.items())
|
158
|
+
if args.data_dir:
|
159
|
+
environ["POLYGON_DATA_DIR"] = args.data_dir
|
160
|
+
if args.agg_time:
|
161
|
+
environ["POLYGON_AGG_TIME"] = args.agg_time
|
162
|
+
|
163
|
+
config = PolygonConfig(
|
164
|
+
environ=os.environ,
|
165
|
+
calendar_name=args.calendar_name,
|
166
|
+
start_session=args.start_session,
|
167
|
+
end_session=args.end_session,
|
168
|
+
)
|
169
|
+
|
170
|
+
concat_all_aggs_from_csv(
|
171
|
+
config=config,
|
172
|
+
aggs_pattern=args.aggs_pattern,
|
173
|
+
)
|
@@ -0,0 +1,113 @@
|
|
1
|
+
from exchange_calendars.calendar_helpers import Date, parse_date, parse_timestamp
|
2
|
+
from zipline.utils.calendar_utils import get_calendar
|
3
|
+
|
4
|
+
import os
|
5
|
+
import pandas as pd
|
6
|
+
|
7
|
+
|
8
|
+
class PolygonConfig:
|
9
|
+
def __init__(
|
10
|
+
self,
|
11
|
+
environ: dict,
|
12
|
+
calendar_name: str,
|
13
|
+
start_session: Date,
|
14
|
+
end_session: Date,
|
15
|
+
agg_time: str = "day",
|
16
|
+
):
|
17
|
+
if agg_time not in ["minute", "day"]:
|
18
|
+
raise ValueError(f"agg_time must be 'minute' or 'day', got '{agg_time}'")
|
19
|
+
self.calendar_name = calendar_name
|
20
|
+
self.start_timestamp = (
|
21
|
+
parse_date(start_session, calendar=self.calendar)
|
22
|
+
if start_session
|
23
|
+
else self.calendar.first_session
|
24
|
+
)
|
25
|
+
self.end_timestamp = (
|
26
|
+
parse_date(end_session, calendar=self.calendar)
|
27
|
+
if end_session
|
28
|
+
else self.calendar.last_session
|
29
|
+
)
|
30
|
+
self.max_workers = None
|
31
|
+
if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
|
32
|
+
self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
|
33
|
+
self.api_key = environ.get("POLYGON_API_KEY")
|
34
|
+
self.data_dir = environ.get("POLYGON_DATA_DIR", "data/files.polygon.io")
|
35
|
+
self.cik_cusip_mapping_csv_path = environ.get(
|
36
|
+
"CIK_CUSIP_MAPS_CSV", os.path.join(self.data_dir, "cik-cusip-maps.csv")
|
37
|
+
)
|
38
|
+
self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
|
39
|
+
self.market = environ.get("POLYGON_MARKET", "stocks")
|
40
|
+
self.tickers_dir = environ.get(
|
41
|
+
"POLYGON_TICKERS_DIR",
|
42
|
+
os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
|
43
|
+
)
|
44
|
+
self.tickers_csv_path = environ.get(
|
45
|
+
"POLYGON_TICKERS_CSV",
|
46
|
+
os.path.join(
|
47
|
+
self.tickers_dir,
|
48
|
+
f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
|
49
|
+
),
|
50
|
+
)
|
51
|
+
self.flat_files_dir = environ.get(
|
52
|
+
"POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
|
53
|
+
)
|
54
|
+
self.csv_paths_pattern = environ.get("POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz")
|
55
|
+
self.agg_time = agg_time
|
56
|
+
self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
|
57
|
+
self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
|
58
|
+
self.day_aggs_dir = os.path.join(self.asset_files_dir, "day_aggs_v1")
|
59
|
+
self.aggs_dir = (
|
60
|
+
self.minute_aggs_dir if self.agg_time == "minute" else self.day_aggs_dir
|
61
|
+
)
|
62
|
+
# TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
|
63
|
+
self.minute_by_ticker_dir = os.path.join(
|
64
|
+
self.asset_files_dir, "minute_by_ticker_v1"
|
65
|
+
)
|
66
|
+
self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
|
67
|
+
self.by_ticker_dir = (
|
68
|
+
self.minute_by_ticker_dir
|
69
|
+
if self.agg_time == "minute"
|
70
|
+
else self.day_by_ticker_dir
|
71
|
+
)
|
72
|
+
self.arrow_format = environ.get("POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive")
|
73
|
+
# self.by_ticker_hive_dir = os.path.join(
|
74
|
+
# self.by_ticker_dir,
|
75
|
+
# f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
|
76
|
+
# )
|
77
|
+
self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
|
78
|
+
|
79
|
+
@property
|
80
|
+
def calendar(self):
|
81
|
+
return get_calendar(self.calendar_name)
|
82
|
+
|
83
|
+
def ticker_file_path(self, date: pd.Timestamp):
|
84
|
+
ticker_year_dir = os.path.join(
|
85
|
+
self.tickers_dir, f"tickers_{date.strftime('%Y')}"
|
86
|
+
)
|
87
|
+
os.makedirs(ticker_year_dir, exist_ok=True)
|
88
|
+
return os.path.join(
|
89
|
+
ticker_year_dir, f"tickers_{date.date().isoformat()}.parquet"
|
90
|
+
)
|
91
|
+
|
92
|
+
def file_path_to_name(self, path: str):
|
93
|
+
return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
|
94
|
+
|
95
|
+
def by_ticker_aggs_arrow_dir(self, first_path: str, last_path: str):
|
96
|
+
return os.path.join(
|
97
|
+
self.by_ticker_dir,
|
98
|
+
f"{self.file_path_to_name(first_path)}_{self.file_path_to_name(last_path)}.arrow",
|
99
|
+
)
|
100
|
+
|
101
|
+
def api_cache_path(
|
102
|
+
self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
|
103
|
+
):
|
104
|
+
start_str = parse_date(start_date, calendar=self.calendar).date().isoformat()
|
105
|
+
end_str = parse_date(end_date, calendar=self.calendar).date().isoformat()
|
106
|
+
return os.path.join(
|
107
|
+
self.cache_dir, f"{start_str}_{end_str}/{filename}{extension}"
|
108
|
+
)
|
109
|
+
|
110
|
+
|
111
|
+
if __name__ == "__main__":
|
112
|
+
config = PolygonConfig(os.environ, "XNYS", "2003-10-01", "2023-01-01")
|
113
|
+
print(config.__dict__)
|
@@ -0,0 +1,104 @@
|
|
1
|
+
from .config import PolygonConfig
|
2
|
+
|
3
|
+
import os
|
4
|
+
import glob
|
5
|
+
from concurrent.futures import ProcessPoolExecutor
|
6
|
+
import pandas as pd
|
7
|
+
|
8
|
+
|
9
|
+
def convert_timestamp(x):
|
10
|
+
"""
|
11
|
+
Polygon timestamps are in nanoseconds, milliseconds, or seconds.
|
12
|
+
We can decide automatically based on the size of the number because the only overlaps
|
13
|
+
are during the first few months of 1970. And zero is always the same in any case.
|
14
|
+
"""
|
15
|
+
try:
|
16
|
+
unix_time = int(x)
|
17
|
+
return pd.to_datetime(
|
18
|
+
unix_time,
|
19
|
+
unit=(
|
20
|
+
"ns"
|
21
|
+
if unix_time > 100_000_000_000_000
|
22
|
+
else "ms" if unix_time > 10_000_000_000 else "s"
|
23
|
+
),
|
24
|
+
)
|
25
|
+
except Exception as e:
|
26
|
+
print(f"ERROR: Failed to convert '{x}': {e}")
|
27
|
+
return pd.NaT
|
28
|
+
|
29
|
+
|
30
|
+
def convert_csv_to_parquet(path, extension, compression):
|
31
|
+
print(path)
|
32
|
+
try:
|
33
|
+
bars_df = pd.read_csv(
|
34
|
+
path,
|
35
|
+
compression=compression,
|
36
|
+
converters={"window_start": convert_timestamp},
|
37
|
+
dtype={"ticker": "str"},
|
38
|
+
)
|
39
|
+
# bars_df["ticker"] = bars_df["ticker"].astype(str)
|
40
|
+
# bars_df.info()
|
41
|
+
# print(f"{bars_df["ticker"].str.len().max()=}")
|
42
|
+
if len(bars_df) == 0:
|
43
|
+
print(f"WARNING: Empty {path}")
|
44
|
+
return
|
45
|
+
# if len(bars_df) < 100000:
|
46
|
+
# print(f"WARNING: Short {path}")
|
47
|
+
# Don't change the data. We're just converting to Parquet to save time.
|
48
|
+
# bars_df.set_index(["window_start", "ticker"], inplace=True)
|
49
|
+
# bars_df.sort_index(inplace=True)
|
50
|
+
parquet_path = path.removesuffix(extension) + ".parquet"
|
51
|
+
bars_df.to_parquet(parquet_path)
|
52
|
+
# fp.write(parquet_path, bars_df, has_nulls=False, write_index=False, fixed_text={"ticker": bars_df["ticker"].str.len().max()})
|
53
|
+
if not os.path.exists(parquet_path):
|
54
|
+
print(f"ERROR: Failed to write {parquet_path}")
|
55
|
+
except Exception as e:
|
56
|
+
print(f"Failed for {path}: {e}")
|
57
|
+
|
58
|
+
|
59
|
+
def process_all_csv_to_parquet(
|
60
|
+
aggs_dir,
|
61
|
+
recursive=True,
|
62
|
+
extension=".csv.gz",
|
63
|
+
compression="infer",
|
64
|
+
force=False,
|
65
|
+
max_workers=None,
|
66
|
+
):
|
67
|
+
"""Big CSV files are very slow to read. So we only read them once and convert them to Parquet."""
|
68
|
+
csv_pattern = f"**/*{extension}" if recursive else f"*{extension}"
|
69
|
+
paths = list(glob.glob(os.path.join(aggs_dir, csv_pattern), recursive=recursive))
|
70
|
+
if force:
|
71
|
+
print(f"Removing Parquet files that may exist for {len(paths)} CSV files.")
|
72
|
+
for path in paths:
|
73
|
+
parquet_path = path.removesuffix(extension) + ".parquet"
|
74
|
+
if os.path.exists(parquet_path):
|
75
|
+
print(f"Removing {parquet_path}")
|
76
|
+
os.remove(parquet_path)
|
77
|
+
else:
|
78
|
+
csv_file_count = len(paths)
|
79
|
+
paths = [
|
80
|
+
path
|
81
|
+
for path in paths
|
82
|
+
if not os.path.exists(path.removesuffix(extension) + ".parquet")
|
83
|
+
]
|
84
|
+
if len(paths) < csv_file_count:
|
85
|
+
print(f"Skipping {csv_file_count - len(paths)} already converted files.")
|
86
|
+
if max_workers == 0:
|
87
|
+
for path in paths:
|
88
|
+
convert_csv_to_parquet(path, extension=extension, compression=compression)
|
89
|
+
else:
|
90
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
91
|
+
executor.map(
|
92
|
+
convert_csv_to_parquet,
|
93
|
+
paths,
|
94
|
+
[extension] * len(paths),
|
95
|
+
[compression] * len(paths),
|
96
|
+
)
|
97
|
+
|
98
|
+
|
99
|
+
if __name__ == "__main__":
|
100
|
+
# os.environ["POLYGON_DATA_DIR"] = "/Volumes/Oahu/Mirror/files.polygon.io"
|
101
|
+
config = PolygonConfig(
|
102
|
+
environ=os.environ, calendar_name="XNYS", start_session=None, end_session=None
|
103
|
+
)
|
104
|
+
process_all_csv_to_parquet(config.aggs_dir)
|