zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +33 -5
- zipline_polygon_bundle/adjustments.py +60 -31
- zipline_polygon_bundle/bundle.py +202 -208
- zipline_polygon_bundle/compute_signals.py +261 -0
- zipline_polygon_bundle/concat_all_aggs.py +140 -70
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +167 -36
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +5 -38
- zipline_polygon_bundle/trades.py +533 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/METADATA +10 -5
- zipline_polygon_bundle-0.2.0.dist-info/RECORD +18 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dist-info}/WHEEL +0 -0
zipline_polygon_bundle/config.py
CHANGED
@@ -1,8 +1,38 @@
|
|
1
|
-
from exchange_calendars.calendar_helpers import Date, parse_date
|
1
|
+
from exchange_calendars.calendar_helpers import Date, parse_date
|
2
2
|
from zipline.utils.calendar_utils import get_calendar
|
3
3
|
|
4
|
-
import
|
4
|
+
from .nyse_all_hours_calendar import NYSE_ALL_HOURS
|
5
|
+
|
6
|
+
from typing import Iterator, Tuple
|
7
|
+
|
5
8
|
import pandas as pd
|
9
|
+
from pyarrow.fs import LocalFileSystem
|
10
|
+
import os
|
11
|
+
import datetime
|
12
|
+
import re
|
13
|
+
import fnmatch
|
14
|
+
|
15
|
+
AGG_TIME_DAY = "day"
|
16
|
+
AGG_TIME_MINUTE = "minute"
|
17
|
+
AGG_TIME_TRADES = "1minute"
|
18
|
+
|
19
|
+
PARTITION_COLUMN_NAME = "part"
|
20
|
+
PARTITION_KEY_LENGTH = 2
|
21
|
+
|
22
|
+
|
23
|
+
def to_partition_key(s: str) -> str:
|
24
|
+
"""
|
25
|
+
Partition key is low cardinality and must be filesystem-safe.
|
26
|
+
The reason for partitioning is to keep the parquet files from getting too big.
|
27
|
+
10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
|
28
|
+
Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
|
29
|
+
"""
|
30
|
+
k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
|
31
|
+
if k.isalpha():
|
32
|
+
return k
|
33
|
+
# Replace non-alpha characters with "A".
|
34
|
+
k = "".join([c if c.isalpha() else "A" for c in k])
|
35
|
+
return k
|
6
36
|
|
7
37
|
|
8
38
|
class PolygonConfig:
|
@@ -10,33 +40,37 @@ class PolygonConfig:
|
|
10
40
|
self,
|
11
41
|
environ: dict,
|
12
42
|
calendar_name: str,
|
13
|
-
|
14
|
-
|
15
|
-
agg_time: str =
|
43
|
+
start_date: Date,
|
44
|
+
end_date: Date,
|
45
|
+
agg_time: str = AGG_TIME_DAY,
|
16
46
|
):
|
17
|
-
if agg_time not in ["minute", "day"]:
|
18
|
-
raise ValueError(f"agg_time must be 'minute' or 'day', got '{agg_time}'")
|
19
47
|
self.calendar_name = calendar_name
|
48
|
+
self.start_date = start_date
|
49
|
+
self.end_date = end_date
|
50
|
+
# TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
|
20
51
|
self.start_timestamp = (
|
21
|
-
parse_date(
|
22
|
-
if
|
52
|
+
parse_date(start_date, calendar=self.calendar)
|
53
|
+
if start_date
|
23
54
|
else self.calendar.first_session
|
24
55
|
)
|
25
56
|
self.end_timestamp = (
|
26
|
-
parse_date(
|
27
|
-
if
|
57
|
+
parse_date(end_date, calendar=self.calendar)
|
58
|
+
if end_date
|
28
59
|
else self.calendar.last_session
|
29
60
|
)
|
61
|
+
self.agg_time = agg_time
|
62
|
+
|
30
63
|
self.max_workers = None
|
31
64
|
if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
|
32
65
|
self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
|
33
66
|
self.api_key = environ.get("POLYGON_API_KEY")
|
67
|
+
self.filesystem = LocalFileSystem()
|
34
68
|
self.data_dir = environ.get("POLYGON_DATA_DIR", "data/files.polygon.io")
|
35
69
|
self.cik_cusip_mapping_csv_path = environ.get(
|
36
70
|
"CIK_CUSIP_MAPS_CSV", os.path.join(self.data_dir, "cik-cusip-maps.csv")
|
37
71
|
)
|
38
|
-
self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
|
39
72
|
self.market = environ.get("POLYGON_MARKET", "stocks")
|
73
|
+
self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
|
40
74
|
self.tickers_dir = environ.get(
|
41
75
|
"POLYGON_TICKERS_DIR",
|
42
76
|
os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
|
@@ -51,34 +85,80 @@ class PolygonConfig:
|
|
51
85
|
self.flat_files_dir = environ.get(
|
52
86
|
"POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
|
53
87
|
)
|
54
|
-
|
55
|
-
self.
|
88
|
+
# TODO: Restore non-recusive option. Always recursive for now.
|
89
|
+
self.csv_paths_pattern = environ.get(
|
90
|
+
# "POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz"
|
91
|
+
"POLYGON_FLAT_FILES_CSV_PATTERN",
|
92
|
+
"*.csv.gz",
|
93
|
+
)
|
56
94
|
self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
|
57
95
|
self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
|
58
96
|
self.day_aggs_dir = os.path.join(self.asset_files_dir, "day_aggs_v1")
|
59
|
-
self.
|
60
|
-
|
61
|
-
|
97
|
+
self.trades_dir = os.path.join(self.asset_files_dir, "trades_v1")
|
98
|
+
self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
|
99
|
+
|
62
100
|
# TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
|
101
|
+
self.custom_asset_files_dir = environ.get(
|
102
|
+
"CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
|
103
|
+
)
|
104
|
+
|
105
|
+
self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
|
106
|
+
|
107
|
+
self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
|
108
|
+
self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
|
109
|
+
self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
|
110
|
+
|
63
111
|
self.minute_by_ticker_dir = os.path.join(
|
64
|
-
self.
|
112
|
+
self.custom_asset_files_dir, "minute_by_ticker_v1"
|
65
113
|
)
|
66
|
-
self.day_by_ticker_dir = os.path.join(
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
114
|
+
self.day_by_ticker_dir = os.path.join(
|
115
|
+
self.custom_asset_files_dir, "day_by_ticker_v1"
|
116
|
+
)
|
117
|
+
|
118
|
+
# If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
|
119
|
+
if bool(re.match(r"^\d", agg_time)):
|
120
|
+
self.agg_timedelta = pd.to_timedelta(agg_time)
|
121
|
+
self.csv_files_dir = self.trades_dir
|
122
|
+
self.custom_aggs_name_format = environ.get(
|
123
|
+
"CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
|
124
|
+
)
|
125
|
+
self.aggs_dir = os.path.join(
|
126
|
+
self.custom_asset_files_dir,
|
127
|
+
self.custom_aggs_name_format.format(config=self),
|
128
|
+
)
|
129
|
+
self.by_ticker_dir = os.path.join(
|
130
|
+
self.custom_asset_files_dir,
|
131
|
+
(self.custom_aggs_name_format + "_by_ticker").format(config=self),
|
132
|
+
)
|
133
|
+
elif agg_time == AGG_TIME_MINUTE:
|
134
|
+
self.agg_timedelta = pd.to_timedelta("1minute")
|
135
|
+
self.aggs_dir = self.minute_aggs_dir
|
136
|
+
self.csv_files_dir = self.aggs_dir
|
137
|
+
self.by_ticker_dir = self.minute_by_ticker_dir
|
138
|
+
elif agg_time == AGG_TIME_DAY:
|
139
|
+
self.agg_timedelta = pd.to_timedelta("1day")
|
140
|
+
self.aggs_dir = self.day_aggs_dir
|
141
|
+
self.csv_files_dir = self.aggs_dir
|
142
|
+
self.by_ticker_dir = self.day_by_ticker_dir
|
143
|
+
else:
|
144
|
+
raise ValueError(
|
145
|
+
f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
|
146
|
+
)
|
147
|
+
|
148
|
+
self.arrow_format = environ.get(
|
149
|
+
"POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
|
71
150
|
)
|
72
|
-
self.arrow_format = environ.get("POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive")
|
73
151
|
# self.by_ticker_hive_dir = os.path.join(
|
74
152
|
# self.by_ticker_dir,
|
75
153
|
# f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
|
76
154
|
# )
|
77
|
-
self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
|
78
155
|
|
79
156
|
@property
|
80
157
|
def calendar(self):
|
81
|
-
|
158
|
+
# print call stack
|
159
|
+
# import traceback
|
160
|
+
# traceback.print_stack()
|
161
|
+
return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
|
82
162
|
|
83
163
|
def ticker_file_path(self, date: pd.Timestamp):
|
84
164
|
ticker_year_dir = os.path.join(
|
@@ -88,25 +168,76 @@ class PolygonConfig:
|
|
88
168
|
return os.path.join(
|
89
169
|
ticker_year_dir, f"tickers_{date.date().isoformat()}.parquet"
|
90
170
|
)
|
91
|
-
|
171
|
+
|
92
172
|
def file_path_to_name(self, path: str):
|
173
|
+
# TODO: Use csv_paths_pattern to remove the suffixes
|
93
174
|
return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
|
94
175
|
|
95
|
-
def
|
96
|
-
return
|
97
|
-
|
98
|
-
|
99
|
-
|
176
|
+
def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
|
177
|
+
return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
|
178
|
+
|
179
|
+
@property
|
180
|
+
def by_ticker_aggs_arrow_dir(self):
|
181
|
+
# TODO: Don't split these up by ingestion range. They're already time indexed.
|
182
|
+
# Only reason to separate them is if we're worried about (or want) data being different across ingestions.
|
183
|
+
# This scattering is really slow and is usually gonna be redundant.
|
184
|
+
# This wasn't a problem when start/end dates were the calendar bounds when omitted.
|
185
|
+
# Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
|
186
|
+
# return os.path.join(
|
187
|
+
# self.by_ticker_dir,
|
188
|
+
# f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
|
189
|
+
# # "aggs.arrow",
|
190
|
+
# )
|
191
|
+
return self.by_ticker_dir
|
100
192
|
|
101
193
|
def api_cache_path(
|
102
|
-
self,
|
194
|
+
self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
|
103
195
|
):
|
104
|
-
|
105
|
-
|
196
|
+
first_day_str = first_day.date().isoformat()
|
197
|
+
last_day_str = last_day.date().isoformat()
|
106
198
|
return os.path.join(
|
107
|
-
self.cache_dir, f"{
|
199
|
+
self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
|
108
200
|
)
|
109
201
|
|
202
|
+
def csv_paths(self) -> Iterator[str]:
|
203
|
+
for root, dirnames, filenames in os.walk(self.aggs_dir, topdown=True):
|
204
|
+
if dirnames:
|
205
|
+
dirnames[:] = sorted(dirnames)
|
206
|
+
# Filter out filenames that don't match the pattern.
|
207
|
+
filenames = fnmatch.filter(filenames, self.csv_paths_pattern)
|
208
|
+
if filenames:
|
209
|
+
for filename in sorted(filenames):
|
210
|
+
yield os.path.join(root, filename)
|
211
|
+
|
212
|
+
def find_first_and_last_aggs(
|
213
|
+
self, aggs_dir, file_pattern
|
214
|
+
) -> Tuple[str | None, str | None]:
|
215
|
+
# Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
|
216
|
+
# Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
|
217
|
+
# This needs to be efficient because it is called on every init, even though we only need it for ingest.
|
218
|
+
# But we can't call it in ingest because the writer initializes and writes the metadata before it is called.
|
219
|
+
paths = []
|
220
|
+
for root, dirnames, filenames in os.walk(aggs_dir, topdown=True):
|
221
|
+
if dirnames:
|
222
|
+
# We only want first and last in each directory.
|
223
|
+
sorted_dirs = sorted(dirnames)
|
224
|
+
dirnames[:] = (
|
225
|
+
[sorted_dirs[0], sorted_dirs[-1]]
|
226
|
+
if len(sorted_dirs) > 1
|
227
|
+
else sorted_dirs
|
228
|
+
)
|
229
|
+
# Filter out filenames that don't match the pattern.
|
230
|
+
filenames = fnmatch.filter(filenames, file_pattern)
|
231
|
+
if filenames:
|
232
|
+
filenames = sorted(filenames)
|
233
|
+
paths.append(os.path.join(root, filenames[0]))
|
234
|
+
if len(filenames) > 1:
|
235
|
+
paths.append(os.path.join(root, filenames[-1]))
|
236
|
+
if not paths:
|
237
|
+
return None, None
|
238
|
+
paths = sorted(paths)
|
239
|
+
return self.file_path_to_name(paths[0]), self.file_path_to_name(paths[-1])
|
240
|
+
|
110
241
|
|
111
242
|
if __name__ == "__main__":
|
112
243
|
config = PolygonConfig(os.environ, "XNYS", "2003-10-01", "2023-01-01")
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import datetime
|
2
|
+
from exchange_calendars.calendar_utils import get_calendar_names, register_calendar_type
|
3
|
+
from exchange_calendars.exchange_calendar_xnys import XNYSExchangeCalendar
|
4
|
+
|
5
|
+
|
6
|
+
NYSE_ALL_HOURS = "NYSE_ALL_HOURS"
|
7
|
+
|
8
|
+
|
9
|
+
class USExtendedHoursExchangeCalendar(XNYSExchangeCalendar):
|
10
|
+
"""
|
11
|
+
A calendar for extended hours which runs from 4 AM to 8 PM.
|
12
|
+
"""
|
13
|
+
|
14
|
+
name = NYSE_ALL_HOURS
|
15
|
+
|
16
|
+
open_times = ((None, datetime.time(4)),)
|
17
|
+
|
18
|
+
close_times = ((None, datetime.time(20)),)
|
19
|
+
|
20
|
+
regular_early_close = datetime.time(13)
|
21
|
+
|
22
|
+
|
23
|
+
def register_nyse_all_hours_calendar():
|
24
|
+
if NYSE_ALL_HOURS not in get_calendar_names():
|
25
|
+
register_calendar_type(NYSE_ALL_HOURS, USExtendedHoursExchangeCalendar)
|
@@ -99,6 +99,6 @@ def process_all_csv_to_parquet(
|
|
99
99
|
if __name__ == "__main__":
|
100
100
|
# os.environ["POLYGON_DATA_DIR"] = "/Volumes/Oahu/Mirror/files.polygon.io"
|
101
101
|
config = PolygonConfig(
|
102
|
-
environ=os.environ, calendar_name="XNYS",
|
102
|
+
environ=os.environ, calendar_name="XNYS", start_date=None, end_date=None
|
103
103
|
)
|
104
104
|
process_all_csv_to_parquet(config.aggs_dir)
|
@@ -74,8 +74,8 @@ if __name__ == "__main__":
|
|
74
74
|
config = PolygonConfig(
|
75
75
|
environ=os.environ,
|
76
76
|
calendar_name="XNYS",
|
77
|
-
|
78
|
-
|
77
|
+
start_date="2020-10-07",
|
78
|
+
end_date="2020-10-15",
|
79
79
|
)
|
80
80
|
print(f"{config.aggs_dir=}")
|
81
81
|
max_ticker_lens = apply_to_all_aggs(
|
@@ -0,0 +1,101 @@
|
|
1
|
+
from .config import PolygonConfig
|
2
|
+
from .trades import cast_strings_to_list
|
3
|
+
|
4
|
+
import os
|
5
|
+
|
6
|
+
import pyarrow as pa
|
7
|
+
from pyarrow import dataset as pa_ds
|
8
|
+
from pyarrow import compute as pa_compute
|
9
|
+
from pyarrow import fs as pa_fs
|
10
|
+
from fsspec.implementations.arrow import ArrowFSWrapper
|
11
|
+
from pyarrow import csv as pa_csv
|
12
|
+
|
13
|
+
|
14
|
+
def quotes_schema(raw: bool = False) -> pa.Schema:
|
15
|
+
# There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
|
16
|
+
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
17
|
+
# timestamp_type = pa.timestamp("ns", tz="UTC")
|
18
|
+
timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
|
19
|
+
|
20
|
+
# Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
|
21
|
+
# price_type = pa.decimal128(precision=38, scale=10)
|
22
|
+
# 64bit float a little overkill but avoids any plausible truncation error.
|
23
|
+
price_type = pa.float64()
|
24
|
+
|
25
|
+
# ticker: string
|
26
|
+
# ask_exchange: int64
|
27
|
+
# ask_price: double
|
28
|
+
# ask_size: int64
|
29
|
+
# bid_exchange: int64
|
30
|
+
# bid_price: double
|
31
|
+
# bid_size: int64
|
32
|
+
# conditions: string
|
33
|
+
# indicators: int64
|
34
|
+
# participant_timestamp: int64
|
35
|
+
# sequence_number: int64
|
36
|
+
# sip_timestamp: int64
|
37
|
+
# tape: int64
|
38
|
+
# trf_timestamp: int64
|
39
|
+
|
40
|
+
return pa.schema(
|
41
|
+
[
|
42
|
+
pa.field("ticker", pa.string(), nullable=False),
|
43
|
+
pa.field("ask_exchange", pa.int8(), nullable=False),
|
44
|
+
pa.field("ask_price", price_type, nullable=False),
|
45
|
+
pa.field("ask_size", pa.int64(), nullable=False),
|
46
|
+
pa.field("bid_exchange", pa.int8(), nullable=False),
|
47
|
+
pa.field("bid_price", price_type, nullable=False),
|
48
|
+
pa.field("bid_size", pa.int64(), nullable=False),
|
49
|
+
pa.field("conditions", pa.string(), nullable=False),
|
50
|
+
pa.field("indicators", pa.string(), nullable=False),
|
51
|
+
pa.field("participant_timestamp", timestamp_type, nullable=False),
|
52
|
+
pa.field("sequence_number", pa.int64(), nullable=False),
|
53
|
+
pa.field("sip_timestamp", timestamp_type, nullable=False),
|
54
|
+
pa.field("tape", pa.int8(), nullable=False),
|
55
|
+
pa.field("trf_timestamp", timestamp_type, nullable=False),
|
56
|
+
]
|
57
|
+
)
|
58
|
+
|
59
|
+
|
60
|
+
def quotes_dataset(config: PolygonConfig) -> pa_ds.Dataset:
|
61
|
+
"""
|
62
|
+
Create a pyarrow dataset from the quotes files.
|
63
|
+
"""
|
64
|
+
|
65
|
+
# https://arrow.apache.org/docs/python/filesystems.html#using-arrow-filesystems-with-fsspec
|
66
|
+
# https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem.glob.
|
67
|
+
fsspec = ArrowFSWrapper(config.filesystem)
|
68
|
+
|
69
|
+
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
70
|
+
paths = sorted(
|
71
|
+
fsspec.glob(os.path.join(config.quotes_dir, config.csv_paths_pattern))
|
72
|
+
)
|
73
|
+
|
74
|
+
return pa_ds.FileSystemDataset.from_paths(paths,
|
75
|
+
format=pa_ds.CsvFileFormat(),
|
76
|
+
schema=quotes_schema(raw=True),
|
77
|
+
filesystem=config.filesystem)
|
78
|
+
|
79
|
+
|
80
|
+
def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa.uint8()):
|
81
|
+
"""Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
|
82
|
+
|
83
|
+
# Create a mask to identify empty strings
|
84
|
+
is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
|
85
|
+
|
86
|
+
# Use replace_with_mask to replace empty strings with the default ("0")
|
87
|
+
filled_column = pa_compute.replace_with_mask(string_array, is_empty, pa.scalar(default))
|
88
|
+
|
89
|
+
# Split the strings by comma
|
90
|
+
split_array = pa_compute.split_pattern(filled_column, pattern=separator)
|
91
|
+
|
92
|
+
# Cast each element in the resulting lists to integers
|
93
|
+
int_list_array = pa_compute.cast(split_array, pa.list_(value_type))
|
94
|
+
|
95
|
+
return int_list_array
|
96
|
+
|
97
|
+
|
98
|
+
def cast_quotes(quotes):
|
99
|
+
quotes = quotes.cast(quotes_schema())
|
100
|
+
condition_values = cast_strings_to_list(quotes.column("conditions").combine_chunks())
|
101
|
+
return quotes.append_column('condition_values', condition_values)
|
@@ -3,6 +3,7 @@ from .config import PolygonConfig
|
|
3
3
|
import datetime
|
4
4
|
import os
|
5
5
|
import pandas as pd
|
6
|
+
import csv
|
6
7
|
import polygon
|
7
8
|
import logging
|
8
9
|
from concurrent.futures import ProcessPoolExecutor
|
@@ -47,7 +48,10 @@ class PolygonAssets:
|
|
47
48
|
active: bool = True,
|
48
49
|
):
|
49
50
|
response = self.polygon_client.list_tickers(
|
50
|
-
market=self.config.market,
|
51
|
+
market=self.config.market,
|
52
|
+
active=active,
|
53
|
+
date=date.date().isoformat(),
|
54
|
+
limit=500,
|
51
55
|
)
|
52
56
|
tickers_df = pd.DataFrame(list(response))
|
53
57
|
# The currency info is for crypto. The source_feed is always NA.
|
@@ -383,40 +387,3 @@ def get_ticker_universe(config: PolygonConfig, fetch_missing: bool = False):
|
|
383
387
|
merged_tickers = pd.read_parquet(parquet_path)
|
384
388
|
merged_tickers.info()
|
385
389
|
return merged_tickers
|
386
|
-
|
387
|
-
|
388
|
-
# Initialize ticker files in __main__. Use CLI args to specify start and end dates.
|
389
|
-
if __name__ == "__main__":
|
390
|
-
import argparse
|
391
|
-
|
392
|
-
parser = argparse.ArgumentParser(description="Initialize ticker files.")
|
393
|
-
parser.add_argument(
|
394
|
-
"--start-date",
|
395
|
-
type=str,
|
396
|
-
help="Start date in ISO format (YYYY-MM-DD)",
|
397
|
-
default="2014-05-01",
|
398
|
-
)
|
399
|
-
parser.add_argument(
|
400
|
-
"--end-date",
|
401
|
-
type=str,
|
402
|
-
help="End date in ISO format (YYYY-MM-DD)",
|
403
|
-
default="2024-04-01",
|
404
|
-
)
|
405
|
-
args = parser.parse_args()
|
406
|
-
|
407
|
-
start_date = (
|
408
|
-
datetime.datetime.strptime(args.start_date, "%Y-%m-%d").date()
|
409
|
-
if args.start_date
|
410
|
-
else datetime.date.today()
|
411
|
-
)
|
412
|
-
end_date = (
|
413
|
-
datetime.datetime.strptime(args.end_date, "%Y-%m-%d").date()
|
414
|
-
if args.end_date
|
415
|
-
else datetime.date.today()
|
416
|
-
)
|
417
|
-
|
418
|
-
all_tickers = load_all_tickers(start_date, end_date, fetch_missing=True)
|
419
|
-
merged_tickers = merge_tickers(all_tickers)
|
420
|
-
merged_tickers.to_csv(f"data/tickers/us_tickers_{start_date}-{end_date}.csv")
|
421
|
-
ticker_names = ticker_names_from_merged_tickers(merged_tickers)
|
422
|
-
print(ticker_names)
|