zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +28 -1
- zipline_polygon_bundle/adjustments.py +34 -0
- zipline_polygon_bundle/bundle.py +47 -25
- zipline_polygon_bundle/concat_all_aggs.py +17 -32
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +99 -26
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +1 -37
- zipline_polygon_bundle/trades.py +707 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/METADATA +6 -3
- zipline_polygon_bundle-0.1.8.dist-info/RECORD +16 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.1.8.dist-info}/WHEEL +0 -0
@@ -7,7 +7,14 @@ from .bundle import (
|
|
7
7
|
|
8
8
|
from .config import PolygonConfig
|
9
9
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
10
|
-
from .adjustments import load_splits, load_dividends
|
10
|
+
from .adjustments import load_splits, load_dividends, load_conditions
|
11
|
+
from .trades import trades_schema, trades_dataset, cast_trades, date_to_path
|
12
|
+
from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_all_to_custom_aggs
|
13
|
+
from .trades import get_custom_aggs_dates, generate_csv_trades_tables, compute_signals_for_all_custom_aggs
|
14
|
+
from .quotes import quotes_schema, quotes_dataset, cast_quotes
|
15
|
+
# from .tickers_and_names import load_all_tickers, merge_tickers, ticker_names_from_merged_tickers, get_ticker_universe
|
16
|
+
from .tickers_and_names import PolygonAssets, get_ticker_universe
|
17
|
+
|
11
18
|
|
12
19
|
__all__ = [
|
13
20
|
"register_polygon_equities_bundle",
|
@@ -19,4 +26,24 @@ __all__ = [
|
|
19
26
|
"generate_csv_agg_tables",
|
20
27
|
"load_splits",
|
21
28
|
"load_dividends",
|
29
|
+
"load_conditions",
|
30
|
+
"trades_schema",
|
31
|
+
"trades_dataset",
|
32
|
+
"cast_trades",
|
33
|
+
"date_to_path",
|
34
|
+
"get_custom_aggs_dates",
|
35
|
+
"generate_csv_trades_tables",
|
36
|
+
"custom_aggs_partitioning",
|
37
|
+
"custom_aggs_schema",
|
38
|
+
"trades_to_custom_aggs",
|
39
|
+
"convert_all_to_custom_aggs",
|
40
|
+
"compute_signals_for_all_custom_aggs",
|
41
|
+
"quotes_schema",
|
42
|
+
"quotes_dataset",
|
43
|
+
"cast_quotes",
|
44
|
+
# "load_all_tickers",
|
45
|
+
# "merge_tickers",
|
46
|
+
# "ticker_names_from_merged_tickers",
|
47
|
+
"PolygonAssets",
|
48
|
+
"get_ticker_universe",
|
22
49
|
]
|
@@ -153,3 +153,37 @@ def load_dividends(
|
|
153
153
|
return dividends[
|
154
154
|
["sid", "ex_date", "declared_date", "record_date", "pay_date", "amount"]
|
155
155
|
]
|
156
|
+
|
157
|
+
|
158
|
+
def load_conditions(config: PolygonConfig) -> pd.DataFrame:
|
159
|
+
# The API doesn't use dates for the condition codes but this is a way to provide control over caching.
|
160
|
+
# Main thing is to get the current conditions list but we don't want to call more than once a day.
|
161
|
+
conditions_path = config.api_cache_path(
|
162
|
+
start_date=config.start_timestamp.date(), end_date=config.end_timestamp.date(), filename="conditions"
|
163
|
+
)
|
164
|
+
expected_conditions_count = 100
|
165
|
+
if not os.path.exists(conditions_path):
|
166
|
+
client = polygon.RESTClient(api_key=config.api_key)
|
167
|
+
conditions_response = client.list_conditions(
|
168
|
+
limit=1000,
|
169
|
+
)
|
170
|
+
if conditions_response is HTTPResponse:
|
171
|
+
raise ValueError(f"Polygon.list_splits bad HTTPResponse: {conditions_response}")
|
172
|
+
conditions = pd.DataFrame(conditions_response)
|
173
|
+
print(f"Got {len(conditions)=} from Polygon list_conditions.")
|
174
|
+
os.makedirs(os.path.dirname(conditions_path), exist_ok=True)
|
175
|
+
conditions.to_parquet(conditions_path)
|
176
|
+
if len(conditions) < expected_conditions_count:
|
177
|
+
logging.warning(
|
178
|
+
f"Only got {len(conditions)=} from Polygon list_splits (expected {expected_conditions_count=}). "
|
179
|
+
)
|
180
|
+
# We will always load from the file to avoid any chance of weird errors.
|
181
|
+
if os.path.exists(conditions_path):
|
182
|
+
conditions = pd.read_parquet(conditions_path)
|
183
|
+
print(f"Loaded {len(conditions)=} from {conditions_path}")
|
184
|
+
if len(conditions) < expected_conditions_count:
|
185
|
+
logging.warning(
|
186
|
+
f"Only got {len(conditions)=} from cached conditions (expected {expected_conditions_count=}). "
|
187
|
+
)
|
188
|
+
return conditions
|
189
|
+
raise ValueError(f"Failed to load splits from {conditions_path}")
|
zipline_polygon_bundle/bundle.py
CHANGED
@@ -1,18 +1,21 @@
|
|
1
|
+
import os
|
1
2
|
from zipline.data.bundles import register
|
2
3
|
from zipline.data.resample import minute_frame_to_session_frame
|
3
4
|
|
5
|
+
from exchange_calendars.calendar_helpers import parse_date
|
6
|
+
from zipline.utils.calendar_utils import get_calendar
|
7
|
+
|
4
8
|
from .config import PolygonConfig
|
5
9
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
6
10
|
from .adjustments import load_splits, load_dividends
|
7
11
|
|
8
12
|
import pyarrow
|
9
13
|
import pyarrow.compute
|
14
|
+
import pyarrow.dataset
|
10
15
|
|
11
16
|
import pandas as pd
|
12
17
|
import logging
|
13
18
|
|
14
|
-
import concurrent.futures
|
15
|
-
|
16
19
|
|
17
20
|
# TODO: Change warnings to be relative to number of days in the range.
|
18
21
|
|
@@ -175,8 +178,8 @@ def polygon_equities_bundle_day(
|
|
175
178
|
daily_bar_writer,
|
176
179
|
adjustment_writer,
|
177
180
|
calendar,
|
178
|
-
|
179
|
-
|
181
|
+
start_date,
|
182
|
+
end_date,
|
180
183
|
cache,
|
181
184
|
show_progress,
|
182
185
|
output_dir,
|
@@ -184,8 +187,8 @@ def polygon_equities_bundle_day(
|
|
184
187
|
config = PolygonConfig(
|
185
188
|
environ=environ,
|
186
189
|
calendar_name=calendar.name,
|
187
|
-
|
188
|
-
|
190
|
+
start_date=start_date,
|
191
|
+
end_date=end_date,
|
189
192
|
agg_time="day",
|
190
193
|
)
|
191
194
|
|
@@ -219,7 +222,7 @@ def polygon_equities_bundle_day(
|
|
219
222
|
daily_bar_writer.write(
|
220
223
|
process_day_aggregates(
|
221
224
|
table=table,
|
222
|
-
sessions=calendar.sessions_in_range(
|
225
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
223
226
|
metadata=metadata,
|
224
227
|
calendar=calendar,
|
225
228
|
symbol_to_sid=symbol_to_sid,
|
@@ -400,8 +403,8 @@ def polygon_equities_bundle_minute(
|
|
400
403
|
daily_bar_writer,
|
401
404
|
adjustment_writer,
|
402
405
|
calendar,
|
403
|
-
|
404
|
-
|
406
|
+
start_date,
|
407
|
+
end_date,
|
405
408
|
cache,
|
406
409
|
show_progress,
|
407
410
|
output_dir,
|
@@ -409,8 +412,8 @@ def polygon_equities_bundle_minute(
|
|
409
412
|
config = PolygonConfig(
|
410
413
|
environ=environ,
|
411
414
|
calendar_name=calendar.name,
|
412
|
-
|
413
|
-
|
415
|
+
start_date=start_date,
|
416
|
+
end_date=end_date,
|
414
417
|
agg_time="minute",
|
415
418
|
)
|
416
419
|
|
@@ -444,8 +447,8 @@ def polygon_equities_bundle_minute(
|
|
444
447
|
daily_bar_writer.write(
|
445
448
|
process_minute_aggregates(
|
446
449
|
fragments=aggregates.get_fragments(),
|
447
|
-
sessions=calendar.sessions_in_range(
|
448
|
-
minutes=calendar.sessions_minutes(
|
450
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
451
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
449
452
|
metadata=metadata,
|
450
453
|
calendar=calendar,
|
451
454
|
symbol_to_sid=symbol_to_sid,
|
@@ -459,8 +462,8 @@ def polygon_equities_bundle_minute(
|
|
459
462
|
minute_bar_writer.write(
|
460
463
|
process_minute_aggregates(
|
461
464
|
fragments=aggregates.get_fragments(),
|
462
|
-
sessions=calendar.sessions_in_range(
|
463
|
-
minutes=calendar.sessions_minutes(
|
465
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
466
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
464
467
|
metadata=metadata,
|
465
468
|
calendar=calendar,
|
466
469
|
symbol_to_sid=symbol_to_sid,
|
@@ -485,8 +488,8 @@ def polygon_equities_bundle_minute(
|
|
485
488
|
|
486
489
|
def register_polygon_equities_bundle(
|
487
490
|
bundlename,
|
488
|
-
|
489
|
-
|
491
|
+
start_date=None,
|
492
|
+
end_date=None,
|
490
493
|
calendar_name="XNYS",
|
491
494
|
agg_time="day",
|
492
495
|
# ticker_list=None,
|
@@ -495,6 +498,25 @@ def register_polygon_equities_bundle(
|
|
495
498
|
):
|
496
499
|
if agg_time not in ["day", "minute"]:
|
497
500
|
raise ValueError(f"agg_time must be 'day' or 'minute', not '{agg_time}'")
|
501
|
+
# We need to know the start and end dates of the session before the bundle is
|
502
|
+
# registered because even though we only need it for ingest, the metadata in
|
503
|
+
# the writer is initialized and written before our ingest function is called.
|
504
|
+
if start_date is None or end_date is None:
|
505
|
+
config = PolygonConfig(
|
506
|
+
environ=os.environ,
|
507
|
+
calendar_name=calendar_name,
|
508
|
+
start_date=start_date,
|
509
|
+
end_date=end_date,
|
510
|
+
agg_time=agg_time,
|
511
|
+
)
|
512
|
+
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs()
|
513
|
+
if start_date is None:
|
514
|
+
start_date = first_aggs_date
|
515
|
+
if end_date is None:
|
516
|
+
end_date = last_aggs_date
|
517
|
+
|
518
|
+
calendar = get_calendar(calendar_name)
|
519
|
+
|
498
520
|
register(
|
499
521
|
bundlename,
|
500
522
|
(
|
@@ -502,8 +524,8 @@ def register_polygon_equities_bundle(
|
|
502
524
|
if agg_time == "minute"
|
503
525
|
else polygon_equities_bundle_day
|
504
526
|
),
|
505
|
-
start_session=
|
506
|
-
end_session=
|
527
|
+
start_session=parse_date(start_date, calendar=calendar),
|
528
|
+
end_session=parse_date(end_date, calendar=calendar),
|
507
529
|
calendar_name=calendar_name,
|
508
530
|
# minutes_per_day=390,
|
509
531
|
# create_writers=True,
|
@@ -517,12 +539,12 @@ def register_polygon_equities_bundle(
|
|
517
539
|
# config = PolygonConfig(
|
518
540
|
# environ=os.environ,
|
519
541
|
# calendar_name="XNYS",
|
520
|
-
# #
|
521
|
-
# #
|
522
|
-
#
|
523
|
-
# #
|
524
|
-
#
|
525
|
-
# #
|
542
|
+
# # start_date="2003-10-01",
|
543
|
+
# # start_date="2018-01-01",
|
544
|
+
# start_date="2023-01-01",
|
545
|
+
# # end_date="2023-01-12",
|
546
|
+
# end_date="2023-12-31",
|
547
|
+
# # end_date="2024-06-30",
|
526
548
|
# )
|
527
549
|
# splits = load_polygon_splits(config)
|
528
550
|
# splits.info()
|
@@ -1,7 +1,7 @@
|
|
1
1
|
from .config import PolygonConfig
|
2
2
|
|
3
3
|
import shutil
|
4
|
-
from typing import Iterator, Tuple
|
4
|
+
from typing import Iterator, Tuple, List, Union
|
5
5
|
|
6
6
|
import argparse
|
7
7
|
import glob
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import pyarrow as pa
|
11
11
|
from pyarrow import dataset as pa_ds
|
12
12
|
from pyarrow import csv as pa_csv
|
13
|
+
from pyarrow import compute as pa_compute
|
13
14
|
|
14
15
|
import pandas as pd
|
15
16
|
|
@@ -34,7 +35,7 @@ def to_partition_key(s: str) -> str:
|
|
34
35
|
|
35
36
|
|
36
37
|
def generate_tables_from_csv_files(
|
37
|
-
paths:
|
38
|
+
paths: Iterator[Union[str, os.PathLike]],
|
38
39
|
schema: pa.Schema,
|
39
40
|
start_timestamp: pd.Timestamp,
|
40
41
|
limit_timestamp: pd.Timestamp,
|
@@ -57,7 +58,7 @@ def generate_tables_from_csv_files(
|
|
57
58
|
quoted_strings_can_be_null=False,
|
58
59
|
)
|
59
60
|
|
60
|
-
table =
|
61
|
+
table = pa_csv.read_csv(path, convert_options=convert_options)
|
61
62
|
tables_read_count += 1
|
62
63
|
table = table.set_column(
|
63
64
|
table.column_names.index("window_start"),
|
@@ -75,10 +76,10 @@ def generate_tables_from_csv_files(
|
|
75
76
|
),
|
76
77
|
)
|
77
78
|
expr = (
|
78
|
-
|
79
|
+
pa_compute.field("window_start")
|
79
80
|
>= pa.scalar(start_timestamp, type=schema.field("window_start").type)
|
80
81
|
) & (
|
81
|
-
|
82
|
+
pa_compute.field("window_start")
|
82
83
|
< pa.scalar(
|
83
84
|
limit_timestamp,
|
84
85
|
type=schema.field("window_start").type,
|
@@ -101,22 +102,8 @@ def generate_tables_from_csv_files(
|
|
101
102
|
|
102
103
|
def generate_csv_agg_tables(
|
103
104
|
config: PolygonConfig,
|
104
|
-
) -> Tuple[
|
105
|
+
) -> Tuple[pa.Schema, Iterator[pa.Table]]:
|
105
106
|
"""zipline does bundle ingestion one ticker at a time."""
|
106
|
-
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
107
|
-
paths = sorted(
|
108
|
-
list(
|
109
|
-
glob.glob(
|
110
|
-
os.path.join(config.aggs_dir, config.csv_paths_pattern),
|
111
|
-
recursive="**" in config.csv_paths_pattern,
|
112
|
-
)
|
113
|
-
)
|
114
|
-
)
|
115
|
-
|
116
|
-
print(f"{len(paths)=}")
|
117
|
-
if len(paths) > 0:
|
118
|
-
print(f"{paths[0]=}")
|
119
|
-
print(f"{paths[-1]=}")
|
120
107
|
|
121
108
|
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
122
109
|
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
@@ -154,11 +141,11 @@ def generate_csv_agg_tables(
|
|
154
141
|
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
|
155
142
|
)
|
156
143
|
|
144
|
+
# TODO: Use generator like os.walk for paths.
|
157
145
|
return (
|
158
|
-
paths,
|
159
146
|
polygon_aggs_schema,
|
160
147
|
generate_tables_from_csv_files(
|
161
|
-
paths=
|
148
|
+
paths=config.csv_paths(),
|
162
149
|
schema=polygon_aggs_schema,
|
163
150
|
start_timestamp=config.start_timestamp,
|
164
151
|
limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
|
@@ -176,11 +163,9 @@ def concat_all_aggs_from_csv(
|
|
176
163
|
config: PolygonConfig,
|
177
164
|
overwrite: bool = False,
|
178
165
|
) -> str:
|
179
|
-
|
166
|
+
schema, tables = generate_csv_agg_tables(config)
|
180
167
|
|
181
|
-
|
182
|
-
raise ValueError(f"No Polygon CSV flat files found in {config.aggs_dir=}")
|
183
|
-
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir(paths[0], paths[-1])
|
168
|
+
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
184
169
|
if os.path.exists(by_ticker_aggs_arrow_dir):
|
185
170
|
if overwrite:
|
186
171
|
print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
@@ -212,10 +197,10 @@ if __name__ == "__main__":
|
|
212
197
|
parser = argparse.ArgumentParser()
|
213
198
|
parser.add_argument("--calendar_name", default="XNYS")
|
214
199
|
|
215
|
-
parser.add_argument("--
|
216
|
-
parser.add_argument("--
|
217
|
-
# parser.add_argument("--
|
218
|
-
# parser.add_argument("--
|
200
|
+
parser.add_argument("--start_date", default="2014-06-16")
|
201
|
+
parser.add_argument("--end_date", default="2024-09-06")
|
202
|
+
# parser.add_argument("--start_date", default="2020-01-01")
|
203
|
+
# parser.add_argument("--end_date", default="2020-12-31")
|
219
204
|
|
220
205
|
parser.add_argument("--agg_time", default="day")
|
221
206
|
|
@@ -235,8 +220,8 @@ if __name__ == "__main__":
|
|
235
220
|
config = PolygonConfig(
|
236
221
|
environ=os.environ,
|
237
222
|
calendar_name=args.calendar_name,
|
238
|
-
|
239
|
-
|
223
|
+
start_date=args.start_date,
|
224
|
+
end_date=args.end_date,
|
240
225
|
agg_time=args.agg_time,
|
241
226
|
)
|
242
227
|
|
@@ -138,10 +138,10 @@ if __name__ == "__main__":
|
|
138
138
|
parser = argparse.ArgumentParser()
|
139
139
|
parser.add_argument("--calendar_name", default="XNYS")
|
140
140
|
|
141
|
-
parser.add_argument("--
|
142
|
-
parser.add_argument("--
|
143
|
-
# parser.add_argument("--
|
144
|
-
# parser.add_argument("--
|
141
|
+
parser.add_argument("--start_date", default="2014-06-16")
|
142
|
+
parser.add_argument("--end_date", default="2024-09-06")
|
143
|
+
# parser.add_argument("--start_date", default="2020-10-07")
|
144
|
+
# parser.add_argument("--end_date", default="2020-10-15")
|
145
145
|
# parser.add_argument("--aggs_pattern", default="2020/10/**/*.csv.gz")
|
146
146
|
parser.add_argument("--aggs_pattern", default="**/*.csv.gz")
|
147
147
|
|
@@ -163,8 +163,8 @@ if __name__ == "__main__":
|
|
163
163
|
config = PolygonConfig(
|
164
164
|
environ=os.environ,
|
165
165
|
calendar_name=args.calendar_name,
|
166
|
-
|
167
|
-
|
166
|
+
start_date=args.start_date,
|
167
|
+
end_date=args.end_date,
|
168
168
|
)
|
169
169
|
|
170
170
|
concat_all_aggs_from_csv(
|
zipline_polygon_bundle/config.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1
1
|
from exchange_calendars.calendar_helpers import Date, parse_date, parse_timestamp
|
2
2
|
from zipline.utils.calendar_utils import get_calendar
|
3
3
|
|
4
|
-
import
|
4
|
+
from typing import Iterator, Tuple, List
|
5
|
+
|
5
6
|
import pandas as pd
|
7
|
+
from pyarrow.fs import LocalFileSystem
|
8
|
+
import os
|
9
|
+
import re
|
10
|
+
import fnmatch
|
6
11
|
|
7
12
|
|
8
13
|
class PolygonConfig:
|
@@ -10,33 +15,35 @@ class PolygonConfig:
|
|
10
15
|
self,
|
11
16
|
environ: dict,
|
12
17
|
calendar_name: str,
|
13
|
-
|
14
|
-
|
18
|
+
start_date: Date,
|
19
|
+
end_date: Date,
|
15
20
|
agg_time: str = "day",
|
21
|
+
custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
|
16
22
|
):
|
17
|
-
if agg_time not in ["minute", "day"]:
|
18
|
-
raise ValueError(f"agg_time must be 'minute' or 'day', got '{agg_time}'")
|
19
23
|
self.calendar_name = calendar_name
|
24
|
+
self.start_date = start_date
|
25
|
+
self.end_date = end_date
|
20
26
|
self.start_timestamp = (
|
21
|
-
parse_date(
|
22
|
-
if
|
27
|
+
parse_date(start_date, calendar=self.calendar)
|
28
|
+
if start_date
|
23
29
|
else self.calendar.first_session
|
24
30
|
)
|
25
31
|
self.end_timestamp = (
|
26
|
-
parse_date(
|
27
|
-
if
|
32
|
+
parse_date(end_date, calendar=self.calendar)
|
33
|
+
if end_date
|
28
34
|
else self.calendar.last_session
|
29
35
|
)
|
30
36
|
self.max_workers = None
|
31
37
|
if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
|
32
38
|
self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
|
33
39
|
self.api_key = environ.get("POLYGON_API_KEY")
|
40
|
+
self.filesystem = LocalFileSystem()
|
34
41
|
self.data_dir = environ.get("POLYGON_DATA_DIR", "data/files.polygon.io")
|
35
42
|
self.cik_cusip_mapping_csv_path = environ.get(
|
36
43
|
"CIK_CUSIP_MAPS_CSV", os.path.join(self.data_dir, "cik-cusip-maps.csv")
|
37
44
|
)
|
38
|
-
self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
|
39
45
|
self.market = environ.get("POLYGON_MARKET", "stocks")
|
46
|
+
self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
|
40
47
|
self.tickers_dir = environ.get(
|
41
48
|
"POLYGON_TICKERS_DIR",
|
42
49
|
os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
|
@@ -51,25 +58,54 @@ class PolygonConfig:
|
|
51
58
|
self.flat_files_dir = environ.get(
|
52
59
|
"POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
|
53
60
|
)
|
54
|
-
|
55
|
-
self.
|
61
|
+
# TODO: Restore non-recusive option. Always recursive for now.
|
62
|
+
self.csv_paths_pattern = environ.get(
|
63
|
+
# "POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz"
|
64
|
+
"POLYGON_FLAT_FILES_CSV_PATTERN", "*.csv.gz"
|
65
|
+
)
|
56
66
|
self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
|
57
67
|
self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
|
58
68
|
self.day_aggs_dir = os.path.join(self.asset_files_dir, "day_aggs_v1")
|
59
|
-
self.
|
60
|
-
|
61
|
-
|
69
|
+
self.trades_dir = os.path.join(self.asset_files_dir, "trades_v1")
|
70
|
+
self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
|
71
|
+
|
62
72
|
# TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
|
63
73
|
self.minute_by_ticker_dir = os.path.join(
|
64
74
|
self.asset_files_dir, "minute_by_ticker_v1"
|
65
75
|
)
|
66
76
|
self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
77
|
+
|
78
|
+
if bool(re.match(r"^\d", agg_time)):
|
79
|
+
self.agg_timedelta = pd.to_timedelta(agg_time)
|
80
|
+
self.custom_asset_files_dir = environ.get(
|
81
|
+
"CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
|
82
|
+
)
|
83
|
+
self.custom_aggs_dir = os.path.join(
|
84
|
+
self.custom_asset_files_dir, custom_aggs_format.format(config=self)
|
85
|
+
)
|
86
|
+
self.custom_aggs_by_ticker_dir = os.path.join(
|
87
|
+
self.custom_asset_files_dir,
|
88
|
+
(custom_aggs_format + "_by_ticker").format(config=self),
|
89
|
+
)
|
90
|
+
self.aggs_dir = self.custom_aggs_dir
|
91
|
+
self.by_ticker_dir = self.custom_aggs_by_ticker_dir
|
92
|
+
elif agg_time == "minute":
|
93
|
+
self.agg_timedelta = pd.to_timedelta("1minute")
|
94
|
+
self.aggs_dir = self.minute_aggs_dir
|
95
|
+
self.by_ticker_dir = self.minute_by_ticker_dir
|
96
|
+
elif agg_time == "day":
|
97
|
+
self.agg_timedelta = pd.to_timedelta("1day")
|
98
|
+
self.aggs_dir = self.day_aggs_dir
|
99
|
+
self.by_ticker_dir = self.day_by_ticker_dir
|
100
|
+
else:
|
101
|
+
raise ValueError(
|
102
|
+
f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
|
103
|
+
)
|
104
|
+
self.agg_time = agg_time
|
105
|
+
|
106
|
+
self.arrow_format = environ.get(
|
107
|
+
"POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive"
|
71
108
|
)
|
72
|
-
self.arrow_format = environ.get("POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive")
|
73
109
|
# self.by_ticker_hive_dir = os.path.join(
|
74
110
|
# self.by_ticker_dir,
|
75
111
|
# f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
|
@@ -88,15 +124,15 @@ class PolygonConfig:
|
|
88
124
|
return os.path.join(
|
89
125
|
ticker_year_dir, f"tickers_{date.date().isoformat()}.parquet"
|
90
126
|
)
|
91
|
-
|
127
|
+
|
92
128
|
def file_path_to_name(self, path: str):
|
129
|
+
# TODO: Use csv_paths_pattern to remove the suffixes
|
93
130
|
return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
|
94
131
|
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
)
|
132
|
+
@property
|
133
|
+
def by_ticker_aggs_arrow_dir(self):
|
134
|
+
return os.path.join(self.by_ticker_dir,
|
135
|
+
f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow")
|
100
136
|
|
101
137
|
def api_cache_path(
|
102
138
|
self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
|
@@ -107,6 +143,43 @@ class PolygonConfig:
|
|
107
143
|
self.cache_dir, f"{start_str}_{end_str}/{filename}{extension}"
|
108
144
|
)
|
109
145
|
|
146
|
+
def csv_paths(self) -> Iterator[str]:
|
147
|
+
for root, dirnames, filenames in os.walk(self.aggs_dir, topdown=True):
|
148
|
+
if dirnames:
|
149
|
+
dirnames[:] = sorted(dirnames)
|
150
|
+
# Filter out filenames that don't match the pattern.
|
151
|
+
filenames = fnmatch.filter(filenames, self.csv_paths_pattern)
|
152
|
+
if filenames:
|
153
|
+
for filename in sorted(filenames):
|
154
|
+
yield os.path.join(root, filename)
|
155
|
+
|
156
|
+
def find_first_and_last_aggs(self) -> Tuple[str, str]:
|
157
|
+
# Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
|
158
|
+
# Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
|
159
|
+
# This needs to be efficient because it is called on every init, even though we only need it for ingest.
|
160
|
+
# But we can't call it in ingest because the writer initializes and writes the metadata before it is called.
|
161
|
+
paths = []
|
162
|
+
for root, dirnames, filenames in os.walk(self.aggs_dir, topdown=True):
|
163
|
+
if dirnames:
|
164
|
+
# We only want first and last in each directory.
|
165
|
+
sorted_dirs = sorted(dirnames)
|
166
|
+
dirnames[:] = (
|
167
|
+
[sorted_dirs[0], sorted_dirs[-1]]
|
168
|
+
if len(sorted_dirs) > 1
|
169
|
+
else sorted_dirs
|
170
|
+
)
|
171
|
+
# Filter out filenames that don't match the pattern.
|
172
|
+
filenames = fnmatch.filter(filenames, self.csv_paths_pattern)
|
173
|
+
if filenames:
|
174
|
+
filenames = sorted(filenames)
|
175
|
+
paths.append(os.path.join(root, filenames[0]))
|
176
|
+
if len(filenames) > 1:
|
177
|
+
paths.append(os.path.join(root, filenames[-1]))
|
178
|
+
paths = sorted(paths)
|
179
|
+
if len(paths) < 2:
|
180
|
+
raise ValueError(f"Need more than one aggs file but found {len(paths)} paths in {self.aggs_dir}")
|
181
|
+
return self.file_path_to_name(paths[0]), self.file_path_to_name(paths[-1])
|
182
|
+
|
110
183
|
|
111
184
|
if __name__ == "__main__":
|
112
185
|
config = PolygonConfig(os.environ, "XNYS", "2003-10-01", "2023-01-01")
|
@@ -99,6 +99,6 @@ def process_all_csv_to_parquet(
|
|
99
99
|
if __name__ == "__main__":
|
100
100
|
# os.environ["POLYGON_DATA_DIR"] = "/Volumes/Oahu/Mirror/files.polygon.io"
|
101
101
|
config = PolygonConfig(
|
102
|
-
environ=os.environ, calendar_name="XNYS",
|
102
|
+
environ=os.environ, calendar_name="XNYS", start_date=None, end_date=None
|
103
103
|
)
|
104
104
|
process_all_csv_to_parquet(config.aggs_dir)
|
@@ -74,8 +74,8 @@ if __name__ == "__main__":
|
|
74
74
|
config = PolygonConfig(
|
75
75
|
environ=os.environ,
|
76
76
|
calendar_name="XNYS",
|
77
|
-
|
78
|
-
|
77
|
+
start_date="2020-10-07",
|
78
|
+
end_date="2020-10-15",
|
79
79
|
)
|
80
80
|
print(f"{config.aggs_dir=}")
|
81
81
|
max_ticker_lens = apply_to_all_aggs(
|