zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.2.0.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- zipline_polygon_bundle/__init__.py +31 -1
- zipline_polygon_bundle/adjustments.py +34 -0
- zipline_polygon_bundle/bundle.py +183 -34
- zipline_polygon_bundle/concat_all_aggs.py +18 -53
- zipline_polygon_bundle/concat_all_aggs_partitioned.py +6 -6
- zipline_polygon_bundle/config.py +132 -26
- zipline_polygon_bundle/nyse_all_hours_calendar.py +25 -0
- zipline_polygon_bundle/polygon_file_reader.py +1 -1
- zipline_polygon_bundle/process_all_aggs.py +2 -2
- zipline_polygon_bundle/quotes.py +101 -0
- zipline_polygon_bundle/tickers_and_names.py +5 -38
- zipline_polygon_bundle/trades.py +944 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/METADATA +6 -3
- zipline_polygon_bundle-0.2.0.dev1.dist-info/RECORD +17 -0
- zipline_polygon_bundle-0.1.7.dist-info/RECORD +0 -14
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/LICENSE +0 -0
- {zipline_polygon_bundle-0.1.7.dist-info → zipline_polygon_bundle-0.2.0.dev1.dist-info}/WHEEL +0 -0
@@ -6,11 +6,21 @@ from .bundle import (
|
|
6
6
|
)
|
7
7
|
|
8
8
|
from .config import PolygonConfig
|
9
|
+
from .nyse_all_hours_calendar import NYSE_ALL_HOURS, register_nyse_all_hours_calendar
|
9
10
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
10
|
-
from .adjustments import load_splits, load_dividends
|
11
|
+
from .adjustments import load_splits, load_dividends, load_conditions
|
12
|
+
from .trades import trades_schema, trades_dataset, cast_trades, date_to_path
|
13
|
+
from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_trades_to_custom_aggs
|
14
|
+
from .trades import get_custom_aggs_dates, generate_csv_trades_tables, compute_signals_for_all_custom_aggs
|
15
|
+
from .quotes import quotes_schema, quotes_dataset, cast_quotes
|
16
|
+
# from .tickers_and_names import load_all_tickers, merge_tickers, ticker_names_from_merged_tickers, get_ticker_universe
|
17
|
+
from .tickers_and_names import PolygonAssets, get_ticker_universe
|
18
|
+
|
11
19
|
|
12
20
|
__all__ = [
|
13
21
|
"register_polygon_equities_bundle",
|
22
|
+
"register_nyse_all_hours_calendar",
|
23
|
+
"NYSE_ALL_HOURS",
|
14
24
|
"symbol_to_upper",
|
15
25
|
"polygon_equities_bundle_day",
|
16
26
|
"polygon_equities_bundle_minute",
|
@@ -19,4 +29,24 @@ __all__ = [
|
|
19
29
|
"generate_csv_agg_tables",
|
20
30
|
"load_splits",
|
21
31
|
"load_dividends",
|
32
|
+
"load_conditions",
|
33
|
+
"trades_schema",
|
34
|
+
"trades_dataset",
|
35
|
+
"cast_trades",
|
36
|
+
"date_to_path",
|
37
|
+
"get_custom_aggs_dates",
|
38
|
+
"generate_csv_trades_tables",
|
39
|
+
"custom_aggs_partitioning",
|
40
|
+
"custom_aggs_schema",
|
41
|
+
"trades_to_custom_aggs",
|
42
|
+
"convert_trades_to_custom_aggs",
|
43
|
+
"compute_signals_for_all_custom_aggs",
|
44
|
+
"quotes_schema",
|
45
|
+
"quotes_dataset",
|
46
|
+
"cast_quotes",
|
47
|
+
# "load_all_tickers",
|
48
|
+
# "merge_tickers",
|
49
|
+
# "ticker_names_from_merged_tickers",
|
50
|
+
"PolygonAssets",
|
51
|
+
"get_ticker_universe",
|
22
52
|
]
|
@@ -153,3 +153,37 @@ def load_dividends(
|
|
153
153
|
return dividends[
|
154
154
|
["sid", "ex_date", "declared_date", "record_date", "pay_date", "amount"]
|
155
155
|
]
|
156
|
+
|
157
|
+
|
158
|
+
def load_conditions(config: PolygonConfig) -> pd.DataFrame:
|
159
|
+
# The API doesn't use dates for the condition codes but this is a way to provide control over caching.
|
160
|
+
# Main thing is to get the current conditions list but we don't want to call more than once a day.
|
161
|
+
conditions_path = config.api_cache_path(
|
162
|
+
start_date=config.start_timestamp.date(), end_date=config.end_timestamp.date(), filename="conditions"
|
163
|
+
)
|
164
|
+
expected_conditions_count = 100
|
165
|
+
if not os.path.exists(conditions_path):
|
166
|
+
client = polygon.RESTClient(api_key=config.api_key)
|
167
|
+
conditions_response = client.list_conditions(
|
168
|
+
limit=1000,
|
169
|
+
)
|
170
|
+
if conditions_response is HTTPResponse:
|
171
|
+
raise ValueError(f"Polygon.list_splits bad HTTPResponse: {conditions_response}")
|
172
|
+
conditions = pd.DataFrame(conditions_response)
|
173
|
+
print(f"Got {len(conditions)=} from Polygon list_conditions.")
|
174
|
+
os.makedirs(os.path.dirname(conditions_path), exist_ok=True)
|
175
|
+
conditions.to_parquet(conditions_path)
|
176
|
+
if len(conditions) < expected_conditions_count:
|
177
|
+
logging.warning(
|
178
|
+
f"Only got {len(conditions)=} from Polygon list_splits (expected {expected_conditions_count=}). "
|
179
|
+
)
|
180
|
+
# We will always load from the file to avoid any chance of weird errors.
|
181
|
+
if os.path.exists(conditions_path):
|
182
|
+
conditions = pd.read_parquet(conditions_path)
|
183
|
+
print(f"Loaded {len(conditions)=} from {conditions_path}")
|
184
|
+
if len(conditions) < expected_conditions_count:
|
185
|
+
logging.warning(
|
186
|
+
f"Only got {len(conditions)=} from cached conditions (expected {expected_conditions_count=}). "
|
187
|
+
)
|
188
|
+
return conditions
|
189
|
+
raise ValueError(f"Failed to load splits from {conditions_path}")
|
zipline_polygon_bundle/bundle.py
CHANGED
@@ -1,18 +1,23 @@
|
|
1
|
+
import os
|
1
2
|
from zipline.data.bundles import register
|
2
3
|
from zipline.data.resample import minute_frame_to_session_frame
|
3
4
|
|
4
|
-
from .
|
5
|
+
from exchange_calendars.calendar_helpers import parse_date
|
6
|
+
from exchange_calendars.calendar_utils import get_calendar
|
7
|
+
|
5
8
|
from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
|
6
9
|
from .adjustments import load_splits, load_dividends
|
10
|
+
from .config import PolygonConfig
|
11
|
+
from .nyse_all_hours_calendar import register_nyse_all_hours_calendar
|
12
|
+
from .trades import convert_trades_to_custom_aggs, scatter_custom_aggs_to_by_ticker
|
7
13
|
|
8
14
|
import pyarrow
|
9
15
|
import pyarrow.compute
|
16
|
+
import pyarrow.dataset
|
10
17
|
|
11
18
|
import pandas as pd
|
12
19
|
import logging
|
13
20
|
|
14
|
-
import concurrent.futures
|
15
|
-
|
16
21
|
|
17
22
|
# TODO: Change warnings to be relative to number of days in the range.
|
18
23
|
|
@@ -26,7 +31,7 @@ def symbol_to_upper(s: str) -> str:
|
|
26
31
|
def generate_all_agg_tables_from_csv(
|
27
32
|
config: PolygonConfig,
|
28
33
|
):
|
29
|
-
|
34
|
+
schema, tables = generate_csv_agg_tables(config)
|
30
35
|
for table in tables:
|
31
36
|
table = table.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
32
37
|
yield table
|
@@ -175,8 +180,8 @@ def polygon_equities_bundle_day(
|
|
175
180
|
daily_bar_writer,
|
176
181
|
adjustment_writer,
|
177
182
|
calendar,
|
178
|
-
|
179
|
-
|
183
|
+
start_date,
|
184
|
+
end_date,
|
180
185
|
cache,
|
181
186
|
show_progress,
|
182
187
|
output_dir,
|
@@ -184,8 +189,8 @@ def polygon_equities_bundle_day(
|
|
184
189
|
config = PolygonConfig(
|
185
190
|
environ=environ,
|
186
191
|
calendar_name=calendar.name,
|
187
|
-
|
188
|
-
|
192
|
+
start_date=start_date,
|
193
|
+
end_date=end_date,
|
189
194
|
agg_time="day",
|
190
195
|
)
|
191
196
|
|
@@ -206,7 +211,19 @@ def polygon_equities_bundle_day(
|
|
206
211
|
)
|
207
212
|
)
|
208
213
|
|
209
|
-
|
214
|
+
# Only get the columns Zipline allows.
|
215
|
+
table = aggregates.to_table(
|
216
|
+
columns=[
|
217
|
+
"ticker",
|
218
|
+
"window_start",
|
219
|
+
"open",
|
220
|
+
"high",
|
221
|
+
"low",
|
222
|
+
"close",
|
223
|
+
"volume",
|
224
|
+
"transactions",
|
225
|
+
]
|
226
|
+
)
|
210
227
|
table = rename_polygon_to_zipline(table, "day")
|
211
228
|
# Get all the symbols in the table by using value_counts to tabulate the unique values.
|
212
229
|
# pyarrow.Table.column returns a pyarrow.ChunkedArray.
|
@@ -219,7 +236,7 @@ def polygon_equities_bundle_day(
|
|
219
236
|
daily_bar_writer.write(
|
220
237
|
process_day_aggregates(
|
221
238
|
table=table,
|
222
|
-
sessions=calendar.sessions_in_range(
|
239
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
223
240
|
metadata=metadata,
|
224
241
|
calendar=calendar,
|
225
242
|
symbol_to_sid=symbol_to_sid,
|
@@ -251,7 +268,19 @@ def process_minute_fragment(
|
|
251
268
|
dates_with_data: set,
|
252
269
|
agg_time: str,
|
253
270
|
):
|
254
|
-
|
271
|
+
# Only get the columns Zipline allows.
|
272
|
+
table = fragment.to_table(
|
273
|
+
columns=[
|
274
|
+
"ticker",
|
275
|
+
"window_start",
|
276
|
+
"open",
|
277
|
+
"high",
|
278
|
+
"low",
|
279
|
+
"close",
|
280
|
+
"volume",
|
281
|
+
"transactions",
|
282
|
+
]
|
283
|
+
)
|
255
284
|
print(f" {table.num_rows=}")
|
256
285
|
table = rename_polygon_to_zipline(table, "timestamp")
|
257
286
|
table = table.sort_by([("symbol", "ascending"), ("timestamp", "ascending")])
|
@@ -400,8 +429,8 @@ def polygon_equities_bundle_minute(
|
|
400
429
|
daily_bar_writer,
|
401
430
|
adjustment_writer,
|
402
431
|
calendar,
|
403
|
-
|
404
|
-
|
432
|
+
start_date,
|
433
|
+
end_date,
|
405
434
|
cache,
|
406
435
|
show_progress,
|
407
436
|
output_dir,
|
@@ -409,8 +438,8 @@ def polygon_equities_bundle_minute(
|
|
409
438
|
config = PolygonConfig(
|
410
439
|
environ=environ,
|
411
440
|
calendar_name=calendar.name,
|
412
|
-
|
413
|
-
|
441
|
+
start_date=start_date,
|
442
|
+
end_date=end_date,
|
414
443
|
agg_time="minute",
|
415
444
|
)
|
416
445
|
|
@@ -444,8 +473,99 @@ def polygon_equities_bundle_minute(
|
|
444
473
|
daily_bar_writer.write(
|
445
474
|
process_minute_aggregates(
|
446
475
|
fragments=aggregates.get_fragments(),
|
447
|
-
sessions=calendar.sessions_in_range(
|
448
|
-
minutes=calendar.sessions_minutes(
|
476
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
477
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
478
|
+
metadata=metadata,
|
479
|
+
calendar=calendar,
|
480
|
+
symbol_to_sid=symbol_to_sid,
|
481
|
+
dates_with_data=dates_with_data,
|
482
|
+
agg_time="day",
|
483
|
+
),
|
484
|
+
show_progress=show_progress,
|
485
|
+
)
|
486
|
+
|
487
|
+
# Get data for all stocks and write to Zipline
|
488
|
+
minute_bar_writer.write(
|
489
|
+
process_minute_aggregates(
|
490
|
+
fragments=aggregates.get_fragments(),
|
491
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
492
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
493
|
+
metadata=metadata,
|
494
|
+
calendar=calendar,
|
495
|
+
symbol_to_sid=symbol_to_sid,
|
496
|
+
dates_with_data=dates_with_data,
|
497
|
+
agg_time="minute",
|
498
|
+
),
|
499
|
+
show_progress=show_progress,
|
500
|
+
)
|
501
|
+
|
502
|
+
# Write the metadata
|
503
|
+
asset_db_writer.write(equities=metadata)
|
504
|
+
|
505
|
+
# Load splits and dividends
|
506
|
+
first_start_end = min(dates_with_data)
|
507
|
+
last_end_date = max(dates_with_data)
|
508
|
+
splits = load_splits(config, first_start_end, last_end_date, symbol_to_sid)
|
509
|
+
dividends = load_dividends(config, first_start_end, last_end_date, symbol_to_sid)
|
510
|
+
|
511
|
+
# Write splits and dividends
|
512
|
+
adjustment_writer.write(splits=splits, dividends=dividends)
|
513
|
+
|
514
|
+
|
515
|
+
def polygon_equities_bundle_trades(
|
516
|
+
environ,
|
517
|
+
asset_db_writer,
|
518
|
+
minute_bar_writer,
|
519
|
+
daily_bar_writer,
|
520
|
+
adjustment_writer,
|
521
|
+
calendar,
|
522
|
+
start_date,
|
523
|
+
end_date,
|
524
|
+
cache,
|
525
|
+
show_progress,
|
526
|
+
output_dir,
|
527
|
+
):
|
528
|
+
# TODO: Support agg durations other than `1min`.
|
529
|
+
config = PolygonConfig(
|
530
|
+
environ=environ,
|
531
|
+
calendar_name=calendar.name,
|
532
|
+
start_date=start_date,
|
533
|
+
end_date=end_date,
|
534
|
+
agg_time="1min",
|
535
|
+
)
|
536
|
+
|
537
|
+
convert_trades_to_custom_aggs(config, overwrite=False)
|
538
|
+
by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker(config)
|
539
|
+
aggregates = pyarrow.dataset.dataset(by_ticker_aggs_arrow_dir)
|
540
|
+
# 3.5 billion rows for 10 years of minute data.
|
541
|
+
# print(f"{aggregates.count_rows()=}")
|
542
|
+
# Can't sort the dataset because that reads it all into memory.
|
543
|
+
# aggregates = aggregates.sort_by([("ticker", "ascending"), ("window_start", "ascending")])
|
544
|
+
# print("Sorted")
|
545
|
+
|
546
|
+
# Zipline uses case-insensitive symbols, so we need to convert them to uppercase with a ^ prefix when lowercase.
|
547
|
+
# This is because the SQL schema zipline uses for symbols ignores case.
|
548
|
+
# We put the original symbol in the asset_name field.
|
549
|
+
metadata = pd.DataFrame(
|
550
|
+
columns=(
|
551
|
+
"start_date",
|
552
|
+
"end_date",
|
553
|
+
"auto_close_date",
|
554
|
+
"symbol",
|
555
|
+
"exchange",
|
556
|
+
"asset_name",
|
557
|
+
)
|
558
|
+
)
|
559
|
+
|
560
|
+
symbol_to_sid = {}
|
561
|
+
dates_with_data = set()
|
562
|
+
|
563
|
+
# Get data for all stocks and write to Zipline
|
564
|
+
daily_bar_writer.write(
|
565
|
+
process_minute_aggregates(
|
566
|
+
fragments=aggregates.get_fragments(),
|
567
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
568
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
449
569
|
metadata=metadata,
|
450
570
|
calendar=calendar,
|
451
571
|
symbol_to_sid=symbol_to_sid,
|
@@ -459,8 +579,8 @@ def polygon_equities_bundle_minute(
|
|
459
579
|
minute_bar_writer.write(
|
460
580
|
process_minute_aggregates(
|
461
581
|
fragments=aggregates.get_fragments(),
|
462
|
-
sessions=calendar.sessions_in_range(
|
463
|
-
minutes=calendar.sessions_minutes(
|
582
|
+
sessions=calendar.sessions_in_range(start_date, end_date),
|
583
|
+
minutes=calendar.sessions_minutes(start_date, end_date),
|
464
584
|
metadata=metadata,
|
465
585
|
calendar=calendar,
|
466
586
|
symbol_to_sid=symbol_to_sid,
|
@@ -485,25 +605,54 @@ def polygon_equities_bundle_minute(
|
|
485
605
|
|
486
606
|
def register_polygon_equities_bundle(
|
487
607
|
bundlename,
|
488
|
-
|
489
|
-
|
608
|
+
start_date=None,
|
609
|
+
end_date=None,
|
490
610
|
calendar_name="XNYS",
|
491
611
|
agg_time="day",
|
492
612
|
# ticker_list=None,
|
493
613
|
# watchlists=None,
|
494
614
|
# include_asset_types=None,
|
495
615
|
):
|
496
|
-
|
497
|
-
|
616
|
+
register_nyse_all_hours_calendar()
|
617
|
+
|
618
|
+
if agg_time not in ["day", "minute", "1min"]:
|
619
|
+
raise ValueError(
|
620
|
+
f"agg_time must be 'day', 'minute' (aggs), or '1min' (trades), not '{agg_time}'"
|
621
|
+
)
|
622
|
+
|
623
|
+
# We need to know the start and end dates of the session before the bundle is
|
624
|
+
# registered because even though we only need it for ingest, the metadata in
|
625
|
+
# the writer is initialized and written before our ingest function is called.
|
626
|
+
if start_date is None or end_date is None:
|
627
|
+
config = PolygonConfig(
|
628
|
+
environ=os.environ,
|
629
|
+
calendar_name=calendar_name,
|
630
|
+
start_date=start_date,
|
631
|
+
end_date=end_date,
|
632
|
+
agg_time=agg_time,
|
633
|
+
)
|
634
|
+
first_aggs_date, last_aggs_date = config.find_first_and_last_aggs(
|
635
|
+
config.aggs_dir if agg_time in ["day", "minute"] else config.trades_dir,
|
636
|
+
config.csv_paths_pattern,
|
637
|
+
)
|
638
|
+
if start_date is None:
|
639
|
+
start_date = first_aggs_date
|
640
|
+
if end_date is None:
|
641
|
+
end_date = last_aggs_date
|
642
|
+
|
498
643
|
register(
|
499
644
|
bundlename,
|
500
645
|
(
|
501
|
-
|
502
|
-
if agg_time == "
|
503
|
-
else
|
646
|
+
polygon_equities_bundle_day
|
647
|
+
if agg_time == "day"
|
648
|
+
else (
|
649
|
+
polygon_equities_bundle_minute
|
650
|
+
if agg_time == "minute"
|
651
|
+
else polygon_equities_bundle_trades
|
652
|
+
)
|
504
653
|
),
|
505
|
-
start_session=
|
506
|
-
end_session=
|
654
|
+
start_session=parse_date(start_date, raise_oob=False) if start_date else None,
|
655
|
+
end_session=parse_date(end_date, raise_oob=False) if end_date else None,
|
507
656
|
calendar_name=calendar_name,
|
508
657
|
# minutes_per_day=390,
|
509
658
|
# create_writers=True,
|
@@ -517,12 +666,12 @@ def register_polygon_equities_bundle(
|
|
517
666
|
# config = PolygonConfig(
|
518
667
|
# environ=os.environ,
|
519
668
|
# calendar_name="XNYS",
|
520
|
-
# #
|
521
|
-
# #
|
522
|
-
#
|
523
|
-
# #
|
524
|
-
#
|
525
|
-
# #
|
669
|
+
# # start_date="2003-10-01",
|
670
|
+
# # start_date="2018-01-01",
|
671
|
+
# start_date="2023-01-01",
|
672
|
+
# # end_date="2023-01-12",
|
673
|
+
# end_date="2023-12-31",
|
674
|
+
# # end_date="2024-06-30",
|
526
675
|
# )
|
527
676
|
# splits = load_polygon_splits(config)
|
528
677
|
# splits.info()
|
@@ -1,40 +1,21 @@
|
|
1
|
-
from .config import PolygonConfig
|
1
|
+
from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
|
2
2
|
|
3
3
|
import shutil
|
4
|
-
from typing import Iterator, Tuple
|
4
|
+
from typing import Iterator, Tuple, List, Union
|
5
5
|
|
6
6
|
import argparse
|
7
|
-
import glob
|
8
7
|
import os
|
9
8
|
|
10
9
|
import pyarrow as pa
|
11
10
|
from pyarrow import dataset as pa_ds
|
12
11
|
from pyarrow import csv as pa_csv
|
12
|
+
from pyarrow import compute as pa_compute
|
13
13
|
|
14
14
|
import pandas as pd
|
15
15
|
|
16
16
|
|
17
|
-
PARTITION_COLUMN_NAME = "part"
|
18
|
-
PARTITION_KEY_LENGTH = 2
|
19
|
-
|
20
|
-
|
21
|
-
def to_partition_key(s: str) -> str:
|
22
|
-
"""
|
23
|
-
Partition key is low cardinality and must be filesystem-safe.
|
24
|
-
The reason for partitioning is to keep the parquet files from getting too big.
|
25
|
-
10 years of minute aggs for US stocks is 83GB gzipped. A single parquet would be 62GB on disk.
|
26
|
-
Currently the first two characters so files stay under 1GB. Weird characters are replaced with "A".
|
27
|
-
"""
|
28
|
-
k = (s + "A")[0:PARTITION_KEY_LENGTH].upper()
|
29
|
-
if k.isalpha():
|
30
|
-
return k
|
31
|
-
# Replace non-alpha characters with "A".
|
32
|
-
k = "".join([c if c.isalpha() else "A" for c in k])
|
33
|
-
return k
|
34
|
-
|
35
|
-
|
36
17
|
def generate_tables_from_csv_files(
|
37
|
-
paths:
|
18
|
+
paths: Iterator[Union[str, os.PathLike]],
|
38
19
|
schema: pa.Schema,
|
39
20
|
start_timestamp: pd.Timestamp,
|
40
21
|
limit_timestamp: pd.Timestamp,
|
@@ -57,7 +38,7 @@ def generate_tables_from_csv_files(
|
|
57
38
|
quoted_strings_can_be_null=False,
|
58
39
|
)
|
59
40
|
|
60
|
-
table =
|
41
|
+
table = pa_csv.read_csv(path, convert_options=convert_options)
|
61
42
|
tables_read_count += 1
|
62
43
|
table = table.set_column(
|
63
44
|
table.column_names.index("window_start"),
|
@@ -75,10 +56,10 @@ def generate_tables_from_csv_files(
|
|
75
56
|
),
|
76
57
|
)
|
77
58
|
expr = (
|
78
|
-
|
59
|
+
pa_compute.field("window_start")
|
79
60
|
>= pa.scalar(start_timestamp, type=schema.field("window_start").type)
|
80
61
|
) & (
|
81
|
-
|
62
|
+
pa_compute.field("window_start")
|
82
63
|
< pa.scalar(
|
83
64
|
limit_timestamp,
|
84
65
|
type=schema.field("window_start").type,
|
@@ -101,22 +82,8 @@ def generate_tables_from_csv_files(
|
|
101
82
|
|
102
83
|
def generate_csv_agg_tables(
|
103
84
|
config: PolygonConfig,
|
104
|
-
) -> Tuple[
|
85
|
+
) -> Tuple[pa.Schema, Iterator[pa.Table]]:
|
105
86
|
"""zipline does bundle ingestion one ticker at a time."""
|
106
|
-
# We sort by path because they have the year and month in the dir names and the date in the filename.
|
107
|
-
paths = sorted(
|
108
|
-
list(
|
109
|
-
glob.glob(
|
110
|
-
os.path.join(config.aggs_dir, config.csv_paths_pattern),
|
111
|
-
recursive="**" in config.csv_paths_pattern,
|
112
|
-
)
|
113
|
-
)
|
114
|
-
)
|
115
|
-
|
116
|
-
print(f"{len(paths)=}")
|
117
|
-
if len(paths) > 0:
|
118
|
-
print(f"{paths[0]=}")
|
119
|
-
print(f"{paths[-1]=}")
|
120
87
|
|
121
88
|
# Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
|
122
89
|
# I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
|
@@ -154,11 +121,11 @@ def generate_csv_agg_tables(
|
|
154
121
|
pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
|
155
122
|
)
|
156
123
|
|
124
|
+
# TODO: Use generator like os.walk for paths.
|
157
125
|
return (
|
158
|
-
paths,
|
159
126
|
polygon_aggs_schema,
|
160
127
|
generate_tables_from_csv_files(
|
161
|
-
paths=
|
128
|
+
paths=config.csv_paths(),
|
162
129
|
schema=polygon_aggs_schema,
|
163
130
|
start_timestamp=config.start_timestamp,
|
164
131
|
limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
|
@@ -176,11 +143,9 @@ def concat_all_aggs_from_csv(
|
|
176
143
|
config: PolygonConfig,
|
177
144
|
overwrite: bool = False,
|
178
145
|
) -> str:
|
179
|
-
|
146
|
+
schema, tables = generate_csv_agg_tables(config)
|
180
147
|
|
181
|
-
|
182
|
-
raise ValueError(f"No Polygon CSV flat files found in {config.aggs_dir=}")
|
183
|
-
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir(paths[0], paths[-1])
|
148
|
+
by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
|
184
149
|
if os.path.exists(by_ticker_aggs_arrow_dir):
|
185
150
|
if overwrite:
|
186
151
|
print(f"Removing {by_ticker_aggs_arrow_dir=}")
|
@@ -212,10 +177,10 @@ if __name__ == "__main__":
|
|
212
177
|
parser = argparse.ArgumentParser()
|
213
178
|
parser.add_argument("--calendar_name", default="XNYS")
|
214
179
|
|
215
|
-
parser.add_argument("--
|
216
|
-
parser.add_argument("--
|
217
|
-
# parser.add_argument("--
|
218
|
-
# parser.add_argument("--
|
180
|
+
parser.add_argument("--start_date", default="2014-06-16")
|
181
|
+
parser.add_argument("--end_date", default="2024-09-06")
|
182
|
+
# parser.add_argument("--start_date", default="2020-01-01")
|
183
|
+
# parser.add_argument("--end_date", default="2020-12-31")
|
219
184
|
|
220
185
|
parser.add_argument("--agg_time", default="day")
|
221
186
|
|
@@ -235,8 +200,8 @@ if __name__ == "__main__":
|
|
235
200
|
config = PolygonConfig(
|
236
201
|
environ=os.environ,
|
237
202
|
calendar_name=args.calendar_name,
|
238
|
-
|
239
|
-
|
203
|
+
start_date=args.start_date,
|
204
|
+
end_date=args.end_date,
|
240
205
|
agg_time=args.agg_time,
|
241
206
|
)
|
242
207
|
|
@@ -138,10 +138,10 @@ if __name__ == "__main__":
|
|
138
138
|
parser = argparse.ArgumentParser()
|
139
139
|
parser.add_argument("--calendar_name", default="XNYS")
|
140
140
|
|
141
|
-
parser.add_argument("--
|
142
|
-
parser.add_argument("--
|
143
|
-
# parser.add_argument("--
|
144
|
-
# parser.add_argument("--
|
141
|
+
parser.add_argument("--start_date", default="2014-06-16")
|
142
|
+
parser.add_argument("--end_date", default="2024-09-06")
|
143
|
+
# parser.add_argument("--start_date", default="2020-10-07")
|
144
|
+
# parser.add_argument("--end_date", default="2020-10-15")
|
145
145
|
# parser.add_argument("--aggs_pattern", default="2020/10/**/*.csv.gz")
|
146
146
|
parser.add_argument("--aggs_pattern", default="**/*.csv.gz")
|
147
147
|
|
@@ -163,8 +163,8 @@ if __name__ == "__main__":
|
|
163
163
|
config = PolygonConfig(
|
164
164
|
environ=os.environ,
|
165
165
|
calendar_name=args.calendar_name,
|
166
|
-
|
167
|
-
|
166
|
+
start_date=args.start_date,
|
167
|
+
end_date=args.end_date,
|
168
168
|
)
|
169
169
|
|
170
170
|
concat_all_aggs_from_csv(
|