zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,20 @@
1
1
  from exchange_calendars.calendar_helpers import Date, parse_date
2
- from exchange_calendars.calendar_utils import get_calendar
2
+ from zipline.utils.calendar_utils import get_calendar
3
3
 
4
4
  from .nyse_all_hours_calendar import NYSE_ALL_HOURS
5
5
 
6
- from typing import Iterator, Tuple
6
+ from typing import Iterator, Mapping, Tuple
7
7
 
8
8
  import pandas as pd
9
9
  from pyarrow.fs import LocalFileSystem
10
10
  import os
11
+ import datetime
11
12
  import re
12
13
  import fnmatch
13
14
 
15
+ AGG_TIME_DAY = "day"
16
+ AGG_TIME_MINUTE = "minute"
17
+ AGG_TIME_TRADES = "1minute"
14
18
 
15
19
  PARTITION_COLUMN_NAME = "part"
16
20
  PARTITION_KEY_LENGTH = 2
@@ -34,16 +38,16 @@ def to_partition_key(s: str) -> str:
34
38
  class PolygonConfig:
35
39
  def __init__(
36
40
  self,
37
- environ: dict,
41
+ environ: Mapping[str, str],
38
42
  calendar_name: str,
39
43
  start_date: Date,
40
44
  end_date: Date,
41
- agg_time: str = "day",
42
- custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
45
+ agg_time: str = AGG_TIME_DAY,
43
46
  ):
44
47
  self.calendar_name = calendar_name
45
48
  self.start_date = start_date
46
49
  self.end_date = end_date
50
+ # TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
47
51
  self.start_timestamp = (
48
52
  parse_date(start_date, calendar=self.calendar)
49
53
  if start_date
@@ -54,6 +58,8 @@ class PolygonConfig:
54
58
  if end_date
55
59
  else self.calendar.last_session
56
60
  )
61
+ self.agg_time = agg_time
62
+
57
63
  self.max_workers = None
58
64
  if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
59
65
  self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
@@ -65,17 +71,6 @@ class PolygonConfig:
65
71
  )
66
72
  self.market = environ.get("POLYGON_MARKET", "stocks")
67
73
  self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
68
- self.tickers_dir = environ.get(
69
- "POLYGON_TICKERS_DIR",
70
- os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
71
- )
72
- self.tickers_csv_path = environ.get(
73
- "POLYGON_TICKERS_CSV",
74
- os.path.join(
75
- self.tickers_dir,
76
- f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
77
- ),
78
- )
79
74
  self.flat_files_dir = environ.get(
80
75
  "POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
81
76
  )
@@ -92,54 +87,78 @@ class PolygonConfig:
92
87
  self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
93
88
 
94
89
  # TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
90
+ self.custom_asset_files_dir = environ.get(
91
+ "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
92
+ )
93
+ self.tickers_dir = environ.get(
94
+ "POLYGON_TICKERS_DIR",
95
+ os.path.join(self.custom_asset_files_dir, "tickers"),
96
+ )
97
+ self.tickers_csv_path = environ.get(
98
+ "POLYGON_TICKERS_CSV",
99
+ os.path.join(
100
+ self.tickers_dir,
101
+ f"tickers_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.csv",
102
+ ),
103
+ )
104
+
105
+ self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
106
+
107
+ self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
108
+ self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
109
+ self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
110
+
95
111
  self.minute_by_ticker_dir = os.path.join(
96
- self.asset_files_dir, "minute_by_ticker_v1"
112
+ self.custom_asset_files_dir, "minute_by_ticker_v1"
113
+ )
114
+ self.day_by_ticker_dir = os.path.join(
115
+ self.custom_asset_files_dir, "day_by_ticker_v1"
97
116
  )
98
- self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
99
117
 
118
+ # If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
100
119
  if bool(re.match(r"^\d", agg_time)):
101
120
  self.agg_timedelta = pd.to_timedelta(agg_time)
102
- self.custom_asset_files_dir = environ.get(
103
- "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
121
+ self.csv_files_dir = self.trades_dir
122
+ self.custom_aggs_name_format = environ.get(
123
+ "CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
104
124
  )
105
- self.custom_aggs_dir = os.path.join(
106
- self.custom_asset_files_dir, custom_aggs_format.format(config=self)
125
+ self.aggs_dir = os.path.join(
126
+ self.custom_asset_files_dir,
127
+ self.custom_aggs_name_format.format(config=self),
107
128
  )
108
- self.custom_aggs_by_ticker_dir = os.path.join(
129
+ self.by_ticker_dir = os.path.join(
109
130
  self.custom_asset_files_dir,
110
- (custom_aggs_format + "_by_ticker").format(config=self),
131
+ (self.custom_aggs_name_format + "_by_ticker").format(config=self),
111
132
  )
112
- self.aggs_dir = self.custom_aggs_dir
113
- self.by_ticker_dir = self.custom_aggs_by_ticker_dir
114
- elif agg_time == "minute":
133
+ elif agg_time == AGG_TIME_MINUTE:
115
134
  self.agg_timedelta = pd.to_timedelta("1minute")
116
135
  self.aggs_dir = self.minute_aggs_dir
136
+ self.csv_files_dir = self.aggs_dir
117
137
  self.by_ticker_dir = self.minute_by_ticker_dir
118
- elif agg_time == "day":
138
+ elif agg_time == AGG_TIME_DAY:
119
139
  self.agg_timedelta = pd.to_timedelta("1day")
120
140
  self.aggs_dir = self.day_aggs_dir
141
+ self.csv_files_dir = self.aggs_dir
121
142
  self.by_ticker_dir = self.day_by_ticker_dir
122
143
  else:
123
144
  raise ValueError(
124
145
  f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
125
146
  )
126
- self.agg_time = agg_time
127
147
 
128
148
  self.arrow_format = environ.get(
129
- "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive"
149
+ "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
130
150
  )
131
151
  # self.by_ticker_hive_dir = os.path.join(
132
152
  # self.by_ticker_dir,
133
153
  # f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
134
154
  # )
135
- self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
136
155
 
137
156
  @property
138
157
  def calendar(self):
139
- # If you don't give a start date you'll only get 20 years from today.
140
- if self.calendar_name in [NYSE_ALL_HOURS, "us_futures", "CMES", "XNYS", "NYSE"]:
141
- return get_calendar(self.calendar_name, side="right", start=pd.Timestamp("1990-01-01"))
142
- return get_calendar(self.calendar_name, side="right")
158
+ # print call stack
159
+ # import traceback
160
+ # traceback.print_stack()
161
+ return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
143
162
 
144
163
  def ticker_file_path(self, date: pd.Timestamp):
145
164
  ticker_year_dir = os.path.join(
@@ -154,6 +173,9 @@ class PolygonConfig:
154
173
  # TODO: Use csv_paths_pattern to remove the suffixes
155
174
  return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
156
175
 
176
+ def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
177
+ return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
178
+
157
179
  @property
158
180
  def by_ticker_aggs_arrow_dir(self):
159
181
  # TODO: Don't split these up by ingestion range. They're already time indexed.
@@ -161,19 +183,20 @@ class PolygonConfig:
161
183
  # This scattering is really slow and is usually gonna be redundant.
162
184
  # This wasn't a problem when start/end dates were the calendar bounds when omitted.
163
185
  # Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
164
- return os.path.join(
165
- self.by_ticker_dir,
166
- f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
167
- # "aggs.arrow",
168
- )
186
+ # return os.path.join(
187
+ # self.by_ticker_dir,
188
+ # f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
189
+ # # "aggs.arrow",
190
+ # )
191
+ return self.by_ticker_dir
169
192
 
170
193
  def api_cache_path(
171
- self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
194
+ self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
172
195
  ):
173
- start_str = parse_date(start_date, calendar=self.calendar).date().isoformat()
174
- end_str = parse_date(end_date, calendar=self.calendar).date().isoformat()
196
+ first_day_str = first_day.date().isoformat()
197
+ last_day_str = last_day.date().isoformat()
175
198
  return os.path.join(
176
- self.cache_dir, f"{start_str}_{end_str}/{filename}{extension}"
199
+ self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
177
200
  )
178
201
 
179
202
  def csv_paths(self) -> Iterator[str]:
@@ -186,7 +209,9 @@ class PolygonConfig:
186
209
  for filename in sorted(filenames):
187
210
  yield os.path.join(root, filename)
188
211
 
189
- def find_first_and_last_aggs(self, aggs_dir, file_pattern) -> Tuple[str | None, str | None]:
212
+ def find_first_and_last_aggs(
213
+ self, aggs_dir, file_pattern
214
+ ) -> Tuple[str | None, str | None]:
190
215
  # Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
191
216
  # Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
192
217
  # This needs to be efficient because it is called on every init, even though we only need it for ingest.