zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,261 @@
1
+ from .config import PolygonConfig
2
+ from .trades import custom_aggs_schema, custom_aggs_partitioning
3
+
4
+ import datetime
5
+ import numpy as np
6
+ import pyarrow as pa
7
+ import pyarrow.compute as pa_compute
8
+ import pyarrow.dataset as pa_ds
9
+ import pandas_ta as ta
10
+ import pandas as pd
11
+
12
+
13
+ def calculate_mfi(typical_price: pd.Series, money_flow: pd.Series, period: int):
14
+ mf_sign = np.where(typical_price > np.roll(typical_price, shift=1), 1, -1)
15
+ signed_mf = money_flow * mf_sign
16
+
17
+ # Calculate gain and loss using vectorized operations
18
+ positive_mf = np.maximum(signed_mf, 0)
19
+ negative_mf = np.maximum(-signed_mf, 0)
20
+
21
+ mf_avg_gain = (
22
+ np.convolve(positive_mf, np.ones(period), mode="full")[: len(positive_mf)]
23
+ / period
24
+ )
25
+ mf_avg_loss = (
26
+ np.convolve(negative_mf, np.ones(period), mode="full")[: len(negative_mf)]
27
+ / period
28
+ )
29
+
30
+ epsilon = 1e-10 # Small epsilon value to avoid division by zero
31
+ mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
32
+ return mfi
33
+
34
+
35
+ # https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
36
+ # https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
37
+ # `k` vs `fast_k` arg names.
38
+ # https://github.com/twopirllc/pandas-ta/issues/726
39
+ # Results affected by values outside range
40
+ # https://github.com/twopirllc/pandas-ta/issues/535
41
+
42
+
43
+ def calculate_stoch(
44
+ high: pd.Series,
45
+ low: pd.Series,
46
+ close: pd.Series,
47
+ k: int = 14,
48
+ d: int = 3,
49
+ smooth_k: int = 3,
50
+ mamode: str = "sma",
51
+ ):
52
+ """Indicator: Stochastic Oscillator (STOCH)"""
53
+ lowest_low = low.rolling(k).min()
54
+ highest_high = high.rolling(k).max()
55
+
56
+ stoch = 100 * (close - lowest_low)
57
+ stoch /= ta.utils.non_zero_range(highest_high, lowest_low)
58
+
59
+ stoch_k = ta.overlap.ma(
60
+ mamode, stoch.loc[stoch.first_valid_index() :,], length=smooth_k
61
+ )
62
+ stoch_d = (
63
+ ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index() :,], length=d)
64
+ if stoch_k is not None
65
+ else None
66
+ )
67
+ # Histogram
68
+ stoch_h = stoch_k - stoch_d if stoch_d is not None else None
69
+
70
+ return stoch_k, stoch_d, stoch_h
71
+
72
+
73
+ def compute_per_ticker_signals(df: pd.DataFrame, period: int = 14) -> pd.DataFrame:
74
+ df = df.set_index("window_start").sort_index()
75
+ session_index = pd.date_range(
76
+ start=df.index[0], end=df.index[-1], freq=pd.Timedelta(seconds=60)
77
+ )
78
+ df = df.reindex(session_index)
79
+ df.index.rename("window_start", inplace=True)
80
+
81
+ # df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
82
+ # df["day_of_week"] = df.index.day_of_week
83
+
84
+ df.transactions = df.transactions.fillna(0)
85
+ df.volume = df.volume.fillna(0)
86
+ df.total = df.total.fillna(0)
87
+ df.close = df.close.ffill()
88
+ close = df.close
89
+ df.vwap = df.vwap.fillna(close)
90
+ df.high = df.high.fillna(close)
91
+ df.low = df.low.fillna(close)
92
+ df.open = df.open.fillna(close)
93
+ price_open = df.open
94
+ high = df.high
95
+ low = df.low
96
+ vwap = df.vwap
97
+ # volume = df.volume
98
+ total = df.total
99
+ next_close = close.shift()
100
+
101
+ # TODO: Odometer rollover signal. Relative difference to nearest power of 10.
102
+ # Something about log10 being a whole number? When is $50 the rollover vs $100 or $10?
103
+
104
+ # "True (Typical?) Price" which I think is an approximation of VWAP.
105
+ # Trouble with both is that if there are no trades in a bar we get NaN.
106
+ # That then means we get NaN for averages for the next period-1 bars too.
107
+ # Question is whether to ffill the price for these calculations.
108
+ df["TP"] = (high + low + close) / 3
109
+
110
+ # Gain/loss in this bar.
111
+ df["ret1bar"] = close.div(price_open).sub(1)
112
+
113
+ for t in range(2, period):
114
+ df[f"ret{t}bar"] = close.div(price_open.shift(t - 1)).sub(1)
115
+
116
+ # Average True Range (ATR)
117
+ true_range = pd.concat(
118
+ [high.sub(low), high.sub(next_close).abs(), low.sub(next_close).abs()], axis=1
119
+ ).max(1)
120
+ # Normalized ATR (NATR) or Average of Normalized TR.
121
+ # Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
122
+ # He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
123
+ # atr = true_range.ewm(span=period).mean()
124
+ # df["natr_c"] = atr / close
125
+ # df["antr_c"] = (true_range / close).ewm(span=period).mean()
126
+ # df["natr_v"] = atr / vwap
127
+ # df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
128
+ df["NATR"] = (true_range / vwap).ewm(span=period).mean()
129
+
130
+ # True Price as HLC average VS VWAP.
131
+ # VWAP is better I think but is quite different than standard CCI.
132
+ # Three ways to compute CCI, all give the same value using TP.
133
+ # tp = (high + low + close) / 3
134
+ # df['SMA'] = ta.sma(tp, length=period)
135
+ # df['sma_r'] = tp.rolling(period).mean()
136
+ # df['MAD'] = ta.mad(tp, length=period)
137
+ # # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
138
+ # df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
139
+
140
+ # df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
141
+ # df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
142
+ # df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
143
+
144
+ df["taCCI"] = ta.cci(high=high, low=low, close=close, length=period)
145
+
146
+ # https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
147
+ # https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
148
+ sma = pd.Series(ta.sma(vwap, length=period))
149
+ mad = pd.Series(ta.mad(vwap, length=period))
150
+ df["CCI"] = (vwap - sma) / (0.015 * mad)
151
+
152
+ # df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
153
+ df["MFI"] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
154
+
155
+ # We use Stochastic (rather than MACD because we need a ticker independent indicator.
156
+ # IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
157
+ # https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
158
+ # We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
159
+ stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
160
+ df["STOCHk"] = stoch_k
161
+ df["STOCHd"] = stoch_d
162
+ df["STOCHh"] = stoch_h
163
+
164
+ return df
165
+
166
+
167
+ def iterate_all_aggs_tables(
168
+ config: PolygonConfig,
169
+ valid_tickers: pa.Array,
170
+ ):
171
+ schedule = config.calendar.trading_index(
172
+ start=config.start_timestamp, end=config.end_timestamp, period="1D"
173
+ )
174
+ for timestamp in schedule:
175
+ date = timestamp.to_pydatetime().date()
176
+ aggs_ds = pa_ds.dataset(
177
+ config.aggs_dir,
178
+ format="parquet",
179
+ schema=custom_aggs_schema(tz=config.calendar.tz.key),
180
+ partitioning=custom_aggs_partitioning(),
181
+ )
182
+ date_filter_expr = (
183
+ (pa_compute.field("year") == date.year)
184
+ & (pa_compute.field("month") == date.month)
185
+ & (pa_compute.field("date") == date)
186
+ )
187
+ # print(f"{date_filter_expr=}")
188
+ for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
189
+ session_filter = (
190
+ (pa_compute.field("window_start") >= start_dt)
191
+ & (pa_compute.field("window_start") < end_dt)
192
+ & pa_compute.is_in(pa_compute.field("ticker"), valid_tickers)
193
+ )
194
+ # Sorting table doesn't seem to avoid needing to sort the df. Maybe use_threads=False on to_pandas would help?
195
+ # table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
196
+ table = fragment.to_table(filter=session_filter)
197
+ if table.num_rows > 0:
198
+ metadata = (
199
+ dict(table.schema.metadata) if table.schema.metadata else dict()
200
+ )
201
+ metadata["date"] = date.isoformat()
202
+ table = table.replace_schema_metadata(metadata)
203
+ yield table
204
+
205
+
206
+ # def iterate_all_aggs_with_signals(config: PolygonConfig):
207
+ # for table in iterate_all_aggs_tables(config):
208
+ # df = table.to_pandas()
209
+ # df = df.groupby("ticker").apply(
210
+ # compute_per_ticker_signals, include_groups=False
211
+ # )
212
+ # yield pa.Table.from_pandas(df)
213
+
214
+
215
+ def file_visitor(written_file):
216
+ print(f"{written_file.path=}")
217
+
218
+
219
+ def compute_signals_for_all_aggs(
220
+ from_config: PolygonConfig,
221
+ to_config: PolygonConfig,
222
+ valid_tickers: pa.Array,
223
+ overwrite: bool = False,
224
+ ) -> str:
225
+ if overwrite:
226
+ print("WARNING: overwrite not implemented/ignored.")
227
+
228
+ # Need a different aggs_dir for the signals because schema is different.
229
+ print(f"{to_config.aggs_dir=}")
230
+
231
+ for aggs_table in iterate_all_aggs_tables(from_config, valid_tickers):
232
+ metadata = aggs_table.schema.metadata
233
+ date = datetime.date.fromisoformat(metadata[b"date"].decode("utf-8"))
234
+ print(f"{date=}")
235
+ df = aggs_table.to_pandas()
236
+ df = df.groupby("ticker").apply(
237
+ compute_per_ticker_signals, include_groups=False
238
+ )
239
+ table = pa.Table.from_pandas(df)
240
+ if table.num_rows > 0:
241
+ table = table.replace_schema_metadata(metadata)
242
+ table = table.append_column("date", pa.array(np.full(len(table), date)))
243
+ table = table.append_column(
244
+ "year", pa.array(np.full(len(table), date.year), type=pa.uint16())
245
+ )
246
+ table = table.append_column(
247
+ "month", pa.array(np.full(len(table), date.month), type=pa.uint8())
248
+ )
249
+ table = table.sort_by(
250
+ [("ticker", "ascending"), ("window_start", "ascending")]
251
+ )
252
+ pa_ds.write_dataset(
253
+ table,
254
+ filesystem=to_config.filesystem,
255
+ base_dir=to_config.aggs_dir,
256
+ partitioning=custom_aggs_partitioning(),
257
+ format="parquet",
258
+ existing_data_behavior="overwrite_or_ignore",
259
+ file_visitor=file_visitor,
260
+ )
261
+ return to_config.aggs_dir
@@ -1,24 +1,43 @@
1
1
  from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
2
2
 
3
3
  import shutil
4
- from typing import Iterator, Tuple, List, Union
4
+ from typing import Iterator, Tuple, Union
5
5
 
6
6
  import argparse
7
7
  import os
8
+ import datetime
8
9
 
9
10
  import pyarrow as pa
10
- from pyarrow import dataset as pa_ds
11
- from pyarrow import csv as pa_csv
12
- from pyarrow import compute as pa_compute
11
+ import pyarrow.compute as pa_compute
12
+ import pyarrow.csv as pa_csv
13
+ import pyarrow.dataset as pa_ds
14
+ import pyarrow.fs as pa_fs
13
15
 
14
16
  import pandas as pd
15
17
 
16
18
 
19
+ # def get_by_ticker_dates(config: PolygonConfig, schema) -> set[datetime.date]:
20
+ # file_info = config.filesystem.get_file_info(config.by_ticker_dir)
21
+ # if file_info.type == pa_fs.FileType.NotFound:
22
+ # return set()
23
+ # partitioning = None
24
+ # if PARTITION_COLUMN_NAME in schema.names:
25
+ # partitioning = pa_ds.partitioning(
26
+ # pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
27
+ # )
28
+ # by_ticker_aggs_ds = pa_ds.dataset(config.by_ticker_aggs_arrow_dir, schema=schema, partitioning=partitioning)
29
+ # return set(
30
+ # [
31
+ # pa_ds.get_partition_keys(fragment.partition_expression).get("date")
32
+ # for fragment in by_ticker_aggs_ds.get_fragments()
33
+ # ]
34
+ # )
35
+
36
+
17
37
  def generate_tables_from_csv_files(
18
- paths: Iterator[Union[str, os.PathLike]],
38
+ config: PolygonConfig,
19
39
  schema: pa.Schema,
20
- start_timestamp: pd.Timestamp,
21
- limit_timestamp: pd.Timestamp,
40
+ overwrite: bool = False,
22
41
  ) -> Iterator[pa.Table]:
23
42
  empty_table = schema.empty_table()
24
43
  # TODO: Find which column(s) need to be cast to int64 from the schema.
@@ -29,16 +48,35 @@ def generate_tables_from_csv_files(
29
48
  )
30
49
  csv_schema = empty_table.schema
31
50
 
51
+ existing_by_ticker_dates = set()
52
+ if not overwrite:
53
+ # print("Getting existing by_ticker_dates")
54
+ # existing_by_ticker_dates = get_by_ticker_dates(config, schema)
55
+ print(f"{len(existing_by_ticker_dates)=}")
56
+
57
+ schedule = config.calendar.trading_index(
58
+ start=config.start_timestamp, end=config.end_timestamp, period="1D"
59
+ )
60
+ start_timestamp = config.start_timestamp.tz_localize(config.calendar.tz.key)
61
+ limit_timestamp = (config.end_timestamp + pd.Timedelta(days=1)).tz_localize(
62
+ config.calendar.tz.key)
63
+ # print(f"{start_timestamp=} {limit_timestamp=} {config.calendar.tz=} {schedule[:2]=} {schedule[-2:]=}")
64
+
32
65
  tables_read_count = 0
33
66
  skipped_table_count = 0
34
- for path in paths:
67
+ for timestamp in schedule:
68
+ date: datetime.date = timestamp.tz_localize(config.calendar.tz.key).to_pydatetime().date()
69
+ # print(f"{date=} {timestamp=}")
70
+ if date in existing_by_ticker_dates:
71
+ continue
72
+ csv_path = config.date_to_csv_file_path(date)
35
73
  convert_options = pa_csv.ConvertOptions(
36
74
  column_types=csv_schema,
37
75
  strings_can_be_null=False,
38
76
  quoted_strings_can_be_null=False,
39
77
  )
40
78
 
41
- table = pa_csv.read_csv(path, convert_options=convert_options)
79
+ table = pa_csv.read_csv(csv_path, convert_options=convert_options)
42
80
  tables_read_count += 1
43
81
  table = table.set_column(
44
82
  table.column_names.index("window_start"),
@@ -76,18 +114,86 @@ def generate_tables_from_csv_files(
76
114
  skipped_table_count += 1
77
115
  continue
78
116
 
117
+ if PARTITION_COLUMN_NAME in schema.names:
118
+ print(f"{date=}")
79
119
  yield table
80
120
  print(f"{tables_read_count=} {skipped_table_count=}")
81
121
 
82
122
 
123
+ # def generate_tables_from_csv_files(
124
+ # paths: Iterator[Union[str, os.PathLike]],
125
+ # schema: pa.Schema,
126
+ # start_timestamp: pd.Timestamp,
127
+ # limit_timestamp: pd.Timestamp,
128
+ # ) -> Iterator[pa.Table]:
129
+ # empty_table = schema.empty_table()
130
+ # # TODO: Find which column(s) need to be cast to int64 from the schema.
131
+ # empty_table = empty_table.set_column(
132
+ # empty_table.column_names.index("window_start"),
133
+ # "window_start",
134
+ # empty_table.column("window_start").cast(pa.int64()),
135
+ # )
136
+ # csv_schema = empty_table.schema
137
+
138
+ # tables_read_count = 0
139
+ # skipped_table_count = 0
140
+ # for path in paths:
141
+ # convert_options = pa_csv.ConvertOptions(
142
+ # column_types=csv_schema,
143
+ # strings_can_be_null=False,
144
+ # quoted_strings_can_be_null=False,
145
+ # )
146
+
147
+ # table = pa_csv.read_csv(path, convert_options=convert_options)
148
+ # tables_read_count += 1
149
+ # table = table.set_column(
150
+ # table.column_names.index("window_start"),
151
+ # "window_start",
152
+ # table.column("window_start").cast(schema.field("window_start").type),
153
+ # )
154
+ # if PARTITION_COLUMN_NAME in schema.names:
155
+ # table = table.append_column(
156
+ # PARTITION_COLUMN_NAME,
157
+ # pa.array(
158
+ # [
159
+ # to_partition_key(ticker)
160
+ # for ticker in table.column("ticker").to_pylist()
161
+ # ]
162
+ # ),
163
+ # )
164
+ # expr = (
165
+ # pa_compute.field("window_start")
166
+ # >= pa.scalar(start_timestamp, type=schema.field("window_start").type)
167
+ # ) & (
168
+ # pa_compute.field("window_start")
169
+ # < pa.scalar(
170
+ # limit_timestamp,
171
+ # type=schema.field("window_start").type,
172
+ # )
173
+ # )
174
+ # table = table.filter(expr)
175
+
176
+ # # TODO: Also check that these rows are within range for this file's date (not just the whole session).
177
+ # # And if we're doing that (figuring date for each file), we can just skip reading the file.
178
+ # # Might able to do a single comparison using compute.days_between.
179
+ # # https://arrow.apache.org/docs/python/generated/pyarrow.compute.days_between.html
180
+
181
+ # if table.num_rows == 0:
182
+ # skipped_table_count += 1
183
+ # continue
184
+
185
+ # yield table
186
+ # print(f"{tables_read_count=} {skipped_table_count=}")
187
+
188
+
83
189
  def generate_csv_agg_tables(
84
- config: PolygonConfig,
190
+ config: PolygonConfig, overwrite: bool = False
85
191
  ) -> Tuple[pa.Schema, Iterator[pa.Table]]:
86
192
  """zipline does bundle ingestion one ticker at a time."""
87
193
 
88
194
  # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
89
195
  # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
90
- timestamp_type = pa.timestamp("ns", tz="UTC")
196
+ timestamp_type = pa.timestamp("ns", tz='UTC')
91
197
 
92
198
  # But we can't use the timestamp type in the schema here because it's not supported by the CSV reader.
93
199
  # So we'll use int64 and cast it after reading the CSV file.
@@ -121,14 +227,12 @@ def generate_csv_agg_tables(
121
227
  pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
122
228
  )
123
229
 
124
- # TODO: Use generator like os.walk for paths.
125
230
  return (
126
231
  polygon_aggs_schema,
127
232
  generate_tables_from_csv_files(
128
- paths=config.csv_paths(),
233
+ config,
129
234
  schema=polygon_aggs_schema,
130
- start_timestamp=config.start_timestamp,
131
- limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
235
+ overwrite=overwrite,
132
236
  ),
133
237
  )
134
238
 
@@ -143,16 +247,17 @@ def concat_all_aggs_from_csv(
143
247
  config: PolygonConfig,
144
248
  overwrite: bool = False,
145
249
  ) -> str:
146
- schema, tables = generate_csv_agg_tables(config)
250
+ schema, tables = generate_csv_agg_tables(config, overwrite=overwrite)
147
251
 
148
252
  by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
149
- if os.path.exists(by_ticker_aggs_arrow_dir):
150
- if overwrite:
151
- print(f"Removing {by_ticker_aggs_arrow_dir=}")
152
- shutil.rmtree(by_ticker_aggs_arrow_dir)
153
- else:
154
- print(f"Found existing {by_ticker_aggs_arrow_dir=}")
155
- return by_ticker_aggs_arrow_dir
253
+ # if os.path.exists(by_ticker_aggs_arrow_dir):
254
+ # if overwrite:
255
+ # print(f"Removing {by_ticker_aggs_arrow_dir=}")
256
+ # shutil.rmtree(by_ticker_aggs_arrow_dir)
257
+ # else:
258
+ # # TODO: Validate the existing data.
259
+ # print(f"Found existing {by_ticker_aggs_arrow_dir=}")
260
+ # return by_ticker_aggs_arrow_dir
156
261
 
157
262
  partitioning = None
158
263
  if PARTITION_COLUMN_NAME in schema.names:
@@ -160,7 +265,7 @@ def concat_all_aggs_from_csv(
160
265
  pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
161
266
  )
162
267
 
163
- # scanner = pa_ds.Scanner.from_batches(source=generate_batches_from_tables(tables), schema=schema)
268
+ print(f"Scattering aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
164
269
  pa_ds.write_dataset(
165
270
  generate_batches_from_tables(tables),
166
271
  schema=schema,
@@ -169,7 +274,7 @@ def concat_all_aggs_from_csv(
169
274
  format="parquet",
170
275
  existing_data_behavior="overwrite_or_ignore",
171
276
  )
172
- print(f"Concatenated aggregates to {by_ticker_aggs_arrow_dir=}")
277
+ print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
173
278
  return by_ticker_aggs_arrow_dir
174
279
 
175
280
 
@@ -1,5 +1,5 @@
1
1
  from exchange_calendars.calendar_helpers import Date, parse_date
2
- from exchange_calendars.calendar_utils import get_calendar
2
+ from zipline.utils.calendar_utils import get_calendar
3
3
 
4
4
  from .nyse_all_hours_calendar import NYSE_ALL_HOURS
5
5
 
@@ -8,9 +8,13 @@ from typing import Iterator, Tuple
8
8
  import pandas as pd
9
9
  from pyarrow.fs import LocalFileSystem
10
10
  import os
11
+ import datetime
11
12
  import re
12
13
  import fnmatch
13
14
 
15
+ AGG_TIME_DAY = "day"
16
+ AGG_TIME_MINUTE = "minute"
17
+ AGG_TIME_TRADES = "1minute"
14
18
 
15
19
  PARTITION_COLUMN_NAME = "part"
16
20
  PARTITION_KEY_LENGTH = 2
@@ -38,12 +42,12 @@ class PolygonConfig:
38
42
  calendar_name: str,
39
43
  start_date: Date,
40
44
  end_date: Date,
41
- agg_time: str = "day",
42
- custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
45
+ agg_time: str = AGG_TIME_DAY,
43
46
  ):
44
47
  self.calendar_name = calendar_name
45
48
  self.start_date = start_date
46
49
  self.end_date = end_date
50
+ # TODO: We can't use PolygonConfig.calendar because it gets these properties for start/end session.
47
51
  self.start_timestamp = (
48
52
  parse_date(start_date, calendar=self.calendar)
49
53
  if start_date
@@ -54,6 +58,8 @@ class PolygonConfig:
54
58
  if end_date
55
59
  else self.calendar.last_session
56
60
  )
61
+ self.agg_time = agg_time
62
+
57
63
  self.max_workers = None
58
64
  if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
59
65
  self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
@@ -92,54 +98,67 @@ class PolygonConfig:
92
98
  self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
93
99
 
94
100
  # TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
101
+ self.custom_asset_files_dir = environ.get(
102
+ "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
103
+ )
104
+
105
+ self.cache_dir = os.path.join(self.custom_asset_files_dir, "api_cache")
106
+
107
+ self.lock_file_path = os.path.join(self.custom_asset_files_dir, "ingest.lock")
108
+ self.custom_aggs_dates_path = os.path.join(self.custom_asset_files_dir, "aggs_dates.json")
109
+ self.by_ticker_dates_path = os.path.join(self.custom_asset_files_dir, "by_ticker_dates.json")
110
+
95
111
  self.minute_by_ticker_dir = os.path.join(
96
- self.asset_files_dir, "minute_by_ticker_v1"
112
+ self.custom_asset_files_dir, "minute_by_ticker_v1"
113
+ )
114
+ self.day_by_ticker_dir = os.path.join(
115
+ self.custom_asset_files_dir, "day_by_ticker_v1"
97
116
  )
98
- self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
99
117
 
118
+ # If agg_time begins with a digit, it is a timedelta string and we're using custom aggs from trades.
100
119
  if bool(re.match(r"^\d", agg_time)):
101
120
  self.agg_timedelta = pd.to_timedelta(agg_time)
102
- self.custom_asset_files_dir = environ.get(
103
- "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
121
+ self.csv_files_dir = self.trades_dir
122
+ self.custom_aggs_name_format = environ.get(
123
+ "CUSTOM_AGGS_NAME_FORMAT", "{config.agg_timedelta.seconds}sec_aggs"
104
124
  )
105
- self.custom_aggs_dir = os.path.join(
106
- self.custom_asset_files_dir, custom_aggs_format.format(config=self)
125
+ self.aggs_dir = os.path.join(
126
+ self.custom_asset_files_dir,
127
+ self.custom_aggs_name_format.format(config=self),
107
128
  )
108
- self.custom_aggs_by_ticker_dir = os.path.join(
129
+ self.by_ticker_dir = os.path.join(
109
130
  self.custom_asset_files_dir,
110
- (custom_aggs_format + "_by_ticker").format(config=self),
131
+ (self.custom_aggs_name_format + "_by_ticker").format(config=self),
111
132
  )
112
- self.aggs_dir = self.custom_aggs_dir
113
- self.by_ticker_dir = self.custom_aggs_by_ticker_dir
114
- elif agg_time == "minute":
133
+ elif agg_time == AGG_TIME_MINUTE:
115
134
  self.agg_timedelta = pd.to_timedelta("1minute")
116
135
  self.aggs_dir = self.minute_aggs_dir
136
+ self.csv_files_dir = self.aggs_dir
117
137
  self.by_ticker_dir = self.minute_by_ticker_dir
118
- elif agg_time == "day":
138
+ elif agg_time == AGG_TIME_DAY:
119
139
  self.agg_timedelta = pd.to_timedelta("1day")
120
140
  self.aggs_dir = self.day_aggs_dir
141
+ self.csv_files_dir = self.aggs_dir
121
142
  self.by_ticker_dir = self.day_by_ticker_dir
122
143
  else:
123
144
  raise ValueError(
124
145
  f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
125
146
  )
126
- self.agg_time = agg_time
127
147
 
128
148
  self.arrow_format = environ.get(
129
- "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive"
149
+ "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == AGG_TIME_DAY else "hive"
130
150
  )
131
151
  # self.by_ticker_hive_dir = os.path.join(
132
152
  # self.by_ticker_dir,
133
153
  # f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
134
154
  # )
135
- self.cache_dir = os.path.join(self.asset_files_dir, "api_cache")
136
155
 
137
156
  @property
138
157
  def calendar(self):
139
- # If you don't give a start date you'll only get 20 years from today.
140
- if self.calendar_name in [NYSE_ALL_HOURS, "us_futures", "CMES", "XNYS", "NYSE"]:
141
- return get_calendar(self.calendar_name, side="right", start=pd.Timestamp("1990-01-01"))
142
- return get_calendar(self.calendar_name, side="right")
158
+ # print call stack
159
+ # import traceback
160
+ # traceback.print_stack()
161
+ return get_calendar(self.calendar_name, start_session=self.start_date, end_session=self.end_date)
143
162
 
144
163
  def ticker_file_path(self, date: pd.Timestamp):
145
164
  ticker_year_dir = os.path.join(
@@ -154,6 +173,9 @@ class PolygonConfig:
154
173
  # TODO: Use csv_paths_pattern to remove the suffixes
155
174
  return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
156
175
 
176
+ def date_to_csv_file_path(self, date: datetime.date, ext=".csv.gz"):
177
+ return f"{self.csv_files_dir}/{date.strftime('%Y/%m/%Y-%m-%d') + ext}"
178
+
157
179
  @property
158
180
  def by_ticker_aggs_arrow_dir(self):
159
181
  # TODO: Don't split these up by ingestion range. They're already time indexed.
@@ -161,19 +183,20 @@ class PolygonConfig:
161
183
  # This scattering is really slow and is usually gonna be redundant.
162
184
  # This wasn't a problem when start/end dates were the calendar bounds when omitted.
163
185
  # Can't just drop this because concat_all_aggs_from_csv will skip if it exists.
164
- return os.path.join(
165
- self.by_ticker_dir,
166
- f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
167
- # "aggs.arrow",
168
- )
186
+ # return os.path.join(
187
+ # self.by_ticker_dir,
188
+ # f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow",
189
+ # # "aggs.arrow",
190
+ # )
191
+ return self.by_ticker_dir
169
192
 
170
193
  def api_cache_path(
171
- self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
194
+ self, first_day: pd.Timestamp, last_day: pd.Timestamp, filename: str, extension=".parquet"
172
195
  ):
173
- start_str = parse_date(start_date, calendar=self.calendar).date().isoformat()
174
- end_str = parse_date(end_date, calendar=self.calendar).date().isoformat()
196
+ first_day_str = first_day.date().isoformat()
197
+ last_day_str = last_day.date().isoformat()
175
198
  return os.path.join(
176
- self.cache_dir, f"{start_str}_{end_str}/{filename}{extension}"
199
+ self.cache_dir, f"{first_day_str}_{last_day_str}/{filename}{extension}"
177
200
  )
178
201
 
179
202
  def csv_paths(self) -> Iterator[str]:
@@ -186,7 +209,9 @@ class PolygonConfig:
186
209
  for filename in sorted(filenames):
187
210
  yield os.path.join(root, filename)
188
211
 
189
- def find_first_and_last_aggs(self, aggs_dir, file_pattern) -> Tuple[str | None, str | None]:
212
+ def find_first_and_last_aggs(
213
+ self, aggs_dir, file_pattern
214
+ ) -> Tuple[str | None, str | None]:
190
215
  # Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
191
216
  # Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
192
217
  # This needs to be efficient because it is called on every init, even though we only need it for ingest.