zipline_polygon_bundle 0.1.8__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,33 +1,29 @@
1
- from .config import PolygonConfig
1
+ from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
2
2
 
3
3
  from typing import Iterator, Tuple
4
4
 
5
5
  import pyarrow as pa
6
- from pyarrow import dataset as pa_ds
7
- from pyarrow import compute as pa_compute
8
- from pyarrow import compute as pc
9
- from pyarrow import parquet as pa_parquet
10
- from pyarrow import csv as pa_csv
11
- from pyarrow import fs as pa_fs
6
+ import pyarrow.compute as pa_compute
7
+ import pyarrow.csv as pa_csv
8
+ import pyarrow.dataset as pa_ds
9
+ import pyarrow.fs as pa_fs
12
10
 
13
11
  from fsspec.implementations.arrow import ArrowFSWrapper
14
12
 
13
+ import os
15
14
  import datetime
16
- import pandas_market_calendars
15
+
17
16
  import numpy as np
18
17
  import pandas as pd
19
18
 
20
- import pandas_ta as ta
21
-
22
- # from concurrent.futures import ThreadPoolExecutor
23
- # from concurrent.futures import ProcessPoolExecutor
24
-
25
19
 
26
20
  def trades_schema(raw: bool = False) -> pa.Schema:
27
21
  # There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
28
22
  # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
29
23
  # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
30
- # timestamp_type = pa.timestamp("ns", tz="UTC")
24
+ # The timezone is America/New_York because that's the US exchanges timezone and the date is a trading day.
25
+ # timestamp_type = pa.timestamp("ns", tz="America/New_York")
26
+ # timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
31
27
  timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
32
28
 
33
29
  # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
@@ -36,22 +32,22 @@ def trades_schema(raw: bool = False) -> pa.Schema:
36
32
  price_type = pa.float64()
37
33
 
38
34
  return pa.schema(
39
- [
40
- pa.field("ticker", pa.string(), nullable=False),
41
- pa.field("conditions", pa.string(), nullable=False),
42
- pa.field("correction", pa.string(), nullable=False),
43
- pa.field("exchange", pa.int8(), nullable=False),
44
- pa.field("id", pa.string(), nullable=False),
45
- pa.field("participant_timestamp", timestamp_type, nullable=False),
46
- pa.field("price", price_type, nullable=False),
47
- pa.field("sequence_number", pa.int64(), nullable=False),
48
- pa.field("sip_timestamp", timestamp_type, nullable=False),
49
- pa.field("size", pa.int64(), nullable=False),
50
- pa.field("tape", pa.int8(), nullable=False),
51
- pa.field("trf_id", pa.int64(), nullable=False),
52
- pa.field("trf_timestamp", timestamp_type, nullable=False),
53
- ]
54
- )
35
+ [
36
+ pa.field("ticker", pa.string(), nullable=False),
37
+ pa.field("conditions", pa.string(), nullable=False),
38
+ pa.field("correction", pa.string(), nullable=False),
39
+ pa.field("exchange", pa.int8(), nullable=False),
40
+ pa.field("id", pa.string(), nullable=False),
41
+ pa.field("participant_timestamp", timestamp_type, nullable=False),
42
+ pa.field("price", price_type, nullable=False),
43
+ pa.field("sequence_number", pa.int64(), nullable=False),
44
+ pa.field("sip_timestamp", timestamp_type, nullable=False),
45
+ pa.field("size", pa.int64(), nullable=False),
46
+ pa.field("tape", pa.int8(), nullable=False),
47
+ pa.field("trf_id", pa.int64(), nullable=False),
48
+ pa.field("trf_timestamp", timestamp_type, nullable=False),
49
+ ]
50
+ )
55
51
 
56
52
 
57
53
  def trades_dataset(config: PolygonConfig) -> pa_ds.Dataset:
@@ -68,20 +64,26 @@ def trades_dataset(config: PolygonConfig) -> pa_ds.Dataset:
68
64
  fsspec.glob(os.path.join(config.trades_dir, config.csv_paths_pattern))
69
65
  )
70
66
 
71
- return pa_ds.FileSystemDataset.from_paths(paths,
72
- format=pa_ds.CsvFileFormat(),
73
- schema=trades_schema(raw=True),
74
- filesystem=config.filesystem)
67
+ return pa_ds.FileSystemDataset.from_paths(
68
+ paths,
69
+ format=pa_ds.CsvFileFormat(),
70
+ schema=trades_schema(raw=True),
71
+ filesystem=config.filesystem,
72
+ )
75
73
 
76
74
 
77
- def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa.uint8()):
75
+ def cast_strings_to_list(
76
+ string_array, separator=",", default="0", value_type=pa.uint8()
77
+ ):
78
78
  """Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
79
79
 
80
80
  # Create a mask to identify empty strings
81
81
  is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
82
82
 
83
83
  # Use replace_with_mask to replace empty strings with the default ("0")
84
- filled_column = pa_compute.replace_with_mask(string_array, is_empty, pa.scalar(default))
84
+ filled_column = pa_compute.replace_with_mask(
85
+ string_array, is_empty, pa.scalar(default)
86
+ )
85
87
 
86
88
  # Split the strings by comma
87
89
  split_array = pa_compute.split_pattern(filled_column, pattern=separator)
@@ -92,254 +94,78 @@ def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa
92
94
  return int_list_array
93
95
 
94
96
 
95
- def cast_trades(trades):
96
- trades = trades.cast(trades_schema())
97
- condition_values = cast_strings_to_list(trades.column("conditions").combine_chunks())
98
- return trades.append_column('condition_values', condition_values)
99
-
100
-
101
- def date_to_path(date, ext=".csv.gz"):
102
- # return f"{date.year}/{date.month:02}/{date.isoformat()}{ext}"
103
- return date.strftime("%Y/%m/%Y-%m-%d") + ext
104
-
105
-
106
- def convert_to_custom_aggs_file(config: PolygonConfig,
107
- overwrite: bool,
108
- timestamp: pd.Timestamp,
109
- start_session: pd.Timestamp,
110
- end_session: pd.Timestamp):
111
- date = timestamp.to_pydatetime().date()
112
- aggs_date_path = date_to_path(date, ext=".parquet")
113
- aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
114
- # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
115
- fsspec = ArrowFSWrapper(config.filesystem)
116
- if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
117
- if overwrite:
118
- if fsspec.exists(aggs_path):
119
- config.filesystem.delete_file(aggs_path)
120
- if fsspec.exists(aggs_by_ticker_path):
121
- config.filesystem.delete_file(aggs_by_ticker_path)
122
- else:
123
- if fsspec.exists(aggs_path):
124
- print(f"SKIPPING: {date=} File exists {aggs_path=}")
125
- if fsspec.exists(aggs_by_ticker_path):
126
- print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
127
- return
128
- fsspec.mkdir(fsspec._parent(aggs_path))
129
- fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
130
- trades_path = f"{config.trades_dir}/{date_to_path(date)}"
131
- if not fsspec.exists(trades_path):
132
- print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
133
- return
134
- print(f"{trades_path=}")
135
- format = pa_ds.CsvFileFormat()
136
- trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
137
- fragments = trades_ds.get_fragments()
138
- fragment = next(fragments)
139
- try:
140
- next(fragments)
141
- print("ERROR: More than one fragment for {path=}")
142
- except StopIteration:
143
- pass
144
- trades = fragment.to_table(schema=trades_ds.schema)
97
+ def cast_trades(trades) -> pa.Table:
145
98
  trades = trades.cast(trades_schema())
146
- min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
147
- max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
148
- if min_timestamp < start_session:
149
- print(f"ERROR: {min_timestamp=} < {start_session=}")
150
- if max_timestamp >= end_session:
151
- print(f"ERROR: {max_timestamp=} >= {end_session=}")
152
- trades_df = trades.to_pandas()
153
- trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
154
- aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
155
- open=('price', 'first'),
156
- high=('price', 'max'),
157
- low=('price', 'min'),
158
- close=('price', 'last'),
159
- volume=('size', 'sum'),
99
+ condition_values = cast_strings_to_list(
100
+ trades.column("conditions").combine_chunks()
160
101
  )
161
- aggs_df['transactions'] = trades_df.groupby(["ticker", "window_start"]).size()
162
- aggs_df.reset_index(inplace=True)
163
- aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
164
- aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
165
- print(f"{aggs_by_ticker_path=}")
166
- pa_parquet.write_table(table=aggs_table,
167
- where=aggs_by_ticker_path, filesystem=to_config.filesystem)
168
- aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
169
- print(f"{aggs_path=}")
170
- pa_parquet.write_table(table=aggs_table,
171
- where=aggs_path, filesystem=to_config.filesystem)
172
-
173
-
174
- # def convert_to_custom_aggs(config: PolygonConfig,
175
- # overwrite: bool,
176
- # timestamp: pd.Timestamp,
177
- # start_session: pd.Timestamp,
178
- # end_session: pd.Timestamp):
179
- # date = timestamp.to_pydatetime().date()
180
- # aggs_date_path = date_to_path(date, ext=".parquet")
181
- # aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
182
- # # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
183
- # fsspec = ArrowFSWrapper(config.filesystem)
184
- # if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
185
- # if overwrite:
186
- # if fsspec.exists(aggs_path):
187
- # config.filesystem.delete_file(aggs_path)
188
- # if fsspec.exists(aggs_by_ticker_path):
189
- # config.filesystem.delete_file(aggs_by_ticker_path)
190
- # else:
191
- # if fsspec.exists(aggs_path):
192
- # print(f"SKIPPING: {date=} File exists {aggs_path=}")
193
- # if fsspec.exists(aggs_by_ticker_path):
194
- # print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
195
- # return
196
- # fsspec.mkdir(fsspec._parent(aggs_path))
197
- # fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
198
- # trades_path = f"{config.trades_dir}/{date_to_path(date)}"
199
- # if not fsspec.exists(trades_path):
200
- # print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
201
- # return
202
- # print(f"{trades_path=}")
203
- # format = pa_ds.CsvFileFormat()
204
- # trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
205
- # fragments = trades_ds.get_fragments()
206
- # fragment = next(fragments)
207
- # try:
208
- # next(fragments)
209
- # print("ERROR: More than one fragment for {path=}")
210
- # except StopIteration:
211
- # pass
212
- # trades = fragment.to_table(schema=trades_ds.schema)
213
- # trades = trades.cast(trades_schema())
214
- # min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
215
- # max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
216
- # if min_timestamp < start_session:
217
- # print(f"ERROR: {min_timestamp=} < {start_session=}")
218
- # if max_timestamp >= end_session:
219
- # print(f"ERROR: {max_timestamp=} >= {end_session=}")
220
- # trades_df = trades.to_pandas()
221
- # trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
222
- # aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
223
- # open=('price', 'first'),
224
- # high=('price', 'max'),
225
- # low=('price', 'min'),
226
- # close=('price', 'last'),
227
- # volume=('size', 'sum'),
228
- # )
229
- # aggs_df['transactions'] = trades_df.groupby(["ticker", "window_start"]).size()
230
- # aggs_df.reset_index(inplace=True)
231
- # aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
232
- # aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
233
- # print(f"{aggs_by_ticker_path=}")
234
- # pa_parquet.write_table(table=aggs_table,
235
- # where=aggs_by_ticker_path, filesystem=to_config.filesystem)
236
- # aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
237
- # print(f"{aggs_path=}")
238
- # pa_parquet.write_table(table=aggs_table,
239
- # where=aggs_path, filesystem=to_config.filesystem)
240
- # pa_ds.write_dataset(
241
- # generate_batches_from_tables(tables),
242
- # schema=schema,
243
- # base_dir=by_ticker_aggs_arrow_dir,
244
- # partitioning=partitioning,
245
- # format="parquet",
246
- # existing_data_behavior="overwrite_or_ignore",
247
- # )
248
-
249
-
250
- # def generate_csv_trades_tables(
251
- # config: PolygonConfig,
252
- # ) -> Tuple[datetime.date, Iterator[pa.Table]]:
253
- # """Generator for trades tables from flatfile CSVs."""
254
- # # Use pandas_market_calendars so we can get extended hours.
255
- # # NYSE and NASDAQ have extended hours but XNYS does not.
256
- # calendar = pandas_market_calendars.get_calendar(config.calendar_name)
257
- # schedule = calendar.schedule(start_date=config.start_timestamp, end_date=config.end_timestamp, start="pre", end="post")
258
- # for timestamp, session in schedule.iterrows():
259
- # date = timestamp.to_pydatetime().date()
260
- # trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
261
- # format = pa_ds.CsvFileFormat()
262
- # trades_ds = pa_ds.FileSystemDataset.from_paths([trades_csv_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
263
- # fragments = trades_ds.get_fragments()
264
- # fragment = next(fragments)
265
- # try:
266
- # next(fragments)
267
- # print("ERROR: More than one fragment for {path=}")
268
- # except StopIteration:
269
- # pass
270
- # trades = fragment.to_table(schema=trades_ds.schema)
271
- # trades = trades.cast(trades_schema())
272
- # min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
273
- # max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
274
- # start_session = session['pre']
275
- # end_session = session['post']
276
- # # print(f"{start_session=} {end_session=}")
277
- # # print(f"{min_timestamp=} {max_timestamp=}")
278
- # if min_timestamp < start_session:
279
- # print(f"ERROR: {min_timestamp=} < {start_session=}")
280
- # # The end_session is supposed to be a limit but there are many with trades at that second.
281
- # if max_timestamp >= (end_session + pd.Timedelta(seconds=1)):
282
- # # print(f"ERROR: {max_timestamp=} >= {end_session=}")
283
- # print(f"ERROR: {max_timestamp=} > {end_session+pd.Timedelta(seconds=1)=}")
284
- # yield date, trades
285
- # del fragment
286
- # del fragments
287
- # del trades_ds
102
+ return trades.append_column("condition_values", condition_values)
288
103
 
289
104
 
290
105
  def custom_aggs_schema(raw: bool = False) -> pa.Schema:
106
+ # timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
291
107
  timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
292
108
  price_type = pa.float64()
293
109
  return pa.schema(
294
- [
295
- pa.field("ticker", pa.string(), nullable=False),
296
- pa.field("volume", pa.int64(), nullable=False),
297
- pa.field("open", price_type, nullable=False),
298
- pa.field("close", price_type, nullable=False),
299
- pa.field("high", price_type, nullable=False),
300
- pa.field("low", price_type, nullable=False),
301
- pa.field("window_start", timestamp_type, nullable=False),
302
- pa.field("transactions", pa.int64(), nullable=False),
303
- pa.field("date", pa.date32(), nullable=False),
304
- pa.field("year", pa.uint16(), nullable=False),
305
- pa.field("month", pa.uint8(), nullable=False),
306
- ]
307
- )
110
+ [
111
+ pa.field("ticker", pa.string(), nullable=False),
112
+ pa.field("volume", pa.int64(), nullable=False),
113
+ pa.field("open", price_type, nullable=False),
114
+ pa.field("close", price_type, nullable=False),
115
+ pa.field("high", price_type, nullable=False),
116
+ pa.field("low", price_type, nullable=False),
117
+ pa.field("window_start", timestamp_type, nullable=False),
118
+ pa.field("transactions", pa.int64(), nullable=False),
119
+ pa.field("date", pa.date32(), nullable=False),
120
+ pa.field("year", pa.uint16(), nullable=False),
121
+ pa.field("month", pa.uint8(), nullable=False),
122
+ pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
123
+ ]
124
+ )
308
125
 
309
126
 
310
127
  def custom_aggs_partitioning() -> pa.Schema:
311
128
  return pa_ds.partitioning(
312
- pa.schema([('year', pa.uint16()), ('month', pa.uint8()), ('date', pa.date32())]), flavor="hive"
129
+ pa.schema(
130
+ [("year", pa.uint16()), ("month", pa.uint8()), ("date", pa.date32())]
131
+ ),
132
+ flavor="hive",
313
133
  )
314
134
 
315
135
 
316
- def get_custom_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
317
- file_info = config.filesystem.get_file_info(config.custom_aggs_dir)
136
+ def get_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
137
+ file_info = config.filesystem.get_file_info(config.aggs_dir)
318
138
  if file_info.type == pa_fs.FileType.NotFound:
319
139
  return set()
320
- aggs_ds = pa_ds.dataset(config.custom_aggs_dir,
321
- format="parquet",
322
- schema=custom_aggs_schema(),
323
- partitioning=custom_aggs_partitioning())
324
- return set([pa_ds.get_partition_keys(fragment.partition_expression).get("date") for fragment in aggs_ds.get_fragments()])
140
+ aggs_ds = pa_ds.dataset(
141
+ config.aggs_dir,
142
+ format="parquet",
143
+ schema=custom_aggs_schema(),
144
+ partitioning=custom_aggs_partitioning(),
145
+ )
146
+ return set(
147
+ [
148
+ pa_ds.get_partition_keys(fragment.partition_expression).get("date")
149
+ for fragment in aggs_ds.get_fragments()
150
+ ]
151
+ )
325
152
 
326
153
 
327
154
  def generate_csv_trades_tables(
328
155
  config: PolygonConfig, overwrite: bool = False
329
- ) -> Tuple[datetime.date, Iterator[pa.Table]]:
156
+ ) -> Iterator[Tuple[datetime.date, pa.Table]]:
330
157
  """Generator for trades tables from flatfile CSVs."""
331
- custom_aggs_dates = set()
158
+ existing_aggs_dates = set()
332
159
  if not overwrite:
333
- custom_aggs_dates = get_custom_aggs_dates(config)
334
- # Use pandas_market_calendars so we can get extended hours.
335
- # NYSE and NASDAQ have extended hours but XNYS does not.
336
- calendar = pandas_market_calendars.get_calendar(config.calendar_name)
337
- schedule = calendar.schedule(start_date=config.start_timestamp, end_date=config.end_timestamp, start="pre", end="post")
338
- for timestamp, session in schedule.iterrows():
339
- date = timestamp.to_pydatetime().date()
340
- if date in custom_aggs_dates:
160
+ existing_aggs_dates = get_aggs_dates(config)
161
+ schedule = config.calendar.trading_index(
162
+ start=config.start_timestamp, end=config.end_timestamp, period="1D"
163
+ )
164
+ for timestamp in schedule:
165
+ date: datetime.date = timestamp.to_pydatetime().date()
166
+ if date in existing_aggs_dates:
341
167
  continue
342
- trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
168
+ trades_csv_path = config.date_to_csv_file_path(date)
343
169
  convert_options = pa_csv.ConvertOptions(column_types=trades_schema(raw=True))
344
170
  trades = pa_csv.read_csv(trades_csv_path, convert_options=convert_options)
345
171
  trades = trades.cast(trades_schema())
@@ -359,77 +185,87 @@ def generate_csv_trades_tables(
359
185
  del trades
360
186
 
361
187
 
362
- def trades_to_custom_aggs(config: PolygonConfig, date: datetime.date, table: pa.Table, include_trf: bool = False) -> pa.Table:
363
- print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
188
+ def trades_to_custom_aggs(
189
+ config: PolygonConfig,
190
+ date: datetime.date,
191
+ table: pa.Table,
192
+ include_trf: bool = False,
193
+ ) -> pa.Table:
194
+ print(f"{date=} {pa.default_memory_pool()=}")
195
+ # print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
364
196
  # print(f"{resource.getrusage(resource.RUSAGE_SELF).ru_maxrss=}")
365
197
  table = table.filter(pa_compute.greater(table["size"], 0))
366
198
  table = table.filter(pa_compute.equal(table["correction"], "0"))
367
199
  if not include_trf:
368
200
  table = table.filter(pa_compute.not_equal(table["exchange"], 4))
369
- table = table.append_column("price_total", pa_compute.multiply(table["price"], table["size"]))
370
- table = table.append_column("window_start",
371
- pa_compute.floor_temporal(table["sip_timestamp"],
372
- multiple=config.agg_timedelta.seconds, unit="second"))
373
- # TODO: Calculate VWAP.
374
- table = table.group_by(["ticker", "window_start"], use_threads=False).aggregate([
375
- ('price', 'first'),
376
- ('price', 'max'),
377
- ('price', 'min'),
378
- ('price', 'last'),
379
- ('price_total', 'sum'),
380
- ('size', 'sum'),
381
- ([], "count_all")
382
- ])
383
- table = table.rename_columns({
384
- 'price_first': 'open',
385
- 'price_max': 'high',
386
- 'price_min': 'low',
387
- 'price_last': 'close',
388
- 'size_sum': 'volume',
389
- 'price_total_sum': 'total',
390
- 'count_all': 'transactions'})
391
- table = table.append_column("vwap", pa_compute.divide(table['total'], table['volume']))
201
+ table = table.append_column(
202
+ "price_total", pa_compute.multiply(table["price"], table["size"])
203
+ )
204
+ table = table.append_column(
205
+ "window_start",
206
+ pa_compute.floor_temporal(
207
+ table["sip_timestamp"], multiple=config.agg_timedelta.seconds, unit="second"
208
+ ),
209
+ )
210
+ table = table.group_by(["ticker", "window_start"], use_threads=False).aggregate(
211
+ [
212
+ ("price", "first"),
213
+ ("price", "max"),
214
+ ("price", "min"),
215
+ ("price", "last"),
216
+ ("price_total", "sum"),
217
+ ("size", "sum"),
218
+ ([], "count_all"),
219
+ ]
220
+ )
221
+ table = table.rename_columns(
222
+ {
223
+ "price_first": "open",
224
+ "price_max": "high",
225
+ "price_min": "low",
226
+ "price_last": "close",
227
+ "size_sum": "volume",
228
+ "price_total_sum": "total",
229
+ "count_all": "transactions",
230
+ }
231
+ )
232
+ table = table.append_column(
233
+ "vwap", pa_compute.divide(table["total"], table["volume"])
234
+ )
392
235
  # table.append_column('date', pa.array([date] * len(table), type=pa.date32()))
393
236
  # table.append_column('year', pa.array([date.year] * len(table), type=pa.uint16()))
394
237
  # table.append_column('month', pa.array([date.month] * len(table), type=pa.uint8()))
395
- table = table.append_column('date', pa.array(np.full(len(table), date)))
396
- table = table.append_column('year', pa.array(np.full(len(table), date.year), type=pa.uint16()))
397
- table = table.append_column('month', pa.array(np.full(len(table), date.month), type=pa.uint8()))
398
- table = table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
238
+ table = table.append_column("date", pa.array(np.full(len(table), date)))
239
+ table = table.append_column(
240
+ "year", pa.array(np.full(len(table), date.year), type=pa.uint16())
241
+ )
242
+ table = table.append_column(
243
+ "month", pa.array(np.full(len(table), date.month), type=pa.uint8())
244
+ )
245
+ table = table.append_column(
246
+ PARTITION_COLUMN_NAME,
247
+ pa.array(
248
+ [to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
249
+ ),
250
+ )
251
+ table = table.sort_by([("window_start", "ascending"), ("ticker", "ascending")])
252
+ # print(f"aggs {date=} {table.to_pandas().head()=}")
399
253
  return table
400
254
 
401
255
 
402
- def generate_custom_agg_batches_from_tables(config: PolygonConfig) -> pa.RecordBatch:
403
- for date, trades_table in generate_csv_trades_tables(config):
404
- for batch in trades_to_custom_aggs(config, date, trades_table).to_batches():
405
- yield batch
406
- del trades_table
407
-
408
-
409
- def generate_custom_agg_tables(config: PolygonConfig) -> pa.Table:
410
- for date, trades_table in generate_csv_trades_tables(config):
411
- yield trades_to_custom_aggs(config, date, trades_table)
412
-
413
-
414
- def configure_write_custom_aggs_to_dataset(config: PolygonConfig):
415
- def write_custom_aggs_to_dataset(args: Tuple[datetime.date, pa.Table]):
416
- date, table = args
417
- pa_ds.write_dataset(
418
- trades_to_custom_aggs(config, date, table),
419
- filesystem=config.filesystem,
420
- base_dir=config.custom_aggs_dir,
421
- partitioning=custom_aggs_partitioning(),
422
- format="parquet",
423
- existing_data_behavior="overwrite_or_ignore",
424
- )
425
- return write_custom_aggs_to_dataset
256
+ # def generate_custom_agg_batches_from_tables(config: PolygonConfig):
257
+ # for date, trades_table in generate_csv_trades_tables(config):
258
+ # aggs_table = trades_to_custom_aggs(config, date, trades_table)
259
+ # yield aggs_table
260
+ # del aggs_table
261
+ # del trades_table
426
262
 
427
263
 
428
264
  def file_visitor(written_file):
429
265
  print(f"{written_file.path=}")
430
266
 
431
267
 
432
- def convert_all_to_custom_aggs(
268
+ def convert_trades_to_custom_aggs(
433
269
  config: PolygonConfig, overwrite: bool = False
434
270
  ) -> str:
435
271
  if overwrite:
@@ -438,7 +274,7 @@ def convert_all_to_custom_aggs(
438
274
  # MAX_FILES_OPEN = 8
439
275
  # MIN_ROWS_PER_GROUP = 100_000
440
276
 
441
- print(f"{config.custom_aggs_dir=}")
277
+ print(f"{config.aggs_dir=}")
442
278
 
443
279
  # pa.set_memory_pool()
444
280
 
@@ -446,26 +282,25 @@ def convert_all_to_custom_aggs(
446
282
  # generate_custom_agg_batches_from_tables(config),
447
283
  # schema=custom_aggs_schema(),
448
284
  # filesystem=config.filesystem,
449
- # base_dir=config.custom_aggs_dir,
285
+ # base_dir=config.aggs_dir,
450
286
  # partitioning=custom_aggs_partitioning(),
451
287
  # format="parquet",
452
288
  # existing_data_behavior="overwrite_or_ignore",
453
- # max_open_files = MAX_FILES_OPEN,
454
- # min_rows_per_group = MIN_ROWS_PER_GROUP,
289
+ # # max_open_files = MAX_FILES_OPEN,
290
+ # # min_rows_per_group = MIN_ROWS_PER_GROUP,
455
291
  # )
456
292
 
457
293
  for date, trades_table in generate_csv_trades_tables(config):
458
294
  aggs_table = trades_to_custom_aggs(config, date, trades_table)
459
295
  pa_ds.write_dataset(
460
296
  aggs_table,
461
- # schema=custom_aggs_schema(),
462
297
  filesystem=config.filesystem,
463
- base_dir=config.custom_aggs_dir,
298
+ base_dir=config.aggs_dir,
464
299
  partitioning=custom_aggs_partitioning(),
465
300
  format="parquet",
466
301
  existing_data_behavior="overwrite_or_ignore",
467
302
  file_visitor=file_visitor,
468
- # max_open_files=MAX_FILES_OPEN,
303
+ # max_open_files=10,
469
304
  # min_rows_per_group=MIN_ROWS_PER_GROUP,
470
305
  )
471
306
  del aggs_table
@@ -477,8 +312,8 @@ def convert_all_to_custom_aggs(
477
312
  # generate_csv_trades_tables(config),
478
313
  # )
479
314
 
480
- print(f"Generated aggregates to {config.custom_aggs_dir=}")
481
- return config.custom_aggs_dir
315
+ print(f"Generated aggregates to {config.aggs_dir=}")
316
+ return config.aggs_dir
482
317
 
483
318
 
484
319
  # https://github.com/twopirllc/pandas-ta/issues/731#issuecomment-1766786952
@@ -500,208 +335,199 @@ def convert_all_to_custom_aggs(
500
335
  # mfi = 100 - 100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon))
501
336
  # return mfi
502
337
 
503
- def calculate_mfi(typical_price: pd.Series, money_flow: pd.Series, period: int):
504
- mf_sign = np.where(typical_price > np.roll(typical_price, shift=1), 1, -1)
505
- signed_mf = money_flow * mf_sign
506
-
507
- # Calculate gain and loss using vectorized operations
508
- positive_mf = np.maximum(signed_mf, 0)
509
- negative_mf = np.maximum(-signed_mf, 0)
510
-
511
- mf_avg_gain = np.convolve(positive_mf, np.ones(period), mode='full')[:len(positive_mf)] / period
512
- mf_avg_loss = np.convolve(negative_mf, np.ones(period), mode='full')[:len(negative_mf)] / period
513
-
514
- epsilon = 1e-10 # Small epsilon value to avoid division by zero
515
- mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
516
- return mfi
517
-
518
-
519
- # https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
520
- # https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
521
- # `k` vs `fast_k` arg names.
522
- # https://github.com/twopirllc/pandas-ta/issues/726
523
- # Results affected by values outside range
524
- # https://github.com/twopirllc/pandas-ta/issues/535
525
-
526
- def calculate_stoch(high: pd.Series, low: pd.Series, close: pd.Series, k: int = 14, d: int = 3, smooth_k: int = 3, mamode:str = "sma"):
527
- """Indicator: Stochastic Oscillator (STOCH)"""
528
- lowest_low = low.rolling(k).min()
529
- highest_high = high.rolling(k).max()
530
-
531
- stoch = 100 * (close - lowest_low)
532
- stoch /= ta.utils.non_zero_range(highest_high, lowest_low)
533
-
534
- stoch_k = ta.overlap.ma(mamode, stoch.loc[stoch.first_valid_index():,], length=smooth_k)
535
- stoch_d = ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index():,], length=d) if stoch_k is not None else None
536
- # Histogram
537
- stoch_h = stoch_k - stoch_d if stoch_d is not None else None
538
-
539
- return stoch_k, stoch_d, stoch_h
540
-
541
-
542
- def compute_per_ticker_signals(df: pd.DataFrame, period: int = 14) -> pd.DataFrame:
543
- df = df.set_index('window_start').sort_index()
544
- session_index = pd.date_range(start=df.index[0],
545
- end=df.index[-1],
546
- freq=pd.Timedelta(seconds=60))
547
- df = df.reindex(session_index)
548
- df.index.rename('window_start', inplace=True)
549
-
550
- # df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
551
- # df["day_of_week"] = df.index.day_of_week
552
-
553
- df.transactions = df.transactions.fillna(0)
554
- df.volume = df.volume.fillna(0)
555
- df.total = df.total.fillna(0)
556
- df.close = df.close.ffill()
557
- close = df.close
558
- df.vwap = df.vwap.fillna(close)
559
- df.high = df.high.fillna(close)
560
- df.low = df.low.fillna(close)
561
- df.open = df.open.fillna(close)
562
- price_open = df.open
563
- high = df.high
564
- low = df.low
565
- vwap = df.vwap
566
- # volume = df.volume
567
- total = df.total
568
- next_close = close.shift()
569
-
570
- # TODO: Odometer rollover signal. Relative difference to nearest power of 10.
571
- # Something about log10 being a whole number? When is $50 the rollover vs $100 or $10?
572
-
573
- # "True (Typical?) Price" which I think is an approximation of VWAP.
574
- # Trouble with both is that if there are no trades in a bar we get NaN.
575
- # That then means we get NaN for averages for the next period-1 bars too.
576
- # Question is whether to ffill the price for these calculations.
577
- df["TP"] = (high + low + close) / 3
578
-
579
- # Gain/loss in this bar.
580
- df["ret1bar"] = close.div(price_open).sub(1)
581
-
582
- for t in range(2, period):
583
- df[f'ret{t}bar'] = close.div(price_open.shift(t-1)).sub(1)
584
-
585
- # Average True Range (ATR)
586
- true_range = pd.concat([high.sub(low),
587
- high.sub(next_close).abs(),
588
- low.sub(next_close).abs()], axis=1).max(1)
589
- # Normalized ATR (NATR) or Average of Normalized TR.
590
- # Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
591
- # He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
592
- # atr = true_range.ewm(span=period).mean()
593
- # df["natr_c"] = atr / close
594
- # df["antr_c"] = (true_range / close).ewm(span=period).mean()
595
- # df["natr_v"] = atr / vwap
596
- # df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
597
- df["NATR"] = (true_range / vwap).ewm(span=period).mean()
598
-
599
- # True Price as HLC average VS VWAP.
600
- # VWAP is better I think but is quite different than standard CCI.
601
- # Three ways to compute CCI, all give the same value using TP.
602
- # tp = (high + low + close) / 3
603
- # df['SMA'] = ta.sma(tp, length=period)
604
- # df['sma_r'] = tp.rolling(period).mean()
605
- # df['MAD'] = ta.mad(tp, length=period)
606
- # # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
607
- # df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
608
-
609
- # df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
610
- # df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
611
- # df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
612
-
613
- df['taCCI'] = ta.cci(high=high, low=low, close=close, length=period)
614
-
615
- # https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
616
- # https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
617
- sma = pd.Series(ta.sma(vwap, length=period))
618
- mad = pd.Series(ta.mad(vwap, length=period))
619
- df['CCI'] = (vwap - sma) / (0.015 * mad)
620
-
621
- # df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
622
- df['MFI'] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
623
-
624
- # We use Stochastic (rather than MACD because we need a ticker independent indicator.
625
- # IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
626
- # https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
627
- # We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
628
- stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
629
- df["STOCHk"] = stoch_k
630
- df["STOCHd"] = stoch_d
631
- df["STOCHh"] = stoch_h
632
-
633
- return df
634
-
635
-
636
- def iterate_all_aggs_tables(config: PolygonConfig, valid_tickers: pa.Array, start_session: str = "pre", end_session: str = "market_open"):
637
- calendar = pandas_market_calendars.get_calendar(config.calendar_name)
638
- schedule = calendar.schedule(start_date=config.start_date,
639
- end_date=config.end_date,
640
- start="pre",
641
- end="post")
642
- for date, sessions in schedule.iterrows():
643
- # print(f"{date=} {sessions=}")
644
- start_dt = sessions[start_session]
645
- end_dt = sessions[end_session]
646
- # print(f"{date=} {start_dt=} {end_dt=}")
647
- aggs_ds = pa_ds.dataset(config.custom_aggs_dir,
648
- format="parquet",
649
- schema=custom_aggs_schema(),
650
- partitioning=custom_aggs_partitioning())
651
- date_filter_expr = ((pc.field('year') == date.year)
652
- & (pc.field('month') == date.month)
653
- & (pc.field('date') == date.to_pydatetime().date()))
654
- # print(f"{date_filter_expr=}")
655
- for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
656
- session_filter = ((pc.field('window_start') >= start_dt)
657
- & (pc.field('window_start') < end_dt)
658
- & pc.is_in(pc.field('ticker'), valid_tickers)
659
- )
660
- # Sorting table doesn't seem to avoid needing to sort the df. Maybe use_threads=False on to_pandas would help?
661
- # table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
662
- table = fragment.to_table(filter=session_filter)
663
- if table.num_rows > 0:
664
- metadata = dict(table.schema.metadata) if table.schema.metadata else dict()
665
- metadata["date"] = date.date().isoformat()
666
- table = table.replace_schema_metadata(metadata)
667
- yield table
668
-
669
-
670
- def iterate_all_aggs_with_signals(config: PolygonConfig):
671
- for table in iterate_all_aggs_tables(config):
672
- df = table.to_pandas()
673
- df = df.groupby("ticker").apply(compute_per_ticker_signals, include_groups=False)
674
- yield pa.Table.from_pandas(df)
675
-
676
-
677
- def compute_signals_for_all_custom_aggs(
678
- from_config: PolygonConfig, to_config: PolygonConfig, valid_tickers: pa.Array, overwrite: bool = False
679
- ) -> str:
680
- if overwrite:
681
- print("WARNING: overwrite not implemented/ignored.")
682
338
 
683
- print(f"{to_config.custom_aggs_dir=}")
684
-
685
- for aggs_table in iterate_all_aggs_tables(from_config, valid_tickers):
686
- metadata = aggs_table.schema.metadata
687
- date = datetime.date.fromisoformat(metadata[b'date'].decode('utf-8'))
688
- print(f"{date=}")
689
- df = aggs_table.to_pandas()
690
- df = df.groupby("ticker").apply(compute_per_ticker_signals, include_groups=False)
691
- table = pa.Table.from_pandas(df)
692
- if table.num_rows > 0:
693
- table = table.replace_schema_metadata(metadata)
694
- table = table.append_column('date', pa.array(np.full(len(table), date)))
695
- table = table.append_column('year', pa.array(np.full(len(table), date.year), type=pa.uint16()))
696
- table = table.append_column('month', pa.array(np.full(len(table), date.month), type=pa.uint8()))
697
- table = table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
698
- pa_ds.write_dataset(
699
- table,
700
- filesystem=to_config.filesystem,
701
- base_dir=to_config.custom_aggs_dir,
702
- partitioning=custom_aggs_partitioning(),
703
- format="parquet",
704
- existing_data_behavior="overwrite_or_ignore",
705
- file_visitor=file_visitor,
339
+ def get_by_ticker_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
340
+ file_info = config.filesystem.get_file_info(config.by_ticker_aggs_arrow_dir)
341
+ if file_info.type == pa_fs.FileType.NotFound:
342
+ return set()
343
+ by_ticker_aggs_ds = pa_ds.dataset(
344
+ config.by_ticker_aggs_arrow_dir,
345
+ format="parquet",
346
+ schema=custom_aggs_schema(),
347
+ partitioning=custom_aggs_partitioning(),
348
+ )
349
+ return set(
350
+ [
351
+ pa_ds.get_partition_keys(fragment.partition_expression).get("date")
352
+ for fragment in by_ticker_aggs_ds.get_fragments()
353
+ ]
354
+ )
355
+
356
+
357
+ def batches_for_date(aggs_ds: pa_ds.Dataset, date: pd.Timestamp):
358
+ date_filter_expr = (
359
+ (pa_compute.field("year") == date.year)
360
+ & (pa_compute.field("month") == date.month)
361
+ & (pa_compute.field("date") == date.date())
362
+ )
363
+ print(f"table for {date=}")
364
+ # return aggs_ds.scanner(filter=date_filter_expr).to_batches()
365
+ table = aggs_ds.scanner(filter=date_filter_expr).to_table()
366
+ table = table.sort_by([("part", "ascending"), ("ticker", "ascending"), ("window_start", "ascending"), ])
367
+ return table.to_batches()
368
+
369
+ def generate_batches_for_schedule(config, aggs_ds):
370
+ schedule = config.calendar.trading_index(
371
+ start=config.start_timestamp, end=config.end_timestamp, period="1D"
372
+ )
373
+ for timestamp in schedule:
374
+ # print(f"{timestamp=}")
375
+ yield from batches_for_date(aggs_ds=aggs_ds, date=timestamp)
376
+
377
+
378
+ # def scatter_custom_aggs_to_by_ticker(
379
+ # config: PolygonConfig,
380
+ # overwrite: bool = False,
381
+ # ) -> str:
382
+ # lock = FileLock(config.lock_file_path, blocking=False)
383
+ # with lock:
384
+ # if not lock.is_locked:
385
+ # raise IOError("Failed to acquire lock for updating custom assets.")
386
+ # with open(config.by_ticker_dates_path, "a") as f:
387
+ # f.write("I have a bad feeling about this.")
388
+ # by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker_(config, overwrite)
389
+
390
+ # print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
391
+ # return by_ticker_aggs_arrow_dir
392
+
393
+
394
+ def filter_by_date(config: PolygonConfig) -> pa_compute.Expression:
395
+ start_date = config.start_timestamp.tz_localize(config.calendar.tz.key).date()
396
+ limit_date = (
397
+ (config.end_timestamp + pd.Timedelta(days=1))
398
+ .tz_localize(config.calendar.tz.key)
399
+ .date()
400
+ )
401
+ return (pa_compute.field("date") >= start_date) & (
402
+ pa_compute.field("date") <= limit_date
403
+ )
404
+
405
+
406
+ # def generate_batches_with_partition(
407
+ # config: PolygonConfig,
408
+ # aggs_ds: pa_ds.Dataset,
409
+ # ) -> Iterator[pa.Table]:
410
+ # for fragment in aggs_ds.sort_by("date").get_fragments(
411
+ # filter=filter_by_date(config),
412
+ # ):
413
+ # for batch in fragment.to_batches():
414
+ # # batch = batch.append_column(
415
+ # # PARTITION_COLUMN_NAME,
416
+ # # pa.array(
417
+ # # [
418
+ # # to_partition_key(ticker)
419
+ # # for ticker in batch.column("ticker").to_pylist()
420
+ # # ]
421
+ # # ),
422
+ # # )
423
+ # yield batch.sort_by(
424
+ # [("ticker", "ascending"), ("window_start", "ascending")]
425
+ # )
426
+ # del batch
427
+ # del fragment
428
+
429
+
430
+ def generate_batches_with_partition(
431
+ config: PolygonConfig,
432
+ aggs_ds: pa_ds.Dataset,
433
+ ) -> Iterator[pa.Table]:
434
+ for fragment in (
435
+ aggs_ds.filter(filter_by_date(config))
436
+ .sort_by([(PARTITION_COLUMN_NAME, "ascending"), ("date", "ascending")])
437
+ .get_fragments()
438
+ ):
439
+ for batch in fragment.to_batches():
440
+ yield batch.sort_by(
441
+ [("ticker", "ascending"), ("window_start", "ascending")]
706
442
  )
707
- return to_config.custom_aggs_dir
443
+ del batch
444
+ del fragment
445
+
446
+
447
+ def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
448
+ aggs_ds = pa_ds.dataset(
449
+ config.aggs_dir,
450
+ format="parquet",
451
+ schema=custom_aggs_schema(),
452
+ partitioning=custom_aggs_partitioning(),
453
+ )
454
+ by_ticker_schema = aggs_ds.schema
455
+ partitioning = pa_ds.partitioning(
456
+ pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
457
+ flavor="hive",
458
+ )
459
+ by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
460
+ print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
461
+ pa_ds.write_dataset(
462
+ # generate_batches_with_partition(config=config, aggs_ds=aggs_ds),
463
+ generate_batches_for_schedule(config=config, aggs_ds=aggs_ds),
464
+ schema=by_ticker_schema,
465
+ base_dir=by_ticker_aggs_arrow_dir,
466
+ partitioning=partitioning,
467
+ format="parquet",
468
+ existing_data_behavior="overwrite_or_ignore",
469
+ )
470
+ print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
471
+ return by_ticker_aggs_arrow_dir
472
+
473
+
474
+ # def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
475
+ # file_info = config.filesystem.get_file_info(config.aggs_dir)
476
+ # if file_info.type == pa_fs.FileType.NotFound:
477
+ # raise FileNotFoundError(f"{config.aggs_dir=} not found.")
478
+
479
+ # by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
480
+ # if os.path.exists(by_ticker_aggs_arrow_dir):
481
+ # if overwrite:
482
+ # print(f"Removing {by_ticker_aggs_arrow_dir=}")
483
+ # shutil.rmtree(by_ticker_aggs_arrow_dir)
484
+
485
+ # schedule = config.calendar.trading_index(
486
+ # start=config.start_timestamp, end=config.end_timestamp, period="1D"
487
+ # )
488
+ # assert type(schedule) is pd.DatetimeIndex
489
+
490
+ # print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
491
+ # aggs_ds = pa_ds.dataset(
492
+ # config.aggs_dir,
493
+ # format="parquet",
494
+ # schema=custom_aggs_schema(),
495
+ # partitioning=custom_aggs_partitioning(),
496
+ # )
497
+ # by_ticker_partitioning = pa_ds.partitioning(
498
+ # pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
499
+ # # pa.schema(
500
+ # # [
501
+ # # (PARTITION_COLUMN_NAME, pa.string()),
502
+ # # ("year", pa.uint16()),
503
+ # # ("month", pa.uint8()),
504
+ # # ("date", pa.date32()),
505
+ # # ]
506
+ # # ),
507
+ # flavor="hive",
508
+ # )
509
+ # by_ticker_schema = custom_aggs_schema()
510
+ # by_ticker_schema = by_ticker_schema.append(
511
+ # pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
512
+ # )
513
+
514
+ # # TODO: Collect the dates we've scattered and write a special partition key with them.
515
+ # pa_ds.write_dataset(
516
+ # generate_batches_for_schedule(schedule, aggs_ds),
517
+ # schema=by_ticker_schema,
518
+ # base_dir=by_ticker_aggs_arrow_dir,
519
+ # partitioning=by_ticker_partitioning,
520
+ # format="parquet",
521
+ # existing_data_behavior="overwrite_or_ignore",
522
+ # # max_open_files=250,
523
+ # # file_visitor=file_visitor,
524
+ # )
525
+
526
+ # return by_ticker_aggs_arrow_dir
527
+
528
+
529
+ # def generate_tables_from_custom_aggs_ds(
530
+ # aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
531
+ # ):
532
+ # for timestamp in schedule:
533
+ # yield table_for_date(aggs_ds=aggs_ds, date=timestamp.to_pydatetime().date())