zipline_polygon_bundle 0.2.0.dev1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,29 +1,29 @@
1
1
  from .config import PolygonConfig, PARTITION_COLUMN_NAME, to_partition_key
2
2
 
3
- from typing import Iterator, Tuple, Union
3
+ from typing import Iterator, Tuple
4
4
 
5
5
  import pyarrow as pa
6
- import pyarrow.dataset as pa_ds
7
6
  import pyarrow.compute as pa_compute
8
7
  import pyarrow.csv as pa_csv
8
+ import pyarrow.dataset as pa_ds
9
9
  import pyarrow.fs as pa_fs
10
10
 
11
11
  from fsspec.implementations.arrow import ArrowFSWrapper
12
12
 
13
13
  import os
14
14
  import datetime
15
- import shutil
16
15
 
17
16
  import numpy as np
18
17
  import pandas as pd
19
- import pandas_ta as ta
20
18
 
21
19
 
22
20
  def trades_schema(raw: bool = False) -> pa.Schema:
23
21
  # There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
24
22
  # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
25
23
  # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
26
- # timestamp_type = pa.timestamp("ns", tz="UTC")
24
+ # The timezone is America/New_York because that's the US exchanges timezone and the date is a trading day.
25
+ # timestamp_type = pa.timestamp("ns", tz="America/New_York")
26
+ # timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
27
27
  timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
28
28
 
29
29
  # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
@@ -94,7 +94,7 @@ def cast_strings_to_list(
94
94
  return int_list_array
95
95
 
96
96
 
97
- def cast_trades(trades):
97
+ def cast_trades(trades) -> pa.Table:
98
98
  trades = trades.cast(trades_schema())
99
99
  condition_values = cast_strings_to_list(
100
100
  trades.column("conditions").combine_chunks()
@@ -102,220 +102,8 @@ def cast_trades(trades):
102
102
  return trades.append_column("condition_values", condition_values)
103
103
 
104
104
 
105
- def date_to_path(date, ext=".csv.gz"):
106
- # return f"{date.year}/{date.month:02}/{date.isoformat()}{ext}"
107
- return date.strftime("%Y/%m/%Y-%m-%d") + ext
108
-
109
-
110
- # def convert_to_custom_aggs_file(
111
- # config: PolygonConfig,
112
- # overwrite: bool,
113
- # timestamp: pd.Timestamp,
114
- # start_session: pd.Timestamp,
115
- # end_session: pd.Timestamp,
116
- # ):
117
- # date = timestamp.to_pydatetime().date()
118
- # aggs_date_path = date_to_path(date, ext=".parquet")
119
- # aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
120
- # # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
121
- # fsspec = ArrowFSWrapper(config.filesystem)
122
- # if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
123
- # if overwrite:
124
- # if fsspec.exists(aggs_path):
125
- # config.filesystem.delete_file(aggs_path)
126
- # if fsspec.exists(aggs_by_ticker_path):
127
- # config.filesystem.delete_file(aggs_by_ticker_path)
128
- # else:
129
- # if fsspec.exists(aggs_path):
130
- # print(f"SKIPPING: {date=} File exists {aggs_path=}")
131
- # if fsspec.exists(aggs_by_ticker_path):
132
- # print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
133
- # return
134
- # fsspec.mkdir(fsspec._parent(aggs_path))
135
- # fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
136
- # trades_path = f"{config.trades_dir}/{date_to_path(date)}"
137
- # if not fsspec.exists(trades_path):
138
- # print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
139
- # return
140
- # print(f"{trades_path=}")
141
- # format = pa_ds.CsvFileFormat()
142
- # trades_ds = pa_ds.FileSystemDataset.from_paths(
143
- # [trades_path],
144
- # format=format,
145
- # schema=trades_schema(raw=True),
146
- # filesystem=config.filesystem,
147
- # )
148
- # fragments = trades_ds.get_fragments()
149
- # fragment = next(fragments)
150
- # try:
151
- # next(fragments)
152
- # print("ERROR: More than one fragment for {path=}")
153
- # except StopIteration:
154
- # pass
155
- # trades = fragment.to_table(schema=trades_ds.schema)
156
- # trades = trades.cast(trades_schema())
157
- # min_timestamp = pa.compute.min(trades.column("sip_timestamp")).as_py()
158
- # max_timestamp = pa.compute.max(trades.column("sip_timestamp")).as_py()
159
- # if min_timestamp < start_session:
160
- # print(f"ERROR: {min_timestamp=} < {start_session=}")
161
- # if max_timestamp >= end_session:
162
- # print(f"ERROR: {max_timestamp=} >= {end_session=}")
163
- # trades_df = trades.to_pandas()
164
- # trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
165
- # aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
166
- # open=("price", "first"),
167
- # high=("price", "max"),
168
- # low=("price", "min"),
169
- # close=("price", "last"),
170
- # volume=("size", "sum"),
171
- # )
172
- # aggs_df["transactions"] = trades_df.groupby(["ticker", "window_start"]).size()
173
- # aggs_df.reset_index(inplace=True)
174
- # aggs_table = pa.Table.from_pandas(aggs_df).select(
175
- # [
176
- # "ticker",
177
- # "volume",
178
- # "open",
179
- # "close",
180
- # "high",
181
- # "low",
182
- # "window_start",
183
- # "transactions",
184
- # ]
185
- # )
186
- # aggs_table = aggs_table.sort_by(
187
- # [("ticker", "ascending"), ("window_start", "ascending")]
188
- # )
189
- # print(f"{aggs_by_ticker_path=}")
190
- # pa_parquet.write_table(
191
- # table=aggs_table, where=aggs_by_ticker_path, filesystem=to_config.filesystem
192
- # )
193
- # aggs_table = aggs_table.sort_by(
194
- # [("window_start", "ascending"), ("ticker", "ascending")]
195
- # )
196
- # print(f"{aggs_path=}")
197
- # pa_parquet.write_table(
198
- # table=aggs_table, where=aggs_path, filesystem=to_config.filesystem
199
- # )
200
-
201
-
202
- # def convert_to_custom_aggs(config: PolygonConfig,
203
- # overwrite: bool,
204
- # timestamp: pd.Timestamp,
205
- # start_session: pd.Timestamp,
206
- # end_session: pd.Timestamp):
207
- # date = timestamp.to_pydatetime().date()
208
- # aggs_date_path = date_to_path(date, ext=".parquet")
209
- # aggs_path = f"{config.custom_aggs_dir}/{aggs_date_path}"
210
- # # aggs_by_ticker_path = f"{config.custom_aggs_by_ticker_dir}/{aggs_date_path}"
211
- # fsspec = ArrowFSWrapper(config.filesystem)
212
- # if fsspec.exists(aggs_path) or fsspec.exists(aggs_by_ticker_path):
213
- # if overwrite:
214
- # if fsspec.exists(aggs_path):
215
- # config.filesystem.delete_file(aggs_path)
216
- # if fsspec.exists(aggs_by_ticker_path):
217
- # config.filesystem.delete_file(aggs_by_ticker_path)
218
- # else:
219
- # if fsspec.exists(aggs_path):
220
- # print(f"SKIPPING: {date=} File exists {aggs_path=}")
221
- # if fsspec.exists(aggs_by_ticker_path):
222
- # print(f"SKIPPING: {date=} File exists {aggs_by_ticker_path=}")
223
- # return
224
- # fsspec.mkdir(fsspec._parent(aggs_path))
225
- # fsspec.mkdir(fsspec._parent(aggs_by_ticker_path))
226
- # trades_path = f"{config.trades_dir}/{date_to_path(date)}"
227
- # if not fsspec.exists(trades_path):
228
- # print(f"ERROR: Trades file missing. Skipping {date=}. {trades_path=}")
229
- # return
230
- # print(f"{trades_path=}")
231
- # format = pa_ds.CsvFileFormat()
232
- # trades_ds = pa_ds.FileSystemDataset.from_paths([trades_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
233
- # fragments = trades_ds.get_fragments()
234
- # fragment = next(fragments)
235
- # try:
236
- # next(fragments)
237
- # print("ERROR: More than one fragment for {path=}")
238
- # except StopIteration:
239
- # pass
240
- # trades = fragment.to_table(schema=trades_ds.schema)
241
- # trades = trades.cast(trades_schema())
242
- # min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
243
- # max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
244
- # if min_timestamp < start_session:
245
- # print(f"ERROR: {min_timestamp=} < {start_session=}")
246
- # if max_timestamp >= end_session:
247
- # print(f"ERROR: {max_timestamp=} >= {end_session=}")
248
- # trades_df = trades.to_pandas()
249
- # trades_df["window_start"] = trades_df["sip_timestamp"].dt.floor(aggregate_timedelta)
250
- # aggs_df = trades_df.groupby(["ticker", "window_start"]).agg(
251
- # open=('price', 'first'),
252
- # high=('price', 'max'),
253
- # low=('price', 'min'),
254
- # close=('price', 'last'),
255
- # volume=('size', 'sum'),
256
- # )
257
- # aggs_df['transactions'] = trades_df.groupby(["ticker", "window_start"]).size()
258
- # aggs_df.reset_index(inplace=True)
259
- # aggs_table = pa.Table.from_pandas(aggs_df).select(['ticker', 'volume', 'open', 'close', 'high', 'low', 'window_start', 'transactions'])
260
- # aggs_table = aggs_table.sort_by([('ticker', 'ascending'), ('window_start', 'ascending')])
261
- # print(f"{aggs_by_ticker_path=}")
262
- # pa_parquet.write_table(table=aggs_table,
263
- # where=aggs_by_ticker_path, filesystem=to_config.filesystem)
264
- # aggs_table = aggs_table.sort_by([('window_start', 'ascending'), ('ticker', 'ascending')])
265
- # print(f"{aggs_path=}")
266
- # pa_parquet.write_table(table=aggs_table,
267
- # where=aggs_path, filesystem=to_config.filesystem)
268
- # pa_ds.write_dataset(
269
- # generate_batches_from_tables(tables),
270
- # schema=schema,
271
- # base_dir=by_ticker_aggs_arrow_dir,
272
- # partitioning=partitioning,
273
- # format="parquet",
274
- # existing_data_behavior="overwrite_or_ignore",
275
- # )
276
-
277
-
278
- # def generate_csv_trades_tables(
279
- # config: PolygonConfig,
280
- # ) -> Tuple[datetime.date, Iterator[pa.Table]]:
281
- # """Generator for trades tables from flatfile CSVs."""
282
- # # Use pandas_market_calendars so we can get extended hours.
283
- # # NYSE and NASDAQ have extended hours but XNYS does not.
284
- # calendar = pandas_market_calendars.get_calendar(config.calendar_name)
285
- # schedule = calendar.schedule(start_date=config.start_timestamp, end_date=config.end_timestamp, start="pre", end="post")
286
- # for timestamp, session in schedule.iterrows():
287
- # date = timestamp.to_pydatetime().date()
288
- # trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
289
- # format = pa_ds.CsvFileFormat()
290
- # trades_ds = pa_ds.FileSystemDataset.from_paths([trades_csv_path], format=format, schema=trades_schema(raw=True), filesystem=config.filesystem)
291
- # fragments = trades_ds.get_fragments()
292
- # fragment = next(fragments)
293
- # try:
294
- # next(fragments)
295
- # print("ERROR: More than one fragment for {path=}")
296
- # except StopIteration:
297
- # pass
298
- # trades = fragment.to_table(schema=trades_ds.schema)
299
- # trades = trades.cast(trades_schema())
300
- # min_timestamp = pa.compute.min(trades.column('sip_timestamp')).as_py()
301
- # max_timestamp = pa.compute.max(trades.column('sip_timestamp')).as_py()
302
- # start_session = session['pre']
303
- # end_session = session['post']
304
- # # print(f"{start_session=} {end_session=}")
305
- # # print(f"{min_timestamp=} {max_timestamp=}")
306
- # if min_timestamp < start_session:
307
- # print(f"ERROR: {min_timestamp=} < {start_session=}")
308
- # # The end_session is supposed to be a limit but there are many with trades at that second.
309
- # if max_timestamp >= (end_session + pd.Timedelta(seconds=1)):
310
- # # print(f"ERROR: {max_timestamp=} >= {end_session=}")
311
- # print(f"ERROR: {max_timestamp=} > {end_session+pd.Timedelta(seconds=1)=}")
312
- # yield date, trades
313
- # del fragment
314
- # del fragments
315
- # del trades_ds
316
-
317
-
318
105
  def custom_aggs_schema(raw: bool = False) -> pa.Schema:
106
+ # timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz=tz)
319
107
  timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
320
108
  price_type = pa.float64()
321
109
  return pa.schema(
@@ -328,9 +116,11 @@ def custom_aggs_schema(raw: bool = False) -> pa.Schema:
328
116
  pa.field("low", price_type, nullable=False),
329
117
  pa.field("window_start", timestamp_type, nullable=False),
330
118
  pa.field("transactions", pa.int64(), nullable=False),
119
+ pa.field("vwap", price_type, nullable=False),
331
120
  pa.field("date", pa.date32(), nullable=False),
332
121
  pa.field("year", pa.uint16(), nullable=False),
333
122
  pa.field("month", pa.uint8(), nullable=False),
123
+ pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
334
124
  ]
335
125
  )
336
126
 
@@ -344,12 +134,12 @@ def custom_aggs_partitioning() -> pa.Schema:
344
134
  )
345
135
 
346
136
 
347
- def get_custom_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
348
- file_info = config.filesystem.get_file_info(config.custom_aggs_dir)
137
+ def get_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
138
+ file_info = config.filesystem.get_file_info(config.aggs_dir)
349
139
  if file_info.type == pa_fs.FileType.NotFound:
350
140
  return set()
351
141
  aggs_ds = pa_ds.dataset(
352
- config.custom_aggs_dir,
142
+ config.aggs_dir,
353
143
  format="parquet",
354
144
  schema=custom_aggs_schema(),
355
145
  partitioning=custom_aggs_partitioning(),
@@ -366,17 +156,17 @@ def generate_csv_trades_tables(
366
156
  config: PolygonConfig, overwrite: bool = False
367
157
  ) -> Iterator[Tuple[datetime.date, pa.Table]]:
368
158
  """Generator for trades tables from flatfile CSVs."""
369
- custom_aggs_dates = set()
159
+ existing_aggs_dates = set()
370
160
  if not overwrite:
371
- custom_aggs_dates = get_custom_aggs_dates(config)
161
+ existing_aggs_dates = get_aggs_dates(config)
372
162
  schedule = config.calendar.trading_index(
373
163
  start=config.start_timestamp, end=config.end_timestamp, period="1D"
374
164
  )
375
165
  for timestamp in schedule:
376
- date = timestamp.to_pydatetime().date()
377
- if date in custom_aggs_dates:
166
+ date: datetime.date = timestamp.to_pydatetime().date()
167
+ if date in existing_aggs_dates:
378
168
  continue
379
- trades_csv_path = f"{config.trades_dir}/{date_to_path(date)}"
169
+ trades_csv_path = config.date_to_csv_file_path(date)
380
170
  convert_options = pa_csv.ConvertOptions(column_types=trades_schema(raw=True))
381
171
  trades = pa_csv.read_csv(trades_csv_path, convert_options=convert_options)
382
172
  trades = trades.cast(trades_schema())
@@ -402,7 +192,8 @@ def trades_to_custom_aggs(
402
192
  table: pa.Table,
403
193
  include_trf: bool = False,
404
194
  ) -> pa.Table:
405
- print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
195
+ print(f"{date=} {pa.default_memory_pool()=}")
196
+ # print(f"{datetime.datetime.now()=} {date=} {pa.default_memory_pool()=}")
406
197
  # print(f"{resource.getrusage(resource.RUSAGE_SELF).ru_maxrss=}")
407
198
  table = table.filter(pa_compute.greater(table["size"], 0))
408
199
  table = table.filter(pa_compute.equal(table["correction"], "0"))
@@ -452,37 +243,25 @@ def trades_to_custom_aggs(
452
243
  table = table.append_column(
453
244
  "month", pa.array(np.full(len(table), date.month), type=pa.uint8())
454
245
  )
246
+ table = table.append_column(
247
+ PARTITION_COLUMN_NAME,
248
+ pa.array(
249
+ [to_partition_key(ticker) for ticker in table.column("ticker").to_pylist()]
250
+ ),
251
+ )
455
252
  table = table.sort_by([("window_start", "ascending"), ("ticker", "ascending")])
253
+ # print(f"aggs {date=} {table.to_pandas().head()=}")
456
254
  return table
457
255
 
458
256
 
459
- # def generate_custom_agg_batches_from_tables(config: PolygonConfig) -> pa.RecordBatch:
257
+ # def generate_custom_agg_batches_from_tables(config: PolygonConfig):
460
258
  # for date, trades_table in generate_csv_trades_tables(config):
461
- # for batch in trades_to_custom_aggs(config, date, trades_table).to_batches():
462
- # yield batch
259
+ # aggs_table = trades_to_custom_aggs(config, date, trades_table)
260
+ # yield aggs_table
261
+ # del aggs_table
463
262
  # del trades_table
464
263
 
465
264
 
466
- # def generate_custom_agg_tables(config: PolygonConfig) -> pa.Table:
467
- # for date, trades_table in generate_csv_trades_tables(config):
468
- # yield trades_to_custom_aggs(config, date, trades_table)
469
-
470
-
471
- # def configure_write_custom_aggs_to_dataset(config: PolygonConfig):
472
- # def write_custom_aggs_to_dataset(args: Tuple[datetime.date, pa.Table]):
473
- # date, table = args
474
- # pa_ds.write_dataset(
475
- # trades_to_custom_aggs(config, date, table),
476
- # filesystem=config.filesystem,
477
- # base_dir=config.custom_aggs_dir,
478
- # partitioning=custom_aggs_partitioning(),
479
- # format="parquet",
480
- # existing_data_behavior="overwrite_or_ignore",
481
- # )
482
-
483
- # return write_custom_aggs_to_dataset
484
-
485
-
486
265
  def file_visitor(written_file):
487
266
  print(f"{written_file.path=}")
488
267
 
@@ -504,26 +283,25 @@ def convert_trades_to_custom_aggs(
504
283
  # generate_custom_agg_batches_from_tables(config),
505
284
  # schema=custom_aggs_schema(),
506
285
  # filesystem=config.filesystem,
507
- # base_dir=config.custom_aggs_dir,
286
+ # base_dir=config.aggs_dir,
508
287
  # partitioning=custom_aggs_partitioning(),
509
288
  # format="parquet",
510
289
  # existing_data_behavior="overwrite_or_ignore",
511
- # max_open_files = MAX_FILES_OPEN,
512
- # min_rows_per_group = MIN_ROWS_PER_GROUP,
290
+ # # max_open_files = MAX_FILES_OPEN,
291
+ # # min_rows_per_group = MIN_ROWS_PER_GROUP,
513
292
  # )
514
293
 
515
294
  for date, trades_table in generate_csv_trades_tables(config):
516
295
  aggs_table = trades_to_custom_aggs(config, date, trades_table)
517
296
  pa_ds.write_dataset(
518
297
  aggs_table,
519
- # schema=custom_aggs_schema(),
520
298
  filesystem=config.filesystem,
521
299
  base_dir=config.aggs_dir,
522
300
  partitioning=custom_aggs_partitioning(),
523
301
  format="parquet",
524
302
  existing_data_behavior="overwrite_or_ignore",
525
303
  file_visitor=file_visitor,
526
- # max_open_files=MAX_FILES_OPEN,
304
+ # max_open_files=10,
527
305
  # min_rows_per_group=MIN_ROWS_PER_GROUP,
528
306
  )
529
307
  del aggs_table
@@ -559,386 +337,199 @@ def convert_trades_to_custom_aggs(
559
337
  # return mfi
560
338
 
561
339
 
562
- # def generate_custom_agg_tables(
563
- # config: PolygonConfig,
564
- # ) -> Tuple[pa.Schema, Iterator[pa.Table]]:
565
- # """zipline does bundle ingestion one ticker at a time."""
566
-
567
- # # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
568
- # # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
569
- # timestamp_type = pa.timestamp("ns", tz="UTC")
570
-
571
- # # But we can't use the timestamp type in the schema here because it's not supported by the CSV reader.
572
- # # So we'll use int64 and cast it after reading the CSV file.
573
- # # https://github.com/apache/arrow/issues/44030
574
-
575
- # # strptime(3) (used by CSV reader for timestamps in ConvertOptions.timestamp_parsers) supports Unix timestamps (%s) and milliseconds (%f) but not nanoseconds.
576
- # # https://www.geeksforgeeks.org/how-to-use-strptime-with-milliseconds-in-python/
577
- # # Actually that's the wrong strptime (it's Python's). C++ strptime(3) doesn't even support %f.
578
- # # https://github.com/apache/arrow/issues/39839#issuecomment-1915981816
579
- # # Also I don't think you can use those in a format string without a separator.
580
-
581
- # # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
582
- # # price_type = pa.decimal128(precision=38, scale=10)
583
- # # 64bit float a little overkill but avoids any plausible truncation error.
584
- # price_type = pa.float64()
585
-
586
- # custom_aggs_schema = pa.schema(
587
- # [
588
- # pa.field("ticker", pa.string(), nullable=False),
589
- # pa.field("volume", pa.int64(), nullable=False),
590
- # pa.field("open", price_type, nullable=False),
591
- # pa.field("close", price_type, nullable=False),
592
- # pa.field("high", price_type, nullable=False),
593
- # pa.field("low", price_type, nullable=False),
594
- # pa.field("window_start", timestamp_type, nullable=False),
595
- # pa.field("transactions", pa.int64(), nullable=False),
596
- # pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
597
- # ]
598
- # )
340
+ def get_by_ticker_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
341
+ file_info = config.filesystem.get_file_info(config.by_ticker_aggs_arrow_dir)
342
+ if file_info.type == pa_fs.FileType.NotFound:
343
+ return set()
344
+ by_ticker_aggs_ds = pa_ds.dataset(
345
+ config.by_ticker_aggs_arrow_dir,
346
+ format="parquet",
347
+ schema=custom_aggs_schema(),
348
+ partitioning=custom_aggs_partitioning(),
349
+ )
350
+ return set(
351
+ [
352
+ pa_ds.get_partition_keys(fragment.partition_expression).get("date")
353
+ for fragment in by_ticker_aggs_ds.get_fragments()
354
+ ]
355
+ )
599
356
 
600
- # # TODO: Use generator like os.walk for paths.
601
- # return (
602
- # custom_aggs_schema,
603
- # generate_tables_from_custom_aggs(
604
- # paths=config.csv_paths(),
605
- # schema=custom_aggs_schema,
606
- # start_timestamp=config.start_timestamp,
607
- # limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
608
- # ),
609
- # )
610
357
 
611
- # def get_custom_aggs_dates(config: PolygonConfig) -> set[datetime.date]:
612
- # file_info = config.filesystem.get_file_info(config.custom_aggs_dir)
613
- # if file_info.type == pa_fs.FileType.NotFound:
614
- # return set()
615
- # aggs_ds = pa_ds.dataset(
616
- # config.custom_aggs_dir,
617
- # format="parquet",
618
- # schema=custom_aggs_schema(),
619
- # partitioning=custom_aggs_partitioning(),
620
- # )
621
- # return set(
622
- # [
623
- # pa_ds.get_partition_keys(fragment.partition_expression).get("date")
624
- # for fragment in aggs_ds.get_fragments()
625
- # ]
626
- # )
358
+ def batches_for_date(aggs_ds: pa_ds.Dataset, date: pd.Timestamp):
359
+ date_filter_expr = (
360
+ (pa_compute.field("year") == date.year)
361
+ & (pa_compute.field("month") == date.month)
362
+ & (pa_compute.field("date") == date.date())
363
+ )
364
+ print(f"table for {date=}")
365
+ # return aggs_ds.scanner(filter=date_filter_expr).to_batches()
366
+ table = aggs_ds.scanner(filter=date_filter_expr).to_table()
367
+ table = table.sort_by([("part", "ascending"), ("ticker", "ascending"), ("window_start", "ascending"), ])
368
+ return table.to_batches()
627
369
 
628
370
 
629
- def generate_batches_from_custom_aggs_ds(
630
- aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
631
- ) -> Iterator[pa.RecordBatch]:
371
+ def generate_batches_for_schedule(config, aggs_ds):
372
+ schedule = config.calendar.trading_index(
373
+ start=config.start_timestamp, end=config.end_timestamp, period="1D"
374
+ )
632
375
  for timestamp in schedule:
633
- date = timestamp.to_pydatetime().date()
634
- date_filter_expr = (
635
- (pa_compute.field("year") == date.year)
636
- & (pa_compute.field("month") == date.month)
637
- & (pa_compute.field("date") == date)
638
- )
639
- for batch in aggs_ds.to_batches(filter=date_filter_expr):
640
- # TODO: Check that these rows are within range for this file's date (not just the whole session).
641
- # And if we're doing that (figuring date for each file), we can just skip reading the file.
642
- # Might able to do a single comparison using compute.days_between.
643
- # https://arrow.apache.org/docs/python/generated/pyarrow.compute.days_between.html
644
- batch = batch.append_column(
645
- PARTITION_COLUMN_NAME,
646
- pa.array(
647
- [
648
- to_partition_key(ticker)
649
- for ticker in batch.column("ticker").to_pylist()
650
- ]
651
- ),
652
- )
653
- yield batch
376
+ # print(f"{timestamp=}")
377
+ yield from batches_for_date(aggs_ds=aggs_ds, date=timestamp)
378
+
379
+
380
+ # def scatter_custom_aggs_to_by_ticker(
381
+ # config: PolygonConfig,
382
+ # overwrite: bool = False,
383
+ # ) -> str:
384
+ # lock = FileLock(config.lock_file_path, blocking=False)
385
+ # with lock:
386
+ # if not lock.is_locked:
387
+ # raise IOError("Failed to acquire lock for updating custom assets.")
388
+ # with open(config.by_ticker_dates_path, "a") as f:
389
+ # f.write("I have a bad feeling about this.")
390
+ # by_ticker_aggs_arrow_dir = scatter_custom_aggs_to_by_ticker_(config, overwrite)
391
+
392
+ # print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
393
+ # return by_ticker_aggs_arrow_dir
394
+
395
+
396
+ def filter_by_date(config: PolygonConfig) -> pa_compute.Expression:
397
+ start_date = config.start_timestamp.tz_localize(config.calendar.tz.key).date()
398
+ limit_date = (
399
+ (config.end_timestamp + pd.Timedelta(days=1))
400
+ .tz_localize(config.calendar.tz.key)
401
+ .date()
402
+ )
403
+ return (pa_compute.field("date") >= start_date) & (
404
+ pa_compute.field("date") <= limit_date
405
+ )
654
406
 
655
407
 
656
- def scatter_custom_aggs_to_by_ticker(
408
+ # def generate_batches_with_partition(
409
+ # config: PolygonConfig,
410
+ # aggs_ds: pa_ds.Dataset,
411
+ # ) -> Iterator[pa.Table]:
412
+ # for fragment in aggs_ds.sort_by("date").get_fragments(
413
+ # filter=filter_by_date(config),
414
+ # ):
415
+ # for batch in fragment.to_batches():
416
+ # # batch = batch.append_column(
417
+ # # PARTITION_COLUMN_NAME,
418
+ # # pa.array(
419
+ # # [
420
+ # # to_partition_key(ticker)
421
+ # # for ticker in batch.column("ticker").to_pylist()
422
+ # # ]
423
+ # # ),
424
+ # # )
425
+ # yield batch.sort_by(
426
+ # [("ticker", "ascending"), ("window_start", "ascending")]
427
+ # )
428
+ # del batch
429
+ # del fragment
430
+
431
+
432
+ def generate_batches_with_partition(
657
433
  config: PolygonConfig,
658
- overwrite: bool = False,
659
- ) -> str:
660
- file_info = config.filesystem.get_file_info(config.custom_aggs_dir)
661
- if file_info.type == pa_fs.FileType.NotFound:
662
- raise FileNotFoundError(f"{config.custom_aggs_dir=} not found.")
434
+ aggs_ds: pa_ds.Dataset,
435
+ ) -> Iterator[pa.Table]:
436
+ for fragment in (
437
+ aggs_ds.filter(filter_by_date(config))
438
+ .sort_by([(PARTITION_COLUMN_NAME, "ascending"), ("date", "ascending")])
439
+ .get_fragments()
440
+ ):
441
+ for batch in fragment.to_batches():
442
+ yield batch.sort_by(
443
+ [("ticker", "ascending"), ("window_start", "ascending")]
444
+ )
445
+ del batch
446
+ del fragment
663
447
 
664
- by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
665
- if os.path.exists(by_ticker_aggs_arrow_dir):
666
- if overwrite:
667
- print(f"Removing {by_ticker_aggs_arrow_dir=}")
668
- shutil.rmtree(by_ticker_aggs_arrow_dir)
669
- else:
670
- print(f"Found existing {by_ticker_aggs_arrow_dir=}")
671
- return by_ticker_aggs_arrow_dir
672
448
 
449
+ def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
673
450
  aggs_ds = pa_ds.dataset(
674
- config.custom_aggs_dir,
451
+ config.aggs_dir,
675
452
  format="parquet",
676
453
  schema=custom_aggs_schema(),
677
454
  partitioning=custom_aggs_partitioning(),
678
455
  )
679
- schedule = config.calendar.trading_index(
680
- start=config.start_timestamp, end=config.end_timestamp, period="1D"
681
- )
682
- assert type(schedule) is pd.DatetimeIndex
456
+ by_ticker_schema = aggs_ds.schema
683
457
  partitioning = pa_ds.partitioning(
684
- pa.schema([(PARTITION_COLUMN_NAME, pa.string())]), flavor="hive"
458
+ pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
459
+ flavor="hive",
685
460
  )
686
- schema = aggs_ds.schema
687
- schema = schema.append(pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False))
688
-
461
+ by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
462
+ print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
689
463
  pa_ds.write_dataset(
690
- generate_batches_from_custom_aggs_ds(aggs_ds, schedule),
691
- schema=schema,
464
+ # generate_batches_with_partition(config=config, aggs_ds=aggs_ds),
465
+ generate_batches_for_schedule(config=config, aggs_ds=aggs_ds),
466
+ schema=by_ticker_schema,
692
467
  base_dir=by_ticker_aggs_arrow_dir,
693
468
  partitioning=partitioning,
694
469
  format="parquet",
695
470
  existing_data_behavior="overwrite_or_ignore",
696
471
  )
697
- print(f"Scattered custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
472
+ print(f"Scattered aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
698
473
  return by_ticker_aggs_arrow_dir
699
474
 
700
475
 
701
- def calculate_mfi(typical_price: pd.Series, money_flow: pd.Series, period: int):
702
- mf_sign = np.where(typical_price > np.roll(typical_price, shift=1), 1, -1)
703
- signed_mf = money_flow * mf_sign
704
-
705
- # Calculate gain and loss using vectorized operations
706
- positive_mf = np.maximum(signed_mf, 0)
707
- negative_mf = np.maximum(-signed_mf, 0)
708
-
709
- mf_avg_gain = (
710
- np.convolve(positive_mf, np.ones(period), mode="full")[: len(positive_mf)]
711
- / period
712
- )
713
- mf_avg_loss = (
714
- np.convolve(negative_mf, np.ones(period), mode="full")[: len(negative_mf)]
715
- / period
716
- )
717
-
718
- epsilon = 1e-10 # Small epsilon value to avoid division by zero
719
- mfi = 100 - (100 / (1 + mf_avg_gain / (mf_avg_loss + epsilon)))
720
- return mfi
721
-
722
-
723
- # https://github.com/twopirllc/pandas-ta/blob/main/pandas_ta/momentum/stoch.py
724
- # https://github.com/twopirllc/pandas-ta/blob/development/pandas_ta/momentum/stoch.py
725
- # `k` vs `fast_k` arg names.
726
- # https://github.com/twopirllc/pandas-ta/issues/726
727
- # Results affected by values outside range
728
- # https://github.com/twopirllc/pandas-ta/issues/535
729
-
730
-
731
- def calculate_stoch(
732
- high: pd.Series,
733
- low: pd.Series,
734
- close: pd.Series,
735
- k: int = 14,
736
- d: int = 3,
737
- smooth_k: int = 3,
738
- mamode: str = "sma",
739
- ):
740
- """Indicator: Stochastic Oscillator (STOCH)"""
741
- lowest_low = low.rolling(k).min()
742
- highest_high = high.rolling(k).max()
476
+ # def scatter_custom_aggs_to_by_ticker(config, overwrite=False) -> str:
477
+ # file_info = config.filesystem.get_file_info(config.aggs_dir)
478
+ # if file_info.type == pa_fs.FileType.NotFound:
479
+ # raise FileNotFoundError(f"{config.aggs_dir=} not found.")
743
480
 
744
- stoch = 100 * (close - lowest_low)
745
- stoch /= ta.utils.non_zero_range(highest_high, lowest_low)
481
+ # by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
482
+ # if os.path.exists(by_ticker_aggs_arrow_dir):
483
+ # if overwrite:
484
+ # print(f"Removing {by_ticker_aggs_arrow_dir=}")
485
+ # shutil.rmtree(by_ticker_aggs_arrow_dir)
746
486
 
747
- stoch_k = ta.overlap.ma(
748
- mamode, stoch.loc[stoch.first_valid_index() :,], length=smooth_k
749
- )
750
- stoch_d = (
751
- ta.overlap.ma(mamode, stoch_k.loc[stoch_k.first_valid_index() :,], length=d)
752
- if stoch_k is not None
753
- else None
754
- )
755
- # Histogram
756
- stoch_h = stoch_k - stoch_d if stoch_d is not None else None
487
+ # schedule = config.calendar.trading_index(
488
+ # start=config.start_timestamp, end=config.end_timestamp, period="1D"
489
+ # )
490
+ # assert type(schedule) is pd.DatetimeIndex
757
491
 
758
- return stoch_k, stoch_d, stoch_h
492
+ # print(f"Scattering custom aggregates by ticker to {by_ticker_aggs_arrow_dir=}")
493
+ # aggs_ds = pa_ds.dataset(
494
+ # config.aggs_dir,
495
+ # format="parquet",
496
+ # schema=custom_aggs_schema(),
497
+ # partitioning=custom_aggs_partitioning(),
498
+ # )
499
+ # by_ticker_partitioning = pa_ds.partitioning(
500
+ # pa.schema([(PARTITION_COLUMN_NAME, pa.string())]),
501
+ # # pa.schema(
502
+ # # [
503
+ # # (PARTITION_COLUMN_NAME, pa.string()),
504
+ # # ("year", pa.uint16()),
505
+ # # ("month", pa.uint8()),
506
+ # # ("date", pa.date32()),
507
+ # # ]
508
+ # # ),
509
+ # flavor="hive",
510
+ # )
511
+ # by_ticker_schema = custom_aggs_schema()
512
+ # by_ticker_schema = by_ticker_schema.append(
513
+ # pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False),
514
+ # )
759
515
 
516
+ # # TODO: Collect the dates we've scattered and write a special partition key with them.
517
+ # pa_ds.write_dataset(
518
+ # generate_batches_for_schedule(schedule, aggs_ds),
519
+ # schema=by_ticker_schema,
520
+ # base_dir=by_ticker_aggs_arrow_dir,
521
+ # partitioning=by_ticker_partitioning,
522
+ # format="parquet",
523
+ # existing_data_behavior="overwrite_or_ignore",
524
+ # # max_open_files=250,
525
+ # # file_visitor=file_visitor,
526
+ # )
760
527
 
761
- def compute_per_ticker_signals(df: pd.DataFrame, period: int = 14) -> pd.DataFrame:
762
- df = df.set_index("window_start").sort_index()
763
- session_index = pd.date_range(
764
- start=df.index[0], end=df.index[-1], freq=pd.Timedelta(seconds=60)
765
- )
766
- df = df.reindex(session_index)
767
- df.index.rename("window_start", inplace=True)
768
-
769
- # df["minute_of_day"] = (df.index.hour * 60) + df.index.minute
770
- # df["day_of_week"] = df.index.day_of_week
771
-
772
- df.transactions = df.transactions.fillna(0)
773
- df.volume = df.volume.fillna(0)
774
- df.total = df.total.fillna(0)
775
- df.close = df.close.ffill()
776
- close = df.close
777
- df.vwap = df.vwap.fillna(close)
778
- df.high = df.high.fillna(close)
779
- df.low = df.low.fillna(close)
780
- df.open = df.open.fillna(close)
781
- price_open = df.open
782
- high = df.high
783
- low = df.low
784
- vwap = df.vwap
785
- # volume = df.volume
786
- total = df.total
787
- next_close = close.shift()
788
-
789
- # TODO: Odometer rollover signal. Relative difference to nearest power of 10.
790
- # Something about log10 being a whole number? When is $50 the rollover vs $100 or $10?
791
-
792
- # "True (Typical?) Price" which I think is an approximation of VWAP.
793
- # Trouble with both is that if there are no trades in a bar we get NaN.
794
- # That then means we get NaN for averages for the next period-1 bars too.
795
- # Question is whether to ffill the price for these calculations.
796
- df["TP"] = (high + low + close) / 3
797
-
798
- # Gain/loss in this bar.
799
- df["ret1bar"] = close.div(price_open).sub(1)
800
-
801
- for t in range(2, period):
802
- df[f"ret{t}bar"] = close.div(price_open.shift(t - 1)).sub(1)
803
-
804
- # Average True Range (ATR)
805
- true_range = pd.concat(
806
- [high.sub(low), high.sub(next_close).abs(), low.sub(next_close).abs()], axis=1
807
- ).max(1)
808
- # Normalized ATR (NATR) or Average of Normalized TR.
809
- # Choice of NATR operations ordering discussion: https://www.macroption.com/normalized-atr/
810
- # He doesn't talk about VWAP but I think that is a better normalizing price for a bar.
811
- # atr = true_range.ewm(span=period).mean()
812
- # df["natr_c"] = atr / close
813
- # df["antr_c"] = (true_range / close).ewm(span=period).mean()
814
- # df["natr_v"] = atr / vwap
815
- # df["antr_v"] = (true_range / vwap).ewm(span=period).mean()
816
- df["NATR"] = (true_range / vwap).ewm(span=period).mean()
817
-
818
- # True Price as HLC average VS VWAP.
819
- # VWAP is better I think but is quite different than standard CCI.
820
- # Three ways to compute CCI, all give the same value using TP.
821
- # tp = (high + low + close) / 3
822
- # df['SMA'] = ta.sma(tp, length=period)
823
- # df['sma_r'] = tp.rolling(period).mean()
824
- # df['MAD'] = ta.mad(tp, length=period)
825
- # # Series.mad deprecated. mad = (s - s.mean()).abs().mean()
826
- # df['mad_r'] = tp.rolling(period).apply(lambda x: (pd.Series(x) - pd.Series(x).mean()).abs().mean())
827
-
828
- # df['cci_r'] = (tp - df['sma_r']) / (0.015 * df['mad_r'])
829
- # df['CCI'] = (tp - df['SMA']) / (0.015 * df['MAD'])
830
- # df['cci_ta'] = ta.cci(high=high, low=low, close=close, length=period)
831
-
832
- df["taCCI"] = ta.cci(high=high, low=low, close=close, length=period)
833
-
834
- # https://gist.github.com/quantra-go-algo/1b37bfb74d69148f0dfbdb5a2c7bdb25
835
- # https://medium.com/@huzaifazahoor654/how-to-calculate-cci-in-python-a-step-by-step-guide-9a3f61698be6
836
- sma = pd.Series(ta.sma(vwap, length=period))
837
- mad = pd.Series(ta.mad(vwap, length=period))
838
- df["CCI"] = (vwap - sma) / (0.015 * mad)
839
-
840
- # df['MFI'] = calculate_mfi(high=high, low=low, close=close, volume=volume, period=period)
841
- df["MFI"] = calculate_mfi(typical_price=vwap, money_flow=total, period=period)
842
-
843
- # We use Stochastic (rather than MACD because we need a ticker independent indicator.
844
- # IOW a percentage price oscillator (PPO) rather than absolute price oscillator (APO).
845
- # https://www.alpharithms.com/moving-average-convergence-divergence-macd-031217/
846
- # We're using 14/3 currently rather than the usual 26/12 popular for MACD though.
847
- stoch_k, stoch_d, stoch_h = calculate_stoch(high, low, close, k=period)
848
- df["STOCHk"] = stoch_k
849
- df["STOCHd"] = stoch_d
850
- df["STOCHh"] = stoch_h
851
-
852
- return df
853
-
854
-
855
- def iterate_all_aggs_tables(
856
- config: PolygonConfig,
857
- valid_tickers: pa.Array,
858
- ):
859
- schedule = config.calendar.trading_index(
860
- start=config.start_timestamp, end=config.end_timestamp, period="1D"
861
- )
862
- for timestamp in schedule:
863
- date = timestamp.to_pydatetime().date()
864
- aggs_ds = pa_ds.dataset(
865
- config.custom_aggs_dir,
866
- format="parquet",
867
- schema=custom_aggs_schema(),
868
- partitioning=custom_aggs_partitioning(),
869
- )
870
- date_filter_expr = (
871
- (pa_compute.field("year") == date.year)
872
- & (pa_compute.field("month") == date.month)
873
- & (pa_compute.field("date") == date)
874
- )
875
- # print(f"{date_filter_expr=}")
876
- for fragment in aggs_ds.get_fragments(filter=date_filter_expr):
877
- session_filter = (
878
- (pa_compute.field("window_start") >= start_dt)
879
- & (pa_compute.field("window_start") < end_dt)
880
- & pa_compute.is_in(pa_compute.field("ticker"), valid_tickers)
881
- )
882
- # Sorting table doesn't seem to avoid needing to sort the df. Maybe use_threads=False on to_pandas would help?
883
- # table = fragment.to_table(filter=session_filter).sort_by([('ticker', 'ascending'), ('window_start', 'descending')])
884
- table = fragment.to_table(filter=session_filter)
885
- if table.num_rows > 0:
886
- metadata = (
887
- dict(table.schema.metadata) if table.schema.metadata else dict()
888
- )
889
- metadata["date"] = date.isoformat()
890
- table = table.replace_schema_metadata(metadata)
891
- yield table
892
-
893
-
894
- # def iterate_all_aggs_with_signals(config: PolygonConfig):
895
- # for table in iterate_all_aggs_tables(config):
896
- # df = table.to_pandas()
897
- # df = df.groupby("ticker").apply(
898
- # compute_per_ticker_signals, include_groups=False
899
- # )
900
- # yield pa.Table.from_pandas(df)
901
-
902
-
903
- def compute_signals_for_all_custom_aggs(
904
- from_config: PolygonConfig,
905
- to_config: PolygonConfig,
906
- valid_tickers: pa.Array,
907
- overwrite: bool = False,
908
- ) -> str:
909
- if overwrite:
910
- print("WARNING: overwrite not implemented/ignored.")
528
+ # return by_ticker_aggs_arrow_dir
911
529
 
912
- print(f"{to_config.custom_aggs_dir=}")
913
530
 
914
- for aggs_table in iterate_all_aggs_tables(from_config, valid_tickers):
915
- metadata = aggs_table.schema.metadata
916
- date = datetime.date.fromisoformat(metadata[b"date"].decode("utf-8"))
917
- print(f"{date=}")
918
- df = aggs_table.to_pandas()
919
- df = df.groupby("ticker").apply(
920
- compute_per_ticker_signals, include_groups=False
921
- )
922
- table = pa.Table.from_pandas(df)
923
- if table.num_rows > 0:
924
- table = table.replace_schema_metadata(metadata)
925
- table = table.append_column("date", pa.array(np.full(len(table), date)))
926
- table = table.append_column(
927
- "year", pa.array(np.full(len(table), date.year), type=pa.uint16())
928
- )
929
- table = table.append_column(
930
- "month", pa.array(np.full(len(table), date.month), type=pa.uint8())
931
- )
932
- table = table.sort_by(
933
- [("ticker", "ascending"), ("window_start", "ascending")]
934
- )
935
- pa_ds.write_dataset(
936
- table,
937
- filesystem=to_config.filesystem,
938
- base_dir=to_config.custom_aggs_dir,
939
- partitioning=custom_aggs_partitioning(),
940
- format="parquet",
941
- existing_data_behavior="overwrite_or_ignore",
942
- file_visitor=file_visitor,
943
- )
944
- return to_config.custom_aggs_dir
531
+ # def generate_tables_from_custom_aggs_ds(
532
+ # aggs_ds: pa_ds.Dataset, schedule: pd.DatetimeIndex
533
+ # ):
534
+ # for timestamp in schedule:
535
+ # yield table_for_date(aggs_ds=aggs_ds, date=timestamp.to_pydatetime().date())