zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,14 @@ from .bundle import (
7
7
 
8
8
  from .config import PolygonConfig
9
9
  from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
10
- from .adjustments import load_splits, load_dividends
10
+ from .adjustments import load_splits, load_dividends, load_conditions
11
+ from .trades import trades_schema, trades_dataset, cast_trades, date_to_path
12
+ from .trades import custom_aggs_partitioning, custom_aggs_schema, trades_to_custom_aggs, convert_all_to_custom_aggs
13
+ from .trades import get_custom_aggs_dates, generate_csv_trades_tables, compute_signals_for_all_custom_aggs
14
+ from .quotes import quotes_schema, quotes_dataset, cast_quotes
15
+ # from .tickers_and_names import load_all_tickers, merge_tickers, ticker_names_from_merged_tickers, get_ticker_universe
16
+ from .tickers_and_names import PolygonAssets, get_ticker_universe
17
+
11
18
 
12
19
  __all__ = [
13
20
  "register_polygon_equities_bundle",
@@ -19,4 +26,24 @@ __all__ = [
19
26
  "generate_csv_agg_tables",
20
27
  "load_splits",
21
28
  "load_dividends",
29
+ "load_conditions",
30
+ "trades_schema",
31
+ "trades_dataset",
32
+ "cast_trades",
33
+ "date_to_path",
34
+ "get_custom_aggs_dates",
35
+ "generate_csv_trades_tables",
36
+ "custom_aggs_partitioning",
37
+ "custom_aggs_schema",
38
+ "trades_to_custom_aggs",
39
+ "convert_all_to_custom_aggs",
40
+ "compute_signals_for_all_custom_aggs",
41
+ "quotes_schema",
42
+ "quotes_dataset",
43
+ "cast_quotes",
44
+ # "load_all_tickers",
45
+ # "merge_tickers",
46
+ # "ticker_names_from_merged_tickers",
47
+ "PolygonAssets",
48
+ "get_ticker_universe",
22
49
  ]
@@ -153,3 +153,37 @@ def load_dividends(
153
153
  return dividends[
154
154
  ["sid", "ex_date", "declared_date", "record_date", "pay_date", "amount"]
155
155
  ]
156
+
157
+
158
+ def load_conditions(config: PolygonConfig) -> pd.DataFrame:
159
+ # The API doesn't use dates for the condition codes but this is a way to provide control over caching.
160
+ # Main thing is to get the current conditions list but we don't want to call more than once a day.
161
+ conditions_path = config.api_cache_path(
162
+ start_date=config.start_timestamp.date(), end_date=config.end_timestamp.date(), filename="conditions"
163
+ )
164
+ expected_conditions_count = 100
165
+ if not os.path.exists(conditions_path):
166
+ client = polygon.RESTClient(api_key=config.api_key)
167
+ conditions_response = client.list_conditions(
168
+ limit=1000,
169
+ )
170
+ if conditions_response is HTTPResponse:
171
+ raise ValueError(f"Polygon.list_splits bad HTTPResponse: {conditions_response}")
172
+ conditions = pd.DataFrame(conditions_response)
173
+ print(f"Got {len(conditions)=} from Polygon list_conditions.")
174
+ os.makedirs(os.path.dirname(conditions_path), exist_ok=True)
175
+ conditions.to_parquet(conditions_path)
176
+ if len(conditions) < expected_conditions_count:
177
+ logging.warning(
178
+ f"Only got {len(conditions)=} from Polygon list_splits (expected {expected_conditions_count=}). "
179
+ )
180
+ # We will always load from the file to avoid any chance of weird errors.
181
+ if os.path.exists(conditions_path):
182
+ conditions = pd.read_parquet(conditions_path)
183
+ print(f"Loaded {len(conditions)=} from {conditions_path}")
184
+ if len(conditions) < expected_conditions_count:
185
+ logging.warning(
186
+ f"Only got {len(conditions)=} from cached conditions (expected {expected_conditions_count=}). "
187
+ )
188
+ return conditions
189
+ raise ValueError(f"Failed to load splits from {conditions_path}")
@@ -1,18 +1,21 @@
1
+ import os
1
2
  from zipline.data.bundles import register
2
3
  from zipline.data.resample import minute_frame_to_session_frame
3
4
 
5
+ from exchange_calendars.calendar_helpers import parse_date
6
+ from zipline.utils.calendar_utils import get_calendar
7
+
4
8
  from .config import PolygonConfig
5
9
  from .concat_all_aggs import concat_all_aggs_from_csv, generate_csv_agg_tables
6
10
  from .adjustments import load_splits, load_dividends
7
11
 
8
12
  import pyarrow
9
13
  import pyarrow.compute
14
+ import pyarrow.dataset
10
15
 
11
16
  import pandas as pd
12
17
  import logging
13
18
 
14
- import concurrent.futures
15
-
16
19
 
17
20
  # TODO: Change warnings to be relative to number of days in the range.
18
21
 
@@ -175,8 +178,8 @@ def polygon_equities_bundle_day(
175
178
  daily_bar_writer,
176
179
  adjustment_writer,
177
180
  calendar,
178
- start_session,
179
- end_session,
181
+ start_date,
182
+ end_date,
180
183
  cache,
181
184
  show_progress,
182
185
  output_dir,
@@ -184,8 +187,8 @@ def polygon_equities_bundle_day(
184
187
  config = PolygonConfig(
185
188
  environ=environ,
186
189
  calendar_name=calendar.name,
187
- start_session=start_session,
188
- end_session=end_session,
190
+ start_date=start_date,
191
+ end_date=end_date,
189
192
  agg_time="day",
190
193
  )
191
194
 
@@ -219,7 +222,7 @@ def polygon_equities_bundle_day(
219
222
  daily_bar_writer.write(
220
223
  process_day_aggregates(
221
224
  table=table,
222
- sessions=calendar.sessions_in_range(start_session, end_session),
225
+ sessions=calendar.sessions_in_range(start_date, end_date),
223
226
  metadata=metadata,
224
227
  calendar=calendar,
225
228
  symbol_to_sid=symbol_to_sid,
@@ -400,8 +403,8 @@ def polygon_equities_bundle_minute(
400
403
  daily_bar_writer,
401
404
  adjustment_writer,
402
405
  calendar,
403
- start_session,
404
- end_session,
406
+ start_date,
407
+ end_date,
405
408
  cache,
406
409
  show_progress,
407
410
  output_dir,
@@ -409,8 +412,8 @@ def polygon_equities_bundle_minute(
409
412
  config = PolygonConfig(
410
413
  environ=environ,
411
414
  calendar_name=calendar.name,
412
- start_session=start_session,
413
- end_session=end_session,
415
+ start_date=start_date,
416
+ end_date=end_date,
414
417
  agg_time="minute",
415
418
  )
416
419
 
@@ -444,8 +447,8 @@ def polygon_equities_bundle_minute(
444
447
  daily_bar_writer.write(
445
448
  process_minute_aggregates(
446
449
  fragments=aggregates.get_fragments(),
447
- sessions=calendar.sessions_in_range(start_session, end_session),
448
- minutes=calendar.sessions_minutes(start_session, end_session),
450
+ sessions=calendar.sessions_in_range(start_date, end_date),
451
+ minutes=calendar.sessions_minutes(start_date, end_date),
449
452
  metadata=metadata,
450
453
  calendar=calendar,
451
454
  symbol_to_sid=symbol_to_sid,
@@ -459,8 +462,8 @@ def polygon_equities_bundle_minute(
459
462
  minute_bar_writer.write(
460
463
  process_minute_aggregates(
461
464
  fragments=aggregates.get_fragments(),
462
- sessions=calendar.sessions_in_range(start_session, end_session),
463
- minutes=calendar.sessions_minutes(start_session, end_session),
465
+ sessions=calendar.sessions_in_range(start_date, end_date),
466
+ minutes=calendar.sessions_minutes(start_date, end_date),
464
467
  metadata=metadata,
465
468
  calendar=calendar,
466
469
  symbol_to_sid=symbol_to_sid,
@@ -485,8 +488,8 @@ def polygon_equities_bundle_minute(
485
488
 
486
489
  def register_polygon_equities_bundle(
487
490
  bundlename,
488
- start_session=None,
489
- end_session=None,
491
+ start_date=None,
492
+ end_date=None,
490
493
  calendar_name="XNYS",
491
494
  agg_time="day",
492
495
  # ticker_list=None,
@@ -495,6 +498,25 @@ def register_polygon_equities_bundle(
495
498
  ):
496
499
  if agg_time not in ["day", "minute"]:
497
500
  raise ValueError(f"agg_time must be 'day' or 'minute', not '{agg_time}'")
501
+ # We need to know the start and end dates of the session before the bundle is
502
+ # registered because even though we only need it for ingest, the metadata in
503
+ # the writer is initialized and written before our ingest function is called.
504
+ if start_date is None or end_date is None:
505
+ config = PolygonConfig(
506
+ environ=os.environ,
507
+ calendar_name=calendar_name,
508
+ start_date=start_date,
509
+ end_date=end_date,
510
+ agg_time=agg_time,
511
+ )
512
+ first_aggs_date, last_aggs_date = config.find_first_and_last_aggs()
513
+ if start_date is None:
514
+ start_date = first_aggs_date
515
+ if end_date is None:
516
+ end_date = last_aggs_date
517
+
518
+ calendar = get_calendar(calendar_name)
519
+
498
520
  register(
499
521
  bundlename,
500
522
  (
@@ -502,8 +524,8 @@ def register_polygon_equities_bundle(
502
524
  if agg_time == "minute"
503
525
  else polygon_equities_bundle_day
504
526
  ),
505
- start_session=start_session,
506
- end_session=end_session,
527
+ start_session=parse_date(start_date, calendar=calendar),
528
+ end_session=parse_date(end_date, calendar=calendar),
507
529
  calendar_name=calendar_name,
508
530
  # minutes_per_day=390,
509
531
  # create_writers=True,
@@ -517,12 +539,12 @@ def register_polygon_equities_bundle(
517
539
  # config = PolygonConfig(
518
540
  # environ=os.environ,
519
541
  # calendar_name="XNYS",
520
- # # start_session="2003-10-01",
521
- # # start_session="2018-01-01",
522
- # start_session="2023-01-01",
523
- # # end_session="2023-01-12",
524
- # end_session="2023-12-31",
525
- # # end_session="2024-06-30",
542
+ # # start_date="2003-10-01",
543
+ # # start_date="2018-01-01",
544
+ # start_date="2023-01-01",
545
+ # # end_date="2023-01-12",
546
+ # end_date="2023-12-31",
547
+ # # end_date="2024-06-30",
526
548
  # )
527
549
  # splits = load_polygon_splits(config)
528
550
  # splits.info()
@@ -1,7 +1,7 @@
1
1
  from .config import PolygonConfig
2
2
 
3
3
  import shutil
4
- from typing import Iterator, Tuple
4
+ from typing import Iterator, Tuple, List, Union
5
5
 
6
6
  import argparse
7
7
  import glob
@@ -10,6 +10,7 @@ import os
10
10
  import pyarrow as pa
11
11
  from pyarrow import dataset as pa_ds
12
12
  from pyarrow import csv as pa_csv
13
+ from pyarrow import compute as pa_compute
13
14
 
14
15
  import pandas as pd
15
16
 
@@ -34,7 +35,7 @@ def to_partition_key(s: str) -> str:
34
35
 
35
36
 
36
37
  def generate_tables_from_csv_files(
37
- paths: list,
38
+ paths: Iterator[Union[str, os.PathLike]],
38
39
  schema: pa.Schema,
39
40
  start_timestamp: pd.Timestamp,
40
41
  limit_timestamp: pd.Timestamp,
@@ -57,7 +58,7 @@ def generate_tables_from_csv_files(
57
58
  quoted_strings_can_be_null=False,
58
59
  )
59
60
 
60
- table = pa.csv.read_csv(path, convert_options=convert_options)
61
+ table = pa_csv.read_csv(path, convert_options=convert_options)
61
62
  tables_read_count += 1
62
63
  table = table.set_column(
63
64
  table.column_names.index("window_start"),
@@ -75,10 +76,10 @@ def generate_tables_from_csv_files(
75
76
  ),
76
77
  )
77
78
  expr = (
78
- pa.compute.field("window_start")
79
+ pa_compute.field("window_start")
79
80
  >= pa.scalar(start_timestamp, type=schema.field("window_start").type)
80
81
  ) & (
81
- pa.compute.field("window_start")
82
+ pa_compute.field("window_start")
82
83
  < pa.scalar(
83
84
  limit_timestamp,
84
85
  type=schema.field("window_start").type,
@@ -101,22 +102,8 @@ def generate_tables_from_csv_files(
101
102
 
102
103
  def generate_csv_agg_tables(
103
104
  config: PolygonConfig,
104
- ) -> Tuple[list[str], pa.Schema, Iterator[pa.Table]]:
105
+ ) -> Tuple[pa.Schema, Iterator[pa.Table]]:
105
106
  """zipline does bundle ingestion one ticker at a time."""
106
- # We sort by path because they have the year and month in the dir names and the date in the filename.
107
- paths = sorted(
108
- list(
109
- glob.glob(
110
- os.path.join(config.aggs_dir, config.csv_paths_pattern),
111
- recursive="**" in config.csv_paths_pattern,
112
- )
113
- )
114
- )
115
-
116
- print(f"{len(paths)=}")
117
- if len(paths) > 0:
118
- print(f"{paths[0]=}")
119
- print(f"{paths[-1]=}")
120
107
 
121
108
  # Polygon Aggregate flatfile timestamps are in nanoseconds (like trades), not milliseconds as the docs say.
122
109
  # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
@@ -154,11 +141,11 @@ def generate_csv_agg_tables(
154
141
  pa.field(PARTITION_COLUMN_NAME, pa.string(), nullable=False)
155
142
  )
156
143
 
144
+ # TODO: Use generator like os.walk for paths.
157
145
  return (
158
- paths,
159
146
  polygon_aggs_schema,
160
147
  generate_tables_from_csv_files(
161
- paths=paths,
148
+ paths=config.csv_paths(),
162
149
  schema=polygon_aggs_schema,
163
150
  start_timestamp=config.start_timestamp,
164
151
  limit_timestamp=config.end_timestamp + pd.to_timedelta(1, unit="day"),
@@ -176,11 +163,9 @@ def concat_all_aggs_from_csv(
176
163
  config: PolygonConfig,
177
164
  overwrite: bool = False,
178
165
  ) -> str:
179
- paths, schema, tables = generate_csv_agg_tables(config)
166
+ schema, tables = generate_csv_agg_tables(config)
180
167
 
181
- if len(paths) < 1:
182
- raise ValueError(f"No Polygon CSV flat files found in {config.aggs_dir=}")
183
- by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir(paths[0], paths[-1])
168
+ by_ticker_aggs_arrow_dir = config.by_ticker_aggs_arrow_dir
184
169
  if os.path.exists(by_ticker_aggs_arrow_dir):
185
170
  if overwrite:
186
171
  print(f"Removing {by_ticker_aggs_arrow_dir=}")
@@ -212,10 +197,10 @@ if __name__ == "__main__":
212
197
  parser = argparse.ArgumentParser()
213
198
  parser.add_argument("--calendar_name", default="XNYS")
214
199
 
215
- parser.add_argument("--start_session", default="2014-06-16")
216
- parser.add_argument("--end_session", default="2024-09-06")
217
- # parser.add_argument("--start_session", default="2020-01-01")
218
- # parser.add_argument("--end_session", default="2020-12-31")
200
+ parser.add_argument("--start_date", default="2014-06-16")
201
+ parser.add_argument("--end_date", default="2024-09-06")
202
+ # parser.add_argument("--start_date", default="2020-01-01")
203
+ # parser.add_argument("--end_date", default="2020-12-31")
219
204
 
220
205
  parser.add_argument("--agg_time", default="day")
221
206
 
@@ -235,8 +220,8 @@ if __name__ == "__main__":
235
220
  config = PolygonConfig(
236
221
  environ=os.environ,
237
222
  calendar_name=args.calendar_name,
238
- start_session=args.start_session,
239
- end_session=args.end_session,
223
+ start_date=args.start_date,
224
+ end_date=args.end_date,
240
225
  agg_time=args.agg_time,
241
226
  )
242
227
 
@@ -138,10 +138,10 @@ if __name__ == "__main__":
138
138
  parser = argparse.ArgumentParser()
139
139
  parser.add_argument("--calendar_name", default="XNYS")
140
140
 
141
- parser.add_argument("--start_session", default="2014-06-16")
142
- parser.add_argument("--end_session", default="2024-09-06")
143
- # parser.add_argument("--start_session", default="2020-10-07")
144
- # parser.add_argument("--end_session", default="2020-10-15")
141
+ parser.add_argument("--start_date", default="2014-06-16")
142
+ parser.add_argument("--end_date", default="2024-09-06")
143
+ # parser.add_argument("--start_date", default="2020-10-07")
144
+ # parser.add_argument("--end_date", default="2020-10-15")
145
145
  # parser.add_argument("--aggs_pattern", default="2020/10/**/*.csv.gz")
146
146
  parser.add_argument("--aggs_pattern", default="**/*.csv.gz")
147
147
 
@@ -163,8 +163,8 @@ if __name__ == "__main__":
163
163
  config = PolygonConfig(
164
164
  environ=os.environ,
165
165
  calendar_name=args.calendar_name,
166
- start_session=args.start_session,
167
- end_session=args.end_session,
166
+ start_date=args.start_date,
167
+ end_date=args.end_date,
168
168
  )
169
169
 
170
170
  concat_all_aggs_from_csv(
@@ -1,8 +1,13 @@
1
1
  from exchange_calendars.calendar_helpers import Date, parse_date, parse_timestamp
2
2
  from zipline.utils.calendar_utils import get_calendar
3
3
 
4
- import os
4
+ from typing import Iterator, Tuple, List
5
+
5
6
  import pandas as pd
7
+ from pyarrow.fs import LocalFileSystem
8
+ import os
9
+ import re
10
+ import fnmatch
6
11
 
7
12
 
8
13
  class PolygonConfig:
@@ -10,33 +15,35 @@ class PolygonConfig:
10
15
  self,
11
16
  environ: dict,
12
17
  calendar_name: str,
13
- start_session: Date,
14
- end_session: Date,
18
+ start_date: Date,
19
+ end_date: Date,
15
20
  agg_time: str = "day",
21
+ custom_aggs_format: str = "{config.agg_timedelta.seconds}sec_aggs",
16
22
  ):
17
- if agg_time not in ["minute", "day"]:
18
- raise ValueError(f"agg_time must be 'minute' or 'day', got '{agg_time}'")
19
23
  self.calendar_name = calendar_name
24
+ self.start_date = start_date
25
+ self.end_date = end_date
20
26
  self.start_timestamp = (
21
- parse_date(start_session, calendar=self.calendar)
22
- if start_session
27
+ parse_date(start_date, calendar=self.calendar)
28
+ if start_date
23
29
  else self.calendar.first_session
24
30
  )
25
31
  self.end_timestamp = (
26
- parse_date(end_session, calendar=self.calendar)
27
- if end_session
32
+ parse_date(end_date, calendar=self.calendar)
33
+ if end_date
28
34
  else self.calendar.last_session
29
35
  )
30
36
  self.max_workers = None
31
37
  if environ.get("POLYGON_MAX_WORKERS", "").strip() != "":
32
38
  self.max_workers = int(environ.get("POLYGON_MAX_WORKERS"))
33
39
  self.api_key = environ.get("POLYGON_API_KEY")
40
+ self.filesystem = LocalFileSystem()
34
41
  self.data_dir = environ.get("POLYGON_DATA_DIR", "data/files.polygon.io")
35
42
  self.cik_cusip_mapping_csv_path = environ.get(
36
43
  "CIK_CUSIP_MAPS_CSV", os.path.join(self.data_dir, "cik-cusip-maps.csv")
37
44
  )
38
- self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
39
45
  self.market = environ.get("POLYGON_MARKET", "stocks")
46
+ self.asset_subdir = environ.get("POLYGON_ASSET_SUBDIR", "us_stocks_sip")
40
47
  self.tickers_dir = environ.get(
41
48
  "POLYGON_TICKERS_DIR",
42
49
  os.path.join(os.path.join(self.data_dir, "tickers"), self.asset_subdir),
@@ -51,25 +58,54 @@ class PolygonConfig:
51
58
  self.flat_files_dir = environ.get(
52
59
  "POLYGON_FLAT_FILES_DIR", os.path.join(self.data_dir, "flatfiles")
53
60
  )
54
- self.csv_paths_pattern = environ.get("POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz")
55
- self.agg_time = agg_time
61
+ # TODO: Restore non-recusive option. Always recursive for now.
62
+ self.csv_paths_pattern = environ.get(
63
+ # "POLYGON_FLAT_FILES_CSV_PATTERN", "**/*.csv.gz"
64
+ "POLYGON_FLAT_FILES_CSV_PATTERN", "*.csv.gz"
65
+ )
56
66
  self.asset_files_dir = os.path.join(self.flat_files_dir, self.asset_subdir)
57
67
  self.minute_aggs_dir = os.path.join(self.asset_files_dir, "minute_aggs_v1")
58
68
  self.day_aggs_dir = os.path.join(self.asset_files_dir, "day_aggs_v1")
59
- self.aggs_dir = (
60
- self.minute_aggs_dir if self.agg_time == "minute" else self.day_aggs_dir
61
- )
69
+ self.trades_dir = os.path.join(self.asset_files_dir, "trades_v1")
70
+ self.quotes_dir = os.path.join(self.asset_files_dir, "quotes_v1")
71
+
62
72
  # TODO: The "by ticker" files are temporary/intermediate and should/could be in the zipline data dir.
63
73
  self.minute_by_ticker_dir = os.path.join(
64
74
  self.asset_files_dir, "minute_by_ticker_v1"
65
75
  )
66
76
  self.day_by_ticker_dir = os.path.join(self.asset_files_dir, "day_by_ticker_v1")
67
- self.by_ticker_dir = (
68
- self.minute_by_ticker_dir
69
- if self.agg_time == "minute"
70
- else self.day_by_ticker_dir
77
+
78
+ if bool(re.match(r"^\d", agg_time)):
79
+ self.agg_timedelta = pd.to_timedelta(agg_time)
80
+ self.custom_asset_files_dir = environ.get(
81
+ "CUSTOM_ASSET_FILES_DIR", self.asset_files_dir
82
+ )
83
+ self.custom_aggs_dir = os.path.join(
84
+ self.custom_asset_files_dir, custom_aggs_format.format(config=self)
85
+ )
86
+ self.custom_aggs_by_ticker_dir = os.path.join(
87
+ self.custom_asset_files_dir,
88
+ (custom_aggs_format + "_by_ticker").format(config=self),
89
+ )
90
+ self.aggs_dir = self.custom_aggs_dir
91
+ self.by_ticker_dir = self.custom_aggs_by_ticker_dir
92
+ elif agg_time == "minute":
93
+ self.agg_timedelta = pd.to_timedelta("1minute")
94
+ self.aggs_dir = self.minute_aggs_dir
95
+ self.by_ticker_dir = self.minute_by_ticker_dir
96
+ elif agg_time == "day":
97
+ self.agg_timedelta = pd.to_timedelta("1day")
98
+ self.aggs_dir = self.day_aggs_dir
99
+ self.by_ticker_dir = self.day_by_ticker_dir
100
+ else:
101
+ raise ValueError(
102
+ f"agg_time must be 'minute', 'day', or a timedelta string; got '{agg_time=}'"
103
+ )
104
+ self.agg_time = agg_time
105
+
106
+ self.arrow_format = environ.get(
107
+ "POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive"
71
108
  )
72
- self.arrow_format = environ.get("POLYGON_ARROW_FORMAT", "parquet" if self.agg_time == "day" else "hive")
73
109
  # self.by_ticker_hive_dir = os.path.join(
74
110
  # self.by_ticker_dir,
75
111
  # f"{self.agg_time}_{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.hive",
@@ -88,15 +124,15 @@ class PolygonConfig:
88
124
  return os.path.join(
89
125
  ticker_year_dir, f"tickers_{date.date().isoformat()}.parquet"
90
126
  )
91
-
127
+
92
128
  def file_path_to_name(self, path: str):
129
+ # TODO: Use csv_paths_pattern to remove the suffixes
93
130
  return os.path.basename(path).removesuffix(".gz").removesuffix(".csv")
94
131
 
95
- def by_ticker_aggs_arrow_dir(self, first_path: str, last_path: str):
96
- return os.path.join(
97
- self.by_ticker_dir,
98
- f"{self.file_path_to_name(first_path)}_{self.file_path_to_name(last_path)}.arrow",
99
- )
132
+ @property
133
+ def by_ticker_aggs_arrow_dir(self):
134
+ return os.path.join(self.by_ticker_dir,
135
+ f"{self.start_timestamp.date().isoformat()}_{self.end_timestamp.date().isoformat()}.arrow")
100
136
 
101
137
  def api_cache_path(
102
138
  self, start_date: Date, end_date: Date, filename: str, extension=".parquet"
@@ -107,6 +143,43 @@ class PolygonConfig:
107
143
  self.cache_dir, f"{start_str}_{end_str}/{filename}{extension}"
108
144
  )
109
145
 
146
+ def csv_paths(self) -> Iterator[str]:
147
+ for root, dirnames, filenames in os.walk(self.aggs_dir, topdown=True):
148
+ if dirnames:
149
+ dirnames[:] = sorted(dirnames)
150
+ # Filter out filenames that don't match the pattern.
151
+ filenames = fnmatch.filter(filenames, self.csv_paths_pattern)
152
+ if filenames:
153
+ for filename in sorted(filenames):
154
+ yield os.path.join(root, filename)
155
+
156
+ def find_first_and_last_aggs(self) -> Tuple[str, str]:
157
+ # Find the path to the lexically first and last paths in aggs_dir that matches csv_paths_pattern.
158
+ # Would like to use Path.walk(top_down=True) but it is only availble in Python 3.12+.
159
+ # This needs to be efficient because it is called on every init, even though we only need it for ingest.
160
+ # But we can't call it in ingest because the writer initializes and writes the metadata before it is called.
161
+ paths = []
162
+ for root, dirnames, filenames in os.walk(self.aggs_dir, topdown=True):
163
+ if dirnames:
164
+ # We only want first and last in each directory.
165
+ sorted_dirs = sorted(dirnames)
166
+ dirnames[:] = (
167
+ [sorted_dirs[0], sorted_dirs[-1]]
168
+ if len(sorted_dirs) > 1
169
+ else sorted_dirs
170
+ )
171
+ # Filter out filenames that don't match the pattern.
172
+ filenames = fnmatch.filter(filenames, self.csv_paths_pattern)
173
+ if filenames:
174
+ filenames = sorted(filenames)
175
+ paths.append(os.path.join(root, filenames[0]))
176
+ if len(filenames) > 1:
177
+ paths.append(os.path.join(root, filenames[-1]))
178
+ paths = sorted(paths)
179
+ if len(paths) < 2:
180
+ raise ValueError(f"Need more than one aggs file but found {len(paths)} paths in {self.aggs_dir}")
181
+ return self.file_path_to_name(paths[0]), self.file_path_to_name(paths[-1])
182
+
110
183
 
111
184
  if __name__ == "__main__":
112
185
  config = PolygonConfig(os.environ, "XNYS", "2003-10-01", "2023-01-01")
@@ -99,6 +99,6 @@ def process_all_csv_to_parquet(
99
99
  if __name__ == "__main__":
100
100
  # os.environ["POLYGON_DATA_DIR"] = "/Volumes/Oahu/Mirror/files.polygon.io"
101
101
  config = PolygonConfig(
102
- environ=os.environ, calendar_name="XNYS", start_session=None, end_session=None
102
+ environ=os.environ, calendar_name="XNYS", start_date=None, end_date=None
103
103
  )
104
104
  process_all_csv_to_parquet(config.aggs_dir)
@@ -74,8 +74,8 @@ if __name__ == "__main__":
74
74
  config = PolygonConfig(
75
75
  environ=os.environ,
76
76
  calendar_name="XNYS",
77
- start_session="2020-10-07",
78
- end_session="2020-10-15",
77
+ start_date="2020-10-07",
78
+ end_date="2020-10-15",
79
79
  )
80
80
  print(f"{config.aggs_dir=}")
81
81
  max_ticker_lens = apply_to_all_aggs(