zipline_polygon_bundle 0.1.7__py3-none-any.whl → 0.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,101 @@
1
+ from .config import PolygonConfig
2
+ from .trades import cast_strings_to_list
3
+
4
+ import os
5
+
6
+ import pyarrow as pa
7
+ from pyarrow import dataset as pa_ds
8
+ from pyarrow import compute as pa_compute
9
+ from pyarrow import fs as pa_fs
10
+ from fsspec.implementations.arrow import ArrowFSWrapper
11
+ from pyarrow import csv as pa_csv
12
+
13
+
14
+ def quotes_schema(raw: bool = False) -> pa.Schema:
15
+ # There is some problem reading the timestamps as timestamps so we have to read as integer then change the schema.
16
+ # I make the timestamp timezone-aware because that's how Unix timestamps work and it may help avoid mistakes.
17
+ # timestamp_type = pa.timestamp("ns", tz="UTC")
18
+ timestamp_type = pa.int64() if raw else pa.timestamp("ns", tz="UTC")
19
+
20
+ # Polygon price scale is 4 decimal places (i.e. hundredths of a penny), but we'll use 10 because we have precision to spare.
21
+ # price_type = pa.decimal128(precision=38, scale=10)
22
+ # 64bit float a little overkill but avoids any plausible truncation error.
23
+ price_type = pa.float64()
24
+
25
+ # ticker: string
26
+ # ask_exchange: int64
27
+ # ask_price: double
28
+ # ask_size: int64
29
+ # bid_exchange: int64
30
+ # bid_price: double
31
+ # bid_size: int64
32
+ # conditions: string
33
+ # indicators: int64
34
+ # participant_timestamp: int64
35
+ # sequence_number: int64
36
+ # sip_timestamp: int64
37
+ # tape: int64
38
+ # trf_timestamp: int64
39
+
40
+ return pa.schema(
41
+ [
42
+ pa.field("ticker", pa.string(), nullable=False),
43
+ pa.field("ask_exchange", pa.int8(), nullable=False),
44
+ pa.field("ask_price", price_type, nullable=False),
45
+ pa.field("ask_size", pa.int64(), nullable=False),
46
+ pa.field("bid_exchange", pa.int8(), nullable=False),
47
+ pa.field("bid_price", price_type, nullable=False),
48
+ pa.field("bid_size", pa.int64(), nullable=False),
49
+ pa.field("conditions", pa.string(), nullable=False),
50
+ pa.field("indicators", pa.string(), nullable=False),
51
+ pa.field("participant_timestamp", timestamp_type, nullable=False),
52
+ pa.field("sequence_number", pa.int64(), nullable=False),
53
+ pa.field("sip_timestamp", timestamp_type, nullable=False),
54
+ pa.field("tape", pa.int8(), nullable=False),
55
+ pa.field("trf_timestamp", timestamp_type, nullable=False),
56
+ ]
57
+ )
58
+
59
+
60
+ def quotes_dataset(config: PolygonConfig) -> pa_ds.Dataset:
61
+ """
62
+ Create a pyarrow dataset from the quotes files.
63
+ """
64
+
65
+ # https://arrow.apache.org/docs/python/filesystems.html#using-arrow-filesystems-with-fsspec
66
+ # https://filesystem-spec.readthedocs.io/en/latest/_modules/fsspec/spec.html#AbstractFileSystem.glob.
67
+ fsspec = ArrowFSWrapper(config.filesystem)
68
+
69
+ # We sort by path because they have the year and month in the dir names and the date in the filename.
70
+ paths = sorted(
71
+ fsspec.glob(os.path.join(config.quotes_dir, config.csv_paths_pattern))
72
+ )
73
+
74
+ return pa_ds.FileSystemDataset.from_paths(paths,
75
+ format=pa_ds.CsvFileFormat(),
76
+ schema=quotes_schema(raw=True),
77
+ filesystem=config.filesystem)
78
+
79
+
80
+ def cast_strings_to_list(string_array, separator=",", default="0", value_type=pa.uint8()):
81
+ """Cast a PyArrow StringArray of comma-separated numbers to a ListArray of values."""
82
+
83
+ # Create a mask to identify empty strings
84
+ is_empty = pa_compute.equal(pa_compute.utf8_trim_whitespace(string_array), "")
85
+
86
+ # Use replace_with_mask to replace empty strings with the default ("0")
87
+ filled_column = pa_compute.replace_with_mask(string_array, is_empty, pa.scalar(default))
88
+
89
+ # Split the strings by comma
90
+ split_array = pa_compute.split_pattern(filled_column, pattern=separator)
91
+
92
+ # Cast each element in the resulting lists to integers
93
+ int_list_array = pa_compute.cast(split_array, pa.list_(value_type))
94
+
95
+ return int_list_array
96
+
97
+
98
+ def cast_quotes(quotes):
99
+ quotes = quotes.cast(quotes_schema())
100
+ condition_values = cast_strings_to_list(quotes.column("conditions").combine_chunks())
101
+ return quotes.append_column('condition_values', condition_values)
@@ -3,6 +3,7 @@ from .config import PolygonConfig
3
3
  import datetime
4
4
  import os
5
5
  import pandas as pd
6
+ import csv
6
7
  import polygon
7
8
  import logging
8
9
  from concurrent.futures import ProcessPoolExecutor
@@ -383,40 +384,3 @@ def get_ticker_universe(config: PolygonConfig, fetch_missing: bool = False):
383
384
  merged_tickers = pd.read_parquet(parquet_path)
384
385
  merged_tickers.info()
385
386
  return merged_tickers
386
-
387
-
388
- # Initialize ticker files in __main__. Use CLI args to specify start and end dates.
389
- if __name__ == "__main__":
390
- import argparse
391
-
392
- parser = argparse.ArgumentParser(description="Initialize ticker files.")
393
- parser.add_argument(
394
- "--start-date",
395
- type=str,
396
- help="Start date in ISO format (YYYY-MM-DD)",
397
- default="2014-05-01",
398
- )
399
- parser.add_argument(
400
- "--end-date",
401
- type=str,
402
- help="End date in ISO format (YYYY-MM-DD)",
403
- default="2024-04-01",
404
- )
405
- args = parser.parse_args()
406
-
407
- start_date = (
408
- datetime.datetime.strptime(args.start_date, "%Y-%m-%d").date()
409
- if args.start_date
410
- else datetime.date.today()
411
- )
412
- end_date = (
413
- datetime.datetime.strptime(args.end_date, "%Y-%m-%d").date()
414
- if args.end_date
415
- else datetime.date.today()
416
- )
417
-
418
- all_tickers = load_all_tickers(start_date, end_date, fetch_missing=True)
419
- merged_tickers = merge_tickers(all_tickers)
420
- merged_tickers.to_csv(f"data/tickers/us_tickers_{start_date}-{end_date}.csv")
421
- ticker_names = ticker_names_from_merged_tickers(merged_tickers)
422
- print(ticker_names)