gnomepy 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gnomepy/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ from gnomepy.registry.api import *
2
+ from gnomepy.constants import *
3
+ from gnomepy.config import *
4
+ from gnomepy.data.client import *
5
+ from gnomepy.data.common import *
6
+ from gnomepy.data.types import *
File without changes
gnomepy/config.py ADDED
@@ -0,0 +1,23 @@
1
+ import os
2
+
3
+ class Config:
4
+ REGISTRY_API_URL = 'https://i3116oczxe.execute-api.us-east-1.amazonaws.com/api'
5
+
6
+ class DevConfig(Config):
7
+ ...
8
+
9
+ class StagingConfig(Config):
10
+ ...
11
+
12
+ class ProdConfig(Config):
13
+ ...
14
+
15
+ _ENV = os.getenv("ENVIRONMENT", "prod").lower()
16
+
17
+ _CONFIG_MAP = {
18
+ "dev": DevConfig,
19
+ "staging": StagingConfig,
20
+ "prod": ProdConfig
21
+ }
22
+
23
+ config = _CONFIG_MAP.get(_ENV, DevConfig)
gnomepy/constants.py ADDED
@@ -0,0 +1,7 @@
1
+ from enum import StrEnum
2
+
3
+
4
+ class Environment(StrEnum):
5
+ DEV = "dev",
6
+ STAGING = "staging",
7
+ PROD = "prod",
File without changes
gnomepy/data/client.py ADDED
@@ -0,0 +1,72 @@
1
+ import datetime
2
+ import re
3
+ from typing import Optional
4
+
5
+ import boto3.session
6
+ import pandas as pd
7
+
8
+ from gnomepy.data.common import DataStore
9
+ from gnomepy.data.types import SchemaType
10
+
11
+ _KEY_REGEX = re.compile("[0-9]/[0-9]/([0-9]+)_([0-9]+)/*")
12
+
13
+ class MarketDataClient:
14
+ def __init__(
15
+ self,
16
+ bucket: str = "market-data-collector",
17
+ aws_profile_name: Optional[str] = None,
18
+ ):
19
+ session = boto3.session.Session(profile_name=aws_profile_name)
20
+ self.s3 = session.client('s3')
21
+ self.bucket = bucket
22
+
23
+ def get_data(
24
+ self,
25
+ *,
26
+ exchange_id: int,
27
+ listing_id: int,
28
+ start_datetime: datetime.datetime | pd.Timestamp,
29
+ end_datetime: datetime.datetime | pd.Timestamp,
30
+ schema_type: SchemaType = SchemaType.MBO,
31
+ ) -> DataStore:
32
+ total = self._get_raw_history(exchange_id, listing_id, start_datetime, end_datetime)
33
+ return DataStore.from_bytes(total, schema_type)
34
+
35
+ def _get_raw_history(
36
+ self,
37
+ exchange_id: int,
38
+ listing_id: int,
39
+ start_datetime: datetime.datetime | pd.Timestamp,
40
+ end_datetime: datetime.datetime | pd.Timestamp,
41
+ ) -> bytes:
42
+ keys = self._get_available_keys(exchange_id, listing_id, start_datetime, end_datetime)
43
+ total = b''
44
+ for key in keys:
45
+ response = self.s3.get_object(Bucket=self.bucket, Key=key)
46
+ total += response["Body"].read()
47
+ return total
48
+
49
+ def _get_available_keys(
50
+ self,
51
+ exchange_id: int,
52
+ listing_id: int,
53
+ start_datetime: datetime.datetime | pd.Timestamp,
54
+ end_datetime: datetime.datetime | pd.Timestamp,
55
+ ):
56
+ prefix = f"{exchange_id}/{listing_id}/"
57
+ paginator = self.s3.get_paginator('list_objects_v2')
58
+ pages = paginator.paginate(Bucket=self.bucket, Prefix=prefix)
59
+
60
+ keys = []
61
+ for page in pages:
62
+ for obj in page['Contents']:
63
+ key = obj['Key']
64
+ parsed = _KEY_REGEX.match(key)
65
+ if parsed is not None:
66
+ date = parsed.group(1)
67
+ hour = parsed.group(2)
68
+ parsed_dt = datetime.datetime.strptime(f"{date} {hour}", "%Y%m%d %H")
69
+ if start_datetime <= parsed_dt <= end_datetime:
70
+ keys.append(key)
71
+
72
+ return keys
gnomepy/data/common.py ADDED
@@ -0,0 +1,373 @@
1
+ from __future__ import annotations
2
+
3
+ import decimal
4
+ import enum
5
+ import logging
6
+ import warnings
7
+ from abc import ABC
8
+ from io import BytesIO
9
+ from typing import IO, Generator, Protocol, Any, Iterator, Callable
10
+
11
+ import importlib_resources
12
+ import numpy as np
13
+ import pandas as pd
14
+ import pytz
15
+ import zstandard
16
+
17
+ from gnomepy.data.types import SchemaBase, SchemaType, get_schema_base, DecimalType, FIXED_PRICE_SCALE, FIXED_SIZE_SCALE
18
+ from gnomepy.data.sbe import Schema, Message
19
+
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ def _is_zstandard(reader: IO[bytes]) -> bool:
24
+ """
25
+ Determine if an `IO[bytes]` reader contains zstandard compressed data.
26
+
27
+ Parameters
28
+ ----------
29
+ reader : IO[bytes]
30
+ The data to check.
31
+
32
+ Returns
33
+ -------
34
+ bool
35
+ """
36
+ reader.seek(0)
37
+ try:
38
+ zstandard.get_frame_parameters(reader.read(18))
39
+ except zstandard.ZstdError:
40
+ return False
41
+ else:
42
+ return True
43
+
44
+ class Compression(enum.Enum):
45
+ ZSTD = 0
46
+ NONE = 1
47
+
48
+ class DataSource(ABC):
49
+ """
50
+ Abstract base class for holding schema data.
51
+ """
52
+ @property
53
+ def reader(self) -> IO[bytes]:
54
+ raise NotImplementedError
55
+
56
+ @property
57
+ def bytes(self) -> memoryview:
58
+ raise NotImplementedError
59
+
60
+ class MemoryDataSource(DataSource):
61
+ def __init__(self, source: BytesIO | bytes | IO[bytes]):
62
+ if isinstance(source, bytes):
63
+ initial_data = source
64
+ else:
65
+ source.seek(0)
66
+ initial_data = source.read()
67
+
68
+ if len(initial_data) == 0:
69
+ raise ValueError(f"Cannot create data source from empty {type(source).__name__}")
70
+ self.__buffer = BytesIO(initial_data)
71
+
72
+ @property
73
+ def reader(self) -> BytesIO:
74
+ self.__buffer.seek(0)
75
+ return self.__buffer
76
+
77
+ @property
78
+ def bytes(self) -> memoryview:
79
+ return self.__buffer.getbuffer()
80
+
81
+ class DataStore:
82
+
83
+ def __init__(
84
+ self,
85
+ data_source: DataSource,
86
+ schema_type: SchemaType,
87
+ schema_file_module = "gnomepy.data.sbe",
88
+ schema_file_name = "schema.xml"
89
+ ):
90
+ self._data_source = data_source
91
+ self._schema_type = schema_type
92
+ self._schema_base_type = get_schema_base(self._schema_type)
93
+
94
+ buffer = self._data_source.reader
95
+ if _is_zstandard(buffer):
96
+ self._compression = Compression.ZSTD
97
+ else:
98
+ self._compression = Compression.NONE
99
+
100
+ with importlib_resources.open_text(schema_file_module, schema_file_name) as f:
101
+ self.schema = Schema.parse(f)
102
+ self._header_size = self.schema.types[self.schema.header_type_name].size()
103
+
104
+ def __iter__(self) -> Generator[SchemaBase, None, None]:
105
+ mem = self.bytes
106
+ offset = 0
107
+ body_size = self._schema_metadata.body_size
108
+ while offset < len(mem):
109
+ message = self.schema.decode(mem[offset:])
110
+ parsed = self._schema_base_type.from_message(message)
111
+
112
+ yield parsed
113
+
114
+ offset += body_size + self._header_size
115
+
116
+ def replay(self, callback: Callable[[Any], None]) -> None:
117
+ """
118
+ Replay data by passing records sequentially to the given callback.
119
+
120
+ Parameters
121
+ ----------
122
+ callback : callable
123
+ The callback to the data handler.
124
+
125
+ """
126
+ for record in self:
127
+ try:
128
+ callback(record)
129
+ except Exception:
130
+ logger.exception("exception while replaying to user callback")
131
+ raise
132
+
133
+ def __repr__(self):
134
+ return f"<{self.__class__.__name__}(type={self._schema_type})>"
135
+
136
+ @property
137
+ def _schema_metadata(self) -> Message:
138
+ for message in self.schema.messages.values():
139
+ if message.description == self._schema_type.value:
140
+ return message
141
+ raise Exception(f"Invalid schema type: {self._schema_type}")
142
+
143
+ @property
144
+ def schema_dtype(self) -> np.dtype:
145
+ metadata = self._schema_metadata
146
+ header_format = f"u{self._header_size}"
147
+ return np.dtype({'names': ['header'] + metadata.field_names, 'formats': [header_format] + metadata.formats})
148
+
149
+ @property
150
+ def bytes(self) -> memoryview:
151
+ if self._compression == Compression.ZSTD:
152
+ return memoryview(zstandard.ZstdDecompressor().stream_reader(self._data_source.bytes).readall())
153
+ return self._data_source.bytes
154
+
155
+ @property
156
+ def reader(self) -> IO[bytes]:
157
+ if self._compression == Compression.ZSTD:
158
+ return zstandard.ZstdDecompressor().stream_reader(self._data_source.reader)
159
+ return self._data_source.reader
160
+
161
+ def to_ndarray(self, count: int | None = None) -> np.ndarray[Any, Any] | NDArrayIterator:
162
+ """
163
+ Return the data as a numpy `ndarray`.
164
+
165
+ Parameters
166
+ ----------
167
+ count : int, optional
168
+ If set, instead of returning a single `np.ndarray` a `NDArrayIterator`
169
+ instance will be returned. When iterated, this object will yield
170
+ a `np.ndarray` with at most `count` elements until the entire contents
171
+ of the data store is exhausted. This can be used to process a large
172
+ data store in pieces instead of all at once.
173
+
174
+ Returns
175
+ -------
176
+ np.ndarray
177
+ NDArrayIterator
178
+ """
179
+ ndarray_iter = NDArrayIterator(
180
+ reader=self.reader,
181
+ dtype=self.schema_dtype,
182
+ count=count,
183
+ )
184
+
185
+ if count is None:
186
+ return next(ndarray_iter, np.empty([0, 1], dtype=self.schema_dtype))
187
+
188
+ return ndarray_iter
189
+
190
+ def to_df(
191
+ self,
192
+ price_type: DecimalType | str = DecimalType.FLOAT,
193
+ size_type: DecimalType | str = DecimalType.FLOAT,
194
+ pretty_ts: bool = True,
195
+ tz: pytz.BaseTzInfo | str = pytz.UTC,
196
+ replace_nulls: bool = True,
197
+ count: int | None = None,
198
+ ) -> pd.DataFrame | DataFrameIterator:
199
+ """
200
+ Return the data as a `pd.DataFrame`.
201
+
202
+ Parameters
203
+ ----------
204
+ price_type : DecimalType or str, default "float"
205
+ The price type to use for price fields.
206
+ If "fixed", prices will have a type of `int` in fixed decimal format; each unit representing 1e-9 or 0.000000001.
207
+ If "float", prices will have a type of `float`.
208
+ If "decimal", prices will be instances of `decimal.Decimal`.
209
+ size_type : DecimalType or str, default "float"
210
+ The size type to use for size fields.
211
+ If "fixed", sizes will have a type of `int` in fixed decimal format; each unit representing 1e-6 or 0.000001.
212
+ If "float", sizes will have a type of `float`.
213
+ If "decimal", sizes will be instances of `decimal.Decimal`.
214
+ pretty_ts : bool, default True
215
+ If all timestamp columns should be converted from UNIX nanosecond
216
+ `int` to tz-aware `pd.Timestamp`. The timezone can be specified using the `tz` parameter.
217
+ tz : pytz.BaseTzInfo or str, default UTC
218
+ If `pretty_ts` is `True`, all timestamps will be converted to the specified timezone.
219
+ replace_nulls : bool, default True
220
+ Replace the null values in the `DataFrame` with `np.nan`.
221
+ count : int, optional
222
+ If set, instead of returning a single `DataFrame` a `DataFrameIterator`
223
+ instance will be returned. When iterated, this object will yield
224
+ a `DataFrame` with at most `count` elements until the entire contents
225
+ of the data store are exhausted. This can be used to process a large
226
+ data store in pieces instead of all at once.
227
+
228
+ Returns
229
+ -------
230
+ pd.DataFrame
231
+ DataFrameIterator
232
+ """
233
+ if not isinstance(tz, pytz.BaseTzInfo):
234
+ tz = pytz.timezone(tz)
235
+ if count is None:
236
+ records = iter([self.to_ndarray()])
237
+ else:
238
+ records = self.to_ndarray(count)
239
+
240
+ df_iter = DataFrameIterator(
241
+ records=records,
242
+ schema_metadata=self._schema_metadata,
243
+ count=count,
244
+ tz=tz,
245
+ price_type=price_type,
246
+ size_type=size_type,
247
+ replace_nulls=replace_nulls,
248
+ pretty_ts=pretty_ts,
249
+ )
250
+ if count is None:
251
+ return next(df_iter)
252
+
253
+ return df_iter
254
+
255
+ @classmethod
256
+ def from_bytes(cls, data: BytesIO | bytes | IO[bytes], schema_type: SchemaType) -> DataStore:
257
+ return cls(MemoryDataSource(data), schema_type)
258
+
259
+
260
+ class NDArrayIterator(Protocol):
261
+
262
+ def __init__(
263
+ self,
264
+ reader: IO[bytes],
265
+ dtype: np.typing.DTypeLike,
266
+ count: int | None = None,
267
+ ) -> None:
268
+ self._reader = reader
269
+ self._dtype: np.typing.DTypeLike = np.dtype(dtype)
270
+ self._count = count
271
+ self._close_on_next = False
272
+
273
+ def __iter__(self) -> NDArrayIterator:
274
+ return self
275
+
276
+ def __next__(self) -> np.ndarray[Any, Any]:
277
+ if self._close_on_next:
278
+ raise StopIteration
279
+
280
+ if self._count is None:
281
+ read_size = -1
282
+ else:
283
+ read_size = self._dtype.itemsize * max(self._count, 1)
284
+
285
+ if buffer := self._reader.read(read_size):
286
+ loose_bytes = len(buffer) % self._dtype.itemsize
287
+ if loose_bytes != 0:
288
+ warnings.warn("Data store file is truncated or contains an incomplete record")
289
+ buffer = buffer[:-loose_bytes]
290
+ self._close_on_next = True
291
+
292
+ try:
293
+ return np.frombuffer(
294
+ buffer=buffer,
295
+ dtype=self._dtype,
296
+ )
297
+ except ValueError as exc:
298
+ raise Exception("Cannot decode data stream") from exc
299
+
300
+ raise StopIteration
301
+
302
+
303
+ class DataFrameIterator:
304
+ def __init__(
305
+ self,
306
+ records: Iterator[np.ndarray[Any, Any]],
307
+ schema_metadata: Message,
308
+ count: int | None,
309
+ tz: pytz.BaseTzInfo,
310
+ price_type: DecimalType = DecimalType.FLOAT,
311
+ size_type: DecimalType = DecimalType.FLOAT,
312
+ replace_nulls: bool = True,
313
+ pretty_ts: bool = True,
314
+ ):
315
+ self._records = records
316
+ self._schema_metadata = schema_metadata
317
+ self._count = count
318
+ self._price_type = price_type
319
+ self._size_type = size_type
320
+ self._replace_nulls = replace_nulls
321
+ self._pretty_ts = pretty_ts
322
+ self._tz = tz
323
+
324
+ def __iter__(self) -> DataFrameIterator:
325
+ return self
326
+
327
+ def __next__(self) -> pd.DataFrame:
328
+ df = pd.DataFrame(
329
+ next(self._records),
330
+ columns=self._schema_metadata.field_names,
331
+ )
332
+ if self._replace_nulls:
333
+ self._format_nulls(df)
334
+
335
+ self._format_decimal(df, 'price', self._price_type, FIXED_PRICE_SCALE)
336
+ self._format_decimal(df, 'size', self._size_type, FIXED_SIZE_SCALE)
337
+ self._format_decimal(df, 'volume', self._size_type, FIXED_SIZE_SCALE)
338
+
339
+ if self._pretty_ts:
340
+ self._format_pretty_ts(df)
341
+ self._format_timezone(df)
342
+
343
+ return df
344
+
345
+ def _format_nulls(self, df: pd.DataFrame):
346
+ for field, na_val in self._schema_metadata.null_fields.items():
347
+ df[field] = df[field].replace(na_val, np.nan)
348
+
349
+ def _format_timezone(self, df: pd.DataFrame) -> None:
350
+ for field in self._schema_metadata.fields_by_type('timestamp'):
351
+ df[field] = df[field].dt.tz_convert(self._tz)
352
+
353
+ def _format_decimal(
354
+ self,
355
+ df: pd.DataFrame,
356
+ type_name: str,
357
+ decimal_type: DecimalType,
358
+ scale: int,
359
+ ):
360
+ fields = self._schema_metadata.fields_by_type(type_name)
361
+
362
+ if decimal_type == DecimalType.DECIMAL:
363
+ df[fields] = (
364
+ df[fields].applymap(decimal.Decimal) / scale
365
+ )
366
+ elif decimal_type == DecimalType.FLOAT:
367
+ df[fields] /= scale
368
+ else:
369
+ return # do nothing
370
+
371
+ def _format_pretty_ts(self, df: pd.DataFrame) -> None:
372
+ for field in self._schema_metadata.fields_by_type('timestamp'):
373
+ df[field] = pd.to_datetime(df[field], utc=True, errors="coerce")