duck_client 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.3
2
+ Name: duck_client
3
+ Version: 0.1.2
4
+ Summary:
5
+ Author: River.Shi
6
+ Author-email: nachuan.shi.quant@gamil.com
7
+ Requires-Python: >=3.10,<3.14
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Requires-Dist: duckdb (>=1.1.3,<2.0.0)
14
+ Requires-Dist: fireducks (>=1.1.8,<2.0.0)
15
+ Requires-Dist: mpire (>=2.10.2,<3.0.0)
16
+ Requires-Dist: numpy (>=2.2.2,<3.0.0)
17
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
+ Requires-Dist: tqdm (>=4.67.1,<5.0.0)
19
+ Description-Content-Type: text/markdown
20
+
21
+ # DuckDB 连接器文档
22
+
23
+ ## DataBase 类
24
+
25
+ ### 类说明
26
+ 用于连接和管理DuckDB数据库,处理币安交易数据的存储、因子计算和查询操作
27
+
28
+ ---
29
+
30
+ ### 初始化方法
31
+ ```python
32
+ def __init__(
33
+ self,
34
+ cache_path: str | None = None,
35
+ db_path: str | None = None,
36
+ read_only: bool = True
37
+ )
38
+ ```
39
+ 参数说明:
40
+ - `cache_path`: 本地缓存路径(可选)
41
+ - `db_path`: DuckDB数据库文件路径(可选,内存数据库如果未指定)
42
+ - `read_only`: 是否以只读模式打开数据库(默认True)
43
+
44
+ ---
45
+
46
+ ### 主要公有方法
47
+
48
+ #### 更新K线数据
49
+ ```python
50
+ def update_klines(self)
51
+ ```
52
+ 功能:
53
+ - 遍历所有资产类别和时间频率,创建/更新K线数据表
54
+ - 自动处理不同时间粒度的数据聚合
55
+ - 进度条显示处理进度
56
+
57
+ #### 更新因子数据
58
+ ```python
59
+ def update_factors(self)
60
+ ```
61
+ 功能:
62
+ - 计算动量(momentum)、波动率(volatility)、贝塔(beta)等因子
63
+ - 使用滑动窗口计算(窗口大小自动适配不同时间频率)
64
+ - 进度条显示处理进度
65
+
66
+ #### 获取因子数据
67
+ ```python
68
+ def df_factors(
69
+ self,
70
+ symbols: list[str] | None = None,
71
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
72
+ asset_class: Literal["spot", "um"] = "um",
73
+ start_date: str | None = None,
74
+ end_date: str | None = None,
75
+ order_by_timestamp: bool = False,
76
+ ) -> pd.DataFrame
77
+ ```
78
+ 参数说明:
79
+ - `symbols`: 交易对列表(可选,默认全部)
80
+ - `freq`: 时间频率(默认1分钟)
81
+ - `asset_class`: 资产类别(默认永续合约)
82
+ - `start_date/end_date`: 时间范围(ISO格式字符串)
83
+ - `order_by_timestamp`: 是否按时间排序
84
+
85
+ 返回:
86
+ - 包含以下字段的DataFrame:
87
+ timestamp, symbol, close, return, momentum, volatility, beta
88
+
89
+ #### 获取原始K线数据
90
+ ```python
91
+ def df_klines(
92
+ self,
93
+ symbols: list[str] | None = None,
94
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
95
+ asset_class: Literal["spot", "um"] = "um",
96
+ start_date: str | None = None,
97
+ end_date: str | None = None,
98
+ order_by_timestamp: bool = False,
99
+ ) -> pd.DataFrame
100
+ ```
101
+ 参数同`df_factors`
102
+
103
+ 返回:
104
+ - 包含以下字段的DataFrame:
105
+ symbol, timestamp, open, high, low, close, volume, quote_volume,
106
+ taker_buy_volume, taker_buy_quote_volume
107
+
108
+ #### 获取因子矩阵
109
+ ```python
110
+ def factors_matrix(
111
+ self,
112
+ symbols: list[str] | None = None,
113
+ factor: Literal["return", "momentum", "volatility", "beta"] = "return",
114
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
115
+ asset_class: Literal["spot", "um"] = "um",
116
+ start_date: str | None = None,
117
+ end_date: str | None = None,
118
+ ) -> pd.DataFrame
119
+ ```
120
+ 特殊说明:
121
+ - 返回以时间为索引、交易对为列名的二维矩阵
122
+ - 使用PIVOT操作将纵向数据转换为横向矩阵
123
+ - 适合量化分析中的因子研究
124
+
125
+ ---
126
+
127
+ ### 异常类型
128
+ - `DBReadOnlyError`: 尝试在只读模式下执行写操作时抛出
129
+ - `DBError`: 参数错误或无效操作时抛出
130
+
131
+ ---
132
+
133
+ ### 使用示例
134
+ ```python
135
+ # 初始化数据库连接
136
+ db = DataBase(db_path="binance.db", read_only=False)
137
+
138
+ # 更新数据
139
+ db.update_klines()
140
+ db.update_factors()
141
+
142
+ # 查询数据
143
+ df = db.df_factors(
144
+ symbols=["BTCUSDT", "ETHUSDT"],
145
+ freq="1h",
146
+ start_date="2023-01-01",
147
+ end_date="2023-01-31"
148
+ )
149
+
150
+ # 获取因子矩阵
151
+ matrix = db.factors_matrix(
152
+ factor="momentum",
153
+ freq="4h",
154
+ asset_class="um"
155
+ )
156
+
@@ -0,0 +1,135 @@
1
+ # DuckDB 连接器文档
2
+
3
+ ## DataBase 类
4
+
5
+ ### 类说明
6
+ 用于连接和管理DuckDB数据库,处理币安交易数据的存储、因子计算和查询操作
7
+
8
+ ---
9
+
10
+ ### 初始化方法
11
+ ```python
12
+ def __init__(
13
+ self,
14
+ cache_path: str | None = None,
15
+ db_path: str | None = None,
16
+ read_only: bool = True
17
+ )
18
+ ```
19
+ 参数说明:
20
+ - `cache_path`: 本地缓存路径(可选)
21
+ - `db_path`: DuckDB数据库文件路径(可选,内存数据库如果未指定)
22
+ - `read_only`: 是否以只读模式打开数据库(默认True)
23
+
24
+ ---
25
+
26
+ ### 主要公有方法
27
+
28
+ #### 更新K线数据
29
+ ```python
30
+ def update_klines(self)
31
+ ```
32
+ 功能:
33
+ - 遍历所有资产类别和时间频率,创建/更新K线数据表
34
+ - 自动处理不同时间粒度的数据聚合
35
+ - 进度条显示处理进度
36
+
37
+ #### 更新因子数据
38
+ ```python
39
+ def update_factors(self)
40
+ ```
41
+ 功能:
42
+ - 计算动量(momentum)、波动率(volatility)、贝塔(beta)等因子
43
+ - 使用滑动窗口计算(窗口大小自动适配不同时间频率)
44
+ - 进度条显示处理进度
45
+
46
+ #### 获取因子数据
47
+ ```python
48
+ def df_factors(
49
+ self,
50
+ symbols: list[str] | None = None,
51
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
52
+ asset_class: Literal["spot", "um"] = "um",
53
+ start_date: str | None = None,
54
+ end_date: str | None = None,
55
+ order_by_timestamp: bool = False,
56
+ ) -> pd.DataFrame
57
+ ```
58
+ 参数说明:
59
+ - `symbols`: 交易对列表(可选,默认全部)
60
+ - `freq`: 时间频率(默认1分钟)
61
+ - `asset_class`: 资产类别(默认永续合约)
62
+ - `start_date/end_date`: 时间范围(ISO格式字符串)
63
+ - `order_by_timestamp`: 是否按时间排序
64
+
65
+ 返回:
66
+ - 包含以下字段的DataFrame:
67
+ timestamp, symbol, close, return, momentum, volatility, beta
68
+
69
+ #### 获取原始K线数据
70
+ ```python
71
+ def df_klines(
72
+ self,
73
+ symbols: list[str] | None = None,
74
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
75
+ asset_class: Literal["spot", "um"] = "um",
76
+ start_date: str | None = None,
77
+ end_date: str | None = None,
78
+ order_by_timestamp: bool = False,
79
+ ) -> pd.DataFrame
80
+ ```
81
+ 参数同`df_factors`
82
+
83
+ 返回:
84
+ - 包含以下字段的DataFrame:
85
+ symbol, timestamp, open, high, low, close, volume, quote_volume,
86
+ taker_buy_volume, taker_buy_quote_volume
87
+
88
+ #### 获取因子矩阵
89
+ ```python
90
+ def factors_matrix(
91
+ self,
92
+ symbols: list[str] | None = None,
93
+ factor: Literal["return", "momentum", "volatility", "beta"] = "return",
94
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
95
+ asset_class: Literal["spot", "um"] = "um",
96
+ start_date: str | None = None,
97
+ end_date: str | None = None,
98
+ ) -> pd.DataFrame
99
+ ```
100
+ 特殊说明:
101
+ - 返回以时间为索引、交易对为列名的二维矩阵
102
+ - 使用PIVOT操作将纵向数据转换为横向矩阵
103
+ - 适合量化分析中的因子研究
104
+
105
+ ---
106
+
107
+ ### 异常类型
108
+ - `DBReadOnlyError`: 尝试在只读模式下执行写操作时抛出
109
+ - `DBError`: 参数错误或无效操作时抛出
110
+
111
+ ---
112
+
113
+ ### 使用示例
114
+ ```python
115
+ # 初始化数据库连接
116
+ db = DataBase(db_path="binance.db", read_only=False)
117
+
118
+ # 更新数据
119
+ db.update_klines()
120
+ db.update_factors()
121
+
122
+ # 查询数据
123
+ df = db.df_factors(
124
+ symbols=["BTCUSDT", "ETHUSDT"],
125
+ freq="1h",
126
+ start_date="2023-01-01",
127
+ end_date="2023-01-31"
128
+ )
129
+
130
+ # 获取因子矩阵
131
+ matrix = db.factors_matrix(
132
+ factor="momentum",
133
+ freq="4h",
134
+ asset_class="um"
135
+ )
@@ -0,0 +1,11 @@
1
+
2
+ from duckdb_connector.db import DataBase
3
+ from importlib import metadata
4
+
5
+ try:
6
+ __version__ = metadata.version("duckdb-connector")
7
+ except metadata.PackageNotFoundError:
8
+ __version__ = "0.0.0"
9
+
10
+
11
+ __all__ = ["DataBase"]
@@ -0,0 +1,708 @@
1
+ import duckdb
2
+ import fireducks.pandas as pd
3
+ from typing import Literal
4
+ from urllib.parse import urljoin
5
+ from pathlib import Path
6
+ from tqdm import tqdm
7
+ import datetime
8
+ import numpy as np
9
+
10
+
11
+ class DataBase:
12
+ FREQ_MAP = {
13
+ "1m": 1,
14
+ "3m": 3,
15
+ "5m": 5,
16
+ "15m": 15,
17
+ "30m": 30,
18
+ "1h": 60,
19
+ "4h": 240,
20
+ "6h": 360,
21
+ "12h": 720,
22
+ "1d": 1440,
23
+ }
24
+
25
+ ASSET_CLASS_MAP = {
26
+ "spot": "spot",
27
+ "um": "futures/um",
28
+ "cm": "futures/cm",
29
+ }
30
+
31
+ MOMENTUM_WINDOW_MAP = {
32
+ "1d": 7,
33
+ "12h": 6,
34
+ "4h": 6,
35
+ "1h": 4,
36
+ "30m": 4,
37
+ "15m": 4,
38
+ "3m": 5,
39
+ "1m": 5,
40
+ }
41
+
42
+ VOLATILITY_WINDOW_MAP = {
43
+ "1d": 14,
44
+ "12h": 14,
45
+ "4h": 18,
46
+ "1h": 24,
47
+ "30m": 12,
48
+ "15m": 16,
49
+ "3m": 20,
50
+ "1m": 30,
51
+ }
52
+
53
+ BETA_WINDOW_MAP = {
54
+ "1d": 90,
55
+ "12h": 180,
56
+ "4h": 180,
57
+ "1h": 720,
58
+ "30m": 1440,
59
+ "15m": 2880,
60
+ "3m": 7 * 24 * 20,
61
+ "1m": 7 * 24 * 60,
62
+ }
63
+
64
+ def __init__(
65
+ self,
66
+ cache_base_path: str = "/usr/local/share/binance_data/cache/",
67
+ db_path: str | None = None,
68
+ read_only: bool = True,
69
+ ):
70
+ """
71
+ Initialize the query class
72
+
73
+ Args:
74
+ cache_base_path: The base path of the cache data
75
+ """
76
+ self._db_path = db_path
77
+ self.cache_base_path = cache_base_path
78
+
79
+ if db_path:
80
+ self.conn = duckdb.connect(database=db_path, read_only=read_only)
81
+ else:
82
+ self.conn = duckdb.connect()
83
+
84
+ self._read_only = read_only
85
+
86
+ def _check_table_exists(self, table_name: str):
87
+ check_sql = f"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{table_name}'"
88
+ table_exists = self.conn.query(check_sql).fetchone()[0] > 0
89
+ return table_exists
90
+
91
+ def klines_2_db(
92
+ self,
93
+ symbol: str,
94
+ freq: str = "1m",
95
+ asset_class: Literal["spot", "um", "cm"] = "um",
96
+ ):
97
+ if freq not in self.FREQ_MAP:
98
+ raise ValueError(
99
+ f"Invalid frequency: {freq}. Must be one of: {', '.join(self.FREQ_MAP.keys())}"
100
+ )
101
+
102
+ table_name = f"binance_{asset_class}_klines_{freq}_{symbol}"
103
+ table_exists = self._check_table_exists(table_name)
104
+
105
+ monthly_path = urljoin(
106
+ self.cache_base_path,
107
+ f"{self.ASSET_CLASS_MAP[asset_class]}/monthly/klines/{symbol}/1m/*.parquet",
108
+ )
109
+ daily_path = urljoin(
110
+ self.cache_base_path,
111
+ f"{self.ASSET_CLASS_MAP[asset_class]}/daily/klines/{symbol}/1m/*.parquet",
112
+ )
113
+
114
+ def get_create_table_sql(paths: str) -> str:
115
+ if freq == "1m":
116
+ return f"""
117
+ CREATE TABLE IF NOT EXISTS {table_name} AS
118
+ SELECT DISTINCT
119
+ to_timestamp(timestamp / 1000) as timestamp,
120
+ open, high, low, close,
121
+ volume, quote_volume,
122
+ taker_buy_volume, taker_buy_quote_volume
123
+ FROM read_parquet({paths})
124
+ ORDER BY timestamp
125
+ """
126
+ else:
127
+ return f"""
128
+ CREATE TABLE IF NOT EXISTS {table_name} AS
129
+ SELECT DISTINCT
130
+ time_bucket(INTERVAL '{self.FREQ_MAP[freq]} minutes', to_timestamp(timestamp / 1000)) as timestamp,
131
+ FIRST(open) as open,
132
+ MAX(high) as high,
133
+ MIN(low) as low,
134
+ LAST(close) as close,
135
+ SUM(volume) as volume,
136
+ SUM(quote_volume) as quote_volume,
137
+ SUM(taker_buy_volume) as taker_buy_volume,
138
+ SUM(taker_buy_quote_volume) as taker_buy_quote_volume
139
+ FROM read_parquet({paths})
140
+ GROUP BY time_bucket(INTERVAL '{self.FREQ_MAP[freq]} minutes', to_timestamp(timestamp / 1000))
141
+ ORDER BY timestamp
142
+ """
143
+
144
+ def get_insert_table_sql(paths: str, last_ts: datetime.datetime) -> str:
145
+ last_ts = datetime.datetime(
146
+ last_ts.year, last_ts.month, last_ts.day, 0, 0, 0
147
+ ) + datetime.timedelta(days=1)
148
+
149
+ last_ts_ms = int(last_ts.timestamp() * 1000)
150
+ if freq == "1m":
151
+ return f"""
152
+ WITH filtered_data AS (
153
+ SELECT *
154
+ FROM read_parquet({paths})
155
+ WHERE timestamp >= {last_ts_ms}
156
+ )
157
+ INSERT INTO {table_name}
158
+ SELECT DISTINCT
159
+ to_timestamp(timestamp / 1000) as timestamp,
160
+ open, high, low, close,
161
+ volume, quote_volume,
162
+ taker_buy_volume, taker_buy_quote_volume
163
+ FROM filtered_data
164
+ ORDER BY timestamp
165
+ """
166
+ else:
167
+ return f"""
168
+ WITH filtered_data AS (
169
+ SELECT *
170
+ FROM read_parquet({paths})
171
+ WHERE timestamp >= {last_ts_ms}
172
+ )
173
+ INSERT INTO {table_name}
174
+ SELECT DISTINCT
175
+ time_bucket(INTERVAL '{self.FREQ_MAP[freq]} minutes', to_timestamp(timestamp / 1000)) as timestamp,
176
+ FIRST(open) as open,
177
+ MAX(high) as high,
178
+ MIN(low) as low,
179
+ LAST(close) as close,
180
+ SUM(volume) as volume,
181
+ SUM(quote_volume) as quote_volume,
182
+ SUM(taker_buy_volume) as taker_buy_volume,
183
+ SUM(taker_buy_quote_volume) as taker_buy_quote_volume
184
+ FROM filtered_data
185
+ GROUP BY time_bucket(INTERVAL '{self.FREQ_MAP[freq]} minutes', to_timestamp(timestamp / 1000))
186
+ ORDER BY timestamp
187
+ """
188
+
189
+ path_options = [
190
+ f"['{monthly_path}', '{daily_path}']",
191
+ f"['{daily_path}']",
192
+ f"['{monthly_path}']",
193
+ ]
194
+
195
+ for paths in path_options:
196
+ try:
197
+ if not table_exists:
198
+ self.conn.execute(get_create_table_sql(paths))
199
+ else:
200
+ last_ts = self.conn.query(
201
+ f"SELECT MAX(timestamp) FROM {table_name}"
202
+ ).fetchone()[0]
203
+ self.conn.execute(get_insert_table_sql(paths, last_ts))
204
+ break
205
+ except duckdb.duckdb.IOException:
206
+ continue
207
+
208
+ def _list_klines_symbols(self, asset_class: Literal["spot", "um", "cm"] = "um"):
209
+ monthly_path = urljoin(
210
+ self.cache_base_path, f"{self.ASSET_CLASS_MAP[asset_class]}/monthly/klines/"
211
+ )
212
+ daily_path = urljoin(
213
+ self.cache_base_path, f"{self.ASSET_CLASS_MAP[asset_class]}/daily/klines/"
214
+ )
215
+
216
+ monthly_symbols = [x.name for x in Path(monthly_path).iterdir() if x.is_dir()]
217
+ daily_symbols = [x.name for x in Path(daily_path).iterdir() if x.is_dir()]
218
+
219
+ return list(set(monthly_symbols + daily_symbols))
220
+
221
+ def _list_metrics_symbols(self, asset_class: Literal["um"] = "um"):
222
+ daily_path = urljoin(
223
+ self.cache_base_path, f"{self.ASSET_CLASS_MAP[asset_class]}/daily/metrics/"
224
+ )
225
+ daily_symbols = [x.name for x in Path(daily_path).iterdir() if x.is_dir()]
226
+ return daily_symbols
227
+
228
+ def list_all_symbols(
229
+ self,
230
+ asset_class: Literal["spot", "um", "cm"] = "um",
231
+ data_type: Literal["klines", "metrics"] = "klines",
232
+ ):
233
+ if data_type == "klines":
234
+ return self._list_klines_symbols(asset_class)
235
+ elif data_type == "metrics":
236
+ if asset_class != "um":
237
+ raise ValueError("`metrics` only support `um`")
238
+ return self._list_metrics_symbols(asset_class)
239
+
240
+ def klines(
241
+ self,
242
+ symbol: str,
243
+ start_date: str | None = None,
244
+ end_date: str | None = None,
245
+ freq: str = "1m",
246
+ asset_class: Literal["spot", "um", "cm"] = "um",
247
+ ):
248
+ if freq not in self.FREQ_MAP:
249
+ raise ValueError(
250
+ f"Invalid frequency: {freq}. Must be one of: {', '.join(self.FREQ_MAP.keys())}"
251
+ )
252
+
253
+ table_klines = f"binance_{asset_class}_klines_{freq}_{symbol}"
254
+ table_factors = f"binance_{asset_class}_factors_{freq}_{symbol}"
255
+ sql = f"""
256
+ WITH klines AS (
257
+ SELECT * FROM {table_klines}
258
+ ),
259
+ factors AS (
260
+ SELECT * FROM {table_factors}
261
+ )
262
+ SELECT klines.*, factors.return, factors.momentum, factors.volatility, factors.beta
263
+ FROM klines
264
+ LEFT JOIN factors ON klines.timestamp = factors.timestamp
265
+ """
266
+
267
+ if start_date:
268
+ start_date = pd.to_datetime(start_date).timestamp()
269
+ sql += f" WHERE klines.timestamp >= TO_TIMESTAMP('{start_date}')"
270
+ if end_date:
271
+ end_date = pd.to_datetime(end_date).timestamp()
272
+ sql += f" AND klines.timestamp < TO_TIMESTAMP('{end_date}')"
273
+
274
+ df = self.conn.query(sql).to_df()
275
+
276
+ df.loc[:self.MOMENTUM_WINDOW_MAP[freq], 'momentum'] = np.nan
277
+ df.loc[:self.VOLATILITY_WINDOW_MAP[freq], 'volatility'] = np.nan
278
+ df.loc[:self.BETA_WINDOW_MAP[freq], 'beta'] = np.nan
279
+ return df
280
+
281
+ def query_data(
282
+ self,
283
+ symbol: str,
284
+ freq: str | None = None,
285
+ start_date: str | None = None,
286
+ end_date: str | None = None,
287
+ fields: list[str] | None = None,
288
+ asset_class: Literal["spot", "futures/um", "futures/cm"] = "futures/um",
289
+ data_type: Literal["klines", "metrics"] = "klines",
290
+ ):
291
+ if data_type == "metrics" and asset_class != "futures/um":
292
+ raise ValueError("`metrics` only support `futures/um`")
293
+
294
+ # 检查 klines 数据类型是否提供了 freq 参数
295
+ if data_type == "klines" and not freq:
296
+ raise ValueError("`klines` data type must provide `freq` parameter")
297
+
298
+ if start_date:
299
+ start_date = pd.to_datetime(start_date).timestamp() * 1000
300
+ if end_date:
301
+ end_date = pd.to_datetime(end_date).timestamp() * 1000
302
+
303
+ # 构建 SELECT 语句的列部分
304
+ columns = "*" if not fields else ", ".join(fields)
305
+
306
+ if data_type == "klines":
307
+ try:
308
+ monthly_path = urljoin(
309
+ self.cache_base_path,
310
+ f"{asset_class}/monthly/klines/{symbol}/{freq}/*.parquet",
311
+ )
312
+ daily_path = urljoin(
313
+ self.cache_base_path,
314
+ f"{asset_class}/daily/klines/{symbol}/{freq}/*.parquet",
315
+ )
316
+
317
+ if start_date and end_date:
318
+ df = self.conn.query(
319
+ f"SELECT {columns} FROM read_parquet(['{monthly_path}', '{daily_path}']) "
320
+ f"WHERE open_time >= {start_date} AND open_time < {end_date}"
321
+ ).to_df()
322
+ else:
323
+ df = self.conn.query(
324
+ f"SELECT {columns} FROM read_parquet(['{monthly_path}', '{daily_path}'])"
325
+ ).to_df()
326
+ except duckdb.duckdb.IOException:
327
+ if start_date and end_date:
328
+ df = self.conn.query(
329
+ f"SELECT {columns} FROM read_parquet(['{daily_path}']) "
330
+ f"WHERE open_time >= {start_date} AND open_time < {end_date}"
331
+ ).to_df()
332
+ else:
333
+ df = self.conn.query(
334
+ f"SELECT {columns} FROM read_parquet(['{daily_path}'])"
335
+ ).to_df()
336
+ elif data_type == "metrics":
337
+ daily_path = urljoin(
338
+ self.cache_base_path, f"{asset_class}/daily/metrics/{symbol}/*.parquet"
339
+ )
340
+
341
+ if start_date and end_date:
342
+ df = self.conn.query(
343
+ f"SELECT {columns} FROM read_parquet(['{daily_path}']) "
344
+ f"WHERE open_time >= {start_date} AND open_time < {end_date}"
345
+ ).to_df()
346
+ else:
347
+ df = self.conn.query(
348
+ f"SELECT {columns} FROM read_parquet(['{daily_path}'])"
349
+ ).to_df()
350
+
351
+ return df
352
+
353
+ def factors_2_db(
354
+ self, symbol: str, freq: str, asset_class: Literal["spot", "um", "cm"] = "um"
355
+ ):
356
+
357
+
358
+ if freq not in self.MOMENTUM_WINDOW_MAP:
359
+ raise ValueError(f"Not support freq: {freq}")
360
+
361
+ m_window = self.MOMENTUM_WINDOW_MAP[freq]
362
+ v_window = self.VOLATILITY_WINDOW_MAP[freq]
363
+ b_window = self.BETA_WINDOW_MAP[freq]
364
+
365
+ table_name = f"binance_{asset_class}_klines_{freq}_{symbol}"
366
+ btc_table = f"binance_{asset_class}_klines_{freq}_BTCUSDT"
367
+
368
+ factors_table_name = f"binance_{asset_class}_factors_{freq}_{symbol}"
369
+
370
+ sql = f"""
371
+ CREATE OR REPLACE TABLE {factors_table_name} AS
372
+ WITH base_returns AS (
373
+ SELECT DISTINCT
374
+ t1.timestamp,
375
+ (t1.close/LAG(t1.close) OVER (ORDER BY t1.timestamp) - 1) AS return,
376
+ (t2.close/LAG(t2.close) OVER (ORDER BY t2.timestamp) - 1) AS btc_return
377
+ FROM {table_name} t1
378
+ LEFT JOIN {btc_table} t2 ON t1.timestamp = t2.timestamp
379
+ ),
380
+ momentum AS (
381
+ SELECT
382
+ *,
383
+ PRODUCT(1 + return) OVER (
384
+ ORDER BY timestamp
385
+ ROWS BETWEEN {m_window - 1} PRECEDING AND CURRENT ROW
386
+ ) - 1 AS momentum
387
+ FROM base_returns
388
+ ),
389
+ volatility AS (
390
+ SELECT
391
+ *,
392
+ STDDEV(return) OVER (
393
+ ORDER BY timestamp
394
+ ROWS BETWEEN {v_window - 1} PRECEDING AND CURRENT ROW
395
+ ) AS volatility
396
+ FROM momentum
397
+ ),
398
+ beta AS (
399
+ SELECT
400
+ *,
401
+ REGR_SLOPE(btc_return, return) OVER (
402
+ ORDER BY timestamp
403
+ ROWS BETWEEN {b_window - 1} PRECEDING AND CURRENT ROW
404
+ ) AS beta
405
+ FROM volatility
406
+ )
407
+ SELECT
408
+ timestamp,
409
+ return,
410
+ momentum,
411
+ volatility,
412
+ beta
413
+ FROM beta
414
+ ORDER BY timestamp
415
+ """
416
+ self.conn.execute(sql)
417
+
418
+ def update_klines(
419
+ self,
420
+ asset_class: Literal["spot", "um", "cm"] = "um",
421
+ symbols: list[str] | None = None,
422
+ ):
423
+ if self._read_only:
424
+ raise ValueError("`update_klines` is not allowed in read-only mode, please set `read_only=False`")
425
+
426
+ if not symbols:
427
+ symbols = self.list_all_symbols(asset_class=asset_class, data_type="klines")
428
+ freqs = ["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"]
429
+ for symbol in tqdm(symbols):
430
+ try:
431
+ for freq in tqdm(freqs, leave=False):
432
+ self.klines_2_db(symbol=symbol, freq=freq, asset_class=asset_class)
433
+ except Exception as e:
434
+ print(f"Error: {e} {symbol} {freq}")
435
+
436
+ def update_factors(
437
+ self, asset_class: Literal["um"] = "um", symbols: list[str] | None = None
438
+ ):
439
+ if self._read_only:
440
+ raise ValueError("`update_factors` is not allowed in read-only mode, please set `read_only=False`")
441
+
442
+ if not symbols:
443
+ symbols = self.list_all_symbols(asset_class=asset_class, data_type="klines")
444
+ freqs = ["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"]
445
+ for symbol in tqdm(symbols):
446
+ for freq in tqdm(freqs, leave=False):
447
+ try:
448
+ self.factors_2_db(symbol=symbol, freq=freq, asset_class=asset_class)
449
+ except Exception as e:
450
+ print(f"Error: {e} {symbol} {freq}")
451
+
452
+
453
+ def return_matrix(
454
+ self,
455
+ symbols: list[str],
456
+ start_date: str | None = None,
457
+ end_date: str | None = None,
458
+ freq: str = "1m",
459
+ asset_class: Literal["spot", "um", "cm"] = "um",
460
+ ) -> pd.DataFrame:
461
+ # 构建基础SQL查询 - 使用UNION ALL获取所有唯一时间戳
462
+ base_sql = """
463
+ WITH all_timestamps AS (
464
+ SELECT DISTINCT timestamp
465
+ FROM (
466
+ """
467
+
468
+ # 为每个交易对添加时间戳
469
+ for i, symbol in enumerate(symbols):
470
+ base_sql += f"""
471
+ SELECT timestamp
472
+ FROM binance_{asset_class}_factors_{freq}_{symbol}
473
+ {f"WHERE timestamp >= TIMESTAMP '{start_date}' AND timestamp < TIMESTAMP '{end_date}'" if start_date and end_date else ""}
474
+ {'UNION ALL' if i < len(symbols)-1 else ''}
475
+ """
476
+
477
+ base_sql += """
478
+ )
479
+ ORDER BY timestamp
480
+ )
481
+ SELECT at.timestamp"""
482
+
483
+ # 为每个交易对添加return列
484
+ for symbol in symbols:
485
+ base_sql += f"""
486
+ , f_{symbol}.return as '{symbol}'
487
+ """
488
+
489
+ base_sql += "\nFROM all_timestamps at\n"
490
+
491
+ # 添加所有FULL OUTER JOIN语句
492
+ for symbol in symbols:
493
+ base_sql += f"""
494
+ FULL OUTER JOIN binance_{asset_class}_factors_{freq}_{symbol} f_{symbol}
495
+ ON at.timestamp = f_{symbol}.timestamp
496
+ """
497
+
498
+ base_sql += "\nORDER BY at.timestamp"
499
+
500
+ # 执行查询并返回DataFrame
501
+ return self.conn.query(base_sql).to_df()
502
+
503
+ def return_matrix_v2(
504
+ self,
505
+ symbols: list[str],
506
+ start_date: str | None = None,
507
+ end_date: str | None = None,
508
+ freq: str = "1m",
509
+ asset_class: Literal["spot", "um", "cm"] = "um",
510
+ ) -> pd.DataFrame:
511
+
512
+ dfs = []
513
+ for symbol in tqdm(symbols):
514
+ sql = f"""
515
+ SELECT timestamp, return as '{symbol}'
516
+ FROM binance_{asset_class}_factors_{freq}_{symbol}
517
+ """
518
+
519
+ if start_date and end_date:
520
+ sql += f" WHERE timestamp >= TIMESTAMP '{start_date}' AND timestamp < TIMESTAMP '{end_date}'"
521
+
522
+ df = self.conn.query(sql).to_df()
523
+ df.set_index("timestamp", inplace=True)
524
+ dfs.append(df)
525
+
526
+ return pd.concat(dfs, axis=1)
527
+
528
+ def create_merged_klines(
529
+ self,
530
+ symbols: list[str],
531
+ freq: str = "1m",
532
+ asset_class: Literal["spot", "um", "cm"] = "um",
533
+ ):
534
+ if self._read_only:
535
+ raise ValueError("`create_merged_klines` is not allowed in read-only mode, please set `read_only=False`")
536
+
537
+ merged_table = f"binance_{asset_class}_klines_{freq}"
538
+
539
+ # 使用UNION ALL一次性合并所有数据
540
+ create_sql = f"""
541
+ CREATE OR REPLACE TABLE {merged_table} AS
542
+ SELECT * FROM (
543
+ """
544
+
545
+ for i, symbol in enumerate(symbols):
546
+ source_table = f"binance_{asset_class}_klines_{freq}_{symbol}"
547
+ create_sql += f"""
548
+ SELECT
549
+ timestamp,
550
+ '{symbol}' as symbol,
551
+ open,
552
+ high,
553
+ low,
554
+ close,
555
+ volume,
556
+ quote_volume,
557
+ taker_buy_volume,
558
+ taker_buy_quote_volume
559
+ FROM {source_table}
560
+ {'UNION ALL' if i < len(symbols)-1 else ''}
561
+ """
562
+
563
+ create_sql += ")"
564
+
565
+ try:
566
+ self.conn.execute(create_sql)
567
+ except Exception as e:
568
+ print(f"Error creating merged table: {e}")
569
+
570
+ def create_merged_factors(
571
+ self,
572
+ symbols: list[str],
573
+ freq: str = "1m",
574
+ asset_class: Literal["spot", "um", "cm"] = "um",
575
+ ):
576
+ if self._read_only:
577
+ raise ValueError("`create_merged_klines` is not allowed in read-only mode, please set `read_only=False`")
578
+
579
+ merged_table = f"binance_{asset_class}_factors_{freq}"
580
+
581
+ # 使用UNION ALL一次性合并所有数据
582
+ create_sql = f"""
583
+ CREATE OR REPLACE TABLE {merged_table} AS
584
+ SELECT * FROM (
585
+ """
586
+
587
+ for i, symbol in enumerate(symbols):
588
+ source_table = f"binance_{asset_class}_factors_{freq}_{symbol}"
589
+ create_sql += f"""
590
+ SELECT
591
+ timestamp,
592
+ '{symbol}' as symbol,
593
+ return,
594
+ momentum,
595
+ volatility,
596
+ beta
597
+ FROM {source_table}
598
+ {'UNION ALL' if i < len(symbols)-1 else ''}
599
+ """
600
+
601
+ create_sql += ")"
602
+
603
+ try:
604
+ self.conn.execute(create_sql)
605
+ except Exception as e:
606
+ print(f"Error creating merged table: {e}")
607
+
608
+ def klines_matrix(
609
+ self,
610
+ symbols: list[str] | None = None,
611
+ freq: str = "1m",
612
+ asset_class: Literal["spot", "um", "cm"] = "um",
613
+ value_col: Literal["open", "high", "low", "close", "volume", "quote_volume", "taker_buy_volume", "taker_buy_quote_volume"] = "close",
614
+ start_date: str | None = None,
615
+ end_date: str | None = None,
616
+ ) -> pd.DataFrame:
617
+ table_name = f"binance_{asset_class}_klines_{freq}"
618
+
619
+ sql = f"""
620
+ WITH filtered_klines AS (
621
+ SELECT timestamp, symbol, {value_col}
622
+ FROM {table_name}
623
+ WHERE 1=1
624
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ''}
625
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ''}
626
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ''}
627
+ )
628
+ PIVOT filtered_klines ON symbol USING min({value_col})
629
+ ORDER BY timestamp
630
+ """
631
+
632
+ return self.conn.query(sql).to_df()
633
+
634
+ def factors_matrix(
635
+ self,
636
+ symbols: list[str] | None = None,
637
+ freq: str = "1m",
638
+ asset_class: Literal["spot", "um", "cm"] = "um",
639
+ value_col: Literal["return", "momentum", "volatility", "beta"] = "return",
640
+ start_date: str | None = None,
641
+ end_date: str | None = None,
642
+ ) -> pd.DataFrame:
643
+ table_name = f"binance_{asset_class}_factors_{freq}"
644
+
645
+ sql = f"""
646
+ WITH filtered_factors AS (
647
+ SELECT timestamp, symbol, {value_col}
648
+ FROM {table_name}
649
+ WHERE 1=1
650
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ''}
651
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ''}
652
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ''}
653
+ )
654
+ PIVOT filtered_factors ON symbol USING min({value_col})
655
+ ORDER BY timestamp
656
+ """
657
+
658
+ return self.conn.query(sql).to_df()
659
+
660
+ def klines_table(
661
+ self,
662
+ symbols: list[str] | None = None,
663
+ freq: str = "1m",
664
+ asset_class: Literal["spot", "um", "cm"] = "um",
665
+ start_date: str | None = None,
666
+ end_date: str | None = None,
667
+ ):
668
+ sql = f"""
669
+ SELECT * FROM binance_{asset_class}_klines_{freq}
670
+ WHERE 1=1
671
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ''}
672
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ''}
673
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ''}
674
+ """
675
+ return self.conn.query(sql).to_df()
676
+
677
+ def factors_table(
678
+ self,
679
+ symbols: list[str] | None = None,
680
+ freq: str = "1m",
681
+ asset_class: Literal["spot", "um", "cm"] = "um",
682
+ start_date: str | None = None,
683
+ end_date: str | None = None,
684
+ ):
685
+ sql = f"""
686
+ SELECT * FROM binance_{asset_class}_factors_{freq}
687
+ WHERE 1=1
688
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ''}
689
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ''}
690
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ''}
691
+ """
692
+ return self.conn.query(sql).to_df()
693
+
694
+
695
+
696
+ if __name__ == "__main__":
697
+ path = "/usr/local/share/binance_data/cache/"
698
+ db_path = "/usr/local/share/binance_data/data.db"
699
+ db = DataBase(path, db_path, read_only=True)
700
+
701
+ # db.update_klines(asset_class="spot", symbols=["BIOUSDT"])
702
+ # db.update_factors(asset_class="spot", symbols=["BIOUSDT"])
703
+
704
+ df = db.klines_table(symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"], freq="1m", asset_class="spot", start_date="2023-12-31 23:59:00")
705
+ print(df)
706
+
707
+ df = db.factors_table(symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"], freq="1m", asset_class="spot", start_date="2023-12-31 23:59:00")
708
+ print(df)
@@ -0,0 +1,304 @@
1
+ import duckdb
2
+ from typing import Literal
3
+ from urllib.parse import urljoin
4
+ from tqdm import tqdm
5
+ from duckdb_connector.error import DBReadOnlyError, DBError
6
+
7
+
8
+ class DataBase:
9
+ FREQ_MAP = {
10
+ "1m": 1,
11
+ "3m": 3,
12
+ "15m": 15,
13
+ "30m": 30,
14
+ "1h": 60,
15
+ "4h": 240,
16
+ "12h": 720,
17
+ "1d": 1440,
18
+ }
19
+
20
+ ASSET_CLASS_MAP = {
21
+ "spot": "spot",
22
+ "um": "futures/um",
23
+ # "cm": "/futures/cm", #NOTE: not supported yet
24
+ }
25
+
26
+ MOMENTUM_WINDOW_MAP = {
27
+ "1d": 7,
28
+ "12h": 6,
29
+ "4h": 6,
30
+ "1h": 4,
31
+ "30m": 4,
32
+ "15m": 4,
33
+ "3m": 5,
34
+ "1m": 5,
35
+ }
36
+
37
+ VOLATILITY_WINDOW_MAP = {
38
+ "1d": 14,
39
+ "12h": 14,
40
+ "4h": 18,
41
+ "1h": 24,
42
+ "30m": 12,
43
+ "15m": 16,
44
+ "3m": 20,
45
+ "1m": 30,
46
+ }
47
+
48
+ BETA_WINDOW_MAP = {
49
+ "1d": 90,
50
+ "12h": 180,
51
+ "4h": 180,
52
+ "1h": 720,
53
+ "30m": 1440,
54
+ "15m": 2880,
55
+ "3m": 7 * 24 * 20,
56
+ "1m": 7 * 24 * 60,
57
+ }
58
+
59
+ def __init__(
60
+ self,
61
+ cache_path: str | None = None,
62
+ db_path: str | None = None,
63
+ read_only: bool = True,
64
+ ):
65
+ self._db_path = db_path
66
+ self._cache_path = cache_path
67
+ self._read_only = read_only
68
+ if db_path:
69
+ self.conn = duckdb.connect(database=db_path, read_only=read_only)
70
+ else:
71
+ self.conn = duckdb.connect()
72
+
73
+ def _read_only_check(self):
74
+ if self._read_only:
75
+ raise DBReadOnlyError(
76
+ "Cannot create table in `read-only` mode, please set `read_only=False`"
77
+ )
78
+
79
+ def _asset_class_check(self, asset_class: Literal["spot", "um"]):
80
+ if asset_class not in self.ASSET_CLASS_MAP:
81
+ raise DBError(
82
+ f"Invalid asset class: {asset_class}. Must be one of: {', '.join(self.ASSET_CLASS_MAP.keys())}"
83
+ )
84
+
85
+ def _freq_check(
86
+ self, freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"]
87
+ ):
88
+ if freq not in self.FREQ_MAP:
89
+ raise DBError(
90
+ f"Invalid frequency: {freq}. Must be one of: {', '.join(self.FREQ_MAP.keys())}"
91
+ )
92
+
93
+ def _create_klines_table(
94
+ self,
95
+ asset_class: Literal["spot", "um"],
96
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"],
97
+ ):
98
+ self._read_only_check()
99
+ self._asset_class_check(asset_class)
100
+ self._freq_check(freq)
101
+ path = urljoin(self._cache_path, self.ASSET_CLASS_MAP[asset_class])
102
+ if freq == "1m":
103
+ sql = f"""
104
+ CREATE OR REPLACE TABLE binance_{asset_class}_klines_{freq} AS
105
+ SELECT DISTINCT
106
+ SPLIT_PART(filename, '/', -3) as symbol,
107
+ timestamp,
108
+ open,
109
+ high,
110
+ low,
111
+ close,
112
+ volume,
113
+ quote_volume,
114
+ taker_buy_volume,
115
+ taker_buy_quote_volume
116
+ FROM read_parquet(
117
+ '{path}/*/klines/*/1m/*.parquet',
118
+ FILENAME = true
119
+ )
120
+ """
121
+ else:
122
+ sql = f"""
123
+ CREATE OR REPLACE TABLE binance_{asset_class}_klines_{freq} AS
124
+ SELECT DISTINCT
125
+ SPLIT_PART(filename, '/', -3) as symbol,
126
+ time_bucket(INTERVAL {self.FREQ_MAP[freq]} minutes, to_timestamp(timestamp / 1000)) as timestamp,
127
+ FIRST(open) as open,
128
+ MAX(high) as high,
129
+ MIN(low) as low,
130
+ LAST(close) as close,
131
+ SUM(volume) as volume,
132
+ SUM(quote_volume) as quote_volume,
133
+ SUM(taker_buy_volume) as taker_buy_volume,
134
+ SUM(taker_buy_quote_volume) as taker_buy_quote_volume
135
+ FROM read_parquet(
136
+ '{path}/*/klines/*/1m/*.parquet',
137
+ FILENAME = true
138
+ )
139
+ GROUP BY symbol, time_bucket(INTERVAL {self.FREQ_MAP[freq]} minutes, to_timestamp(timestamp / 1000))
140
+ """
141
+ self.conn.execute(sql)
142
+
143
+ def _create_factors_table(
144
+ self,
145
+ asset_class: Literal["spot", "um"],
146
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"],
147
+ ):
148
+ self._read_only_check()
149
+ self._asset_class_check(asset_class)
150
+ self._freq_check(freq)
151
+
152
+ m_window = self.MOMENTUM_WINDOW_MAP[freq]
153
+ v_window = self.VOLATILITY_WINDOW_MAP[freq]
154
+ b_window = self.BETA_WINDOW_MAP[freq]
155
+
156
+ sql = f"""
157
+ CREATE OR REPLACE TABLE binance_{asset_class}_factors_{freq} AS
158
+ WITH base_table AS (
159
+ SELECT DISTINCT
160
+ a.symbol,
161
+ a.timestamp,
162
+ a.close,
163
+ a.return,
164
+ b.return AS btc_return
165
+ FROM (
166
+ SELECT DISTINCT
167
+ symbol,
168
+ timestamp,
169
+ close,
170
+ close / LAG(close) OVER (PARTITION BY symbol ORDER BY timestamp) - 1 AS return
171
+ FROM binance_{asset_class}_klines_{freq}
172
+ ) a
173
+ LEFT JOIN (
174
+ SELECT
175
+ timestamp,
176
+ close / LAG(close) OVER (ORDER BY timestamp) - 1 AS return
177
+ FROM binance_{asset_class}_klines_{freq}
178
+ WHERE symbol = 'BTCUSDT'
179
+ ) b ON a.timestamp = b.timestamp
180
+ ),
181
+ momentum_table AS (
182
+ SELECT
183
+ *,
184
+ CASE
185
+ WHEN ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY timestamp) >= {m_window}
186
+ THEN PRODUCT(1 + return) OVER (PARTITION BY symbol ORDER BY timestamp ROWS BETWEEN {m_window - 1} PRECEDING AND CURRENT ROW) - 1
187
+ ELSE NULL
188
+ END AS momentum
189
+ FROM base_table
190
+ ),
191
+ vol_table AS (
192
+ SELECT
193
+ *,
194
+ CASE
195
+ WHEN ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY timestamp) >= {v_window}
196
+ THEN STDDEV(return) OVER (PARTITION BY symbol ORDER BY timestamp ROWS BETWEEN {v_window - 1} PRECEDING AND CURRENT ROW)
197
+ ELSE NULL
198
+ END AS volatility
199
+ FROM momentum_table
200
+ ),
201
+ beta_table AS (
202
+ SELECT
203
+ *,
204
+ CASE
205
+ WHEN ROW_NUMBER() OVER (PARTITION BY symbol ORDER BY timestamp) >= {b_window}
206
+ THEN REGR_SLOPE(btc_return, return) OVER (PARTITION BY symbol ORDER BY timestamp ROWS BETWEEN {b_window - 1} PRECEDING AND CURRENT ROW)
207
+ ELSE NULL
208
+ END AS beta
209
+ FROM vol_table
210
+ )
211
+
212
+ SELECT
213
+ timestamp,
214
+ symbol,
215
+ close,
216
+ return,
217
+ momentum,
218
+ volatility,
219
+ beta
220
+ FROM beta_table;
221
+ """
222
+
223
+ self.conn.execute(sql)
224
+
225
+ def update_klines(self):
226
+ for asset_class in tqdm(self.ASSET_CLASS_MAP):
227
+ for freq in tqdm(self.FREQ_MAP, leave=False):
228
+ try:
229
+ self._create_klines_table(asset_class, freq)
230
+ except Exception as e:
231
+ print(f"Error creating klines table for {asset_class} {freq}: {e}")
232
+
233
+ def update_factors(self):
234
+ for asset_class in tqdm(self.ASSET_CLASS_MAP):
235
+ for freq in tqdm(self.FREQ_MAP, leave=False):
236
+ try:
237
+ self._create_factors_table(asset_class, freq)
238
+ except Exception as e:
239
+ print(f"Error creating factors table for {asset_class} {freq}: {e}")
240
+
241
+ def df_factors(
242
+ self,
243
+ symbols: list[str] | None = None,
244
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
245
+ asset_class: Literal["spot", "um"] = "um",
246
+ start_date: str | None = None,
247
+ end_date: str | None = None,
248
+ order_by_timestamp: bool = False,
249
+ ):
250
+ self._asset_class_check(asset_class)
251
+ self._freq_check(freq)
252
+
253
+ sql = f"""
254
+ SELECT * FROM binance_{asset_class}_factors_{freq}
255
+ WHERE 1=1
256
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ""}
257
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ""}
258
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ""}
259
+ """
260
+ if order_by_timestamp:
261
+ sql += "ORDER BY timestamp"
262
+ return self.conn.query(sql).to_df()
263
+
264
+ def df_klines(
265
+ self,
266
+ symbols: list[str] | None = None,
267
+ freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m",
268
+ asset_class: Literal["spot", "um"] = "um",
269
+ start_date: str | None = None,
270
+ end_date: str | None = None,
271
+ order_by_timestamp: bool = False,
272
+ ):
273
+ self._asset_class_check(asset_class)
274
+ self._freq_check(freq)
275
+
276
+ sql = f"""
277
+ SELECT * FROM binance_{asset_class}_klines_{freq}
278
+ WHERE 1=1
279
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ""}
280
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ""}
281
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ""}
282
+ """
283
+ if order_by_timestamp:
284
+ sql += "ORDER BY timestamp"
285
+ return self.conn.query(sql).to_df()
286
+
287
+ def factors_matrix(self, symbols: list[str] | None = None, factor: Literal["return", "momentum", "volatility", "beta"] = "return", freq: Literal["1m", "3m", "15m", "30m", "1h", "4h", "12h", "1d"] = "1m", asset_class: Literal["spot", "um"] = "um", start_date: str | None = None, end_date: str | None = None):
288
+ self._asset_class_check(asset_class)
289
+ self._freq_check(freq)
290
+
291
+ sql = f"""
292
+ WITH filtered_factors AS (
293
+ SELECT timestamp, symbol, {factor}
294
+ FROM binance_{asset_class}_factors_{freq}
295
+ WHERE 1=1
296
+ {f"AND symbol IN ({','.join([f''''{s}' ''' for s in symbols])})" if symbols else ''}
297
+ {f"AND timestamp >= TIMESTAMP '{start_date}'" if start_date else ''}
298
+ {f"AND timestamp < TIMESTAMP '{end_date}'" if end_date else ''}
299
+ )
300
+ PIVOT filtered_factors ON symbol USING min({factor})
301
+ ORDER BY timestamp
302
+ """
303
+
304
+ return self.conn.query(sql).to_df()
@@ -0,0 +1,5 @@
1
+ class DBError(Exception):
2
+ pass
3
+
4
+ class DBReadOnlyError(DBError):
5
+ pass
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "duck_client"
3
+ version = "0.1.2"
4
+ description = ""
5
+ authors = [
6
+ {name = "River.Shi",email = "nachuan.shi.quant@gamil.com"}
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.10,<3.14"
10
+ dependencies = [
11
+ "duckdb (>=1.1.3,<2.0.0)",
12
+ "numpy (>=2.2.2,<3.0.0)",
13
+ "pandas (>=2.2.3,<3.0.0)",
14
+ "tqdm (>=4.67.1,<5.0.0)",
15
+ "fireducks (>=1.1.8,<2.0.0)",
16
+ "mpire (>=2.10.2,<3.0.0)",
17
+ ]
18
+
19
+ [build-system]
20
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
21
+ build-backend = "poetry.core.masonry.api"