signalflow-trading 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signalflow/__init__.py +21 -0
- signalflow/analytics/__init__.py +0 -0
- signalflow/core/__init__.py +46 -0
- signalflow/core/base_mixin.py +232 -0
- signalflow/core/containers/__init__.py +21 -0
- signalflow/core/containers/order.py +216 -0
- signalflow/core/containers/portfolio.py +211 -0
- signalflow/core/containers/position.py +296 -0
- signalflow/core/containers/raw_data.py +167 -0
- signalflow/core/containers/raw_data_view.py +169 -0
- signalflow/core/containers/signals.py +198 -0
- signalflow/core/containers/strategy_state.py +147 -0
- signalflow/core/containers/trade.py +112 -0
- signalflow/core/decorators.py +103 -0
- signalflow/core/enums.py +270 -0
- signalflow/core/registry.py +322 -0
- signalflow/core/rolling_aggregator.py +362 -0
- signalflow/core/signal_transforms/__init__.py +5 -0
- signalflow/core/signal_transforms/base_signal_transform.py +186 -0
- signalflow/data/__init__.py +11 -0
- signalflow/data/raw_data_factory.py +225 -0
- signalflow/data/raw_store/__init__.py +7 -0
- signalflow/data/raw_store/base.py +271 -0
- signalflow/data/raw_store/duckdb_stores.py +696 -0
- signalflow/data/source/__init__.py +10 -0
- signalflow/data/source/base.py +300 -0
- signalflow/data/source/binance.py +442 -0
- signalflow/data/strategy_store/__init__.py +8 -0
- signalflow/data/strategy_store/base.py +278 -0
- signalflow/data/strategy_store/duckdb.py +409 -0
- signalflow/data/strategy_store/schema.py +36 -0
- signalflow/detector/__init__.py +7 -0
- signalflow/detector/adapter/__init__.py +5 -0
- signalflow/detector/adapter/pandas_detector.py +46 -0
- signalflow/detector/base.py +390 -0
- signalflow/detector/sma_cross.py +105 -0
- signalflow/feature/__init__.py +16 -0
- signalflow/feature/adapter/__init__.py +5 -0
- signalflow/feature/adapter/pandas_feature_extractor.py +54 -0
- signalflow/feature/base.py +330 -0
- signalflow/feature/feature_set.py +286 -0
- signalflow/feature/oscillator/__init__.py +5 -0
- signalflow/feature/oscillator/rsi_extractor.py +42 -0
- signalflow/feature/pandasta/__init__.py +10 -0
- signalflow/feature/pandasta/pandas_ta_extractor.py +141 -0
- signalflow/feature/pandasta/top_pandasta_extractors.py +64 -0
- signalflow/feature/smoother/__init__.py +5 -0
- signalflow/feature/smoother/sma_extractor.py +46 -0
- signalflow/strategy/__init__.py +9 -0
- signalflow/strategy/broker/__init__.py +15 -0
- signalflow/strategy/broker/backtest.py +172 -0
- signalflow/strategy/broker/base.py +186 -0
- signalflow/strategy/broker/executor/__init__.py +9 -0
- signalflow/strategy/broker/executor/base.py +35 -0
- signalflow/strategy/broker/executor/binance_spot.py +12 -0
- signalflow/strategy/broker/executor/virtual_spot.py +81 -0
- signalflow/strategy/broker/realtime_spot.py +12 -0
- signalflow/strategy/component/__init__.py +9 -0
- signalflow/strategy/component/base.py +65 -0
- signalflow/strategy/component/entry/__init__.py +7 -0
- signalflow/strategy/component/entry/fixed_size.py +57 -0
- signalflow/strategy/component/entry/signal.py +127 -0
- signalflow/strategy/component/exit/__init__.py +5 -0
- signalflow/strategy/component/exit/time_based.py +47 -0
- signalflow/strategy/component/exit/tp_sl.py +80 -0
- signalflow/strategy/component/metric/__init__.py +8 -0
- signalflow/strategy/component/metric/main_metrics.py +181 -0
- signalflow/strategy/runner/__init__.py +8 -0
- signalflow/strategy/runner/backtest_runner.py +208 -0
- signalflow/strategy/runner/base.py +19 -0
- signalflow/strategy/runner/optimized_backtest_runner.py +178 -0
- signalflow/strategy/runner/realtime_runner.py +0 -0
- signalflow/target/__init__.py +14 -0
- signalflow/target/adapter/__init__.py +5 -0
- signalflow/target/adapter/pandas_labeler.py +45 -0
- signalflow/target/base.py +409 -0
- signalflow/target/fixed_horizon_labeler.py +93 -0
- signalflow/target/static_triple_barrier.py +162 -0
- signalflow/target/triple_barrier.py +188 -0
- signalflow/utils/__init__.py +7 -0
- signalflow/utils/import_utils.py +11 -0
- signalflow/utils/tune_utils.py +19 -0
- signalflow/validator/__init__.py +6 -0
- signalflow/validator/base.py +139 -0
- signalflow/validator/sklearn_validator.py +527 -0
- signalflow_trading-0.2.1.dist-info/METADATA +149 -0
- signalflow_trading-0.2.1.dist-info/RECORD +90 -0
- signalflow_trading-0.2.1.dist-info/WHEEL +5 -0
- signalflow_trading-0.2.1.dist-info/licenses/LICENSE +21 -0
- signalflow_trading-0.2.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,696 @@
|
|
|
1
|
+
# IMPORTANT
|
|
2
|
+
import duckdb
|
|
3
|
+
import polars as pl
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional, Iterable
|
|
9
|
+
from loguru import logger
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from signalflow.core import sf_component
|
|
13
|
+
from signalflow.data.raw_store.base import RawDataStore
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
@sf_component(name="duckdb/spot")
|
|
17
|
+
class DuckDbSpotStore(RawDataStore):
|
|
18
|
+
"""DuckDB storage backend for OHLCV spot data.
|
|
19
|
+
|
|
20
|
+
Provides efficient storage and retrieval of candlestick (OHLCV) data
|
|
21
|
+
using DuckDB as the backend. Designed for fixed-timeframe storage
|
|
22
|
+
(timeframe not stored per-row, configured at database level).
|
|
23
|
+
|
|
24
|
+
Key features:
|
|
25
|
+
- Automatic schema migration from legacy formats
|
|
26
|
+
- Efficient batch inserts with upsert (INSERT OR REPLACE)
|
|
27
|
+
- Gap detection for data continuity checks
|
|
28
|
+
- Multi-pair batch loading
|
|
29
|
+
- Indexed queries for fast retrieval
|
|
30
|
+
|
|
31
|
+
Schema:
|
|
32
|
+
- pair (VARCHAR): Trading pair
|
|
33
|
+
- timestamp (TIMESTAMP): Bar open time (timezone-naive)
|
|
34
|
+
- open, high, low, close (DOUBLE): OHLC prices
|
|
35
|
+
- volume (DOUBLE): Trading volume
|
|
36
|
+
- trades (INTEGER): Number of trades
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
db_path (Path): Path to DuckDB file.
|
|
40
|
+
timeframe (str): Fixed timeframe for all data (e.g., "1m", "5m"). Default: "1m".
|
|
41
|
+
_con (duckdb.DuckDBPyConnection): Database connection (initialized in __post_init__).
|
|
42
|
+
|
|
43
|
+
Example:
|
|
44
|
+
```python
|
|
45
|
+
from signalflow.data.raw_store import DuckDbSpotStore
|
|
46
|
+
from pathlib import Path
|
|
47
|
+
from datetime import datetime
|
|
48
|
+
|
|
49
|
+
# Create store
|
|
50
|
+
store = DuckDbSpotStore(
|
|
51
|
+
db_path=Path("data/binance_spot.duckdb"),
|
|
52
|
+
timeframe="1m"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Insert data
|
|
57
|
+
klines = [
|
|
58
|
+
{
|
|
59
|
+
"timestamp": datetime(2024, 1, 1, 10, 0),
|
|
60
|
+
"open": 45000.0,
|
|
61
|
+
"high": 45100.0,
|
|
62
|
+
"low": 44900.0,
|
|
63
|
+
"close": 45050.0,
|
|
64
|
+
"volume": 100.5,
|
|
65
|
+
"trades": 150
|
|
66
|
+
}
|
|
67
|
+
]
|
|
68
|
+
store.insert_klines("BTCUSDT", klines)
|
|
69
|
+
|
|
70
|
+
# Load data
|
|
71
|
+
df = store.load("BTCUSDT", hours=24)
|
|
72
|
+
|
|
73
|
+
# Check data bounds
|
|
74
|
+
min_ts, max_ts = store.get_time_bounds("BTCUSDT")
|
|
75
|
+
print(f"Data range: {min_ts} to {max_ts}")
|
|
76
|
+
|
|
77
|
+
# Get statistics
|
|
78
|
+
stats = store.get_stats()
|
|
79
|
+
print(stats)
|
|
80
|
+
|
|
81
|
+
finally:
|
|
82
|
+
store.close()
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Note:
|
|
86
|
+
Timeframe is fixed per database, not per row.
|
|
87
|
+
Automatically migrates from legacy schema (open_time, timeframe columns).
|
|
88
|
+
Always call close() to cleanup database connection.
|
|
89
|
+
|
|
90
|
+
See Also:
|
|
91
|
+
RawDataStore: Base class with interface definition.
|
|
92
|
+
RawDataFactory: Factory for creating RawData from stores.
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
db_path: Path
|
|
96
|
+
timeframe: str = "1m"
|
|
97
|
+
_con: duckdb.DuckDBPyConnection = field(init=False)
|
|
98
|
+
|
|
99
|
+
def __post_init__(self) -> None:
|
|
100
|
+
"""Initialize database connection and ensure schema."""
|
|
101
|
+
self._con = duckdb.connect(str(self.db_path))
|
|
102
|
+
self._ensure_tables()
|
|
103
|
+
|
|
104
|
+
def _ensure_tables(self) -> None:
|
|
105
|
+
"""Create tables and migrate from legacy schema if needed.
|
|
106
|
+
|
|
107
|
+
Automatically detects and migrates from:
|
|
108
|
+
- Legacy schema with 'timeframe' column
|
|
109
|
+
- Legacy schema with 'open_time' instead of 'timestamp'
|
|
110
|
+
- Legacy schema with 'quote_volume' instead of 'volume'
|
|
111
|
+
|
|
112
|
+
Creates:
|
|
113
|
+
- ohlcv table with PRIMARY KEY (pair, timestamp)
|
|
114
|
+
- Index on (pair, timestamp DESC) for fast queries
|
|
115
|
+
- meta table for storing timeframe configuration
|
|
116
|
+
"""
|
|
117
|
+
existing = self._con.execute("""
|
|
118
|
+
SELECT column_name
|
|
119
|
+
FROM information_schema.columns
|
|
120
|
+
WHERE table_name = 'ohlcv'
|
|
121
|
+
""").fetchall()
|
|
122
|
+
existing_cols = {row[0] for row in existing}
|
|
123
|
+
|
|
124
|
+
if existing_cols and ("timeframe" in existing_cols or "open_time" in existing_cols):
|
|
125
|
+
logger.info("Migrating schema -> fixed-timeframe table (no timeframe column)...")
|
|
126
|
+
|
|
127
|
+
self._con.execute("""
|
|
128
|
+
CREATE TABLE IF NOT EXISTS ohlcv_new (
|
|
129
|
+
pair VARCHAR NOT NULL,
|
|
130
|
+
timestamp TIMESTAMP NOT NULL,
|
|
131
|
+
open DOUBLE NOT NULL,
|
|
132
|
+
high DOUBLE NOT NULL,
|
|
133
|
+
low DOUBLE NOT NULL,
|
|
134
|
+
close DOUBLE NOT NULL,
|
|
135
|
+
volume DOUBLE NOT NULL,
|
|
136
|
+
trades INTEGER,
|
|
137
|
+
PRIMARY KEY (pair, timestamp)
|
|
138
|
+
)
|
|
139
|
+
""")
|
|
140
|
+
|
|
141
|
+
if "open_time" in existing_cols:
|
|
142
|
+
|
|
143
|
+
self._con.execute("""
|
|
144
|
+
INSERT OR REPLACE INTO ohlcv_new
|
|
145
|
+
SELECT
|
|
146
|
+
pair,
|
|
147
|
+
open_time AS timestamp,
|
|
148
|
+
open, high, low, close,
|
|
149
|
+
quote_volume AS volume,
|
|
150
|
+
trades
|
|
151
|
+
FROM ohlcv
|
|
152
|
+
""")
|
|
153
|
+
else:
|
|
154
|
+
self._con.execute("""
|
|
155
|
+
INSERT OR REPLACE INTO ohlcv_new
|
|
156
|
+
SELECT
|
|
157
|
+
pair,
|
|
158
|
+
timestamp,
|
|
159
|
+
open, high, low, close,
|
|
160
|
+
volume,
|
|
161
|
+
trades
|
|
162
|
+
FROM ohlcv
|
|
163
|
+
""")
|
|
164
|
+
|
|
165
|
+
self._con.execute("DROP TABLE ohlcv")
|
|
166
|
+
self._con.execute("ALTER TABLE ohlcv_new RENAME TO ohlcv")
|
|
167
|
+
logger.info("Migration complete")
|
|
168
|
+
|
|
169
|
+
self._con.execute("""
|
|
170
|
+
CREATE TABLE IF NOT EXISTS ohlcv (
|
|
171
|
+
pair VARCHAR NOT NULL,
|
|
172
|
+
timestamp TIMESTAMP NOT NULL,
|
|
173
|
+
open DOUBLE NOT NULL,
|
|
174
|
+
high DOUBLE NOT NULL,
|
|
175
|
+
low DOUBLE NOT NULL,
|
|
176
|
+
close DOUBLE NOT NULL,
|
|
177
|
+
volume DOUBLE NOT NULL,
|
|
178
|
+
trades INTEGER,
|
|
179
|
+
PRIMARY KEY (pair, timestamp)
|
|
180
|
+
)
|
|
181
|
+
""")
|
|
182
|
+
|
|
183
|
+
self._con.execute("""
|
|
184
|
+
CREATE INDEX IF NOT EXISTS idx_ohlcv_pair_ts
|
|
185
|
+
ON ohlcv(pair, timestamp DESC)
|
|
186
|
+
""")
|
|
187
|
+
|
|
188
|
+
self._con.execute("""
|
|
189
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
190
|
+
key VARCHAR PRIMARY KEY,
|
|
191
|
+
value VARCHAR NOT NULL
|
|
192
|
+
)
|
|
193
|
+
""")
|
|
194
|
+
self._con.execute("""
|
|
195
|
+
INSERT OR REPLACE INTO meta(key, value) VALUES ('timeframe', ?)
|
|
196
|
+
""", [self.timeframe])
|
|
197
|
+
|
|
198
|
+
logger.info(f"Database initialized: {self.db_path} (timeframe={self.timeframe})")
|
|
199
|
+
|
|
200
|
+
def insert_klines(self, pair: str, klines: list[dict]) -> None:
|
|
201
|
+
"""Upsert klines (INSERT OR REPLACE).
|
|
202
|
+
|
|
203
|
+
Efficient batch insertion with automatic upsert on (pair, timestamp) conflict.
|
|
204
|
+
Uses Arrow-based bulk insert for >10 rows for better performance.
|
|
205
|
+
|
|
206
|
+
Timestamp normalization:
|
|
207
|
+
- Removes timezone info
|
|
208
|
+
- Rounds to minute (removes seconds/microseconds)
|
|
209
|
+
- If second != 0, rounds up to next minute
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
pair (str): Trading pair (e.g., "BTCUSDT").
|
|
213
|
+
klines (list[dict]): List of kline dictionaries. Each must contain:
|
|
214
|
+
- timestamp (datetime): Bar open time
|
|
215
|
+
- open (float): Open price
|
|
216
|
+
- high (float): High price
|
|
217
|
+
- low (float): Low price
|
|
218
|
+
- close (float): Close price
|
|
219
|
+
- volume (float): Trading volume
|
|
220
|
+
- trades (int, optional): Number of trades
|
|
221
|
+
|
|
222
|
+
Example:
|
|
223
|
+
```python
|
|
224
|
+
from datetime import datetime
|
|
225
|
+
|
|
226
|
+
# Insert single kline
|
|
227
|
+
store.insert_klines("BTCUSDT", [
|
|
228
|
+
{
|
|
229
|
+
"timestamp": datetime(2024, 1, 1, 10, 0),
|
|
230
|
+
"open": 45000.0,
|
|
231
|
+
"high": 45100.0,
|
|
232
|
+
"low": 44900.0,
|
|
233
|
+
"close": 45050.0,
|
|
234
|
+
"volume": 100.5,
|
|
235
|
+
"trades": 150
|
|
236
|
+
}
|
|
237
|
+
])
|
|
238
|
+
|
|
239
|
+
# Batch insert (efficient for >10 rows)
|
|
240
|
+
klines = [
|
|
241
|
+
{
|
|
242
|
+
"timestamp": datetime(2024, 1, 1, 10, i),
|
|
243
|
+
"open": 45000.0 + i,
|
|
244
|
+
"high": 45100.0 + i,
|
|
245
|
+
"low": 44900.0 + i,
|
|
246
|
+
"close": 45050.0 + i,
|
|
247
|
+
"volume": 100.0,
|
|
248
|
+
"trades": 150
|
|
249
|
+
}
|
|
250
|
+
for i in range(100)
|
|
251
|
+
]
|
|
252
|
+
store.insert_klines("BTCUSDT", klines)
|
|
253
|
+
|
|
254
|
+
# Upsert - updates existing rows
|
|
255
|
+
store.insert_klines("BTCUSDT", [
|
|
256
|
+
{
|
|
257
|
+
"timestamp": datetime(2024, 1, 1, 10, 0),
|
|
258
|
+
"open": 45010.0, # Updated price
|
|
259
|
+
"high": 45110.0,
|
|
260
|
+
"low": 44910.0,
|
|
261
|
+
"close": 45060.0,
|
|
262
|
+
"volume": 101.0,
|
|
263
|
+
"trades": 152
|
|
264
|
+
}
|
|
265
|
+
])
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
Note:
|
|
269
|
+
Empty klines list is silently ignored.
|
|
270
|
+
Uses executemany for ≤10 rows, Arrow bulk insert for >10 rows.
|
|
271
|
+
Automatically logs insert count at debug level.
|
|
272
|
+
"""
|
|
273
|
+
if not klines:
|
|
274
|
+
return
|
|
275
|
+
|
|
276
|
+
if len(klines) <= 10:
|
|
277
|
+
self._con.executemany(
|
|
278
|
+
"INSERT OR REPLACE INTO ohlcv VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
|
|
279
|
+
[
|
|
280
|
+
(
|
|
281
|
+
pair,
|
|
282
|
+
k["timestamp"],
|
|
283
|
+
k["open"],
|
|
284
|
+
k["high"],
|
|
285
|
+
k["low"],
|
|
286
|
+
k["close"],
|
|
287
|
+
k["volume"],
|
|
288
|
+
k.get("trades"),
|
|
289
|
+
)
|
|
290
|
+
for k in klines
|
|
291
|
+
],
|
|
292
|
+
)
|
|
293
|
+
else:
|
|
294
|
+
df = pl.DataFrame(
|
|
295
|
+
{
|
|
296
|
+
"pair": [pair] * len(klines),
|
|
297
|
+
"timestamp": [
|
|
298
|
+
k["timestamp"]
|
|
299
|
+
.replace(tzinfo=None)
|
|
300
|
+
.replace(second=0, microsecond=0)
|
|
301
|
+
+ timedelta(minutes=1)
|
|
302
|
+
if k["timestamp"].second != 0 or k["timestamp"].microsecond != 0
|
|
303
|
+
else k["timestamp"].replace(tzinfo=None)
|
|
304
|
+
for k in klines
|
|
305
|
+
],
|
|
306
|
+
"open": [k["open"] for k in klines],
|
|
307
|
+
"high": [k["high"] for k in klines],
|
|
308
|
+
"low": [k["low"] for k in klines],
|
|
309
|
+
"close": [k["close"] for k in klines],
|
|
310
|
+
"volume": [k["volume"] for k in klines],
|
|
311
|
+
"trades": [k.get("trades") for k in klines],
|
|
312
|
+
}
|
|
313
|
+
)
|
|
314
|
+
self._con.register("temp_klines", df.to_arrow())
|
|
315
|
+
self._con.execute("INSERT OR REPLACE INTO ohlcv SELECT * FROM temp_klines")
|
|
316
|
+
self._con.unregister("temp_klines")
|
|
317
|
+
|
|
318
|
+
logger.debug(f"Inserted {len(klines):,} rows for {pair}")
|
|
319
|
+
|
|
320
|
+
def get_time_bounds(self, pair: str) -> tuple[Optional[datetime], Optional[datetime]]:
|
|
321
|
+
"""Get earliest and latest timestamps for a pair.
|
|
322
|
+
|
|
323
|
+
Useful for:
|
|
324
|
+
- Checking data availability
|
|
325
|
+
- Planning data updates
|
|
326
|
+
- Validating date ranges
|
|
327
|
+
|
|
328
|
+
Args:
|
|
329
|
+
pair (str): Trading pair (e.g., "BTCUSDT").
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
tuple[datetime | None, datetime | None]: (min_timestamp, max_timestamp).
|
|
333
|
+
Both None if no data exists for pair.
|
|
334
|
+
|
|
335
|
+
Example:
|
|
336
|
+
```python
|
|
337
|
+
# Check data availability
|
|
338
|
+
min_ts, max_ts = store.get_time_bounds("BTCUSDT")
|
|
339
|
+
|
|
340
|
+
if min_ts and max_ts:
|
|
341
|
+
print(f"Data available: {min_ts} to {max_ts}")
|
|
342
|
+
days = (max_ts - min_ts).days
|
|
343
|
+
print(f"Total days: {days}")
|
|
344
|
+
else:
|
|
345
|
+
print("No data available")
|
|
346
|
+
|
|
347
|
+
# Plan incremental update
|
|
348
|
+
_, max_ts = store.get_time_bounds("BTCUSDT")
|
|
349
|
+
if max_ts:
|
|
350
|
+
# Fetch data from max_ts to now
|
|
351
|
+
fetch_data(start=max_ts, end=datetime.now())
|
|
352
|
+
```
|
|
353
|
+
"""
|
|
354
|
+
result = self._con.execute("""
|
|
355
|
+
SELECT MIN(timestamp), MAX(timestamp)
|
|
356
|
+
FROM ohlcv
|
|
357
|
+
WHERE pair = ?
|
|
358
|
+
""", [pair]).fetchone()
|
|
359
|
+
return (result[0], result[1]) if result and result[0] else (None, None)
|
|
360
|
+
|
|
361
|
+
def find_gaps(
|
|
362
|
+
self,
|
|
363
|
+
pair: str,
|
|
364
|
+
start: datetime,
|
|
365
|
+
end: datetime,
|
|
366
|
+
tf_minutes: int,
|
|
367
|
+
) -> list[tuple[datetime, datetime]]:
|
|
368
|
+
"""Find gaps in data coverage for a pair.
|
|
369
|
+
|
|
370
|
+
Detects missing bars in expected continuous sequence based on timeframe.
|
|
371
|
+
Useful for data quality checks and incremental backfilling.
|
|
372
|
+
|
|
373
|
+
Args:
|
|
374
|
+
pair (str): Trading pair (e.g., "BTCUSDT").
|
|
375
|
+
start (datetime): Start of expected range.
|
|
376
|
+
end (datetime): End of expected range.
|
|
377
|
+
tf_minutes (int): Timeframe in minutes (e.g., 1 for 1m, 5 for 5m).
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
list[tuple[datetime, datetime]]: List of (gap_start, gap_end) tuples.
|
|
381
|
+
Empty list if no gaps found.
|
|
382
|
+
|
|
383
|
+
Example:
|
|
384
|
+
```python
|
|
385
|
+
from datetime import datetime
|
|
386
|
+
|
|
387
|
+
# Check for gaps in January 2024
|
|
388
|
+
gaps = store.find_gaps(
|
|
389
|
+
pair="BTCUSDT",
|
|
390
|
+
start=datetime(2024, 1, 1),
|
|
391
|
+
end=datetime(2024, 1, 31),
|
|
392
|
+
tf_minutes=1
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
if gaps:
|
|
396
|
+
print(f"Found {len(gaps)} gaps:")
|
|
397
|
+
for gap_start, gap_end in gaps:
|
|
398
|
+
duration = gap_end - gap_start
|
|
399
|
+
print(f" {gap_start} to {gap_end} ({duration})")
|
|
400
|
+
|
|
401
|
+
# Backfill gaps
|
|
402
|
+
backfill_data(pair="BTCUSDT", start=gap_start, end=gap_end)
|
|
403
|
+
else:
|
|
404
|
+
print("No gaps found - data is continuous")
|
|
405
|
+
|
|
406
|
+
# Data quality report
|
|
407
|
+
gaps = store.find_gaps("BTCUSDT", start, end, tf_minutes=1)
|
|
408
|
+
total_expected = int((end - start).total_seconds() / 60)
|
|
409
|
+
total_missing = sum((g[1] - g[0]).total_seconds() / 60 for g in gaps)
|
|
410
|
+
coverage = (1 - total_missing / total_expected) * 100
|
|
411
|
+
print(f"Data coverage: {coverage:.2f}%")
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
Note:
|
|
415
|
+
Returns full range [(start, end)] if no data exists.
|
|
416
|
+
Computationally expensive for large date ranges - use sparingly.
|
|
417
|
+
"""
|
|
418
|
+
existing = self._con.execute("""
|
|
419
|
+
SELECT timestamp
|
|
420
|
+
FROM ohlcv
|
|
421
|
+
WHERE pair = ? AND timestamp BETWEEN ? AND ?
|
|
422
|
+
ORDER BY timestamp
|
|
423
|
+
""", [pair, start, end]).fetchall()
|
|
424
|
+
|
|
425
|
+
if not existing:
|
|
426
|
+
return [(start, end)]
|
|
427
|
+
|
|
428
|
+
existing_times = {row[0] for row in existing}
|
|
429
|
+
gaps: list[tuple[datetime, datetime]] = []
|
|
430
|
+
|
|
431
|
+
gap_start: Optional[datetime] = None
|
|
432
|
+
current = start
|
|
433
|
+
|
|
434
|
+
while current <= end:
|
|
435
|
+
if current not in existing_times:
|
|
436
|
+
if gap_start is None:
|
|
437
|
+
gap_start = current
|
|
438
|
+
else:
|
|
439
|
+
if gap_start is not None:
|
|
440
|
+
gaps.append((gap_start, current - timedelta(minutes=tf_minutes)))
|
|
441
|
+
gap_start = None
|
|
442
|
+
current += timedelta(minutes=tf_minutes)
|
|
443
|
+
|
|
444
|
+
if gap_start is not None:
|
|
445
|
+
gaps.append((gap_start, end))
|
|
446
|
+
|
|
447
|
+
return gaps
|
|
448
|
+
|
|
449
|
+
def load(
|
|
450
|
+
self,
|
|
451
|
+
pair: str,
|
|
452
|
+
hours: Optional[int] = None,
|
|
453
|
+
start: Optional[datetime] = None,
|
|
454
|
+
end: Optional[datetime] = None,
|
|
455
|
+
) -> pl.DataFrame:
|
|
456
|
+
"""Load data for a single trading pair.
|
|
457
|
+
|
|
458
|
+
Output columns: pair, timestamp, open, high, low, close, volume, trades
|
|
459
|
+
|
|
460
|
+
Args:
|
|
461
|
+
pair (str): Trading pair (e.g., "BTCUSDT").
|
|
462
|
+
hours (int | None): Load last N hours of data. Mutually exclusive with start/end.
|
|
463
|
+
start (datetime | None): Start datetime (inclusive). Requires end parameter.
|
|
464
|
+
end (datetime | None): End datetime (inclusive). Requires start parameter.
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
pl.DataFrame: OHLCV data sorted by timestamp. Timezone-naive timestamps.
|
|
468
|
+
|
|
469
|
+
Example:
|
|
470
|
+
```python
|
|
471
|
+
# Load last 24 hours
|
|
472
|
+
df = store.load("BTCUSDT", hours=24)
|
|
473
|
+
|
|
474
|
+
# Load specific range
|
|
475
|
+
df = store.load(
|
|
476
|
+
"BTCUSDT",
|
|
477
|
+
start=datetime(2024, 1, 1),
|
|
478
|
+
end=datetime(2024, 1, 31)
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Check loaded data
|
|
482
|
+
print(df.select(["timestamp", "close"]).head())
|
|
483
|
+
```
|
|
484
|
+
"""
|
|
485
|
+
query = """
|
|
486
|
+
SELECT
|
|
487
|
+
? AS pair,
|
|
488
|
+
timestamp, open, high, low, close, volume, trades
|
|
489
|
+
FROM ohlcv
|
|
490
|
+
WHERE pair = ?
|
|
491
|
+
"""
|
|
492
|
+
params: list[object] = [pair, pair]
|
|
493
|
+
|
|
494
|
+
if hours is not None:
|
|
495
|
+
query += f" AND timestamp > NOW() - INTERVAL '{int(hours)}' HOUR"
|
|
496
|
+
elif start and end:
|
|
497
|
+
query += " AND timestamp BETWEEN ? AND ?"
|
|
498
|
+
params.extend([start, end])
|
|
499
|
+
elif start:
|
|
500
|
+
query += " AND timestamp >= ?"
|
|
501
|
+
params.append(start)
|
|
502
|
+
elif end:
|
|
503
|
+
query += " AND timestamp <= ?"
|
|
504
|
+
params.append(end)
|
|
505
|
+
|
|
506
|
+
query += " ORDER BY timestamp"
|
|
507
|
+
df = self._con.execute(query, params).pl()
|
|
508
|
+
|
|
509
|
+
if 'timestamp' in df.columns:
|
|
510
|
+
df = df.with_columns(
|
|
511
|
+
pl.col('timestamp').dt.replace_time_zone(None)
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
return df
|
|
515
|
+
|
|
516
|
+
def load_many_pandas(
|
|
517
|
+
self,
|
|
518
|
+
pairs: list[str],
|
|
519
|
+
start: datetime | None = None,
|
|
520
|
+
end: datetime | None = None,
|
|
521
|
+
) -> pd.DataFrame:
|
|
522
|
+
"""Load data for multiple pairs as Pandas DataFrame.
|
|
523
|
+
|
|
524
|
+
Convenience wrapper around load_many() for Pandas compatibility.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
pairs (list[str]): List of trading pairs.
|
|
528
|
+
start (datetime | None): Start datetime (inclusive).
|
|
529
|
+
end (datetime | None): End datetime (inclusive).
|
|
530
|
+
|
|
531
|
+
Returns:
|
|
532
|
+
pd.DataFrame: Combined OHLCV data as Pandas DataFrame.
|
|
533
|
+
|
|
534
|
+
Example:
|
|
535
|
+
```python
|
|
536
|
+
df = store.load_many_pandas(
|
|
537
|
+
pairs=["BTCUSDT", "ETHUSDT"],
|
|
538
|
+
start=datetime(2024, 1, 1),
|
|
539
|
+
end=datetime(2024, 1, 31)
|
|
540
|
+
)
|
|
541
|
+
|
|
542
|
+
# Use with pandas
|
|
543
|
+
df["returns"] = df.groupby("pair")["close"].pct_change()
|
|
544
|
+
```
|
|
545
|
+
"""
|
|
546
|
+
df_pl = self.load_many(pairs=pairs, start=start, end=end)
|
|
547
|
+
return df_pl.to_pandas()
|
|
548
|
+
|
|
549
|
+
def load_many(
|
|
550
|
+
self,
|
|
551
|
+
pairs: Iterable[str],
|
|
552
|
+
hours: Optional[int] = None,
|
|
553
|
+
start: Optional[datetime] = None,
|
|
554
|
+
end: Optional[datetime] = None,
|
|
555
|
+
) -> pl.DataFrame:
|
|
556
|
+
"""Batch load for multiple pairs.
|
|
557
|
+
|
|
558
|
+
Output columns: pair, timestamp, open, high, low, close, volume, trades
|
|
559
|
+
|
|
560
|
+
More efficient than multiple load() calls due to single query.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
pairs (Iterable[str]): Trading pairs to load.
|
|
564
|
+
hours (int | None): Load last N hours of data.
|
|
565
|
+
start (datetime | None): Start datetime (inclusive).
|
|
566
|
+
end (datetime | None): End datetime (inclusive).
|
|
567
|
+
|
|
568
|
+
Returns:
|
|
569
|
+
pl.DataFrame: Combined OHLCV data sorted by (pair, timestamp).
|
|
570
|
+
Empty DataFrame with correct schema if no pairs provided.
|
|
571
|
+
|
|
572
|
+
Example:
|
|
573
|
+
```python
|
|
574
|
+
# Load multiple pairs
|
|
575
|
+
df = store.load_many(
|
|
576
|
+
pairs=["BTCUSDT", "ETHUSDT", "BNBUSDT"],
|
|
577
|
+
start=datetime(2024, 1, 1),
|
|
578
|
+
end=datetime(2024, 1, 31)
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
# Analyze by pair
|
|
582
|
+
for pair in df["pair"].unique():
|
|
583
|
+
pair_df = df.filter(pl.col("pair") == pair)
|
|
584
|
+
print(f"{pair}: {len(pair_df)} bars")
|
|
585
|
+
```
|
|
586
|
+
"""
|
|
587
|
+
pairs = list(pairs)
|
|
588
|
+
if not pairs:
|
|
589
|
+
return pl.DataFrame(
|
|
590
|
+
schema={
|
|
591
|
+
"pair": pl.Utf8,
|
|
592
|
+
"timestamp": pl.Datetime,
|
|
593
|
+
"open": pl.Float64,
|
|
594
|
+
"high": pl.Float64,
|
|
595
|
+
"low": pl.Float64,
|
|
596
|
+
"close": pl.Float64,
|
|
597
|
+
"volume": pl.Float64,
|
|
598
|
+
"trades": pl.Int64,
|
|
599
|
+
}
|
|
600
|
+
)
|
|
601
|
+
|
|
602
|
+
placeholders = ",".join(["?"] * len(pairs))
|
|
603
|
+
query = f"""
|
|
604
|
+
SELECT
|
|
605
|
+
pair,
|
|
606
|
+
timestamp, open, high, low, close, volume, trades
|
|
607
|
+
FROM ohlcv
|
|
608
|
+
WHERE pair IN ({placeholders})
|
|
609
|
+
"""
|
|
610
|
+
params: list[object] = [*pairs]
|
|
611
|
+
|
|
612
|
+
if hours is not None:
|
|
613
|
+
query += f" AND timestamp > NOW() - INTERVAL '{int(hours)}' HOUR"
|
|
614
|
+
elif start and end:
|
|
615
|
+
query += " AND timestamp BETWEEN ? AND ?"
|
|
616
|
+
params.extend([start, end])
|
|
617
|
+
elif start:
|
|
618
|
+
query += " AND timestamp >= ?"
|
|
619
|
+
params.append(start)
|
|
620
|
+
elif end:
|
|
621
|
+
query += " AND timestamp <= ?"
|
|
622
|
+
params.append(end)
|
|
623
|
+
|
|
624
|
+
query += " ORDER BY pair, timestamp"
|
|
625
|
+
|
|
626
|
+
df = self._con.execute(query, params).pl()
|
|
627
|
+
|
|
628
|
+
if 'timestamp' in df.columns:
|
|
629
|
+
df = df.with_columns(
|
|
630
|
+
pl.col('timestamp').dt.replace_time_zone(None)
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
return df
|
|
634
|
+
|
|
635
|
+
def get_stats(self) -> pl.DataFrame:
|
|
636
|
+
"""Get database statistics per pair.
|
|
637
|
+
|
|
638
|
+
Returns summary statistics for all pairs in database.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
pl.DataFrame: Statistics with columns:
|
|
642
|
+
- pair (str): Trading pair
|
|
643
|
+
- rows (int): Number of bars
|
|
644
|
+
- first_candle (datetime): Earliest timestamp
|
|
645
|
+
- last_candle (datetime): Latest timestamp
|
|
646
|
+
- total_volume (float): Sum of volume
|
|
647
|
+
|
|
648
|
+
Example:
|
|
649
|
+
```python
|
|
650
|
+
# Get overview
|
|
651
|
+
stats = store.get_stats()
|
|
652
|
+
print(stats)
|
|
653
|
+
|
|
654
|
+
# Check coverage
|
|
655
|
+
for row in stats.iter_rows(named=True):
|
|
656
|
+
pair = row["pair"]
|
|
657
|
+
days = (row["last_candle"] - row["first_candle"]).days
|
|
658
|
+
print(f"{pair}: {row['rows']:,} bars over {days} days")
|
|
659
|
+
|
|
660
|
+
# Identify incomplete data
|
|
661
|
+
min_rows = stats["rows"].min()
|
|
662
|
+
incomplete = stats.filter(pl.col("rows") < min_rows * 0.9)
|
|
663
|
+
print(f"Pairs with <90% coverage: {incomplete['pair'].to_list()}")
|
|
664
|
+
```
|
|
665
|
+
|
|
666
|
+
Note:
|
|
667
|
+
Timeframe not included in output (stored in meta table).
|
|
668
|
+
Sorted alphabetically by pair.
|
|
669
|
+
"""
|
|
670
|
+
return self._con.execute("""
|
|
671
|
+
SELECT
|
|
672
|
+
pair,
|
|
673
|
+
COUNT(*) as rows,
|
|
674
|
+
MIN(timestamp) as first_candle,
|
|
675
|
+
MAX(timestamp) as last_candle,
|
|
676
|
+
ROUND(SUM(volume), 2) as total_volume
|
|
677
|
+
FROM ohlcv
|
|
678
|
+
GROUP BY pair
|
|
679
|
+
ORDER BY pair
|
|
680
|
+
""").pl()
|
|
681
|
+
|
|
682
|
+
def close(self) -> None:
|
|
683
|
+
"""Close database connection and cleanup resources.
|
|
684
|
+
|
|
685
|
+
Always call in finally block or use context manager to ensure cleanup.
|
|
686
|
+
|
|
687
|
+
Example:
|
|
688
|
+
```python
|
|
689
|
+
store = DuckDbSpotStore(Path("data/binance.duckdb"))
|
|
690
|
+
try:
|
|
691
|
+
df = store.load("BTCUSDT", hours=24)
|
|
692
|
+
finally:
|
|
693
|
+
store.close()
|
|
694
|
+
```
|
|
695
|
+
"""
|
|
696
|
+
self._con.close()
|