gapless-crypto-clickhouse 7.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gapless_crypto_clickhouse/__init__.py +147 -0
- gapless_crypto_clickhouse/__probe__.py +349 -0
- gapless_crypto_clickhouse/api.py +1032 -0
- gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
- gapless_crypto_clickhouse/clickhouse/config.py +119 -0
- gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
- gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
- gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
- gapless_crypto_clickhouse/clickhouse_query.py +642 -0
- gapless_crypto_clickhouse/collectors/__init__.py +21 -0
- gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
- gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
- gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
- gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
- gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
- gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
- gapless_crypto_clickhouse/exceptions.py +145 -0
- gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
- gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
- gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
- gapless_crypto_clickhouse/llms.txt +268 -0
- gapless_crypto_clickhouse/probe.py +235 -0
- gapless_crypto_clickhouse/py.typed +0 -0
- gapless_crypto_clickhouse/query_api.py +374 -0
- gapless_crypto_clickhouse/resume/__init__.py +12 -0
- gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
- gapless_crypto_clickhouse/utils/__init__.py +29 -0
- gapless_crypto_clickhouse/utils/error_handling.py +202 -0
- gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
- gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
- gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
- gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
- gapless_crypto_clickhouse/validation/__init__.py +36 -0
- gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
- gapless_crypto_clickhouse/validation/models.py +220 -0
- gapless_crypto_clickhouse/validation/storage.py +502 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
- gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,1277 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gapless-crypto-clickhouse
|
|
3
|
+
Version: 7.1.0
|
|
4
|
+
Summary: ClickHouse-based cryptocurrency data collection with zero-gap guarantee. 22x faster via Binance public repository with persistent database storage, USDT-margined futures support, and production-ready ReplacingMergeTree schema.
|
|
5
|
+
Project-URL: Homepage, https://github.com/terrylica/gapless-crypto-clickhouse
|
|
6
|
+
Project-URL: Documentation, https://github.com/terrylica/gapless-crypto-clickhouse#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/terrylica/gapless-crypto-clickhouse.git
|
|
8
|
+
Project-URL: Issues, https://github.com/terrylica/gapless-crypto-clickhouse/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/terrylica/gapless-crypto-clickhouse/blob/main/CHANGELOG.md
|
|
10
|
+
Author-email: Eon Labs <terry@eonlabs.com>
|
|
11
|
+
Maintainer-email: Terry Li <terry@eonlabs.com>
|
|
12
|
+
License: MIT
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Keywords: 22x-faster,OHLCV,USDT-margined,binance,clickhouse,crypto,cryptocurrency,data-warehouse,database,deterministic-versioning,financial-data,futures,gap-filling,gapless,liquidity,microstructure,order-flow,perpetual,persistent-storage,production-ready,replacing-mergetree,sql-queries,time-series,trading,zero-gaps
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Database
|
|
24
|
+
Classifier: Topic :: Office/Business :: Financial :: Investment
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
26
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
27
|
+
Requires-Python: >=3.12
|
|
28
|
+
Requires-Dist: binance-futures-availability>=1.1.0
|
|
29
|
+
Requires-Dist: clickhouse-connect>=0.7.0
|
|
30
|
+
Requires-Dist: duckdb>=1.1.0
|
|
31
|
+
Requires-Dist: httpx>=0.25.0
|
|
32
|
+
Requires-Dist: numpy<2.0.0,>=1.23.2
|
|
33
|
+
Requires-Dist: pandas<2.2.0,>=2.0.0
|
|
34
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
35
|
+
Requires-Dist: pydantic>=2.0.0
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# Gapless Crypto ClickHouse
|
|
39
|
+
|
|
40
|
+
[](https://pypi.org/project/gapless-crypto-clickhouse/)
|
|
41
|
+
[](https://github.com/terrylica/gapless-crypto-clickhouse/releases/latest)
|
|
42
|
+
[](https://pypi.org/project/gapless-crypto-clickhouse/)
|
|
43
|
+
[](https://pypi.org/project/gapless-crypto-clickhouse/)
|
|
44
|
+
[](https://opensource.org/licenses/MIT)
|
|
45
|
+
[](https://github.com/astral-sh/uv)
|
|
46
|
+
[](https://github.com/terrylica/gapless-crypto-clickhouse/actions)
|
|
47
|
+
[](https://github.com/terrylica/gapless-crypto-clickhouse/blob/main/PROBE_USAGE_EXAMPLE.md)
|
|
48
|
+
|
|
49
|
+
ClickHouse-based cryptocurrency data collection with zero-gap guarantee. 22x faster via Binance public repository with persistent database storage, USDT-margined futures support, and production-ready ReplacingMergeTree schema.
|
|
50
|
+
|
|
51
|
+
## When to Use This Package
|
|
52
|
+
|
|
53
|
+
**Choose `gapless-crypto-clickhouse`** (this package) when you need:
|
|
54
|
+
|
|
55
|
+
- **Persistent database storage** for multi-symbol, multi-timeframe datasets
|
|
56
|
+
- **Advanced SQL queries** for time-series analysis, aggregations, and joins
|
|
57
|
+
- **USDT-margined futures** support (perpetual contracts)
|
|
58
|
+
- **Production data pipelines** with deterministic versioning and deduplication
|
|
59
|
+
- **Python 3.12+** modern runtime environment
|
|
60
|
+
|
|
61
|
+
**Choose [`gapless-crypto-data`](https://github.com/terrylica/gapless-crypto-data)** (file-based) when you need:
|
|
62
|
+
|
|
63
|
+
- **Simple file-based workflows** with CSV output
|
|
64
|
+
- **Single-symbol analysis** without database overhead
|
|
65
|
+
- **Python 3.9-3.13** broader compatibility
|
|
66
|
+
- **Lightweight dependency footprint** (no database required)
|
|
67
|
+
|
|
68
|
+
Both packages share the same 22x performance advantage via Binance public repository and zero-gap guarantee.
|
|
69
|
+
|
|
70
|
+
## Features
|
|
71
|
+
|
|
72
|
+
- **22x faster** data collection via Binance public data repository
|
|
73
|
+
- **2x faster queries** with Apache Arrow optimization (v6.0.0+, 41K+ rows/s at scale)
|
|
74
|
+
- **Auto-ingestion**: Unified `query_ohlcv()` API downloads missing data automatically
|
|
75
|
+
- **ClickHouse database** with ReplacingMergeTree for deterministic deduplication
|
|
76
|
+
- **USDT-margined futures** support (perpetual contracts via `instrument_type` column)
|
|
77
|
+
- **Zero gaps guarantee** through intelligent monthly-to-daily fallback
|
|
78
|
+
- **Complete 13-timeframe support**: 1s, 1m, 3m, 5m, 15m, 30m, 1h, 2h, 4h, 6h, 8h, 12h, 1d
|
|
79
|
+
- **11-column microstructure format** (spot) and 12-column format (futures with funding rate)
|
|
80
|
+
- **Advanced SQL queries** for time-series analysis, multi-symbol joins, aggregations
|
|
81
|
+
- **Persistent storage** with compression (DoubleDelta timestamps, Gorilla OHLCV)
|
|
82
|
+
- **AI agent ready**: llms.txt + probe.py for capability discovery
|
|
83
|
+
- **UV-based Python tooling** for modern dependency management
|
|
84
|
+
- **Production-ready** with comprehensive test coverage
|
|
85
|
+
|
|
86
|
+
## Quick Start
|
|
87
|
+
|
|
88
|
+
### Installation (UV)
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Install via UV
|
|
92
|
+
uv add gapless-crypto-clickhouse
|
|
93
|
+
|
|
94
|
+
# Or install globally
|
|
95
|
+
uv tool install gapless-crypto-clickhouse
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Installation (pip)
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install gapless-crypto-clickhouse
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### Database Setup (ClickHouse)
|
|
105
|
+
|
|
106
|
+
For persistent storage and advanced query capabilities, set up ClickHouse:
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Start ClickHouse using Docker Compose
|
|
110
|
+
docker-compose up -d
|
|
111
|
+
|
|
112
|
+
# Verify ClickHouse is running
|
|
113
|
+
docker-compose ps
|
|
114
|
+
|
|
115
|
+
# View logs
|
|
116
|
+
docker-compose logs -f clickhouse
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
See [Database Integration](#-database-integration) for complete setup guide and usage examples.
|
|
120
|
+
|
|
121
|
+
### Python API (Recommended)
|
|
122
|
+
|
|
123
|
+
#### Function-based API
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
import gapless_crypto_clickhouse as gcd
|
|
127
|
+
|
|
128
|
+
# Fetch recent data with date range (CCXT-compatible timeframe parameter)
|
|
129
|
+
df = gcd.download("BTCUSDT", timeframe="1h", start="2024-01-01", end="2024-06-30")
|
|
130
|
+
|
|
131
|
+
# Or with limit
|
|
132
|
+
df = gcd.fetch_data("ETHUSDT", timeframe="4h", limit=1000)
|
|
133
|
+
|
|
134
|
+
# Backward compatibility (legacy interval parameter)
|
|
135
|
+
df = gcd.fetch_data("ETHUSDT", interval="4h", limit=1000) # DeprecationWarning
|
|
136
|
+
|
|
137
|
+
# Get available symbols and timeframes
|
|
138
|
+
symbols = gcd.get_supported_symbols()
|
|
139
|
+
timeframes = gcd.get_supported_timeframes()
|
|
140
|
+
|
|
141
|
+
# Fill gaps in existing data
|
|
142
|
+
results = gcd.fill_gaps("./data")
|
|
143
|
+
|
|
144
|
+
# Multi-symbol batch download (concurrent execution - 10-20x faster)
|
|
145
|
+
results = gcd.download_multiple(
|
|
146
|
+
symbols=["BTCUSDT", "ETHUSDT", "BNBUSDT", "XRPUSDT", "SOLUSDT"],
|
|
147
|
+
timeframe="1h",
|
|
148
|
+
start_date="2024-01-01",
|
|
149
|
+
end_date="2024-06-30",
|
|
150
|
+
max_workers=5 # Configure concurrency
|
|
151
|
+
)
|
|
152
|
+
# Returns: dict[str, pd.DataFrame]
|
|
153
|
+
# Example: btc_df = results["BTCUSDT"]
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
#### Class-based API
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from gapless_crypto_clickhouse import BinancePublicDataCollector, UniversalGapFiller
|
|
160
|
+
|
|
161
|
+
# Custom collection with full control
|
|
162
|
+
collector = BinancePublicDataCollector(
|
|
163
|
+
symbol="SOLUSDT",
|
|
164
|
+
start_date="2023-01-01",
|
|
165
|
+
end_date="2023-12-31"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
result = collector.collect_timeframe_data("1h")
|
|
169
|
+
df = result["dataframe"]
|
|
170
|
+
|
|
171
|
+
# Manual gap filling
|
|
172
|
+
gap_filler = UniversalGapFiller()
|
|
173
|
+
gaps = gap_filler.detect_all_gaps(csv_file, "1h")
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
> **Note**: This package never included a CLI interface (unlike parent package `gapless-crypto-data`). It provides a Python API only for programmatic access. See examples above for usage patterns.
|
|
177
|
+
|
|
178
|
+
## Data Structure
|
|
179
|
+
|
|
180
|
+
All functions return pandas DataFrames with complete microstructure data:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
import gapless_crypto_clickhouse as gcd
|
|
184
|
+
|
|
185
|
+
# Fetch data
|
|
186
|
+
df = gcd.download("BTCUSDT", timeframe="1h", start="2024-01-01", end="2024-06-30")
|
|
187
|
+
|
|
188
|
+
# DataFrame columns (11-column microstructure format)
|
|
189
|
+
print(df.columns.tolist())
|
|
190
|
+
# ['date', 'open', 'high', 'low', 'close', 'volume',
|
|
191
|
+
# 'close_time', 'quote_asset_volume', 'number_of_trades',
|
|
192
|
+
# 'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume']
|
|
193
|
+
|
|
194
|
+
# Professional microstructure analysis
|
|
195
|
+
buy_pressure = df['taker_buy_base_asset_volume'].sum() / df['volume'].sum()
|
|
196
|
+
avg_trade_size = df['volume'].sum() / df['number_of_trades'].sum()
|
|
197
|
+
market_impact = df['quote_asset_volume'].std() / df['quote_asset_volume'].mean()
|
|
198
|
+
|
|
199
|
+
print(f"Taker buy pressure: {buy_pressure:.1%}")
|
|
200
|
+
print(f"Average trade size: {avg_trade_size:.4f} BTC")
|
|
201
|
+
print(f"Market impact volatility: {market_impact:.3f}")
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Data Sources
|
|
205
|
+
|
|
206
|
+
The package supports two data collection methods:
|
|
207
|
+
|
|
208
|
+
- **Binance Public Repository**: Pre-generated monthly ZIP files for historical data
|
|
209
|
+
- **Binance API**: Real-time data for gap filling and recent data collection
|
|
210
|
+
|
|
211
|
+
## 🏗️ Architecture
|
|
212
|
+
|
|
213
|
+
### Core Components
|
|
214
|
+
|
|
215
|
+
- **BinancePublicDataCollector**: Data collection with full 11-column microstructure format
|
|
216
|
+
- **UniversalGapFiller**: Intelligent gap detection and filling with authentic API-first validation
|
|
217
|
+
- **AtomicCSVOperations**: Corruption-proof file operations with atomic writes
|
|
218
|
+
- **SafeCSVMerger**: Safe merging of data files with integrity validation
|
|
219
|
+
|
|
220
|
+
### Data Flow
|
|
221
|
+
|
|
222
|
+
```
|
|
223
|
+
Binance Public Data Repository → BinancePublicDataCollector → 11-Column Microstructure Format
|
|
224
|
+
↓
|
|
225
|
+
Gap Detection → UniversalGapFiller → Authentic API-First Validation
|
|
226
|
+
↓
|
|
227
|
+
AtomicCSVOperations → Final Gapless Dataset with Order Flow Metrics
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
## 🗄️ Database Integration
|
|
231
|
+
|
|
232
|
+
ClickHouse is a **required component** for this package. The database-first architecture enables persistent storage, advanced query capabilities, and multi-symbol analysis.
|
|
233
|
+
|
|
234
|
+
**When to use**:
|
|
235
|
+
|
|
236
|
+
- **File-based approach**: Simple workflows, single symbols, CSV output compatibility
|
|
237
|
+
- **Database approach**: Multi-symbol analysis, time-series queries, aggregations, production pipelines (recommended)
|
|
238
|
+
|
|
239
|
+
### Quick Start with Docker Compose
|
|
240
|
+
|
|
241
|
+
The repository includes a production-ready `docker-compose.yml` for local development:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# Start ClickHouse (runs in background)
|
|
245
|
+
docker-compose up -d
|
|
246
|
+
|
|
247
|
+
# Verify container is healthy
|
|
248
|
+
docker-compose ps
|
|
249
|
+
|
|
250
|
+
# View initialization logs
|
|
251
|
+
docker-compose logs clickhouse
|
|
252
|
+
|
|
253
|
+
# Access ClickHouse client (optional)
|
|
254
|
+
docker exec -it gapless-clickhouse clickhouse-client
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
**What happens on first start**:
|
|
258
|
+
|
|
259
|
+
1. Downloads ClickHouse 24.1-alpine image (~200 MB)
|
|
260
|
+
2. Creates `ohlcv` table with ReplacingMergeTree engine (from `schema.sql`)
|
|
261
|
+
3. Configures compression (DoubleDelta for timestamps, Gorilla for OHLCV)
|
|
262
|
+
4. Sets up health checks and automatic restart
|
|
263
|
+
|
|
264
|
+
**Schema auto-initialization**: The `schema.sql` file is automatically executed via Docker's `initdb.d` mechanism.
|
|
265
|
+
|
|
266
|
+
### Quick Start: Unified Query API (v6.0.0+)
|
|
267
|
+
|
|
268
|
+
The **recommended way** to query data in v6.0.0+ is using `query_ohlcv()` with auto-ingestion and Apache Arrow optimization:
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
from gapless_crypto_clickhouse import query_ohlcv
|
|
272
|
+
|
|
273
|
+
# Query with auto-ingestion (downloads data if missing)
|
|
274
|
+
df = query_ohlcv(
|
|
275
|
+
"BTCUSDT",
|
|
276
|
+
"1h",
|
|
277
|
+
"2024-01-01",
|
|
278
|
+
"2024-01-31"
|
|
279
|
+
)
|
|
280
|
+
print(f"Retrieved {len(df)} rows") # 744 rows (31 days * 24 hours)
|
|
281
|
+
|
|
282
|
+
# Multi-symbol query
|
|
283
|
+
df = query_ohlcv(
|
|
284
|
+
["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
285
|
+
"1h",
|
|
286
|
+
"2024-01-01",
|
|
287
|
+
"2024-01-31"
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
# Futures data
|
|
291
|
+
df = query_ohlcv(
|
|
292
|
+
"BTCUSDT",
|
|
293
|
+
"1h",
|
|
294
|
+
"2024-01-01",
|
|
295
|
+
"2024-01-31",
|
|
296
|
+
instrument_type="futures-um"
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Query without auto-ingestion (faster, raises if data missing)
|
|
300
|
+
df = query_ohlcv(
|
|
301
|
+
"BTCUSDT",
|
|
302
|
+
"1h",
|
|
303
|
+
"2024-01-01",
|
|
304
|
+
"2024-01-31",
|
|
305
|
+
auto_ingest=False
|
|
306
|
+
)
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
**Performance (Apache Arrow optimization)**:
|
|
310
|
+
|
|
311
|
+
- **2x faster** at scale: 41,272 rows/s vs 20,534 rows/s for large datasets (>8000 rows)
|
|
312
|
+
- **43-57% less memory**: Arrow buffers reduce memory usage for medium/large queries
|
|
313
|
+
- **Auto-ingestion**: Downloads missing data automatically on first query
|
|
314
|
+
- **Best for**: Analytical queries, backtesting, multi-symbol analysis (typical use case)
|
|
315
|
+
|
|
316
|
+
**When to use lower-level APIs**: Advanced use cases requiring custom SQL, bulk loading, or connection management.
|
|
317
|
+
|
|
318
|
+
### Basic Usage Examples
|
|
319
|
+
|
|
320
|
+
#### Connection and Health Check
|
|
321
|
+
|
|
322
|
+
```python
|
|
323
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
324
|
+
|
|
325
|
+
# Connect to ClickHouse (reads from .env or uses defaults)
|
|
326
|
+
with ClickHouseConnection() as conn:
|
|
327
|
+
# Verify connection
|
|
328
|
+
health = conn.health_check()
|
|
329
|
+
print(f"ClickHouse connected: {health}")
|
|
330
|
+
|
|
331
|
+
# Execute simple query
|
|
332
|
+
result = conn.execute("SELECT count() FROM ohlcv")
|
|
333
|
+
print(f"Total rows in database: {result[0][0]:,}")
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
#### Bulk Data Ingestion
|
|
337
|
+
|
|
338
|
+
```python
|
|
339
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
340
|
+
from gapless_crypto_clickhouse.collectors.clickhouse_bulk_loader import ClickHouseBulkLoader
|
|
341
|
+
|
|
342
|
+
# Ingest historical data from Binance public repository
|
|
343
|
+
with ClickHouseConnection() as conn:
|
|
344
|
+
loader = ClickHouseBulkLoader(conn, instrument_type="spot")
|
|
345
|
+
|
|
346
|
+
# Ingest single month (e.g., January 2024)
|
|
347
|
+
rows_inserted = loader.ingest_month("BTCUSDT", "1h", year=2024, month=1)
|
|
348
|
+
print(f"Inserted {rows_inserted:,} rows for BTCUSDT 1h (Jan 2024)")
|
|
349
|
+
|
|
350
|
+
# Ingest date range (e.g., Q1 2024)
|
|
351
|
+
total_rows = loader.ingest_date_range(
|
|
352
|
+
symbol="ETHUSDT",
|
|
353
|
+
timeframe="4h",
|
|
354
|
+
start_date="2024-01-01",
|
|
355
|
+
end_date="2024-03-31"
|
|
356
|
+
)
|
|
357
|
+
print(f"Inserted {total_rows:,} rows for ETHUSDT 4h (Q1 2024)")
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
**Zero-gap guarantee**: ClickHouse uses deterministic versioning (SHA256 hash) to handle duplicate ingestion safely. Re-running ingestion commands won't create duplicates.
|
|
361
|
+
|
|
362
|
+
#### Querying Data
|
|
363
|
+
|
|
364
|
+
```python
|
|
365
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
366
|
+
from gapless_crypto_clickhouse.clickhouse_query import OHLCVQuery
|
|
367
|
+
|
|
368
|
+
with ClickHouseConnection() as conn:
|
|
369
|
+
query = OHLCVQuery(conn)
|
|
370
|
+
|
|
371
|
+
# Get latest data (last 10 bars)
|
|
372
|
+
df = query.get_latest("BTCUSDT", "1h", limit=10)
|
|
373
|
+
print(f"Latest 10 bars:\n{df[['timestamp', 'close']]}")
|
|
374
|
+
|
|
375
|
+
# Get specific date range
|
|
376
|
+
df = query.get_range(
|
|
377
|
+
symbol="BTCUSDT",
|
|
378
|
+
timeframe="1h",
|
|
379
|
+
start_date="2024-01-01",
|
|
380
|
+
end_date="2024-01-31",
|
|
381
|
+
instrument_type="spot"
|
|
382
|
+
)
|
|
383
|
+
print(f"January 2024: {len(df):,} bars")
|
|
384
|
+
|
|
385
|
+
# Multi-symbol comparison
|
|
386
|
+
df = query.get_multi_symbol(
|
|
387
|
+
symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
388
|
+
timeframe="1d",
|
|
389
|
+
start_date="2024-01-01",
|
|
390
|
+
end_date="2024-12-31"
|
|
391
|
+
)
|
|
392
|
+
print(f"Multi-symbol dataset: {df.shape}")
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
**FINAL keyword**: All queries automatically use `FINAL` to ensure deduplicated results. This adds ~10-30% overhead but guarantees data correctness.
|
|
396
|
+
|
|
397
|
+
#### Futures Support (ADR-0004)
|
|
398
|
+
|
|
399
|
+
```python
|
|
400
|
+
# Ingest futures data (12-column format with funding rate)
|
|
401
|
+
with ClickHouseConnection() as conn:
|
|
402
|
+
loader = ClickHouseBulkLoader(conn, instrument_type="futures")
|
|
403
|
+
rows = loader.ingest_month("BTCUSDT", "1h", 2024, 1)
|
|
404
|
+
print(f"Futures data: {rows:,} rows")
|
|
405
|
+
|
|
406
|
+
# Query futures data (isolated from spot)
|
|
407
|
+
query = OHLCVQuery(conn)
|
|
408
|
+
df_spot = query.get_latest("BTCUSDT", "1h", instrument_type="spot", limit=10)
|
|
409
|
+
df_futures = query.get_latest("BTCUSDT", "1h", instrument_type="futures", limit=10)
|
|
410
|
+
|
|
411
|
+
print(f"Spot data: {len(df_spot)} bars")
|
|
412
|
+
print(f"Futures data: {len(df_futures)} bars")
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
**Spot/Futures isolation**: The `instrument_type` column ensures spot and futures data coexist without conflicts.
|
|
416
|
+
|
|
417
|
+
### Configuration
|
|
418
|
+
|
|
419
|
+
**Environment Variables** (`.env` file or system environment):
|
|
420
|
+
|
|
421
|
+
```bash
|
|
422
|
+
CLICKHOUSE_HOST=localhost # ClickHouse server hostname
|
|
423
|
+
CLICKHOUSE_PORT=9000 # Native protocol port (default: 9000)
|
|
424
|
+
CLICKHOUSE_HTTP_PORT=8123 # HTTP interface port (default: 8123)
|
|
425
|
+
CLICKHOUSE_USER=default # Username (default: 'default')
|
|
426
|
+
CLICKHOUSE_PASSWORD= # Password (empty for local dev)
|
|
427
|
+
CLICKHOUSE_DB=default # Database name (default: 'default')
|
|
428
|
+
```
|
|
429
|
+
|
|
430
|
+
**Docker Compose defaults**: The included `docker-compose.yml` uses these defaults, no `.env` file required for local development.
|
|
431
|
+
|
|
432
|
+
### Local Visualization Tools
|
|
433
|
+
|
|
434
|
+
Comprehensive toolchain for ClickHouse data exploration and monitoring (100% open source):
|
|
435
|
+
|
|
436
|
+
**Web Interfaces**:
|
|
437
|
+
|
|
438
|
+
- **CH-UI** (modern TypeScript UI): http://localhost:5521
|
|
439
|
+
```bash
|
|
440
|
+
docker-compose up -d ch-ui
|
|
441
|
+
```
|
|
442
|
+
- **ClickHouse Play** (built-in): http://localhost:8123/play
|
|
443
|
+
|
|
444
|
+
**CLI Tools**:
|
|
445
|
+
|
|
446
|
+
- **clickhouse-client** (official CLI with 70+ formats):
|
|
447
|
+
```bash
|
|
448
|
+
docker exec -it gapless-clickhouse clickhouse-client
|
|
449
|
+
```
|
|
450
|
+
- **clickhouse-local** (file analysis without server):
|
|
451
|
+
```bash
|
|
452
|
+
clickhouse-local --query "SELECT * FROM file('data.csv', CSV)"
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
**Performance Monitoring**:
|
|
456
|
+
|
|
457
|
+
- **chdig** (TUI with flamegraph visualization):
|
|
458
|
+
```bash
|
|
459
|
+
brew install chdig
|
|
460
|
+
chdig --host localhost --port 9000
|
|
461
|
+
```
|
|
462
|
+
|
|
463
|
+
**Validation**: Run automated validation suite:
|
|
464
|
+
|
|
465
|
+
```bash
|
|
466
|
+
bash scripts/validate-clickhouse-tools.sh
|
|
467
|
+
```
|
|
468
|
+
|
|
469
|
+
**Comprehensive guides**: See [`docs/development/`](docs/development/) for detailed usage guides, examples, and troubleshooting.
|
|
470
|
+
|
|
471
|
+
### Migration Guide
|
|
472
|
+
|
|
473
|
+
**Migrating from `gapless-crypto-data` (file-based) to `gapless-crypto-clickhouse` (database-first)**:
|
|
474
|
+
|
|
475
|
+
See [`docs/CLICKHOUSE_MIGRATION.md`](docs/CLICKHOUSE_MIGRATION.md) for:
|
|
476
|
+
|
|
477
|
+
- Architecture changes (file-based → ClickHouse)
|
|
478
|
+
- Code migration examples (drop-in replacement)
|
|
479
|
+
- Deployment guide (Docker Compose, production)
|
|
480
|
+
- Performance characteristics (ingestion, query, deduplication)
|
|
481
|
+
- Troubleshooting common issues
|
|
482
|
+
|
|
483
|
+
**Key Changes**:
|
|
484
|
+
|
|
485
|
+
- Package name: `gapless-crypto-data` → `gapless-crypto-clickhouse`
|
|
486
|
+
- Import paths: `gapless_crypto_data` → `gapless_crypto_clickhouse`
|
|
487
|
+
- ClickHouse requirement: ClickHouse database required (Docker Compose provided)
|
|
488
|
+
- Python version: 3.12+ (was 3.9-3.13)
|
|
489
|
+
- API signatures: **Unchanged** (backwards compatible)
|
|
490
|
+
|
|
491
|
+
**Rollback strategy**: Continue using `gapless-crypto-data` for file-based workflows. Both packages maintained independently.
|
|
492
|
+
|
|
493
|
+
### Production Deployment
|
|
494
|
+
|
|
495
|
+
**Recommended setup**:
|
|
496
|
+
|
|
497
|
+
1. **Persistent storage**: Mount volumes for data durability
|
|
498
|
+
2. **Authentication**: Set `CLICKHOUSE_PASSWORD` for non-localhost deployments
|
|
499
|
+
3. **TLS**: Enable TLS for remote connections
|
|
500
|
+
4. **Monitoring**: ClickHouse exports Prometheus metrics on port 9363
|
|
501
|
+
5. **Backups**: Use ClickHouse Backup tool or volume snapshots
|
|
502
|
+
|
|
503
|
+
**Scaling**:
|
|
504
|
+
|
|
505
|
+
- Single-node: Validated at 53.7M rows (ADR-0003), headroom to ~200M rows
|
|
506
|
+
- Distributed: ClickHouse supports sharding and replication for larger datasets
|
|
507
|
+
|
|
508
|
+
See ClickHouse documentation for production deployment best practices.
|
|
509
|
+
|
|
510
|
+
## 🔧 Advanced Usage
|
|
511
|
+
|
|
512
|
+
### Batch Processing
|
|
513
|
+
|
|
514
|
+
#### Simple API (Recommended)
|
|
515
|
+
|
|
516
|
+
```python
|
|
517
|
+
import gapless_crypto_clickhouse as gcd
|
|
518
|
+
|
|
519
|
+
# Process multiple symbols with simple loops
|
|
520
|
+
symbols = ["BTCUSDT", "ETHUSDT", "SOLUSDT", "ADAUSDT"]
|
|
521
|
+
timeframes = ["1h", "4h"]
|
|
522
|
+
|
|
523
|
+
for symbol in symbols:
|
|
524
|
+
for timeframe in timeframes:
|
|
525
|
+
df = gcd.fetch_data(symbol, timeframe, start="2023-01-01", end="2023-12-31")
|
|
526
|
+
print(f"{symbol} {timeframe}: {len(df)} bars collected")
|
|
527
|
+
```
|
|
528
|
+
|
|
529
|
+
#### Advanced API (Complex Workflows)
|
|
530
|
+
|
|
531
|
+
```python
|
|
532
|
+
from gapless_crypto_clickhouse import BinancePublicDataCollector
|
|
533
|
+
|
|
534
|
+
# Initialize with custom settings
|
|
535
|
+
collector = BinancePublicDataCollector(
|
|
536
|
+
start_date="2023-01-01",
|
|
537
|
+
end_date="2023-12-31",
|
|
538
|
+
output_dir="./crypto_data"
|
|
539
|
+
)
|
|
540
|
+
|
|
541
|
+
# Process multiple symbols with detailed control
|
|
542
|
+
symbols = ["BTCUSDT", "ETHUSDT", "SOLUSDT"]
|
|
543
|
+
for symbol in symbols:
|
|
544
|
+
collector.symbol = symbol
|
|
545
|
+
results = collector.collect_multiple_timeframes(["1m", "5m", "1h", "4h"])
|
|
546
|
+
for timeframe, result in results.items():
|
|
547
|
+
print(f"{symbol} {timeframe}: {result['stats']}")
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
### Gap Analysis
|
|
551
|
+
|
|
552
|
+
#### Simple API (Recommended)
|
|
553
|
+
|
|
554
|
+
```python
|
|
555
|
+
import gapless_crypto_clickhouse as gcd
|
|
556
|
+
|
|
557
|
+
# Quick gap filling for entire directory
|
|
558
|
+
results = gcd.fill_gaps("./data")
|
|
559
|
+
print(f"Processed {results['files_processed']} files")
|
|
560
|
+
print(f"Filled {results['gaps_filled']}/{results['gaps_detected']} gaps")
|
|
561
|
+
print(f"Success rate: {results['success_rate']:.1f}%")
|
|
562
|
+
|
|
563
|
+
# Gap filling for specific symbols only
|
|
564
|
+
results = gcd.fill_gaps("./data", symbols=["BTCUSDT", "ETHUSDT"])
|
|
565
|
+
```
|
|
566
|
+
|
|
567
|
+
#### Advanced API (Detailed Control)
|
|
568
|
+
|
|
569
|
+
```python
|
|
570
|
+
from gapless_crypto_clickhouse import UniversalGapFiller
|
|
571
|
+
|
|
572
|
+
gap_filler = UniversalGapFiller()
|
|
573
|
+
|
|
574
|
+
# Manual gap detection and analysis
|
|
575
|
+
gaps = gap_filler.detect_all_gaps("BTCUSDT_1h.csv", "1h")
|
|
576
|
+
print(f"Found {len(gaps)} gaps")
|
|
577
|
+
|
|
578
|
+
for gap in gaps:
|
|
579
|
+
duration_hours = gap['duration'].total_seconds() / 3600
|
|
580
|
+
print(f"Gap: {gap['start_time']} → {gap['end_time']} ({duration_hours:.1f}h)")
|
|
581
|
+
|
|
582
|
+
# Fill specific gaps
|
|
583
|
+
result = gap_filler.process_file("BTCUSDT_1h.csv", "1h")
|
|
584
|
+
```
|
|
585
|
+
|
|
586
|
+
### Database Query Examples
|
|
587
|
+
|
|
588
|
+
For users leveraging ClickHouse database integration:
|
|
589
|
+
|
|
590
|
+
#### Bulk Ingestion Pipeline
|
|
591
|
+
|
|
592
|
+
```python
|
|
593
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
594
|
+
from gapless_crypto_clickhouse.collectors.clickhouse_bulk_loader import ClickHouseBulkLoader
|
|
595
|
+
|
|
596
|
+
# Multi-symbol bulk ingestion for backtesting datasets
|
|
597
|
+
symbols = ["BTCUSDT", "ETHUSDT", "SOLUSDT", "ADAUSDT", "DOGEUSDT"]
|
|
598
|
+
timeframes = ["1h", "4h", "1d"]
|
|
599
|
+
|
|
600
|
+
with ClickHouseConnection() as conn:
|
|
601
|
+
loader = ClickHouseBulkLoader(conn, instrument_type="spot")
|
|
602
|
+
|
|
603
|
+
for symbol in symbols:
|
|
604
|
+
for timeframe in timeframes:
|
|
605
|
+
# Ingest Q1 2024 data
|
|
606
|
+
rows = loader.ingest_date_range(
|
|
607
|
+
symbol=symbol,
|
|
608
|
+
timeframe=timeframe,
|
|
609
|
+
start_date="2024-01-01",
|
|
610
|
+
end_date="2024-03-31"
|
|
611
|
+
)
|
|
612
|
+
print(f"{symbol} {timeframe}: {rows:,} rows ingested")
|
|
613
|
+
|
|
614
|
+
# Zero-gap guarantee: Re-running this script won't create duplicates
|
|
615
|
+
```
|
|
616
|
+
|
|
617
|
+
#### Multi-Symbol Analysis
|
|
618
|
+
|
|
619
|
+
```python
|
|
620
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
621
|
+
from gapless_crypto_clickhouse.clickhouse_query import OHLCVQuery
|
|
622
|
+
|
|
623
|
+
with ClickHouseConnection() as conn:
|
|
624
|
+
query = OHLCVQuery(conn)
|
|
625
|
+
|
|
626
|
+
# Get synchronized data for all symbols (same time range)
|
|
627
|
+
df = query.get_multi_symbol(
|
|
628
|
+
symbols=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
|
|
629
|
+
timeframe="1h",
|
|
630
|
+
start_date="2024-01-01",
|
|
631
|
+
end_date="2024-01-31"
|
|
632
|
+
)
|
|
633
|
+
|
|
634
|
+
# Analyze cross-asset correlations
|
|
635
|
+
pivot = df.pivot_table(index="timestamp", columns="symbol", values="close")
|
|
636
|
+
correlation = pivot.corr()
|
|
637
|
+
print(f"Correlation matrix:\n{correlation}")
|
|
638
|
+
|
|
639
|
+
# Relative strength analysis
|
|
640
|
+
for symbol in ["BTCUSDT", "ETHUSDT", "SOLUSDT"]:
|
|
641
|
+
symbol_data = df[df["symbol"] == symbol]
|
|
642
|
+
returns = symbol_data["close"].pct_change().sum()
|
|
643
|
+
print(f"{symbol} total return: {returns:.2%}")
|
|
644
|
+
```
|
|
645
|
+
|
|
646
|
+
#### Advanced Time-Series Queries
|
|
647
|
+
|
|
648
|
+
```python
|
|
649
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
650
|
+
|
|
651
|
+
with ClickHouseConnection() as conn:
|
|
652
|
+
# Custom SQL for advanced analytics (ClickHouse functions)
|
|
653
|
+
query = """
|
|
654
|
+
SELECT
|
|
655
|
+
symbol,
|
|
656
|
+
timeframe,
|
|
657
|
+
toStartOfDay(timestamp) AS day,
|
|
658
|
+
avg(close) AS avg_price,
|
|
659
|
+
stddevPop(close) AS volatility,
|
|
660
|
+
sum(volume) AS total_volume,
|
|
661
|
+
count() AS bar_count
|
|
662
|
+
FROM ohlcv FINAL
|
|
663
|
+
WHERE symbol IN ('BTCUSDT', 'ETHUSDT')
|
|
664
|
+
AND timeframe = '1h'
|
|
665
|
+
AND timestamp >= '2024-01-01'
|
|
666
|
+
AND timestamp < '2024-02-01'
|
|
667
|
+
GROUP BY symbol, timeframe, day
|
|
668
|
+
ORDER BY day ASC, symbol ASC
|
|
669
|
+
"""
|
|
670
|
+
|
|
671
|
+
result = conn.execute(query)
|
|
672
|
+
|
|
673
|
+
# Process results
|
|
674
|
+
for row in result:
|
|
675
|
+
symbol, timeframe, day, avg_price, volatility, volume, bars = row
|
|
676
|
+
print(f"{day} {symbol}: avg=${avg_price:.2f}, vol={volatility:.2f}, volume={volume:,.0f}")
|
|
677
|
+
```
|
|
678
|
+
|
|
679
|
+
#### Hybrid Approach (File + Database)
|
|
680
|
+
|
|
681
|
+
Combine file-based collection with database querying:
|
|
682
|
+
|
|
683
|
+
```python
|
|
684
|
+
import gapless_crypto_clickhouse as gcd
|
|
685
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
686
|
+
from gapless_crypto_clickhouse.collectors.clickhouse_bulk_loader import ClickHouseBulkLoader
|
|
687
|
+
|
|
688
|
+
# Step 1: Collect to CSV files (22x faster, portable format)
|
|
689
|
+
df = gcd.download("BTCUSDT", timeframe="1h", start="2024-01-01", end="2024-03-31")
|
|
690
|
+
print(f"Downloaded {len(df):,} bars to CSV")
|
|
691
|
+
|
|
692
|
+
# Step 2: Ingest CSV to ClickHouse for analysis
|
|
693
|
+
with ClickHouseConnection() as conn:
|
|
694
|
+
loader = ClickHouseBulkLoader(conn)
|
|
695
|
+
loader.ingest_from_dataframe(df, symbol="BTCUSDT", timeframe="1h")
|
|
696
|
+
|
|
697
|
+
# Step 3: Run advanced queries
|
|
698
|
+
query = OHLCVQuery(conn)
|
|
699
|
+
gaps = query.detect_gaps("BTCUSDT", "1h", "2024-01-01", "2024-03-31")
|
|
700
|
+
print(f"Gap detection: {len(gaps)} gaps found")
|
|
701
|
+
```
|
|
702
|
+
|
|
703
|
+
**When to use hybrid approach**:
|
|
704
|
+
|
|
705
|
+
- Initial data collection: Use file-based (faster, no database required)
|
|
706
|
+
- Post-processing: Load into ClickHouse for aggregations, joins, time-series analytics
|
|
707
|
+
- Archival: Keep CSV files for portability, use database for active analysis
|
|
708
|
+
|
|
709
|
+
## AI Agent Integration
|
|
710
|
+
|
|
711
|
+
This package includes probe hooks (`gapless_crypto_clickhouse.__probe__`) that enable AI coding agents to discover functionality programmatically.
|
|
712
|
+
|
|
713
|
+
### For AI Coding Agent Users
|
|
714
|
+
|
|
715
|
+
To have your AI coding agent analyze this package, use this prompt:
|
|
716
|
+
|
|
717
|
+
```
|
|
718
|
+
Analyze gapless-crypto-data using: import gapless_crypto_clickhouse; probe = gapless_crypto_clickhouse.__probe__
|
|
719
|
+
|
|
720
|
+
Execute: probe.discover_api(), probe.get_capabilities(), probe.get_task_graph()
|
|
721
|
+
|
|
722
|
+
Provide insights about cryptocurrency data collection capabilities and usage patterns.
|
|
723
|
+
```
|
|
724
|
+
|
|
725
|
+
## 🛠️ Development
|
|
726
|
+
|
|
727
|
+
### Prerequisites
|
|
728
|
+
|
|
729
|
+
- **UV Package Manager** - [Install UV](https://docs.astral.sh/uv/getting-started/installation/)
|
|
730
|
+
- **Python 3.9+** - UV will manage Python versions automatically
|
|
731
|
+
- **Git** - For repository cloning and version control
|
|
732
|
+
- **Docker & Docker Compose** (Optional) - For ClickHouse database development
|
|
733
|
+
|
|
734
|
+
### Development Installation Workflow
|
|
735
|
+
|
|
736
|
+
**IMPORTANT**: This project uses **mandatory pre-commit hooks** to prevent broken code from being committed. All commits are automatically validated for formatting, linting, and basic quality checks.
|
|
737
|
+
|
|
738
|
+
#### Step 1: Clone Repository
|
|
739
|
+
|
|
740
|
+
```bash
|
|
741
|
+
git clone https://github.com/terrylica/gapless-crypto-clickhouse.git
|
|
742
|
+
cd gapless-crypto-clickhouse
|
|
743
|
+
```
|
|
744
|
+
|
|
745
|
+
#### Step 2: Development Environment Setup
|
|
746
|
+
|
|
747
|
+
```bash
|
|
748
|
+
# Create isolated virtual environment
|
|
749
|
+
uv venv
|
|
750
|
+
|
|
751
|
+
# Activate virtual environment
|
|
752
|
+
source .venv/bin/activate # macOS/Linux
|
|
753
|
+
# .venv\Scripts\activate # Windows
|
|
754
|
+
|
|
755
|
+
# Install all dependencies (production + development)
|
|
756
|
+
uv sync --dev
|
|
757
|
+
```
|
|
758
|
+
|
|
759
|
+
#### Step 3: Verify Installation
|
|
760
|
+
|
|
761
|
+
```bash
|
|
762
|
+
# Run test suite
|
|
763
|
+
uv run pytest
|
|
764
|
+
```
|
|
765
|
+
|
|
766
|
+
#### Step 3a: Database Setup (Optional - ClickHouse)
|
|
767
|
+
|
|
768
|
+
If you want to develop with ClickHouse database features:
|
|
769
|
+
|
|
770
|
+
```bash
|
|
771
|
+
# Start ClickHouse container
|
|
772
|
+
docker-compose up -d
|
|
773
|
+
|
|
774
|
+
# Verify ClickHouse is running and healthy
|
|
775
|
+
docker-compose ps
|
|
776
|
+
docker-compose logs clickhouse | grep "Ready for connections"
|
|
777
|
+
|
|
778
|
+
# Test ClickHouse connection
|
|
779
|
+
docker exec gapless-clickhouse clickhouse-client --query "SELECT 1"
|
|
780
|
+
|
|
781
|
+
# View ClickHouse schema
|
|
782
|
+
docker exec gapless-clickhouse clickhouse-client --query "SHOW CREATE TABLE ohlcv"
|
|
783
|
+
```
|
|
784
|
+
|
|
785
|
+
**What gets initialized**:
|
|
786
|
+
|
|
787
|
+
- ClickHouse 24.1-alpine container on ports 9000 (native) and 8123 (HTTP)
|
|
788
|
+
- `ohlcv` table with ReplacingMergeTree engine (from `schema.sql`)
|
|
789
|
+
- Persistent volume for data (`clickhouse-data`)
|
|
790
|
+
- Health checks and automatic restart
|
|
791
|
+
|
|
792
|
+
**Test database ingestion**:
|
|
793
|
+
|
|
794
|
+
```python
|
|
795
|
+
# Create a test script: test_clickhouse.py
|
|
796
|
+
from gapless_crypto_clickhouse.clickhouse import ClickHouseConnection
|
|
797
|
+
from gapless_crypto_clickhouse.collectors.clickhouse_bulk_loader import ClickHouseBulkLoader
|
|
798
|
+
|
|
799
|
+
with ClickHouseConnection() as conn:
|
|
800
|
+
# Health check
|
|
801
|
+
print(f"ClickHouse connected: {conn.health_check()}")
|
|
802
|
+
|
|
803
|
+
# Test ingestion (small dataset)
|
|
804
|
+
loader = ClickHouseBulkLoader(conn, instrument_type="spot")
|
|
805
|
+
rows = loader.ingest_month("BTCUSDT", "1d", year=2024, month=1)
|
|
806
|
+
print(f"Test ingestion: {rows} rows")
|
|
807
|
+
|
|
808
|
+
# Run test
|
|
809
|
+
# uv run python test_clickhouse.py
|
|
810
|
+
```
|
|
811
|
+
|
|
812
|
+
**Teardown**:
|
|
813
|
+
|
|
814
|
+
```bash
|
|
815
|
+
# Stop ClickHouse (keeps data)
|
|
816
|
+
docker-compose down
|
|
817
|
+
|
|
818
|
+
# Stop and delete all data (fresh start)
|
|
819
|
+
docker-compose down -v
|
|
820
|
+
```
|
|
821
|
+
|
|
822
|
+
#### Step 4: Set Up Pre-Commit Hooks (Mandatory)
|
|
823
|
+
|
|
824
|
+
```bash
|
|
825
|
+
# Install pre-commit hooks (prevents broken code from being committed)
|
|
826
|
+
uv run pre-commit install
|
|
827
|
+
|
|
828
|
+
# Test pre-commit hooks
|
|
829
|
+
uv run pre-commit run --all-files
|
|
830
|
+
```
|
|
831
|
+
|
|
832
|
+
#### Step 5: Development Tools
|
|
833
|
+
|
|
834
|
+
```bash
|
|
835
|
+
# Code formatting
|
|
836
|
+
uv run ruff format .
|
|
837
|
+
|
|
838
|
+
# Linting and auto-fixes
|
|
839
|
+
uv run ruff check --fix .
|
|
840
|
+
|
|
841
|
+
# Type checking
|
|
842
|
+
uv run mypy src/
|
|
843
|
+
|
|
844
|
+
# Run specific tests
|
|
845
|
+
uv run pytest tests/test_binance_collector.py -v
|
|
846
|
+
|
|
847
|
+
# Manual pre-commit validation
|
|
848
|
+
uv run pre-commit run --all-files
|
|
849
|
+
```
|
|
850
|
+
|
|
851
|
+
### Development Commands Reference
|
|
852
|
+
|
|
853
|
+
| Task | Command |
|
|
854
|
+
| ---------------------- | ----------------------------------------------------------------------------------- |
|
|
855
|
+
| Install dependencies | `uv sync --dev` |
|
|
856
|
+
| Setup pre-commit hooks | `uv run pre-commit install` |
|
|
857
|
+
| Add new dependency | `uv add package-name` |
|
|
858
|
+
| Add dev dependency | `uv add --dev package-name` |
|
|
859
|
+
| Run Python API | `uv run python -c "import gapless_crypto_clickhouse as gcd; print(gcd.get_info())"` |
|
|
860
|
+
| Run tests | `uv run pytest` |
|
|
861
|
+
| Format code | `uv run ruff format .` |
|
|
862
|
+
| Lint code | `uv run ruff check --fix .` |
|
|
863
|
+
| Type check | `uv run mypy src/` |
|
|
864
|
+
| Validate pre-commit | `uv run pre-commit run --all-files` |
|
|
865
|
+
| Build package | `uv build` |
|
|
866
|
+
|
|
867
|
+
### E2E Validation Framework
|
|
868
|
+
|
|
869
|
+
Autonomous end-to-end validation of ClickHouse web interfaces with Playwright 1.56+ and screenshot evidence.
|
|
870
|
+
|
|
871
|
+
**Validate Web Interfaces**:
|
|
872
|
+
|
|
873
|
+
```bash
|
|
874
|
+
# Full validation (static + unit + integration + e2e)
|
|
875
|
+
uv run scripts/run_validation.py
|
|
876
|
+
|
|
877
|
+
# E2E tests only
|
|
878
|
+
uv run scripts/run_validation.py --e2e-only
|
|
879
|
+
|
|
880
|
+
# CI mode (headless, no interactive prompts)
|
|
881
|
+
uv run scripts/run_validation.py --ci
|
|
882
|
+
```
|
|
883
|
+
|
|
884
|
+
**First-Time Setup**:
|
|
885
|
+
|
|
886
|
+
```bash
|
|
887
|
+
# Install Playwright browsers (one-time)
|
|
888
|
+
uv run playwright install chromium --with-deps
|
|
889
|
+
|
|
890
|
+
# Verify installation
|
|
891
|
+
uv run playwright --version
|
|
892
|
+
```
|
|
893
|
+
|
|
894
|
+
**Test Targets**:
|
|
895
|
+
|
|
896
|
+
- **CH-UI Dashboard**: localhost:5521
|
|
897
|
+
- **ClickHouse Play**: localhost:8123/play
|
|
898
|
+
|
|
899
|
+
**Features**:
|
|
900
|
+
|
|
901
|
+
- Zero manual intervention (PEP 723 self-contained)
|
|
902
|
+
- Screenshot capture for visual regression detection
|
|
903
|
+
- Comprehensive coverage (happy path, errors, edge cases, timeouts)
|
|
904
|
+
- CI/CD optimized with browser caching (30-60s speedup)
|
|
905
|
+
|
|
906
|
+
**Documentation**:
|
|
907
|
+
|
|
908
|
+
- [E2E Testing Guide](docs/validation/E2E_TESTING_GUIDE.md)
|
|
909
|
+
- [Screenshot Baseline Management](docs/validation/SCREENSHOT_BASELINE.md)
|
|
910
|
+
- [ADR-0013: Autonomous Validation Framework](docs/architecture/decisions/0013-autonomous-validation-framework.md)
|
|
911
|
+
|
|
912
|
+
### Project Structure for Development
|
|
913
|
+
|
|
914
|
+
```
|
|
915
|
+
gapless-crypto-clickhouse/
|
|
916
|
+
├── src/gapless_crypto_clickhouse/ # Main package
|
|
917
|
+
│ ├── __init__.py # Package exports
|
|
918
|
+
│ ├── collectors/ # Data collection modules
|
|
919
|
+
│ └── gap_filling/ # Gap detection/filling
|
|
920
|
+
├── tests/ # Test suite
|
|
921
|
+
├── docs/ # Documentation
|
|
922
|
+
├── examples/ # Usage examples
|
|
923
|
+
├── pyproject.toml # Project configuration
|
|
924
|
+
└── uv.lock # Dependency lock file
|
|
925
|
+
```
|
|
926
|
+
|
|
927
|
+
### Building and Publishing
|
|
928
|
+
|
|
929
|
+
```bash
|
|
930
|
+
# Build package
|
|
931
|
+
uv build
|
|
932
|
+
|
|
933
|
+
# Publish to PyPI (requires API token)
|
|
934
|
+
uv publish
|
|
935
|
+
```
|
|
936
|
+
|
|
937
|
+
## 📁 Project Structure
|
|
938
|
+
|
|
939
|
+
```
|
|
940
|
+
gapless-crypto-clickhouse/
|
|
941
|
+
├── src/
|
|
942
|
+
│ └── gapless_crypto_clickhouse/
|
|
943
|
+
│ ├── __init__.py # Package exports
|
|
944
|
+
│ ├── collectors/
|
|
945
|
+
│ │ ├── __init__.py
|
|
946
|
+
│ │ └── binance_public_data_collector.py
|
|
947
|
+
│ ├── gap_filling/
|
|
948
|
+
│ │ ├── __init__.py
|
|
949
|
+
│ │ ├── universal_gap_filler.py
|
|
950
|
+
│ │ └── safe_file_operations.py
|
|
951
|
+
│ └── utils/
|
|
952
|
+
│ └── __init__.py
|
|
953
|
+
├── tests/ # Test suite
|
|
954
|
+
├── docs/ # Documentation
|
|
955
|
+
├── pyproject.toml # Project configuration
|
|
956
|
+
├── README.md # This file
|
|
957
|
+
└── LICENSE # MIT License
|
|
958
|
+
```
|
|
959
|
+
|
|
960
|
+
## 🔍 Supported Timeframes
|
|
961
|
+
|
|
962
|
+
All 13 Binance timeframes supported for complete market coverage:
|
|
963
|
+
|
|
964
|
+
| Timeframe | Code | Description | Use Case |
|
|
965
|
+
| ---------- | ----- | ------------------------ | ---------------------------- |
|
|
966
|
+
| 1 second | `1s` | Ultra-high frequency | HFT, microstructure analysis |
|
|
967
|
+
| 1 minute | `1m` | High resolution | Scalping, order flow |
|
|
968
|
+
| 3 minutes | `3m` | Short-term analysis | Quick trend detection |
|
|
969
|
+
| 5 minutes | `5m` | Common trading timeframe | Day trading signals |
|
|
970
|
+
| 15 minutes | `15m` | Medium-term signals | Swing trading entry |
|
|
971
|
+
| 30 minutes | `30m` | Longer-term patterns | Position management |
|
|
972
|
+
| 1 hour | `1h` | Popular for backtesting | Strategy development |
|
|
973
|
+
| 2 hours | `2h` | Extended analysis | Multi-timeframe confluence |
|
|
974
|
+
| 4 hours | `4h` | Daily cycle patterns | Trend following |
|
|
975
|
+
| 6 hours | `6h` | Quarter-day analysis | Position sizing |
|
|
976
|
+
| 8 hours | `8h` | Third-day cycles | Risk management |
|
|
977
|
+
| 12 hours | `12h` | Half-day patterns | Overnight positions |
|
|
978
|
+
| 1 day | `1d` | Daily analysis | Long-term trends |
|
|
979
|
+
|
|
980
|
+
## ⚠️ Requirements
|
|
981
|
+
|
|
982
|
+
- Python 3.9+
|
|
983
|
+
- pandas >= 2.0.0
|
|
984
|
+
- requests >= 2.25.0
|
|
985
|
+
- Stable internet connection for data downloads
|
|
986
|
+
|
|
987
|
+
## 🤝 Contributing
|
|
988
|
+
|
|
989
|
+
1. Fork the repository
|
|
990
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
991
|
+
3. Install development dependencies (`uv sync --dev`)
|
|
992
|
+
4. Make your changes
|
|
993
|
+
5. Run tests (`uv run pytest`)
|
|
994
|
+
6. Format code (`uv run ruff format .`)
|
|
995
|
+
7. Commit changes (`git commit -m 'Add amazing feature'`)
|
|
996
|
+
8. Push to branch (`git push origin feature/amazing-feature`)
|
|
997
|
+
9. Open a Pull Request
|
|
998
|
+
|
|
999
|
+
## 📚 API Reference
|
|
1000
|
+
|
|
1001
|
+
### BinancePublicDataCollector
|
|
1002
|
+
|
|
1003
|
+
Cryptocurrency spot data collection from Binance's public data repository using pre-generated monthly ZIP files.
|
|
1004
|
+
|
|
1005
|
+
#### Key Methods
|
|
1006
|
+
|
|
1007
|
+
**`__init__(symbol, start_date, end_date, output_dir)`**
|
|
1008
|
+
|
|
1009
|
+
Initialize the collector with trading pair and date range.
|
|
1010
|
+
|
|
1011
|
+
```python
|
|
1012
|
+
collector = BinancePublicDataCollector(
|
|
1013
|
+
symbol="BTCUSDT", # USDT spot pair
|
|
1014
|
+
start_date="2023-01-01", # Start date (YYYY-MM-DD)
|
|
1015
|
+
end_date="2023-12-31", # End date (YYYY-MM-DD)
|
|
1016
|
+
output_dir="./crypto_data" # Output directory (optional)
|
|
1017
|
+
)
|
|
1018
|
+
```
|
|
1019
|
+
|
|
1020
|
+
**`collect_timeframe_data(trading_timeframe) -> Dict[str, Any]`**
|
|
1021
|
+
|
|
1022
|
+
Collect complete historical data for a single timeframe with full 11-column microstructure format.
|
|
1023
|
+
|
|
1024
|
+
```python
|
|
1025
|
+
result = collector.collect_timeframe_data("1h")
|
|
1026
|
+
df = result["dataframe"] # pandas DataFrame with OHLCV + microstructure
|
|
1027
|
+
filepath = result["filepath"] # Path to saved CSV file
|
|
1028
|
+
stats = result["stats"] # Collection statistics
|
|
1029
|
+
|
|
1030
|
+
# Access microstructure data
|
|
1031
|
+
total_trades = df["number_of_trades"].sum()
|
|
1032
|
+
taker_buy_ratio = df["taker_buy_base_asset_volume"].sum() / df["volume"].sum()
|
|
1033
|
+
```
|
|
1034
|
+
|
|
1035
|
+
**`collect_multiple_timeframes(timeframes) -> Dict[str, Dict[str, Any]]`**
|
|
1036
|
+
|
|
1037
|
+
Collect data for multiple timeframes with comprehensive progress tracking.
|
|
1038
|
+
|
|
1039
|
+
```python
|
|
1040
|
+
results = collector.collect_multiple_timeframes(["1h", "4h"])
|
|
1041
|
+
for timeframe, result in results.items():
|
|
1042
|
+
df = result["dataframe"]
|
|
1043
|
+
print(f"{timeframe}: {len(df):,} bars")
|
|
1044
|
+
```
|
|
1045
|
+
|
|
1046
|
+
### UniversalGapFiller
|
|
1047
|
+
|
|
1048
|
+
Gap detection and filling for various timeframes with 11-column microstructure format using Binance API data.
|
|
1049
|
+
|
|
1050
|
+
#### Key Methods
|
|
1051
|
+
|
|
1052
|
+
**`detect_all_gaps(csv_file) -> List[Dict]`**
|
|
1053
|
+
|
|
1054
|
+
Automatically detect timestamp gaps in CSV files.
|
|
1055
|
+
|
|
1056
|
+
```python
|
|
1057
|
+
gap_filler = UniversalGapFiller()
|
|
1058
|
+
gaps = gap_filler.detect_all_gaps("BTCUSDT_1h_data.csv")
|
|
1059
|
+
print(f"Found {len(gaps)} gaps to fill")
|
|
1060
|
+
```
|
|
1061
|
+
|
|
1062
|
+
**`fill_gap(csv_file, gap_info) -> bool`**
|
|
1063
|
+
|
|
1064
|
+
Fill a specific gap with authentic Binance API data.
|
|
1065
|
+
|
|
1066
|
+
```python
|
|
1067
|
+
# Fill first detected gap
|
|
1068
|
+
success = gap_filler.fill_gap("BTCUSDT_1h_data.csv", gaps[0])
|
|
1069
|
+
print(f"Gap filled successfully: {success}")
|
|
1070
|
+
```
|
|
1071
|
+
|
|
1072
|
+
**`process_file(directory) -> Dict[str, Dict]`**
|
|
1073
|
+
|
|
1074
|
+
Batch process all CSV files in a directory for gap detection and filling.
|
|
1075
|
+
|
|
1076
|
+
```python
|
|
1077
|
+
results = gap_filler.process_file("./crypto_data/")
|
|
1078
|
+
for filename, result in results.items():
|
|
1079
|
+
print(f"{filename}: {result['gaps_filled']} gaps filled")
|
|
1080
|
+
```
|
|
1081
|
+
|
|
1082
|
+
### AtomicCSVOperations
|
|
1083
|
+
|
|
1084
|
+
Safe atomic operations for CSV files with header preservation and corruption prevention. Uses temporary files and atomic rename operations to ensure data integrity.
|
|
1085
|
+
|
|
1086
|
+
#### Key Methods
|
|
1087
|
+
|
|
1088
|
+
**`create_backup() -> Path`**
|
|
1089
|
+
|
|
1090
|
+
Create timestamped backup of original file before modifications.
|
|
1091
|
+
|
|
1092
|
+
```python
|
|
1093
|
+
from pathlib import Path
|
|
1094
|
+
atomic_ops = AtomicCSVOperations(Path("data.csv"))
|
|
1095
|
+
backup_path = atomic_ops.create_backup()
|
|
1096
|
+
```
|
|
1097
|
+
|
|
1098
|
+
**`write_dataframe_atomic(df) -> bool`**
|
|
1099
|
+
|
|
1100
|
+
Atomically write DataFrame to CSV with integrity validation.
|
|
1101
|
+
|
|
1102
|
+
```python
|
|
1103
|
+
success = atomic_ops.write_dataframe_atomic(df)
|
|
1104
|
+
if not success:
|
|
1105
|
+
atomic_ops.rollback_from_backup()
|
|
1106
|
+
```
|
|
1107
|
+
|
|
1108
|
+
### SafeCSVMerger
|
|
1109
|
+
|
|
1110
|
+
Safe CSV data merging with gap filling capabilities and data integrity validation. Handles temporal data insertion while maintaining chronological order.
|
|
1111
|
+
|
|
1112
|
+
#### Key Methods
|
|
1113
|
+
|
|
1114
|
+
**`merge_gap_data_safe(gap_data, gap_start, gap_end) -> bool`**
|
|
1115
|
+
|
|
1116
|
+
Safely merge gap data into existing CSV using atomic operations.
|
|
1117
|
+
|
|
1118
|
+
```python
|
|
1119
|
+
from datetime import datetime
|
|
1120
|
+
merger = SafeCSVMerger(Path("eth_data.csv"))
|
|
1121
|
+
success = merger.merge_gap_data_safe(
|
|
1122
|
+
gap_data, # DataFrame with gap data
|
|
1123
|
+
datetime(2024, 1, 1, 12), # Gap start time
|
|
1124
|
+
datetime(2024, 1, 1, 15) # Gap end time
|
|
1125
|
+
)
|
|
1126
|
+
```
|
|
1127
|
+
|
|
1128
|
+
## Output Formats
|
|
1129
|
+
|
|
1130
|
+
### DataFrame Structure (Python API)
|
|
1131
|
+
|
|
1132
|
+
Returns pandas DataFrame with 11-column microstructure format:
|
|
1133
|
+
|
|
1134
|
+
| Column | Type | Description | Example |
|
|
1135
|
+
| ------------------------------ | -------------- | ---------------------- | --------------------- |
|
|
1136
|
+
| `date` | datetime64[ns] | Open timestamp | `2024-01-01 12:00:00` |
|
|
1137
|
+
| `open` | float64 | Opening price | `42150.50` |
|
|
1138
|
+
| `high` | float64 | Highest price | `42200.00` |
|
|
1139
|
+
| `low` | float64 | Lowest price | `42100.25` |
|
|
1140
|
+
| `close` | float64 | Closing price | `42175.75` |
|
|
1141
|
+
| `volume` | float64 | Base asset volume | `15.250000` |
|
|
1142
|
+
| `close_time` | datetime64[ns] | Close timestamp | `2024-01-01 12:59:59` |
|
|
1143
|
+
| `quote_asset_volume` | float64 | Quote asset volume | `643238.125` |
|
|
1144
|
+
| `number_of_trades` | int64 | Trade count | `1547` |
|
|
1145
|
+
| `taker_buy_base_asset_volume` | float64 | Taker buy base volume | `7.825000` |
|
|
1146
|
+
| `taker_buy_quote_asset_volume` | float64 | Taker buy quote volume | `329891.750` |
|
|
1147
|
+
|
|
1148
|
+
### CSV File Structure
|
|
1149
|
+
|
|
1150
|
+
CSV files include header comments with metadata followed by data:
|
|
1151
|
+
|
|
1152
|
+
```csv
|
|
1153
|
+
# Binance Spot Market Data v2.5.0
|
|
1154
|
+
# Generated: 2025-09-18T23:09:25.391126+00:00Z
|
|
1155
|
+
# Source: Binance Public Data Repository
|
|
1156
|
+
# Market: SPOT | Symbol: BTCUSDT | Timeframe: 1h
|
|
1157
|
+
# Coverage: 48 bars
|
|
1158
|
+
# Period: 2024-01-01 00:00:00 to 2024-01-02 23:00:00
|
|
1159
|
+
# Collection: direct_download in 0.0s
|
|
1160
|
+
# Data Hash: 5fba9d2e5d3db849...
|
|
1161
|
+
# Compliance: Zero-Magic-Numbers, Temporal-Integrity, Official-Binance-Source
|
|
1162
|
+
#
|
|
1163
|
+
date,open,high,low,close,volume,close_time,quote_asset_volume,number_of_trades,taker_buy_base_asset_volume,taker_buy_quote_asset_volume
|
|
1164
|
+
2024-01-01 00:00:00,42283.58,42554.57,42261.02,42475.23,1271.68108,2024-01-01 00:59:59,53957248.973789,47134,682.57581,28957416.819645
|
|
1165
|
+
```
|
|
1166
|
+
|
|
1167
|
+
### Metadata JSON Structure
|
|
1168
|
+
|
|
1169
|
+
Each CSV file includes comprehensive metadata in `.metadata.json`:
|
|
1170
|
+
|
|
1171
|
+
```json
|
|
1172
|
+
{
|
|
1173
|
+
"version": "v2.5.0",
|
|
1174
|
+
"generator": "BinancePublicDataCollector",
|
|
1175
|
+
"data_source": "Binance Public Data Repository",
|
|
1176
|
+
"symbol": "BTCUSDT",
|
|
1177
|
+
"timeframe": "1h",
|
|
1178
|
+
"enhanced_microstructure_format": {
|
|
1179
|
+
"total_columns": 11,
|
|
1180
|
+
"analysis_capabilities": [
|
|
1181
|
+
"order_flow_analysis",
|
|
1182
|
+
"liquidity_metrics",
|
|
1183
|
+
"market_microstructure",
|
|
1184
|
+
"trade_weighted_prices",
|
|
1185
|
+
"institutional_data_patterns"
|
|
1186
|
+
]
|
|
1187
|
+
},
|
|
1188
|
+
"gap_analysis": {
|
|
1189
|
+
"total_gaps_detected": 0,
|
|
1190
|
+
"data_completeness_score": 1.0,
|
|
1191
|
+
"gap_filling_method": "authentic_binance_api"
|
|
1192
|
+
},
|
|
1193
|
+
"data_integrity": {
|
|
1194
|
+
"chronological_order": true,
|
|
1195
|
+
"corruption_detected": false
|
|
1196
|
+
}
|
|
1197
|
+
}
|
|
1198
|
+
```
|
|
1199
|
+
|
|
1200
|
+
### Streaming Output (Memory-Efficient)
|
|
1201
|
+
|
|
1202
|
+
For large datasets, Polars streaming provides constant memory usage:
|
|
1203
|
+
|
|
1204
|
+
```python
|
|
1205
|
+
from gapless_crypto_clickhouse.streaming import StreamingDataProcessor
|
|
1206
|
+
|
|
1207
|
+
processor = StreamingDataProcessor(chunk_size=10_000, memory_limit_mb=100)
|
|
1208
|
+
for chunk in processor.stream_csv_chunks("large_dataset.csv"):
|
|
1209
|
+
# Process chunk with constant memory usage
|
|
1210
|
+
print(f"Chunk shape: {chunk.shape}")
|
|
1211
|
+
```
|
|
1212
|
+
|
|
1213
|
+
### File Naming Convention
|
|
1214
|
+
|
|
1215
|
+
Output files follow consistent naming pattern:
|
|
1216
|
+
|
|
1217
|
+
```
|
|
1218
|
+
binance_spot_{SYMBOL}-{TIMEFRAME}_{START_DATE}-{END_DATE}_v{VERSION}.csv
|
|
1219
|
+
binance_spot_{SYMBOL}-{TIMEFRAME}_{START_DATE}-{END_DATE}_v{VERSION}.metadata.json
|
|
1220
|
+
```
|
|
1221
|
+
|
|
1222
|
+
Examples:
|
|
1223
|
+
|
|
1224
|
+
- `binance_spot_BTCUSDT-1h_20240101-20240102_v2.5.0.csv`
|
|
1225
|
+
- `binance_spot_ETHUSDT-4h_20240101-20240201_v2.5.0.csv`
|
|
1226
|
+
- `binance_spot_SOLUSDT-1d_20240101-20241231_v2.5.0.csv`
|
|
1227
|
+
|
|
1228
|
+
### Error Handling
|
|
1229
|
+
|
|
1230
|
+
All classes implement robust error handling with meaningful exceptions:
|
|
1231
|
+
|
|
1232
|
+
```python
|
|
1233
|
+
try:
|
|
1234
|
+
collector = BinancePublicDataCollector(symbol="INVALIDPAIR")
|
|
1235
|
+
result = collector.collect_timeframe_data("1h")
|
|
1236
|
+
except ValueError as e:
|
|
1237
|
+
print(f"Invalid symbol format: {e}")
|
|
1238
|
+
except ConnectionError as e:
|
|
1239
|
+
print(f"Network error: {e}")
|
|
1240
|
+
except FileNotFoundError as e:
|
|
1241
|
+
print(f"Output directory error: {e}")
|
|
1242
|
+
```
|
|
1243
|
+
|
|
1244
|
+
### Type Hints
|
|
1245
|
+
|
|
1246
|
+
All public APIs include comprehensive type hints for better IDE support:
|
|
1247
|
+
|
|
1248
|
+
```python
|
|
1249
|
+
from typing import Dict, List, Optional, Any
|
|
1250
|
+
from pathlib import Path
|
|
1251
|
+
import pandas as pd
|
|
1252
|
+
|
|
1253
|
+
def collect_timeframe_data(self, trading_timeframe: str) -> Dict[str, Any]:
|
|
1254
|
+
# Returns dict with 'dataframe', 'filepath', and 'stats' keys
|
|
1255
|
+
pass
|
|
1256
|
+
|
|
1257
|
+
def collect_multiple_timeframes(
|
|
1258
|
+
self,
|
|
1259
|
+
timeframes: Optional[List[str]] = None
|
|
1260
|
+
) -> Dict[str, Dict[str, Any]]:
|
|
1261
|
+
# Returns nested dict by timeframe
|
|
1262
|
+
pass
|
|
1263
|
+
```
|
|
1264
|
+
|
|
1265
|
+
## 📄 License
|
|
1266
|
+
|
|
1267
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
1268
|
+
|
|
1269
|
+
## 🏢 About Eon Labs
|
|
1270
|
+
|
|
1271
|
+
Gapless Crypto ClickHouse is developed by [Eon Labs](https://github.com/terrylica), specializing in quantitative trading infrastructure and machine learning for financial markets.
|
|
1272
|
+
|
|
1273
|
+
---
|
|
1274
|
+
|
|
1275
|
+
**UV-based** - Python dependency management
|
|
1276
|
+
**📊 11-Column Format** - Microstructure data with order flow metrics
|
|
1277
|
+
**🔒 Gap Detection** - Data completeness validation and filling
|