gapless-crypto-clickhouse 7.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. gapless_crypto_clickhouse/__init__.py +147 -0
  2. gapless_crypto_clickhouse/__probe__.py +349 -0
  3. gapless_crypto_clickhouse/api.py +1032 -0
  4. gapless_crypto_clickhouse/clickhouse/__init__.py +17 -0
  5. gapless_crypto_clickhouse/clickhouse/config.py +119 -0
  6. gapless_crypto_clickhouse/clickhouse/connection.py +269 -0
  7. gapless_crypto_clickhouse/clickhouse/schema.sql +98 -0
  8. gapless_crypto_clickhouse/clickhouse/schema_validator.py +312 -0
  9. gapless_crypto_clickhouse/clickhouse_query.py +642 -0
  10. gapless_crypto_clickhouse/collectors/__init__.py +21 -0
  11. gapless_crypto_clickhouse/collectors/binance_public_data_collector.py +1994 -0
  12. gapless_crypto_clickhouse/collectors/clickhouse_bulk_loader.py +446 -0
  13. gapless_crypto_clickhouse/collectors/concurrent_collection_orchestrator.py +407 -0
  14. gapless_crypto_clickhouse/collectors/csv_format_detector.py +123 -0
  15. gapless_crypto_clickhouse/collectors/httpx_downloader.py +395 -0
  16. gapless_crypto_clickhouse/collectors/hybrid_url_generator.py +316 -0
  17. gapless_crypto_clickhouse/exceptions.py +145 -0
  18. gapless_crypto_clickhouse/gap_filling/__init__.py +1 -0
  19. gapless_crypto_clickhouse/gap_filling/safe_file_operations.py +439 -0
  20. gapless_crypto_clickhouse/gap_filling/universal_gap_filler.py +757 -0
  21. gapless_crypto_clickhouse/llms.txt +268 -0
  22. gapless_crypto_clickhouse/probe.py +235 -0
  23. gapless_crypto_clickhouse/py.typed +0 -0
  24. gapless_crypto_clickhouse/query_api.py +374 -0
  25. gapless_crypto_clickhouse/resume/__init__.py +12 -0
  26. gapless_crypto_clickhouse/resume/intelligent_checkpointing.py +383 -0
  27. gapless_crypto_clickhouse/utils/__init__.py +29 -0
  28. gapless_crypto_clickhouse/utils/error_handling.py +202 -0
  29. gapless_crypto_clickhouse/utils/etag_cache.py +194 -0
  30. gapless_crypto_clickhouse/utils/timeframe_constants.py +90 -0
  31. gapless_crypto_clickhouse/utils/timestamp_format_analyzer.py +256 -0
  32. gapless_crypto_clickhouse/utils/timestamp_utils.py +130 -0
  33. gapless_crypto_clickhouse/validation/__init__.py +36 -0
  34. gapless_crypto_clickhouse/validation/csv_validator.py +677 -0
  35. gapless_crypto_clickhouse/validation/models.py +220 -0
  36. gapless_crypto_clickhouse/validation/storage.py +502 -0
  37. gapless_crypto_clickhouse-7.1.0.dist-info/METADATA +1277 -0
  38. gapless_crypto_clickhouse-7.1.0.dist-info/RECORD +40 -0
  39. gapless_crypto_clickhouse-7.1.0.dist-info/WHEEL +4 -0
  40. gapless_crypto_clickhouse-7.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,268 @@
1
+ # gapless-crypto-clickhouse v6.0.0
2
+
3
+ ClickHouse-based cryptocurrency data collection with zero-gap guarantee and Apache Arrow optimization.
4
+
5
+ ## Quick Start
6
+
7
+ ```python
8
+ from gapless_crypto_clickhouse import query_ohlcv
9
+
10
+ # Query with auto-ingestion (downloads data if missing)
11
+ df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31")
12
+ print(f"Rows: {len(df)}") # 744 rows (31 days * 24 hours)
13
+ ```
14
+
15
+ ## Core API
16
+
17
+ ### query_ohlcv() - Unified Query with Auto-Ingestion (NEW in v6.0.0)
18
+
19
+ **Signature**:
20
+ ```python
21
+ query_ohlcv(
22
+ symbol: str | List[str],
23
+ timeframe: str,
24
+ start_date: str,
25
+ end_date: str,
26
+ instrument_type: Literal["spot", "futures-um"] = "spot",
27
+ auto_ingest: bool = True,
28
+ fill_gaps: bool = True,
29
+ clickhouse_config: Optional[ClickHouseConfig] = None,
30
+ ) -> pd.DataFrame
31
+ ```
32
+
33
+ **Parameters**:
34
+ - `symbol`: Trading pair (e.g., "BTCUSDT") or list of symbols
35
+ - `timeframe`: Timeframe string (e.g., "1h", "4h", "1d")
36
+ - `start_date`: Start date in "YYYY-MM-DD" format
37
+ - `end_date`: End date in "YYYY-MM-DD" format
38
+ - `instrument_type`: "spot" (default) or "futures-um"
39
+ - `auto_ingest`: Auto-download missing data (default: True)
40
+ - `fill_gaps`: Detect and fill gaps (default: True)
41
+
42
+ **Performance**:
43
+ - First query (auto-ingest): 30-60s (download + ingest + query)
44
+ - Cached query: 0.1-2s (3x faster with Arrow)
45
+ - Memory: 75% less vs previous version (Arrow zero-copy)
46
+
47
+ **Examples**:
48
+ ```python
49
+ # Basic query
50
+ df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31")
51
+
52
+ # Multi-symbol query
53
+ df = query_ohlcv(
54
+ ["BTCUSDT", "ETHUSDT", "SOLUSDT"],
55
+ "1h",
56
+ "2024-01-01",
57
+ "2024-01-31"
58
+ )
59
+
60
+ # Futures data
61
+ df = query_ohlcv(
62
+ "BTCUSDT",
63
+ "1h",
64
+ "2024-01-01",
65
+ "2024-01-31",
66
+ instrument_type="futures-um"
67
+ )
68
+
69
+ # Query without auto-ingestion (faster, raises if data missing)
70
+ df = query_ohlcv(
71
+ "BTCUSDT",
72
+ "1h",
73
+ "2024-01-01",
74
+ "2024-01-31",
75
+ auto_ingest=False
76
+ )
77
+ ```
78
+
79
+ **Workflow**:
80
+ 1. Check if data exists in ClickHouse
81
+ 2. If missing and auto_ingest=True: download from Binance + ingest
82
+ 3. Query ClickHouse with FINAL keyword (deduplication)
83
+ 4. If fill_gaps=True: detect and fill gaps
84
+ 5. Return DataFrame (Arrow-optimized internally)
85
+
86
+ ### fetch_data() - File-Based Workflow (Legacy)
87
+
88
+ **Signature**:
89
+ ```python
90
+ fetch_data(
91
+ symbol: str,
92
+ timeframe: str,
93
+ start: Optional[str] = None,
94
+ end: Optional[str] = None,
95
+ limit: Optional[int] = None,
96
+ instrument_type: Literal["spot", "futures-um"] = "spot",
97
+ ) -> pd.DataFrame
98
+ ```
99
+
100
+ **Note**: Use `query_ohlcv()` for database-based workflows with auto-ingestion. `fetch_data()` is for file-based workflows (CSV/Parquet).
101
+
102
+ ## Data Coverage
103
+
104
+ **Symbols**: 713 validated perpetual symbols (spot + futures aligned)
105
+ - Examples: BTCUSDT, ETHUSDT, BNBUSDT, SOLUSDT, XRPUSDT
106
+ - Source: binance-futures-availability package (95%+ SLA)
107
+
108
+ **Timeframes**: 13 timeframes from 1 second to 1 day
109
+ - Ultra-high frequency: 1s, 1m, 3m, 5m
110
+ - Intraday: 15m, 30m, 1h, 2h, 4h
111
+ - Daily: 6h, 8h, 12h, 1d
112
+
113
+ **Instrument Types**:
114
+ - spot: USDT-quoted spot pairs
115
+ - futures-um: USDT-margined perpetual futures
116
+
117
+ **Data Format**: 11-column microstructure format
118
+ - OHLCV: open, high, low, close, volume
119
+ - Timestamps: timestamp (bar open), close_time (bar close)
120
+ - Microstructure: quote_asset_volume, number_of_trades, taker_buy_base_asset_volume, taker_buy_quote_asset_volume
121
+ - Futures-specific: funding_rate (NULL for spot)
122
+
123
+ ## Performance
124
+
125
+ **Arrow Optimization (v6.0.0)**:
126
+ - Query speedup: 3x faster DataFrame creation
127
+ - Memory reduction: 75% less memory (zero-copy)
128
+ - Driver: clickhouse-connect with Apache Arrow
129
+
130
+ **Ingestion**:
131
+ - Bulk loader: >100K rows/sec
132
+ - Download: 22x faster than REST API (CloudFront CDN)
133
+
134
+ **Zero-Gap Guarantee**:
135
+ - Deterministic versioning + ReplacingMergeTree deduplication
136
+ - Query with FINAL keyword for deduplicated results
137
+
138
+ ## Configuration
139
+
140
+ **Environment Variables**:
141
+ ```bash
142
+ export CLICKHOUSE_HOST=localhost # Default: localhost
143
+ export CLICKHOUSE_HTTP_PORT=8123 # Default: 8123 (HTTP protocol)
144
+ export CLICKHOUSE_DATABASE=default # Default: default
145
+ export CLICKHOUSE_USER=default # Default: default
146
+ export CLICKHOUSE_PASSWORD= # Default: empty
147
+ ```
148
+
149
+ **Custom Configuration**:
150
+ ```python
151
+ from gapless_crypto_clickhouse.clickhouse import ClickHouseConfig
152
+
153
+ config = ClickHouseConfig(
154
+ host="clickhouse.example.com",
155
+ http_port=8123,
156
+ database="crypto",
157
+ user="admin",
158
+ password="secret"
159
+ )
160
+
161
+ df = query_ohlcv(
162
+ "BTCUSDT",
163
+ "1h",
164
+ "2024-01-01",
165
+ "2024-01-31",
166
+ clickhouse_config=config
167
+ )
168
+ ```
169
+
170
+ ## AI Agent Introspection
171
+
172
+ ```python
173
+ from gapless_crypto_clickhouse import probe
174
+
175
+ # Get all capabilities
176
+ caps = probe.get_capabilities()
177
+
178
+ # Get supported symbols
179
+ symbols = probe.get_supported_symbols() # 713 symbols
180
+
181
+ # Get supported timeframes
182
+ timeframes = probe.get_supported_timeframes() # 13 timeframes
183
+
184
+ # Get performance info
185
+ perf = probe.get_performance_info()
186
+ ```
187
+
188
+ ## Migration from v5.0.0
189
+
190
+ **Breaking Changes**:
191
+ - Protocol change: Native TCP (port 9000) → HTTP (port 8123)
192
+ - Driver change: clickhouse-driver → clickhouse-connect
193
+ - Exception types: ClickHouseError → Exception
194
+
195
+ **New Features**:
196
+ - query_ohlcv() with lazy auto-ingestion
197
+ - Apache Arrow optimization (3x faster queries, 75% less memory)
198
+ - AI discoverability (probe module)
199
+
200
+ **Migration Steps**:
201
+ 1. Update port: 9000 → 8123 in CLICKHOUSE_PORT or use CLICKHOUSE_HTTP_PORT
202
+ 2. Update exceptions: catch Exception instead of ClickHouseError
203
+ 3. Use query_ohlcv() for unified query API with auto-ingestion
204
+
205
+ ## Common Patterns
206
+
207
+ ### Backtesting
208
+ ```python
209
+ # Load historical data for backtesting
210
+ df = query_ohlcv("BTCUSDT", "1h", "2023-01-01", "2023-12-31")
211
+
212
+ # Calculate indicators
213
+ df['sma_20'] = df['close'].rolling(20).mean()
214
+ df['returns'] = df['close'].pct_change()
215
+
216
+ # Backtest strategy
217
+ df['signal'] = (df['close'] > df['sma_20']).astype(int)
218
+ df['strategy_returns'] = df['returns'] * df['signal'].shift(1)
219
+ ```
220
+
221
+ ### Multi-Symbol Analysis
222
+ ```python
223
+ # Compare multiple symbols
224
+ symbols = ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT"]
225
+ df = query_ohlcv(symbols, "1d", "2024-01-01", "2024-12-31")
226
+
227
+ # Calculate correlation matrix
228
+ pivot = df.pivot(index='timestamp', columns='symbol', values='close')
229
+ corr = pivot.pct_change().corr()
230
+ ```
231
+
232
+ ### Real-Time Updates
233
+ ```python
234
+ # Get latest data
235
+ from datetime import datetime, timedelta
236
+
237
+ end = datetime.now()
238
+ start = end - timedelta(days=7)
239
+
240
+ df = query_ohlcv(
241
+ "BTCUSDT",
242
+ "1h",
243
+ start.strftime("%Y-%m-%d"),
244
+ end.strftime("%Y-%m-%d"),
245
+ auto_ingest=True # Automatically download latest data
246
+ )
247
+
248
+ print(f"Latest price: ${df.iloc[-1]['close']:.2f}")
249
+ ```
250
+
251
+ ## Error Handling
252
+
253
+ ```python
254
+ from gapless_crypto_clickhouse import query_ohlcv
255
+
256
+ try:
257
+ df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31")
258
+ except ValueError as e:
259
+ print(f"Invalid parameters: {e}")
260
+ except Exception as e:
261
+ print(f"Query failed: {e}")
262
+ ```
263
+
264
+ ## Links
265
+
266
+ - GitHub: https://github.com/terrylica/gapless-crypto-clickhouse
267
+ - PyPI: https://pypi.org/project/gapless-crypto-clickhouse/
268
+ - Documentation: See README.md
@@ -0,0 +1,235 @@
1
+ """
2
+ Probe module for AI agent discoverability (v6.0.0).
3
+
4
+ Provides introspection capabilities for AI coding agents to discover:
5
+ - Available query methods and their signatures
6
+ - Supported symbols, timeframes, instrument types
7
+ - Performance characteristics (Arrow optimization)
8
+ - Auto-ingestion capabilities
9
+
10
+ Usage (for AI agents):
11
+ from gapless_crypto_clickhouse import probe
12
+
13
+ # Get all capabilities as JSON
14
+ caps = probe.get_capabilities()
15
+ print(caps["query_methods"]["query_ohlcv"])
16
+
17
+ # Get supported symbols
18
+ symbols = probe.get_supported_symbols()
19
+
20
+ # Get performance info
21
+ perf = probe.get_performance_info()
22
+ """
23
+
24
+ import json
25
+ from typing import Any, Dict
26
+
27
+ from .api import get_supported_symbols, get_supported_timeframes
28
+
29
+
30
+ def get_capabilities() -> Dict[str, Any]:
31
+ """
32
+ Get all package capabilities for AI agent discovery.
33
+
34
+ Returns:
35
+ Dictionary with package capabilities:
36
+ - query_methods: Available query methods with signatures
37
+ - data_sources: Supported data sources
38
+ - symbols: Available trading pairs
39
+ - timeframes: Supported timeframes
40
+ - instrument_types: Available instrument types
41
+ - performance: Performance characteristics
42
+ - features: Feature flags
43
+
44
+ Example:
45
+ caps = probe.get_capabilities()
46
+ print(json.dumps(caps, indent=2))
47
+ """
48
+ return {
49
+ "package": {
50
+ "name": "gapless-crypto-clickhouse",
51
+ "version": "6.0.0",
52
+ "description": "ClickHouse-based cryptocurrency data with zero-gap guarantee and Arrow optimization",
53
+ },
54
+ "query_methods": {
55
+ "query_ohlcv": {
56
+ "signature": "query_ohlcv(symbol, timeframe, start_date, end_date, instrument_type='spot', auto_ingest=True, fill_gaps=True, clickhouse_config=None) -> pd.DataFrame",
57
+ "description": "Query OHLCV data with lazy auto-ingestion (Arrow-optimized)",
58
+ "performance": {
59
+ "first_query_with_auto_ingest": "30-60s (download + ingest + query)",
60
+ "cached_query": "0.1-2s (3x faster with Arrow)",
61
+ "memory_reduction": "75% vs clickhouse-driver",
62
+ },
63
+ "parameters": {
64
+ "symbol": {
65
+ "type": "str | List[str]",
66
+ "description": "Trading pair symbol(s), e.g. 'BTCUSDT' or ['BTCUSDT', 'ETHUSDT']",
67
+ "required": True,
68
+ },
69
+ "timeframe": {
70
+ "type": "str",
71
+ "description": "Timeframe string, e.g. '1h', '4h', '1d'",
72
+ "required": True,
73
+ "valid_values": get_supported_timeframes(),
74
+ },
75
+ "start_date": {
76
+ "type": "str",
77
+ "description": "Start date in 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS' format",
78
+ "required": True,
79
+ },
80
+ "end_date": {
81
+ "type": "str",
82
+ "description": "End date in 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS' format",
83
+ "required": True,
84
+ },
85
+ "instrument_type": {
86
+ "type": "Literal['spot', 'futures-um']",
87
+ "description": "Instrument type (default: 'spot')",
88
+ "required": False,
89
+ "default": "spot",
90
+ },
91
+ "auto_ingest": {
92
+ "type": "bool",
93
+ "description": "Automatically download and ingest missing data (default: True)",
94
+ "required": False,
95
+ "default": True,
96
+ },
97
+ "fill_gaps": {
98
+ "type": "bool",
99
+ "description": "Detect and fill gaps using REST API (default: True)",
100
+ "required": False,
101
+ "default": True,
102
+ },
103
+ },
104
+ "examples": [
105
+ {
106
+ "description": "Basic query with auto-ingestion",
107
+ "code": 'df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31")',
108
+ },
109
+ {
110
+ "description": "Multi-symbol query",
111
+ "code": 'df = query_ohlcv(["BTCUSDT", "ETHUSDT"], "1h", "2024-01-01", "2024-01-31")',
112
+ },
113
+ {
114
+ "description": "Futures data",
115
+ "code": 'df = query_ohlcv("BTCUSDT", "1h", "2024-01-01", "2024-01-31", instrument_type="futures-um")',
116
+ },
117
+ ],
118
+ },
119
+ "fetch_data": {
120
+ "signature": "fetch_data(symbol, timeframe, start=None, end=None, limit=None, instrument_type='spot') -> pd.DataFrame",
121
+ "description": "Fetch data from file-based workflow (CSV/Parquet, no database)",
122
+ "note": "Use query_ohlcv() for database-based workflows with auto-ingestion",
123
+ },
124
+ },
125
+ "data_sources": {
126
+ "binance_public_data": {
127
+ "url": "https://data.binance.vision/data/",
128
+ "description": "Binance Public Data Repository (CloudFront CDN)",
129
+ "performance": "22x faster than REST API",
130
+ "markets": ["spot", "futures-um"],
131
+ },
132
+ "binance_rest_api": {
133
+ "url": "https://api.binance.com/api/v3/klines",
134
+ "description": "Binance REST API (for gap filling only)",
135
+ "rate_limit": "2400 requests/minute",
136
+ },
137
+ },
138
+ "symbols": {
139
+ "count": len(get_supported_symbols()),
140
+ "description": "713 validated perpetual symbols (spot + futures aligned)",
141
+ "examples": ["BTCUSDT", "ETHUSDT", "BNBUSDT", "SOLUSDT", "XRPUSDT"],
142
+ "source": "binance-futures-availability package (95%+ SLA)",
143
+ },
144
+ "timeframes": {
145
+ "supported": get_supported_timeframes(),
146
+ "description": "13 timeframes from 1 second to 1 day",
147
+ "ultra_high_frequency": ["1s", "1m", "3m", "5m"],
148
+ "intraday": ["15m", "30m", "1h", "2h", "4h"],
149
+ "daily": ["6h", "8h", "12h", "1d"],
150
+ },
151
+ "instrument_types": {
152
+ "spot": {
153
+ "description": "USDT-quoted spot pairs",
154
+ "data_format": "11-column microstructure format",
155
+ },
156
+ "futures-um": {
157
+ "description": "USDT-margined perpetual futures",
158
+ "data_format": "11-column microstructure format + funding_rate",
159
+ },
160
+ },
161
+ "performance": {
162
+ "arrow_optimization": {
163
+ "query_speedup": "3x faster DataFrame creation",
164
+ "memory_reduction": "75% less memory (zero-copy)",
165
+ "driver": "clickhouse-connect with Apache Arrow",
166
+ },
167
+ "ingestion": {
168
+ "bulk_loader": ">100K rows/sec",
169
+ "download": "22x faster than REST API (CloudFront CDN)",
170
+ },
171
+ },
172
+ "features": {
173
+ "zero_gap_guarantee": {
174
+ "description": "Deterministic versioning + ReplacingMergeTree deduplication",
175
+ "query_keyword": "FINAL",
176
+ },
177
+ "auto_ingestion": {
178
+ "description": "Lazy on-demand download and ingest when data missing",
179
+ "enabled_by_default": True,
180
+ },
181
+ "gap_detection": {
182
+ "description": "SQL-based gap detection for all 13 timeframes",
183
+ "method": "Window functions with expected interval analysis",
184
+ },
185
+ "gap_filling": {
186
+ "description": "REST API-based gap filling (v6.0.0 TODO)",
187
+ "status": "not_implemented",
188
+ },
189
+ },
190
+ }
191
+
192
+
193
+ def get_performance_info() -> Dict[str, Any]:
194
+ """
195
+ Get performance characteristics.
196
+
197
+ Returns:
198
+ Dictionary with performance metrics
199
+
200
+ Example:
201
+ perf = probe.get_performance_info()
202
+ print(f"Query speedup: {perf['arrow']['query_speedup']}")
203
+ """
204
+ return {
205
+ "arrow": {
206
+ "query_speedup": "3x faster",
207
+ "memory_reduction": "75%",
208
+ "driver": "clickhouse-connect",
209
+ },
210
+ "ingestion": {
211
+ "bulk_loader": ">100K rows/sec",
212
+ "download": "22x faster than REST API",
213
+ },
214
+ "query": {
215
+ "cached": "0.1-2s",
216
+ "first_time_with_auto_ingest": "30-60s",
217
+ },
218
+ }
219
+
220
+
221
+ def print_capabilities() -> None:
222
+ """
223
+ Print all capabilities as formatted JSON.
224
+
225
+ Example:
226
+ from gapless_crypto_clickhouse import probe
227
+ probe.print_capabilities()
228
+ """
229
+ caps = get_capabilities()
230
+ print(json.dumps(caps, indent=2))
231
+
232
+
233
+ if __name__ == "__main__":
234
+ # Allow running as script for quick inspection
235
+ print_capabilities()
File without changes