rm-bdd 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rm_bdd-0.1.0/PKG-INFO +180 -0
- rm_bdd-0.1.0/README.md +167 -0
- rm_bdd-0.1.0/pyproject.toml +27 -0
- rm_bdd-0.1.0/setup.cfg +4 -0
- rm_bdd-0.1.0/src/rm_bdd/__init__.py +2 -0
- rm_bdd-0.1.0/src/rm_bdd/binance_metadata_manager.py +23 -0
- rm_bdd-0.1.0/src/rm_bdd/csvs_to_parquet.py +66 -0
- rm_bdd-0.1.0/src/rm_bdd/data_downloader.py +73 -0
- rm_bdd-0.1.0/src/rm_bdd/data_extractor.py +55 -0
- rm_bdd-0.1.0/src/rm_bdd/data_formatter.py +57 -0
- rm_bdd-0.1.0/src/rm_bdd/data_manager.py +45 -0
- rm_bdd-0.1.0/src/rm_bdd/files_parser.py +76 -0
- rm_bdd-0.1.0/src/rm_bdd/metadata_manager.py +48 -0
- rm_bdd-0.1.0/src/rm_bdd/path_loader.py +62 -0
- rm_bdd-0.1.0/src/rm_bdd.egg-info/PKG-INFO +180 -0
- rm_bdd-0.1.0/src/rm_bdd.egg-info/SOURCES.txt +17 -0
- rm_bdd-0.1.0/src/rm_bdd.egg-info/dependency_links.txt +1 -0
- rm_bdd-0.1.0/src/rm_bdd.egg-info/requires.txt +4 -0
- rm_bdd-0.1.0/src/rm_bdd.egg-info/top_level.txt +1 -0
rm_bdd-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rm-bdd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Binance Data Downloader
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: aiohttp==3.13.3
|
|
10
|
+
Requires-Dist: pyarrow==23.0.1
|
|
11
|
+
Requires-Dist: pandas==3.0.1
|
|
12
|
+
Requires-Dist: xmltodict==1.0.4
|
|
13
|
+
|
|
14
|
+
# rm-binance-data-downloader
|
|
15
|
+
|
|
16
|
+
A Python library for downloading, extracting and formatting historical
|
|
17
|
+
Binance market data.
|
|
18
|
+
|
|
19
|
+
The package provides a pipeline that:
|
|
20
|
+
|
|
21
|
+
1. Downloads data from Binance Vision
|
|
22
|
+
2. Extracts compressed archives
|
|
23
|
+
3. Formats data into a structured dataset
|
|
24
|
+
|
|
25
|
+
It is designed for fast data preparation for quantitative trading,
|
|
26
|
+
backtesting and data analysis.
|
|
27
|
+
|
|
28
|
+
------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
# Installation
|
|
31
|
+
|
|
32
|
+
``` bash
|
|
33
|
+
pip install rm-bdd
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
# Features
|
|
39
|
+
|
|
40
|
+
- Download historical Binance data
|
|
41
|
+
- Automatic archive extraction
|
|
42
|
+
- Data formatting pipeline
|
|
43
|
+
- Metadata management
|
|
44
|
+
- Async architecture
|
|
45
|
+
- Easy integration into trading systems
|
|
46
|
+
|
|
47
|
+
------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
# Quick Example
|
|
50
|
+
|
|
51
|
+
``` python
|
|
52
|
+
import asyncio
|
|
53
|
+
import time
|
|
54
|
+
|
|
55
|
+
from rm_bdd.data_downloader import DataDownloader
|
|
56
|
+
from rm_bdd.data_extractor import DataExtractor
|
|
57
|
+
from rm_bdd.data_formatter import DataFormatter
|
|
58
|
+
from rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DataManager:
|
|
62
|
+
|
|
63
|
+
def __init__(self, downloader, extractor, formatter):
|
|
64
|
+
self._downloader = downloader
|
|
65
|
+
self._extractor = extractor
|
|
66
|
+
self._formatter = formatter
|
|
67
|
+
|
|
68
|
+
async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
|
|
69
|
+
|
|
70
|
+
start = time.time()
|
|
71
|
+
await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
72
|
+
print("download time:", time.time() - start)
|
|
73
|
+
|
|
74
|
+
start = time.time()
|
|
75
|
+
await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
76
|
+
print("extract time:", time.time() - start)
|
|
77
|
+
|
|
78
|
+
start = time.time()
|
|
79
|
+
await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
80
|
+
print("format time:", time.time() - start)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
async def main():
|
|
84
|
+
|
|
85
|
+
downloader = DataDownloader(
|
|
86
|
+
"downloads/",
|
|
87
|
+
BinanceMetadataManager("downloads/metadata.json")
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
extractor = DataExtractor(
|
|
91
|
+
"downloads/",
|
|
92
|
+
"extracts/",
|
|
93
|
+
BinanceMetadataManager("extracts/metadata.json")
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
formatter = DataFormatter(
|
|
97
|
+
"extracts/",
|
|
98
|
+
"data/",
|
|
99
|
+
BinanceMetadataManager("data/metadata.json")
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
manager = DataManager(downloader, extractor, formatter)
|
|
103
|
+
|
|
104
|
+
await manager.download_and_save("BTCUSDT", "1m")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
asyncio.run(main())
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
# Result Folder Structure
|
|
113
|
+
|
|
114
|
+
After execution the folders will look like:
|
|
115
|
+
|
|
116
|
+
downloads/
|
|
117
|
+
BTCUSDT/
|
|
118
|
+
metadata.json
|
|
119
|
+
|
|
120
|
+
extracts/
|
|
121
|
+
BTCUSDT/
|
|
122
|
+
metadata.json
|
|
123
|
+
|
|
124
|
+
data/
|
|
125
|
+
BTCUSDT/
|
|
126
|
+
metadata.json
|
|
127
|
+
|
|
128
|
+
------------------------------------------------------------------------
|
|
129
|
+
|
|
130
|
+
# Pipeline Overview
|
|
131
|
+
|
|
132
|
+
The processing pipeline consists of three stages:
|
|
133
|
+
|
|
134
|
+
### Downloader
|
|
135
|
+
|
|
136
|
+
Downloads historical data archives from Binance Vision.
|
|
137
|
+
|
|
138
|
+
### Extractor
|
|
139
|
+
|
|
140
|
+
Extracts downloaded archives.
|
|
141
|
+
|
|
142
|
+
### Formatter
|
|
143
|
+
|
|
144
|
+
Formats extracted CSV data into a structured dataset ready for analysis.
|
|
145
|
+
|
|
146
|
+
------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
# Metadata Manager
|
|
149
|
+
|
|
150
|
+
The library uses a metadata system to track downloaded, extracted and
|
|
151
|
+
formatted data.
|
|
152
|
+
|
|
153
|
+
This prevents duplicate downloads and processing.
|
|
154
|
+
|
|
155
|
+
------------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
# Example Use Case
|
|
158
|
+
|
|
159
|
+
Typical workflow:
|
|
160
|
+
|
|
161
|
+
download → extract → format → analyze
|
|
162
|
+
|
|
163
|
+
Used for:
|
|
164
|
+
|
|
165
|
+
- algorithmic trading
|
|
166
|
+
- backtesting
|
|
167
|
+
- machine learning datasets
|
|
168
|
+
- market research
|
|
169
|
+
|
|
170
|
+
------------------------------------------------------------------------
|
|
171
|
+
|
|
172
|
+
# Requirements
|
|
173
|
+
|
|
174
|
+
Python 3.10+
|
|
175
|
+
|
|
176
|
+
------------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
# License
|
|
179
|
+
|
|
180
|
+
MIT
|
rm_bdd-0.1.0/README.md
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# rm-binance-data-downloader
|
|
2
|
+
|
|
3
|
+
A Python library for downloading, extracting and formatting historical
|
|
4
|
+
Binance market data.
|
|
5
|
+
|
|
6
|
+
The package provides a pipeline that:
|
|
7
|
+
|
|
8
|
+
1. Downloads data from Binance Vision
|
|
9
|
+
2. Extracts compressed archives
|
|
10
|
+
3. Formats data into a structured dataset
|
|
11
|
+
|
|
12
|
+
It is designed for fast data preparation for quantitative trading,
|
|
13
|
+
backtesting and data analysis.
|
|
14
|
+
|
|
15
|
+
------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
# Installation
|
|
18
|
+
|
|
19
|
+
``` bash
|
|
20
|
+
pip install rm-bdd
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
# Features
|
|
26
|
+
|
|
27
|
+
- Download historical Binance data
|
|
28
|
+
- Automatic archive extraction
|
|
29
|
+
- Data formatting pipeline
|
|
30
|
+
- Metadata management
|
|
31
|
+
- Async architecture
|
|
32
|
+
- Easy integration into trading systems
|
|
33
|
+
|
|
34
|
+
------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
# Quick Example
|
|
37
|
+
|
|
38
|
+
``` python
|
|
39
|
+
import asyncio
|
|
40
|
+
import time
|
|
41
|
+
|
|
42
|
+
from rm_bdd.data_downloader import DataDownloader
|
|
43
|
+
from rm_bdd.data_extractor import DataExtractor
|
|
44
|
+
from rm_bdd.data_formatter import DataFormatter
|
|
45
|
+
from rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class DataManager:
|
|
49
|
+
|
|
50
|
+
def __init__(self, downloader, extractor, formatter):
|
|
51
|
+
self._downloader = downloader
|
|
52
|
+
self._extractor = extractor
|
|
53
|
+
self._formatter = formatter
|
|
54
|
+
|
|
55
|
+
async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
|
|
56
|
+
|
|
57
|
+
start = time.time()
|
|
58
|
+
await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
59
|
+
print("download time:", time.time() - start)
|
|
60
|
+
|
|
61
|
+
start = time.time()
|
|
62
|
+
await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
63
|
+
print("extract time:", time.time() - start)
|
|
64
|
+
|
|
65
|
+
start = time.time()
|
|
66
|
+
await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
67
|
+
print("format time:", time.time() - start)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
async def main():
|
|
71
|
+
|
|
72
|
+
downloader = DataDownloader(
|
|
73
|
+
"downloads/",
|
|
74
|
+
BinanceMetadataManager("downloads/metadata.json")
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
extractor = DataExtractor(
|
|
78
|
+
"downloads/",
|
|
79
|
+
"extracts/",
|
|
80
|
+
BinanceMetadataManager("extracts/metadata.json")
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
formatter = DataFormatter(
|
|
84
|
+
"extracts/",
|
|
85
|
+
"data/",
|
|
86
|
+
BinanceMetadataManager("data/metadata.json")
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
manager = DataManager(downloader, extractor, formatter)
|
|
90
|
+
|
|
91
|
+
await manager.download_and_save("BTCUSDT", "1m")
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
asyncio.run(main())
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
# Result Folder Structure
|
|
100
|
+
|
|
101
|
+
After execution the folders will look like:
|
|
102
|
+
|
|
103
|
+
downloads/
|
|
104
|
+
BTCUSDT/
|
|
105
|
+
metadata.json
|
|
106
|
+
|
|
107
|
+
extracts/
|
|
108
|
+
BTCUSDT/
|
|
109
|
+
metadata.json
|
|
110
|
+
|
|
111
|
+
data/
|
|
112
|
+
BTCUSDT/
|
|
113
|
+
metadata.json
|
|
114
|
+
|
|
115
|
+
------------------------------------------------------------------------
|
|
116
|
+
|
|
117
|
+
# Pipeline Overview
|
|
118
|
+
|
|
119
|
+
The processing pipeline consists of three stages:
|
|
120
|
+
|
|
121
|
+
### Downloader
|
|
122
|
+
|
|
123
|
+
Downloads historical data archives from Binance Vision.
|
|
124
|
+
|
|
125
|
+
### Extractor
|
|
126
|
+
|
|
127
|
+
Extracts downloaded archives.
|
|
128
|
+
|
|
129
|
+
### Formatter
|
|
130
|
+
|
|
131
|
+
Formats extracted CSV data into a structured dataset ready for analysis.
|
|
132
|
+
|
|
133
|
+
------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
# Metadata Manager
|
|
136
|
+
|
|
137
|
+
The library uses a metadata system to track downloaded, extracted and
|
|
138
|
+
formatted data.
|
|
139
|
+
|
|
140
|
+
This prevents duplicate downloads and processing.
|
|
141
|
+
|
|
142
|
+
------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
# Example Use Case
|
|
145
|
+
|
|
146
|
+
Typical workflow:
|
|
147
|
+
|
|
148
|
+
download → extract → format → analyze
|
|
149
|
+
|
|
150
|
+
Used for:
|
|
151
|
+
|
|
152
|
+
- algorithmic trading
|
|
153
|
+
- backtesting
|
|
154
|
+
- machine learning datasets
|
|
155
|
+
- market research
|
|
156
|
+
|
|
157
|
+
------------------------------------------------------------------------
|
|
158
|
+
|
|
159
|
+
# Requirements
|
|
160
|
+
|
|
161
|
+
Python 3.10+
|
|
162
|
+
|
|
163
|
+
------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
# License
|
|
166
|
+
|
|
167
|
+
MIT
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "rm-bdd"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Binance Data Downloader"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Your Name", email = "you@example.com" }
|
|
14
|
+
]
|
|
15
|
+
dependencies = [
|
|
16
|
+
"aiohttp==3.13.3",
|
|
17
|
+
"pyarrow==23.0.1",
|
|
18
|
+
"pandas==3.0.1",
|
|
19
|
+
"xmltodict==1.0.4",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[tool.setuptools]
|
|
23
|
+
package-dir = {"" = "src"}
|
|
24
|
+
|
|
25
|
+
[tool.setuptools.packages.find]
|
|
26
|
+
where = ["src"]
|
|
27
|
+
include = ["rm_bdd*"]
|
rm_bdd-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from src.rm_bdd.metadata_manager import MetadataManager
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _get_data_from_filename(filename) -> tuple[str, str, str]:
|
|
5
|
+
filename = filename.split(".", 1)[0]
|
|
6
|
+
symbol, timeframe, date = filename.split("-", 2)
|
|
7
|
+
return symbol, timeframe, date
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BinanceMetadataManager:
|
|
11
|
+
def __init__(self, file_path):
|
|
12
|
+
self._metadata_manager = MetadataManager(file_path)
|
|
13
|
+
|
|
14
|
+
def update(self, filename):
|
|
15
|
+
symbol, timeframe, date = _get_data_from_filename(filename.rsplit("/", 1)[1])
|
|
16
|
+
self._metadata_manager.update_metadata(symbol, timeframe, date)
|
|
17
|
+
|
|
18
|
+
def check(self, filename):
|
|
19
|
+
symbol, timeframe, date = _get_data_from_filename(filename.rsplit("/", 1)[1])
|
|
20
|
+
|
|
21
|
+
return self._metadata_manager.check_date_not_in_metadata(
|
|
22
|
+
symbol, timeframe, date
|
|
23
|
+
)
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def csvs_to_parquets(files):
|
|
7
|
+
for file in files:
|
|
8
|
+
# Read the CSV file
|
|
9
|
+
df = pd.read_csv(file, parse_dates=["timestamp"], infer_datetime_format=True)
|
|
10
|
+
|
|
11
|
+
# Ensure the 'timestamp' column is in datetime format
|
|
12
|
+
df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")
|
|
13
|
+
|
|
14
|
+
# Set the 'timestamp' column as the index
|
|
15
|
+
df.set_index("timestamp", inplace=True)
|
|
16
|
+
|
|
17
|
+
# Create the output directory if it doesn't exist
|
|
18
|
+
output_dir = "parquets"
|
|
19
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
20
|
+
|
|
21
|
+
# Define the output parquet file path
|
|
22
|
+
parquet_file = os.path.join(
|
|
23
|
+
output_dir, os.path.basename(file).replace(".csv", ".parquet")
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# Write the DataFrame to a Parquet file
|
|
27
|
+
df.to_parquet(parquet_file, engine="pyarrow")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def normalize_time(series):
|
|
31
|
+
"""
|
|
32
|
+
Визначає, чи open_time у мілісекундах чи наносекундах,
|
|
33
|
+
і перетворює їх в datetime (UTC).
|
|
34
|
+
"""
|
|
35
|
+
series[series > 10 ** 13] = series[series > 10 ** 13] // 1000
|
|
36
|
+
return series
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def csv_to_partitioned_parquet(files, symbol, timeframe, path="data"):
|
|
40
|
+
columns = [
|
|
41
|
+
"open_time",
|
|
42
|
+
"open",
|
|
43
|
+
"high",
|
|
44
|
+
"low",
|
|
45
|
+
"close",
|
|
46
|
+
"volume",
|
|
47
|
+
"close_time",
|
|
48
|
+
"quote_asset_volume",
|
|
49
|
+
"number_of_trades",
|
|
50
|
+
"taker_buy_base_asset_volume",
|
|
51
|
+
"taker_buy_quote_asset_volume",
|
|
52
|
+
"ignore",
|
|
53
|
+
]
|
|
54
|
+
df = pd.concat(
|
|
55
|
+
[pd.read_csv(file, names=columns) for file in files], ignore_index=True
|
|
56
|
+
)
|
|
57
|
+
df["open_time"] = normalize_time(df["open_time"])
|
|
58
|
+
df["close_time"] = normalize_time(df["close_time"])
|
|
59
|
+
df["year"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.year
|
|
60
|
+
df["month"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.month
|
|
61
|
+
df["day"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.day
|
|
62
|
+
df["symbol"] = symbol
|
|
63
|
+
df["timeframe"] = timeframe
|
|
64
|
+
df.to_parquet(path, partition_cols=["symbol", "timeframe", "year", "month", "day"])
|
|
65
|
+
|
|
66
|
+
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
|
|
7
|
+
from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
8
|
+
from src.rm_bdd.files_parser import get_all_available_filenames_in_daterange
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
BASE_VISION_URL = "https://data.binance.vision/"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
async def download_file(key, folder) -> str | None:
|
|
15
|
+
logger.info(f"Downloading {key} to {folder}")
|
|
16
|
+
url = BASE_VISION_URL + key
|
|
17
|
+
|
|
18
|
+
os.makedirs(folder, exist_ok=True)
|
|
19
|
+
try:
|
|
20
|
+
local_filename = os.path.join(folder, url.split("/")[-1])
|
|
21
|
+
async with aiohttp.ClientSession() as session:
|
|
22
|
+
async with session.get(url) as r:
|
|
23
|
+
r.raise_for_status()
|
|
24
|
+
with open(local_filename, "wb") as f:
|
|
25
|
+
async for chunk in r.content.iter_chunked(8192):
|
|
26
|
+
chunk: bytes
|
|
27
|
+
f.write(chunk)
|
|
28
|
+
logger.info(f"Downloaded {key} to {local_filename}")
|
|
29
|
+
return str(local_filename)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
logger.info(f"Failed to download {key}: {e}")
|
|
32
|
+
logger.error(e)
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DataDownloader:
|
|
37
|
+
def __init__(self, download_folder, metadata_manager):
|
|
38
|
+
self._download_folder = download_folder
|
|
39
|
+
self._metadata_manager = metadata_manager
|
|
40
|
+
|
|
41
|
+
async def download(self, symbol, timeframe, date_from=None, date_to=None):
|
|
42
|
+
filenames = await get_all_available_filenames_in_daterange(
|
|
43
|
+
symbol, timeframe, date_from, date_to
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# get day from filename and check in metadata manager
|
|
47
|
+
filenames = list(
|
|
48
|
+
filter(self._metadata_manager.check, filenames)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
logger.info(
|
|
52
|
+
f"Downloading {timeframe} data for {symbol} from {date_from} to {date_to}"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
results = await asyncio.gather(
|
|
56
|
+
*(download_file(filename, self._download_folder) for filename in filenames)
|
|
57
|
+
)
|
|
58
|
+
logger.info(f"Finished downloading {timeframe} data for {symbol}")
|
|
59
|
+
logger.debug(f"Downloaded {len(results)} results for {symbol}")
|
|
60
|
+
|
|
61
|
+
for filename in results:
|
|
62
|
+
self._metadata_manager.update(filename)
|
|
63
|
+
|
|
64
|
+
return results
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == '__main__':
|
|
68
|
+
async def main():
|
|
69
|
+
downloader = DataDownloader("downloads/", BinanceMetadataManager("downloads/metadata.json"))
|
|
70
|
+
await downloader.download("BTCUSDT", '1m')
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import partial
|
|
3
|
+
|
|
4
|
+
import zipfile
|
|
5
|
+
|
|
6
|
+
from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
7
|
+
from src.rm_bdd.files_parser import _is_filename_in_date_range
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataExtractor:
|
|
11
|
+
def __init__(self, download_folder, extract_folder, metadata_manager):
|
|
12
|
+
self._download_folder = download_folder
|
|
13
|
+
self._extract_folder = extract_folder
|
|
14
|
+
self._metadata_manager = metadata_manager
|
|
15
|
+
|
|
16
|
+
async def extract(self, symbol, timeframe, date_from=None, date_to=None):
|
|
17
|
+
downloaded_files = self._downloaded_files
|
|
18
|
+
files_to_extract = filter(lambda a: a.startswith(f'{symbol.upper()}-{timeframe}'), downloaded_files)
|
|
19
|
+
files_to_extract = list(filter(
|
|
20
|
+
partial(_is_filename_in_date_range, date_from=date_from, date_to=date_to),
|
|
21
|
+
files_to_extract,
|
|
22
|
+
))
|
|
23
|
+
retval = await self.extract_files(files_to_extract)
|
|
24
|
+
|
|
25
|
+
return retval
|
|
26
|
+
|
|
27
|
+
async def extract_files(self, filenames):
|
|
28
|
+
filenames = [self._download_folder + filename for filename in filenames]
|
|
29
|
+
filenames = list(
|
|
30
|
+
filter(self._metadata_manager.check, filenames)
|
|
31
|
+
)
|
|
32
|
+
filenames = list(filter(lambda a: a.count("-") == 1 or f'{a.rsplit("-", 1)[0]}.zip' not in filenames, filenames))
|
|
33
|
+
retval = []
|
|
34
|
+
for filename in filenames:
|
|
35
|
+
with zipfile.ZipFile(filename) as zip_file:
|
|
36
|
+
zip_file.extractall(self._extract_folder)
|
|
37
|
+
self._metadata_manager.update(filename)
|
|
38
|
+
retval.append(filename)
|
|
39
|
+
return retval
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def _downloaded_files(self) -> list[str]:
|
|
43
|
+
return list(filter(lambda a: a.endswith(".zip"), os.listdir(self._download_folder)))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if __name__ == '__main__':
|
|
47
|
+
import asyncio
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def main():
|
|
51
|
+
extractor = DataExtractor("downloads/", "extracts/", BinanceMetadataManager("extracts/metadata.json"))
|
|
52
|
+
await extractor.extract("BTCUSDT", '1m')
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from functools import partial
|
|
3
|
+
|
|
4
|
+
from src.rm_bdd.csvs_to_parquet import csv_to_partitioned_parquet
|
|
5
|
+
from src.rm_bdd.files_parser import _is_filename_in_date_range
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataFormatter:
|
|
9
|
+
def __init__(self, extract_folder, data_folder, metadata_manager):
|
|
10
|
+
self._extract_folder = extract_folder
|
|
11
|
+
self._data_folder = data_folder
|
|
12
|
+
self._metadata_manager = metadata_manager
|
|
13
|
+
|
|
14
|
+
async def format(self, symbol, timeframe, date_from=None, date_to=None):
|
|
15
|
+
extracted_files = self._extracted_files
|
|
16
|
+
files_to_format = filter(lambda a: a.startswith(f'{symbol.upper()}-{timeframe}'), extracted_files)
|
|
17
|
+
files_to_format = list(filter(
|
|
18
|
+
partial(_is_filename_in_date_range, date_from=date_from, date_to=date_to),
|
|
19
|
+
files_to_format,
|
|
20
|
+
))
|
|
21
|
+
|
|
22
|
+
await self.format_files(files_to_format)
|
|
23
|
+
|
|
24
|
+
async def format_files(self, filenames):
|
|
25
|
+
os.makedirs(self._data_folder, exist_ok=True)
|
|
26
|
+
filenames = [self._extract_folder + filename for filename in filenames]
|
|
27
|
+
|
|
28
|
+
filenames = list(
|
|
29
|
+
filter(lambda a: a.count("-") == 1 or f'{a.rsplit("-", 1)[0]}.csv' not in filenames, filenames))
|
|
30
|
+
retval = []
|
|
31
|
+
symbol = filenames[0].rsplit("/", 1)[-1].split("-", 1)[0]
|
|
32
|
+
timeframe = filenames[0].rsplit("/", 1)[-1].split("-", 2)[1]
|
|
33
|
+
filenames = list(
|
|
34
|
+
filter(self._metadata_manager.check, filenames)
|
|
35
|
+
)
|
|
36
|
+
print("files to format: ", filenames)
|
|
37
|
+
if filenames:
|
|
38
|
+
await csv_to_partitioned_parquet(filenames, symbol, timeframe, self._data_folder)
|
|
39
|
+
|
|
40
|
+
for filename in filenames:
|
|
41
|
+
self._metadata_manager.update(filename)
|
|
42
|
+
retval.append(filename)
|
|
43
|
+
return retval
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def _extracted_files(self) -> list[str]:
|
|
47
|
+
return list(filter(lambda a: a.endswith(".csv"), os.listdir(self._extract_folder)))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
if __name__ == '__main__':
|
|
51
|
+
def main():
|
|
52
|
+
from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
53
|
+
formatter = DataFormatter("extracts/", "data/", BinanceMetadataManager("data/metadata.json"))
|
|
54
|
+
formatter.format("BTCUSDT", '1m')
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
main()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class DataManager:
|
|
5
|
+
def __init__(self, downloader, extractor, formatter):
|
|
6
|
+
self._downloader = downloader
|
|
7
|
+
self._extractor = extractor
|
|
8
|
+
self._formatter = formatter
|
|
9
|
+
|
|
10
|
+
async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
|
|
11
|
+
start = time.time()
|
|
12
|
+
await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
13
|
+
print("download time: ", time.time() - start)
|
|
14
|
+
|
|
15
|
+
start = time.time()
|
|
16
|
+
await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
17
|
+
print("extract time: ", time.time() - start)
|
|
18
|
+
|
|
19
|
+
start = time.time()
|
|
20
|
+
await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
21
|
+
print("format time: ", time.time() - start)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
if __name__ == '__main__':
|
|
25
|
+
import asyncio
|
|
26
|
+
from src.rm_bdd.data_downloader import DataDownloader
|
|
27
|
+
from src.rm_bdd.data_extractor import DataExtractor
|
|
28
|
+
from src.rm_bdd.data_formatter import DataFormatter
|
|
29
|
+
from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
30
|
+
|
|
31
|
+
downloader = DataDownloader("downloads/", BinanceMetadataManager("downloads/metadata.json"))
|
|
32
|
+
extractor = DataExtractor("downloads/", "extracts/", BinanceMetadataManager("extracts/metadata.json"))
|
|
33
|
+
formatter = DataFormatter("extracts/", "data/", BinanceMetadataManager("data/metadata.json"))
|
|
34
|
+
|
|
35
|
+
manager = DataManager(downloader, extractor, formatter)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def main():
|
|
39
|
+
await manager.download_and_save("BTCUSDT", '1m')
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
start = time.time()
|
|
43
|
+
asyncio.run(main())
|
|
44
|
+
|
|
45
|
+
print(time.time() - start)
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import datetime
|
|
3
|
+
import re
|
|
4
|
+
from functools import partial
|
|
5
|
+
|
|
6
|
+
from src.rm_bdd.binance_metadata_manager import _get_data_from_filename
|
|
7
|
+
from src.rm_bdd.path_loader import get_all_path_responses
|
|
8
|
+
|
|
9
|
+
DAY_DATE_REGEX = re.compile(r"^\d{4}-\d{2}-\d{2}$")
|
|
10
|
+
async def _get_all_available_filenames(symbol, timeframe) -> list[str]:
|
|
11
|
+
# get all monthly data
|
|
12
|
+
# get all daily data for last month
|
|
13
|
+
path = f"data/spot/monthly/klines/{symbol}/{timeframe}"
|
|
14
|
+
monthly = await get_all_path_responses(f"data/spot/monthly/klines/{symbol}/{timeframe}")
|
|
15
|
+
|
|
16
|
+
filenames = [item['Key'] for item in monthly[0]['ListBucketResult'].get('Contents', [])]
|
|
17
|
+
file_path, year, month = filenames[-1].rsplit(".", 2)[0].rsplit("-", 2)
|
|
18
|
+
|
|
19
|
+
file_path = file_path.replace("monthly", "daily")
|
|
20
|
+
if month == '12':
|
|
21
|
+
year = "%02i"%(int(year) + 1)
|
|
22
|
+
else:
|
|
23
|
+
month = "%02i"%(int(month) + 1)
|
|
24
|
+
|
|
25
|
+
file_path = "-".join([file_path, year, month])
|
|
26
|
+
|
|
27
|
+
path = path.replace("monthly", "daily")
|
|
28
|
+
|
|
29
|
+
daily = await get_all_path_responses(path, marker=file_path)
|
|
30
|
+
|
|
31
|
+
filenames.extend([item['Key'] for item in daily[0]['ListBucketResult'].get('Contents', [])])
|
|
32
|
+
return filenames
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def str_to_datetime(date: str) -> datetime.datetime:
|
|
36
|
+
if DAY_DATE_REGEX.match(date):
|
|
37
|
+
return datetime.datetime.strptime(date, "%Y-%m-%d")
|
|
38
|
+
return datetime.datetime.strptime(date, "%Y-%m")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _is_filename_in_date_range(filename: str, date_from: str, date_to: str) -> bool:
|
|
42
|
+
*_, date = _get_data_from_filename(filename)
|
|
43
|
+
date = str_to_datetime(date)
|
|
44
|
+
retval = True
|
|
45
|
+
if date_from:
|
|
46
|
+
date_from = str_to_datetime(date_from)
|
|
47
|
+
retval = retval and date >= date_from
|
|
48
|
+
|
|
49
|
+
if date_to:
|
|
50
|
+
date_to = str_to_datetime(date_to)
|
|
51
|
+
retval = retval and date <= date_to
|
|
52
|
+
return retval
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
async def get_all_available_filenames_in_daterange(
|
|
56
|
+
symbol, timeframe, date_from=None, date_to=None
|
|
57
|
+
) -> list[str]:
|
|
58
|
+
filenames = await _get_all_available_filenames(symbol, timeframe)
|
|
59
|
+
if date_from or date_to:
|
|
60
|
+
filenames = list(filter(
|
|
61
|
+
partial(_is_filename_in_date_range, date_from=date_from, date_to=date_to),
|
|
62
|
+
filenames,
|
|
63
|
+
))
|
|
64
|
+
return filenames
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
if __name__ == '__main__':
|
|
68
|
+
async def main():
|
|
69
|
+
# await _get_all_available_filenames("BTCUSDT", timeframe="1m")
|
|
70
|
+
# result = await get_all_available_filenames_in_daterange("BTCUSDT", '1m', '2024-03', '2024-05')
|
|
71
|
+
# result = await get_all_available_filenames_in_daterange("BTCUSDT", '1m')
|
|
72
|
+
result = await get_all_available_filenames_in_daterange("BTCUSDT", '1m', date_from="2025-09")
|
|
73
|
+
print(result)
|
|
74
|
+
...
|
|
75
|
+
|
|
76
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
logger = logging.getLogger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MetadataManager:
|
|
9
|
+
def __init__(self, file_path):
|
|
10
|
+
self._data = None
|
|
11
|
+
self._file_path = file_path
|
|
12
|
+
|
|
13
|
+
def _load_metadata(self):
|
|
14
|
+
if os.path.isfile(self._file_path):
|
|
15
|
+
with open(self._file_path, "r") as f:
|
|
16
|
+
self._data = json.load(f)
|
|
17
|
+
else:
|
|
18
|
+
self._data = {}
|
|
19
|
+
|
|
20
|
+
@property
|
|
21
|
+
def data(self):
|
|
22
|
+
if self._data is None:
|
|
23
|
+
self._load_metadata()
|
|
24
|
+
return self._data
|
|
25
|
+
|
|
26
|
+
def _save_metadata(self):
|
|
27
|
+
with open(self._file_path, mode="w", encoding="utf-8") as f:
|
|
28
|
+
json.dump(self.data, f, indent=4)
|
|
29
|
+
|
|
30
|
+
def update_metadata(self, symbol: str, timeframe: str, date: str):
|
|
31
|
+
if symbol not in self.data:
|
|
32
|
+
self._data[symbol] = {}
|
|
33
|
+
|
|
34
|
+
if timeframe not in self.data[symbol]:
|
|
35
|
+
self._data[symbol][timeframe] = []
|
|
36
|
+
|
|
37
|
+
if not self._data[symbol][timeframe] or self._data[symbol][timeframe][-1] != date:
|
|
38
|
+
self._data[symbol][timeframe].append(date)
|
|
39
|
+
self._save_metadata()
|
|
40
|
+
|
|
41
|
+
def check_date_not_in_metadata(self, symbol: str, timeframe: str, date: str):
|
|
42
|
+
metadata = self.data.get(symbol, {}).get(timeframe, [])
|
|
43
|
+
date_not_in_metadata = date not in metadata
|
|
44
|
+
date_month_not_in_metadata = date.count("-") != 2 or date.rsplit("-", 1)[0] not in metadata
|
|
45
|
+
result = date_not_in_metadata and date_month_not_in_metadata
|
|
46
|
+
if not result:
|
|
47
|
+
logger.info(f"Data for {symbol} {timeframe} {date} already exists. Skip.")
|
|
48
|
+
return result
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
|
|
3
|
+
import aiohttp
|
|
4
|
+
import xmltodict
|
|
5
|
+
|
|
6
|
+
URL_TEMPLATE = "https://s3-ap-northeast-1.amazonaws.com/data.binance.vision"
|
|
7
|
+
SEMAPHORE = asyncio.Semaphore(300)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
async def get_all_paths(prefix: str, delimiter="/"):
|
|
11
|
+
responses = await get_all_path_responses(prefix, delimiter)
|
|
12
|
+
retval = []
|
|
13
|
+
for response in responses:
|
|
14
|
+
common_prefixes = response.get("ListBucketResult", {}).get("CommonPrefixes", [])
|
|
15
|
+
if isinstance(common_prefixes, list):
|
|
16
|
+
retval.extend([i["Prefix"] for i in common_prefixes])
|
|
17
|
+
else:
|
|
18
|
+
retval.append(common_prefixes["Prefix"])
|
|
19
|
+
return retval
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
async def get_all_path_responses(prefix: str, delimiter="/", marker=None, **params):
|
|
23
|
+
is_truncated = True
|
|
24
|
+
responses = []
|
|
25
|
+
while is_truncated:
|
|
26
|
+
response = await get_paths_response_json(prefix, delimiter, marker, **params)
|
|
27
|
+
is_truncated = (
|
|
28
|
+
response.get("ListBucketResult", {}).get("IsTruncated", "false") == "true"
|
|
29
|
+
)
|
|
30
|
+
responses.append(response)
|
|
31
|
+
marker = response.get("ListBucketResult", {}).get("NextMarker", None)
|
|
32
|
+
|
|
33
|
+
return responses
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def get_paths_response_json(prefix: str, delimiter="/", marker=None, **params):
|
|
37
|
+
if not prefix.endswith("/"):
|
|
38
|
+
prefix += "/"
|
|
39
|
+
params = {"prefix": prefix, "delimiter": delimiter, "marker": marker, **params}
|
|
40
|
+
params = {k: v for k, v in params.items() if v is not None}
|
|
41
|
+
if prefix.endswith("/"):
|
|
42
|
+
prefix = prefix[:-1]
|
|
43
|
+
url = URL_TEMPLATE.format(prefix=prefix)
|
|
44
|
+
async with SEMAPHORE:
|
|
45
|
+
async with aiohttp.ClientSession() as session:
|
|
46
|
+
async with session.get(url, params=params) as response:
|
|
47
|
+
text = await response.text()
|
|
48
|
+
return xmltodict.parse(text)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
if __name__ == "__main__":
|
|
52
|
+
|
|
53
|
+
async def main():
|
|
54
|
+
responses = await get_all_path_responses("data/spot/monthly/klines/SOLUSDT/1m")
|
|
55
|
+
print(
|
|
56
|
+
[
|
|
57
|
+
i.get("Key")
|
|
58
|
+
for response in responses for i in response.get("ListBucketResult", {}).get("Contents", [])
|
|
59
|
+
]
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
asyncio.run(main())
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: rm-bdd
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Binance Data Downloader
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: aiohttp==3.13.3
|
|
10
|
+
Requires-Dist: pyarrow==23.0.1
|
|
11
|
+
Requires-Dist: pandas==3.0.1
|
|
12
|
+
Requires-Dist: xmltodict==1.0.4
|
|
13
|
+
|
|
14
|
+
# rm-binance-data-downloader
|
|
15
|
+
|
|
16
|
+
A Python library for downloading, extracting and formatting historical
|
|
17
|
+
Binance market data.
|
|
18
|
+
|
|
19
|
+
The package provides a pipeline that:
|
|
20
|
+
|
|
21
|
+
1. Downloads data from Binance Vision
|
|
22
|
+
2. Extracts compressed archives
|
|
23
|
+
3. Formats data into a structured dataset
|
|
24
|
+
|
|
25
|
+
It is designed for fast data preparation for quantitative trading,
|
|
26
|
+
backtesting and data analysis.
|
|
27
|
+
|
|
28
|
+
------------------------------------------------------------------------
|
|
29
|
+
|
|
30
|
+
# Installation
|
|
31
|
+
|
|
32
|
+
``` bash
|
|
33
|
+
pip install rm-bdd
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
# Features
|
|
39
|
+
|
|
40
|
+
- Download historical Binance data
|
|
41
|
+
- Automatic archive extraction
|
|
42
|
+
- Data formatting pipeline
|
|
43
|
+
- Metadata management
|
|
44
|
+
- Async architecture
|
|
45
|
+
- Easy integration into trading systems
|
|
46
|
+
|
|
47
|
+
------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
# Quick Example
|
|
50
|
+
|
|
51
|
+
``` python
|
|
52
|
+
import asyncio
|
|
53
|
+
import time
|
|
54
|
+
|
|
55
|
+
from rm_bdd.data_downloader import DataDownloader
|
|
56
|
+
from rm_bdd.data_extractor import DataExtractor
|
|
57
|
+
from rm_bdd.data_formatter import DataFormatter
|
|
58
|
+
from rm_bdd.binance_metadata_manager import BinanceMetadataManager
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class DataManager:
|
|
62
|
+
|
|
63
|
+
def __init__(self, downloader, extractor, formatter):
|
|
64
|
+
self._downloader = downloader
|
|
65
|
+
self._extractor = extractor
|
|
66
|
+
self._formatter = formatter
|
|
67
|
+
|
|
68
|
+
async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
|
|
69
|
+
|
|
70
|
+
start = time.time()
|
|
71
|
+
await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
72
|
+
print("download time:", time.time() - start)
|
|
73
|
+
|
|
74
|
+
start = time.time()
|
|
75
|
+
await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
76
|
+
print("extract time:", time.time() - start)
|
|
77
|
+
|
|
78
|
+
start = time.time()
|
|
79
|
+
await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
|
|
80
|
+
print("format time:", time.time() - start)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
async def main():
|
|
84
|
+
|
|
85
|
+
downloader = DataDownloader(
|
|
86
|
+
"downloads/",
|
|
87
|
+
BinanceMetadataManager("downloads/metadata.json")
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
extractor = DataExtractor(
|
|
91
|
+
"downloads/",
|
|
92
|
+
"extracts/",
|
|
93
|
+
BinanceMetadataManager("extracts/metadata.json")
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
formatter = DataFormatter(
|
|
97
|
+
"extracts/",
|
|
98
|
+
"data/",
|
|
99
|
+
BinanceMetadataManager("data/metadata.json")
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
manager = DataManager(downloader, extractor, formatter)
|
|
103
|
+
|
|
104
|
+
await manager.download_and_save("BTCUSDT", "1m")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
asyncio.run(main())
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
------------------------------------------------------------------------
|
|
111
|
+
|
|
112
|
+
# Result Folder Structure
|
|
113
|
+
|
|
114
|
+
After execution the folders will look like:
|
|
115
|
+
|
|
116
|
+
downloads/
|
|
117
|
+
BTCUSDT/
|
|
118
|
+
metadata.json
|
|
119
|
+
|
|
120
|
+
extracts/
|
|
121
|
+
BTCUSDT/
|
|
122
|
+
metadata.json
|
|
123
|
+
|
|
124
|
+
data/
|
|
125
|
+
BTCUSDT/
|
|
126
|
+
metadata.json
|
|
127
|
+
|
|
128
|
+
------------------------------------------------------------------------
|
|
129
|
+
|
|
130
|
+
# Pipeline Overview
|
|
131
|
+
|
|
132
|
+
The processing pipeline consists of three stages:
|
|
133
|
+
|
|
134
|
+
### Downloader
|
|
135
|
+
|
|
136
|
+
Downloads historical data archives from Binance Vision.
|
|
137
|
+
|
|
138
|
+
### Extractor
|
|
139
|
+
|
|
140
|
+
Extracts downloaded archives.
|
|
141
|
+
|
|
142
|
+
### Formatter
|
|
143
|
+
|
|
144
|
+
Formats extracted CSV data into a structured dataset ready for analysis.
|
|
145
|
+
|
|
146
|
+
------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
# Metadata Manager
|
|
149
|
+
|
|
150
|
+
The library uses a metadata system to track downloaded, extracted and
|
|
151
|
+
formatted data.
|
|
152
|
+
|
|
153
|
+
This prevents duplicate downloads and processing.
|
|
154
|
+
|
|
155
|
+
------------------------------------------------------------------------
|
|
156
|
+
|
|
157
|
+
# Example Use Case
|
|
158
|
+
|
|
159
|
+
Typical workflow:
|
|
160
|
+
|
|
161
|
+
download → extract → format → analyze
|
|
162
|
+
|
|
163
|
+
Used for:
|
|
164
|
+
|
|
165
|
+
- algorithmic trading
|
|
166
|
+
- backtesting
|
|
167
|
+
- machine learning datasets
|
|
168
|
+
- market research
|
|
169
|
+
|
|
170
|
+
------------------------------------------------------------------------
|
|
171
|
+
|
|
172
|
+
# Requirements
|
|
173
|
+
|
|
174
|
+
Python 3.10+
|
|
175
|
+
|
|
176
|
+
------------------------------------------------------------------------
|
|
177
|
+
|
|
178
|
+
# License
|
|
179
|
+
|
|
180
|
+
MIT
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/rm_bdd/__init__.py
|
|
4
|
+
src/rm_bdd/binance_metadata_manager.py
|
|
5
|
+
src/rm_bdd/csvs_to_parquet.py
|
|
6
|
+
src/rm_bdd/data_downloader.py
|
|
7
|
+
src/rm_bdd/data_extractor.py
|
|
8
|
+
src/rm_bdd/data_formatter.py
|
|
9
|
+
src/rm_bdd/data_manager.py
|
|
10
|
+
src/rm_bdd/files_parser.py
|
|
11
|
+
src/rm_bdd/metadata_manager.py
|
|
12
|
+
src/rm_bdd/path_loader.py
|
|
13
|
+
src/rm_bdd.egg-info/PKG-INFO
|
|
14
|
+
src/rm_bdd.egg-info/SOURCES.txt
|
|
15
|
+
src/rm_bdd.egg-info/dependency_links.txt
|
|
16
|
+
src/rm_bdd.egg-info/requires.txt
|
|
17
|
+
src/rm_bdd.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
rm_bdd
|