rm-bdd 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rm_bdd-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.4
2
+ Name: rm-bdd
3
+ Version: 0.1.0
4
+ Summary: Binance Data Downloader
5
+ Author-email: Your Name <you@example.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: aiohttp==3.13.3
10
+ Requires-Dist: pyarrow==23.0.1
11
+ Requires-Dist: pandas==3.0.1
12
+ Requires-Dist: xmltodict==1.0.4
13
+
14
+ # rm-binance-data-downloader
15
+
16
+ A Python library for downloading, extracting and formatting historical
17
+ Binance market data.
18
+
19
+ The package provides a pipeline that:
20
+
21
+ 1. Downloads data from Binance Vision
22
+ 2. Extracts compressed archives
23
+ 3. Formats data into a structured dataset
24
+
25
+ It is designed for fast data preparation for quantitative trading,
26
+ backtesting and data analysis.
27
+
28
+ ------------------------------------------------------------------------
29
+
30
+ # Installation
31
+
32
+ ``` bash
33
+ pip install rm-bdd
34
+ ```
35
+
36
+ ------------------------------------------------------------------------
37
+
38
+ # Features
39
+
40
+ - Download historical Binance data
41
+ - Automatic archive extraction
42
+ - Data formatting pipeline
43
+ - Metadata management
44
+ - Async architecture
45
+ - Easy integration into trading systems
46
+
47
+ ------------------------------------------------------------------------
48
+
49
+ # Quick Example
50
+
51
+ ``` python
52
+ import asyncio
53
+ import time
54
+
55
+ from rm_bdd.data_downloader import DataDownloader
56
+ from rm_bdd.data_extractor import DataExtractor
57
+ from rm_bdd.data_formatter import DataFormatter
58
+ from rm_bdd.binance_metadata_manager import BinanceMetadataManager
59
+
60
+
61
+ class DataManager:
62
+
63
+ def __init__(self, downloader, extractor, formatter):
64
+ self._downloader = downloader
65
+ self._extractor = extractor
66
+ self._formatter = formatter
67
+
68
+ async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
69
+
70
+ start = time.time()
71
+ await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
72
+ print("download time:", time.time() - start)
73
+
74
+ start = time.time()
75
+ await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
76
+ print("extract time:", time.time() - start)
77
+
78
+ start = time.time()
79
+ await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
80
+ print("format time:", time.time() - start)
81
+
82
+
83
+ async def main():
84
+
85
+ downloader = DataDownloader(
86
+ "downloads/",
87
+ BinanceMetadataManager("downloads/metadata.json")
88
+ )
89
+
90
+ extractor = DataExtractor(
91
+ "downloads/",
92
+ "extracts/",
93
+ BinanceMetadataManager("extracts/metadata.json")
94
+ )
95
+
96
+ formatter = DataFormatter(
97
+ "extracts/",
98
+ "data/",
99
+ BinanceMetadataManager("data/metadata.json")
100
+ )
101
+
102
+ manager = DataManager(downloader, extractor, formatter)
103
+
104
+ await manager.download_and_save("BTCUSDT", "1m")
105
+
106
+
107
+ asyncio.run(main())
108
+ ```
109
+
110
+ ------------------------------------------------------------------------
111
+
112
+ # Result Folder Structure
113
+
114
+ After execution the folders will look like:
115
+
116
+ downloads/
117
+ BTCUSDT/
118
+ metadata.json
119
+
120
+ extracts/
121
+ BTCUSDT/
122
+ metadata.json
123
+
124
+ data/
125
+ BTCUSDT/
126
+ metadata.json
127
+
128
+ ------------------------------------------------------------------------
129
+
130
+ # Pipeline Overview
131
+
132
+ The processing pipeline consists of three stages:
133
+
134
+ ### Downloader
135
+
136
+ Downloads historical data archives from Binance Vision.
137
+
138
+ ### Extractor
139
+
140
+ Extracts downloaded archives.
141
+
142
+ ### Formatter
143
+
144
+ Formats extracted CSV data into a structured dataset ready for analysis.
145
+
146
+ ------------------------------------------------------------------------
147
+
148
+ # Metadata Manager
149
+
150
+ The library uses a metadata system to track downloaded, extracted and
151
+ formatted data.
152
+
153
+ This prevents duplicate downloads and processing.
154
+
155
+ ------------------------------------------------------------------------
156
+
157
+ # Example Use Case
158
+
159
+ Typical workflow:
160
+
161
+ download → extract → format → analyze
162
+
163
+ Used for:
164
+
165
+ - algorithmic trading
166
+ - backtesting
167
+ - machine learning datasets
168
+ - market research
169
+
170
+ ------------------------------------------------------------------------
171
+
172
+ # Requirements
173
+
174
+ Python 3.10+
175
+
176
+ ------------------------------------------------------------------------
177
+
178
+ # License
179
+
180
+ MIT
rm_bdd-0.1.0/README.md ADDED
@@ -0,0 +1,167 @@
1
+ # rm-binance-data-downloader
2
+
3
+ A Python library for downloading, extracting and formatting historical
4
+ Binance market data.
5
+
6
+ The package provides a pipeline that:
7
+
8
+ 1. Downloads data from Binance Vision
9
+ 2. Extracts compressed archives
10
+ 3. Formats data into a structured dataset
11
+
12
+ It is designed for fast data preparation for quantitative trading,
13
+ backtesting and data analysis.
14
+
15
+ ------------------------------------------------------------------------
16
+
17
+ # Installation
18
+
19
+ ``` bash
20
+ pip install rm-bdd
21
+ ```
22
+
23
+ ------------------------------------------------------------------------
24
+
25
+ # Features
26
+
27
+ - Download historical Binance data
28
+ - Automatic archive extraction
29
+ - Data formatting pipeline
30
+ - Metadata management
31
+ - Async architecture
32
+ - Easy integration into trading systems
33
+
34
+ ------------------------------------------------------------------------
35
+
36
+ # Quick Example
37
+
38
+ ``` python
39
+ import asyncio
40
+ import time
41
+
42
+ from rm_bdd.data_downloader import DataDownloader
43
+ from rm_bdd.data_extractor import DataExtractor
44
+ from rm_bdd.data_formatter import DataFormatter
45
+ from rm_bdd.binance_metadata_manager import BinanceMetadataManager
46
+
47
+
48
+ class DataManager:
49
+
50
+ def __init__(self, downloader, extractor, formatter):
51
+ self._downloader = downloader
52
+ self._extractor = extractor
53
+ self._formatter = formatter
54
+
55
+ async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
56
+
57
+ start = time.time()
58
+ await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
59
+ print("download time:", time.time() - start)
60
+
61
+ start = time.time()
62
+ await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
63
+ print("extract time:", time.time() - start)
64
+
65
+ start = time.time()
66
+ await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
67
+ print("format time:", time.time() - start)
68
+
69
+
70
+ async def main():
71
+
72
+ downloader = DataDownloader(
73
+ "downloads/",
74
+ BinanceMetadataManager("downloads/metadata.json")
75
+ )
76
+
77
+ extractor = DataExtractor(
78
+ "downloads/",
79
+ "extracts/",
80
+ BinanceMetadataManager("extracts/metadata.json")
81
+ )
82
+
83
+ formatter = DataFormatter(
84
+ "extracts/",
85
+ "data/",
86
+ BinanceMetadataManager("data/metadata.json")
87
+ )
88
+
89
+ manager = DataManager(downloader, extractor, formatter)
90
+
91
+ await manager.download_and_save("BTCUSDT", "1m")
92
+
93
+
94
+ asyncio.run(main())
95
+ ```
96
+
97
+ ------------------------------------------------------------------------
98
+
99
+ # Result Folder Structure
100
+
101
+ After execution the folders will look like:
102
+
103
+ downloads/
104
+ BTCUSDT/
105
+ metadata.json
106
+
107
+ extracts/
108
+ BTCUSDT/
109
+ metadata.json
110
+
111
+ data/
112
+ BTCUSDT/
113
+ metadata.json
114
+
115
+ ------------------------------------------------------------------------
116
+
117
+ # Pipeline Overview
118
+
119
+ The processing pipeline consists of three stages:
120
+
121
+ ### Downloader
122
+
123
+ Downloads historical data archives from Binance Vision.
124
+
125
+ ### Extractor
126
+
127
+ Extracts downloaded archives.
128
+
129
+ ### Formatter
130
+
131
+ Formats extracted CSV data into a structured dataset ready for analysis.
132
+
133
+ ------------------------------------------------------------------------
134
+
135
+ # Metadata Manager
136
+
137
+ The library uses a metadata system to track downloaded, extracted and
138
+ formatted data.
139
+
140
+ This prevents duplicate downloads and processing.
141
+
142
+ ------------------------------------------------------------------------
143
+
144
+ # Example Use Case
145
+
146
+ Typical workflow:
147
+
148
+ download → extract → format → analyze
149
+
150
+ Used for:
151
+
152
+ - algorithmic trading
153
+ - backtesting
154
+ - machine learning datasets
155
+ - market research
156
+
157
+ ------------------------------------------------------------------------
158
+
159
+ # Requirements
160
+
161
+ Python 3.10+
162
+
163
+ ------------------------------------------------------------------------
164
+
165
+ # License
166
+
167
+ MIT
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "rm-bdd"
7
+ version = "0.1.0"
8
+ description = "Binance Data Downloader"
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "Your Name", email = "you@example.com" }
14
+ ]
15
+ dependencies = [
16
+ "aiohttp==3.13.3",
17
+ "pyarrow==23.0.1",
18
+ "pandas==3.0.1",
19
+ "xmltodict==1.0.4",
20
+ ]
21
+
22
+ [tool.setuptools]
23
+ package-dir = {"" = "src"}
24
+
25
+ [tool.setuptools.packages.find]
26
+ where = ["src"]
27
+ include = ["rm_bdd*"]
rm_bdd-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,2 @@
1
+ from src.rm_bdd import data_downloader, data_formatter, data_manager
2
+ from src.rm_bdd import data_extractor
@@ -0,0 +1,23 @@
1
+ from src.rm_bdd.metadata_manager import MetadataManager
2
+
3
+
4
+ def _get_data_from_filename(filename) -> tuple[str, str, str]:
5
+ filename = filename.split(".", 1)[0]
6
+ symbol, timeframe, date = filename.split("-", 2)
7
+ return symbol, timeframe, date
8
+
9
+
10
+ class BinanceMetadataManager:
11
+ def __init__(self, file_path):
12
+ self._metadata_manager = MetadataManager(file_path)
13
+
14
+ def update(self, filename):
15
+ symbol, timeframe, date = _get_data_from_filename(filename.rsplit("/", 1)[1])
16
+ self._metadata_manager.update_metadata(symbol, timeframe, date)
17
+
18
+ def check(self, filename):
19
+ symbol, timeframe, date = _get_data_from_filename(filename.rsplit("/", 1)[1])
20
+
21
+ return self._metadata_manager.check_date_not_in_metadata(
22
+ symbol, timeframe, date
23
+ )
@@ -0,0 +1,66 @@
1
+ import os
2
+
3
+ import pandas as pd
4
+
5
+
6
+ def csvs_to_parquets(files):
7
+ for file in files:
8
+ # Read the CSV file
9
+ df = pd.read_csv(file, parse_dates=["timestamp"], infer_datetime_format=True)
10
+
11
+ # Ensure the 'timestamp' column is in datetime format
12
+ df["timestamp"] = pd.to_datetime(df["timestamp"], unit="ms")
13
+
14
+ # Set the 'timestamp' column as the index
15
+ df.set_index("timestamp", inplace=True)
16
+
17
+ # Create the output directory if it doesn't exist
18
+ output_dir = "parquets"
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ # Define the output parquet file path
22
+ parquet_file = os.path.join(
23
+ output_dir, os.path.basename(file).replace(".csv", ".parquet")
24
+ )
25
+
26
+ # Write the DataFrame to a Parquet file
27
+ df.to_parquet(parquet_file, engine="pyarrow")
28
+
29
+
30
+ def normalize_time(series):
31
+ """
32
+ Визначає, чи open_time у мілісекундах чи наносекундах,
33
+ і перетворює їх в datetime (UTC).
34
+ """
35
+ series[series > 10 ** 13] = series[series > 10 ** 13] // 1000
36
+ return series
37
+
38
+
39
+ async def csv_to_partitioned_parquet(files, symbol, timeframe, path="data"):
40
+ columns = [
41
+ "open_time",
42
+ "open",
43
+ "high",
44
+ "low",
45
+ "close",
46
+ "volume",
47
+ "close_time",
48
+ "quote_asset_volume",
49
+ "number_of_trades",
50
+ "taker_buy_base_asset_volume",
51
+ "taker_buy_quote_asset_volume",
52
+ "ignore",
53
+ ]
54
+ df = pd.concat(
55
+ [pd.read_csv(file, names=columns) for file in files], ignore_index=True
56
+ )
57
+ df["open_time"] = normalize_time(df["open_time"])
58
+ df["close_time"] = normalize_time(df["close_time"])
59
+ df["year"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.year
60
+ df["month"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.month
61
+ df["day"] = pd.to_datetime(df["open_time"], unit="ms", utc=True).dt.day
62
+ df["symbol"] = symbol
63
+ df["timeframe"] = timeframe
64
+ df.to_parquet(path, partition_cols=["symbol", "timeframe", "year", "month", "day"])
65
+
66
+
@@ -0,0 +1,73 @@
1
+ import asyncio
2
+ import logging
3
+ import os
4
+
5
+ import aiohttp
6
+
7
+ from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
8
+ from src.rm_bdd.files_parser import get_all_available_filenames_in_daterange
9
+
10
+ logger = logging.getLogger(__name__)
11
+ BASE_VISION_URL = "https://data.binance.vision/"
12
+
13
+
14
+ async def download_file(key, folder) -> str | None:
15
+ logger.info(f"Downloading {key} to {folder}")
16
+ url = BASE_VISION_URL + key
17
+
18
+ os.makedirs(folder, exist_ok=True)
19
+ try:
20
+ local_filename = os.path.join(folder, url.split("/")[-1])
21
+ async with aiohttp.ClientSession() as session:
22
+ async with session.get(url) as r:
23
+ r.raise_for_status()
24
+ with open(local_filename, "wb") as f:
25
+ async for chunk in r.content.iter_chunked(8192):
26
+ chunk: bytes
27
+ f.write(chunk)
28
+ logger.info(f"Downloaded {key} to {local_filename}")
29
+ return str(local_filename)
30
+ except Exception as e:
31
+ logger.info(f"Failed to download {key}: {e}")
32
+ logger.error(e)
33
+ return None
34
+
35
+
36
+ class DataDownloader:
37
+ def __init__(self, download_folder, metadata_manager):
38
+ self._download_folder = download_folder
39
+ self._metadata_manager = metadata_manager
40
+
41
+ async def download(self, symbol, timeframe, date_from=None, date_to=None):
42
+ filenames = await get_all_available_filenames_in_daterange(
43
+ symbol, timeframe, date_from, date_to
44
+ )
45
+
46
+ # get day from filename and check in metadata manager
47
+ filenames = list(
48
+ filter(self._metadata_manager.check, filenames)
49
+ )
50
+
51
+ logger.info(
52
+ f"Downloading {timeframe} data for {symbol} from {date_from} to {date_to}"
53
+ )
54
+
55
+ results = await asyncio.gather(
56
+ *(download_file(filename, self._download_folder) for filename in filenames)
57
+ )
58
+ logger.info(f"Finished downloading {timeframe} data for {symbol}")
59
+ logger.debug(f"Downloaded {len(results)} results for {symbol}")
60
+
61
+ for filename in results:
62
+ self._metadata_manager.update(filename)
63
+
64
+ return results
65
+
66
+
67
+ if __name__ == '__main__':
68
+ async def main():
69
+ downloader = DataDownloader("downloads/", BinanceMetadataManager("downloads/metadata.json"))
70
+ await downloader.download("BTCUSDT", '1m')
71
+
72
+
73
+ asyncio.run(main())
@@ -0,0 +1,55 @@
1
+ import os
2
+ from functools import partial
3
+
4
+ import zipfile
5
+
6
+ from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
7
+ from src.rm_bdd.files_parser import _is_filename_in_date_range
8
+
9
+
10
+ class DataExtractor:
11
+ def __init__(self, download_folder, extract_folder, metadata_manager):
12
+ self._download_folder = download_folder
13
+ self._extract_folder = extract_folder
14
+ self._metadata_manager = metadata_manager
15
+
16
+ async def extract(self, symbol, timeframe, date_from=None, date_to=None):
17
+ downloaded_files = self._downloaded_files
18
+ files_to_extract = filter(lambda a: a.startswith(f'{symbol.upper()}-{timeframe}'), downloaded_files)
19
+ files_to_extract = list(filter(
20
+ partial(_is_filename_in_date_range, date_from=date_from, date_to=date_to),
21
+ files_to_extract,
22
+ ))
23
+ retval = await self.extract_files(files_to_extract)
24
+
25
+ return retval
26
+
27
+ async def extract_files(self, filenames):
28
+ filenames = [self._download_folder + filename for filename in filenames]
29
+ filenames = list(
30
+ filter(self._metadata_manager.check, filenames)
31
+ )
32
+ filenames = list(filter(lambda a: a.count("-") == 1 or f'{a.rsplit("-", 1)[0]}.zip' not in filenames, filenames))
33
+ retval = []
34
+ for filename in filenames:
35
+ with zipfile.ZipFile(filename) as zip_file:
36
+ zip_file.extractall(self._extract_folder)
37
+ self._metadata_manager.update(filename)
38
+ retval.append(filename)
39
+ return retval
40
+
41
+ @property
42
+ def _downloaded_files(self) -> list[str]:
43
+ return list(filter(lambda a: a.endswith(".zip"), os.listdir(self._download_folder)))
44
+
45
+
46
+ if __name__ == '__main__':
47
+ import asyncio
48
+
49
+
50
+ async def main():
51
+ extractor = DataExtractor("downloads/", "extracts/", BinanceMetadataManager("extracts/metadata.json"))
52
+ await extractor.extract("BTCUSDT", '1m')
53
+
54
+
55
+ asyncio.run(main())
@@ -0,0 +1,57 @@
1
+ import os
2
+ from functools import partial
3
+
4
+ from src.rm_bdd.csvs_to_parquet import csv_to_partitioned_parquet
5
+ from src.rm_bdd.files_parser import _is_filename_in_date_range
6
+
7
+
8
+ class DataFormatter:
9
+ def __init__(self, extract_folder, data_folder, metadata_manager):
10
+ self._extract_folder = extract_folder
11
+ self._data_folder = data_folder
12
+ self._metadata_manager = metadata_manager
13
+
14
+ async def format(self, symbol, timeframe, date_from=None, date_to=None):
15
+ extracted_files = self._extracted_files
16
+ files_to_format = filter(lambda a: a.startswith(f'{symbol.upper()}-{timeframe}'), extracted_files)
17
+ files_to_format = list(filter(
18
+ partial(_is_filename_in_date_range, date_from=date_from, date_to=date_to),
19
+ files_to_format,
20
+ ))
21
+
22
+ await self.format_files(files_to_format)
23
+
24
+ async def format_files(self, filenames):
25
+ os.makedirs(self._data_folder, exist_ok=True)
26
+ filenames = [self._extract_folder + filename for filename in filenames]
27
+
28
+ filenames = list(
29
+ filter(lambda a: a.count("-") == 1 or f'{a.rsplit("-", 1)[0]}.csv' not in filenames, filenames))
30
+ retval = []
31
+ symbol = filenames[0].rsplit("/", 1)[-1].split("-", 1)[0]
32
+ timeframe = filenames[0].rsplit("/", 1)[-1].split("-", 2)[1]
33
+ filenames = list(
34
+ filter(self._metadata_manager.check, filenames)
35
+ )
36
+ print("files to format: ", filenames)
37
+ if filenames:
38
+ await csv_to_partitioned_parquet(filenames, symbol, timeframe, self._data_folder)
39
+
40
+ for filename in filenames:
41
+ self._metadata_manager.update(filename)
42
+ retval.append(filename)
43
+ return retval
44
+
45
+ @property
46
+ def _extracted_files(self) -> list[str]:
47
+ return list(filter(lambda a: a.endswith(".csv"), os.listdir(self._extract_folder)))
48
+
49
+
50
+ if __name__ == '__main__':
51
+ def main():
52
+ from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
53
+ formatter = DataFormatter("extracts/", "data/", BinanceMetadataManager("data/metadata.json"))
54
+ formatter.format("BTCUSDT", '1m')
55
+
56
+
57
+ main()
@@ -0,0 +1,45 @@
1
+ import time
2
+
3
+
4
+ class DataManager:
5
+ def __init__(self, downloader, extractor, formatter):
6
+ self._downloader = downloader
7
+ self._extractor = extractor
8
+ self._formatter = formatter
9
+
10
+ async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
11
+ start = time.time()
12
+ await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
13
+ print("download time: ", time.time() - start)
14
+
15
+ start = time.time()
16
+ await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
17
+ print("extract time: ", time.time() - start)
18
+
19
+ start = time.time()
20
+ await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
21
+ print("format time: ", time.time() - start)
22
+
23
+
24
+ if __name__ == '__main__':
25
+ import asyncio
26
+ from src.rm_bdd.data_downloader import DataDownloader
27
+ from src.rm_bdd.data_extractor import DataExtractor
28
+ from src.rm_bdd.data_formatter import DataFormatter
29
+ from src.rm_bdd.binance_metadata_manager import BinanceMetadataManager
30
+
31
+ downloader = DataDownloader("downloads/", BinanceMetadataManager("downloads/metadata.json"))
32
+ extractor = DataExtractor("downloads/", "extracts/", BinanceMetadataManager("extracts/metadata.json"))
33
+ formatter = DataFormatter("extracts/", "data/", BinanceMetadataManager("data/metadata.json"))
34
+
35
+ manager = DataManager(downloader, extractor, formatter)
36
+
37
+
38
+ async def main():
39
+ await manager.download_and_save("BTCUSDT", '1m')
40
+
41
+
42
+ start = time.time()
43
+ asyncio.run(main())
44
+
45
+ print(time.time() - start)
@@ -0,0 +1,76 @@
1
+ import asyncio
2
+ import datetime
3
+ import re
4
+ from functools import partial
5
+
6
+ from src.rm_bdd.binance_metadata_manager import _get_data_from_filename
7
+ from src.rm_bdd.path_loader import get_all_path_responses
8
+
9
+ DAY_DATE_REGEX = re.compile(r"^\d{4}-\d{2}-\d{2}$")
10
+ async def _get_all_available_filenames(symbol, timeframe) -> list[str]:
11
+ # get all monthly data
12
+ # get all daily data for last month
13
+ path = f"data/spot/monthly/klines/{symbol}/{timeframe}"
14
+ monthly = await get_all_path_responses(f"data/spot/monthly/klines/{symbol}/{timeframe}")
15
+
16
+ filenames = [item['Key'] for item in monthly[0]['ListBucketResult'].get('Contents', [])]
17
+ file_path, year, month = filenames[-1].rsplit(".", 2)[0].rsplit("-", 2)
18
+
19
+ file_path = file_path.replace("monthly", "daily")
20
+ if month == '12':
21
+ year = "%02i"%(int(year) + 1)
22
+ else:
23
+ month = "%02i"%(int(month) + 1)
24
+
25
+ file_path = "-".join([file_path, year, month])
26
+
27
+ path = path.replace("monthly", "daily")
28
+
29
+ daily = await get_all_path_responses(path, marker=file_path)
30
+
31
+ filenames.extend([item['Key'] for item in daily[0]['ListBucketResult'].get('Contents', [])])
32
+ return filenames
33
+
34
+
35
+ def str_to_datetime(date: str) -> datetime.datetime:
36
+ if DAY_DATE_REGEX.match(date):
37
+ return datetime.datetime.strptime(date, "%Y-%m-%d")
38
+ return datetime.datetime.strptime(date, "%Y-%m")
39
+
40
+
41
+ def _is_filename_in_date_range(filename: str, date_from: str, date_to: str) -> bool:
42
+ *_, date = _get_data_from_filename(filename)
43
+ date = str_to_datetime(date)
44
+ retval = True
45
+ if date_from:
46
+ date_from = str_to_datetime(date_from)
47
+ retval = retval and date >= date_from
48
+
49
+ if date_to:
50
+ date_to = str_to_datetime(date_to)
51
+ retval = retval and date <= date_to
52
+ return retval
53
+
54
+
55
+ async def get_all_available_filenames_in_daterange(
56
+ symbol, timeframe, date_from=None, date_to=None
57
+ ) -> list[str]:
58
+ filenames = await _get_all_available_filenames(symbol, timeframe)
59
+ if date_from or date_to:
60
+ filenames = list(filter(
61
+ partial(_is_filename_in_date_range, date_from=date_from, date_to=date_to),
62
+ filenames,
63
+ ))
64
+ return filenames
65
+
66
+
67
+ if __name__ == '__main__':
68
+ async def main():
69
+ # await _get_all_available_filenames("BTCUSDT", timeframe="1m")
70
+ # result = await get_all_available_filenames_in_daterange("BTCUSDT", '1m', '2024-03', '2024-05')
71
+ # result = await get_all_available_filenames_in_daterange("BTCUSDT", '1m')
72
+ result = await get_all_available_filenames_in_daterange("BTCUSDT", '1m', date_from="2025-09")
73
+ print(result)
74
+ ...
75
+
76
+ asyncio.run(main())
@@ -0,0 +1,48 @@
1
+ import json
2
+ import logging
3
+ import os
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+
8
+ class MetadataManager:
9
+ def __init__(self, file_path):
10
+ self._data = None
11
+ self._file_path = file_path
12
+
13
+ def _load_metadata(self):
14
+ if os.path.isfile(self._file_path):
15
+ with open(self._file_path, "r") as f:
16
+ self._data = json.load(f)
17
+ else:
18
+ self._data = {}
19
+
20
+ @property
21
+ def data(self):
22
+ if self._data is None:
23
+ self._load_metadata()
24
+ return self._data
25
+
26
+ def _save_metadata(self):
27
+ with open(self._file_path, mode="w", encoding="utf-8") as f:
28
+ json.dump(self.data, f, indent=4)
29
+
30
+ def update_metadata(self, symbol: str, timeframe: str, date: str):
31
+ if symbol not in self.data:
32
+ self._data[symbol] = {}
33
+
34
+ if timeframe not in self.data[symbol]:
35
+ self._data[symbol][timeframe] = []
36
+
37
+ if not self._data[symbol][timeframe] or self._data[symbol][timeframe][-1] != date:
38
+ self._data[symbol][timeframe].append(date)
39
+ self._save_metadata()
40
+
41
+ def check_date_not_in_metadata(self, symbol: str, timeframe: str, date: str):
42
+ metadata = self.data.get(symbol, {}).get(timeframe, [])
43
+ date_not_in_metadata = date not in metadata
44
+ date_month_not_in_metadata = date.count("-") != 2 or date.rsplit("-", 1)[0] not in metadata
45
+ result = date_not_in_metadata and date_month_not_in_metadata
46
+ if not result:
47
+ logger.info(f"Data for {symbol} {timeframe} {date} already exists. Skip.")
48
+ return result
@@ -0,0 +1,62 @@
1
+ import asyncio
2
+
3
+ import aiohttp
4
+ import xmltodict
5
+
6
+ URL_TEMPLATE = "https://s3-ap-northeast-1.amazonaws.com/data.binance.vision"
7
+ SEMAPHORE = asyncio.Semaphore(300)
8
+
9
+
10
+ async def get_all_paths(prefix: str, delimiter="/"):
11
+ responses = await get_all_path_responses(prefix, delimiter)
12
+ retval = []
13
+ for response in responses:
14
+ common_prefixes = response.get("ListBucketResult", {}).get("CommonPrefixes", [])
15
+ if isinstance(common_prefixes, list):
16
+ retval.extend([i["Prefix"] for i in common_prefixes])
17
+ else:
18
+ retval.append(common_prefixes["Prefix"])
19
+ return retval
20
+
21
+
22
+ async def get_all_path_responses(prefix: str, delimiter="/", marker=None, **params):
23
+ is_truncated = True
24
+ responses = []
25
+ while is_truncated:
26
+ response = await get_paths_response_json(prefix, delimiter, marker, **params)
27
+ is_truncated = (
28
+ response.get("ListBucketResult", {}).get("IsTruncated", "false") == "true"
29
+ )
30
+ responses.append(response)
31
+ marker = response.get("ListBucketResult", {}).get("NextMarker", None)
32
+
33
+ return responses
34
+
35
+
36
+ async def get_paths_response_json(prefix: str, delimiter="/", marker=None, **params):
37
+ if not prefix.endswith("/"):
38
+ prefix += "/"
39
+ params = {"prefix": prefix, "delimiter": delimiter, "marker": marker, **params}
40
+ params = {k: v for k, v in params.items() if v is not None}
41
+ if prefix.endswith("/"):
42
+ prefix = prefix[:-1]
43
+ url = URL_TEMPLATE.format(prefix=prefix)
44
+ async with SEMAPHORE:
45
+ async with aiohttp.ClientSession() as session:
46
+ async with session.get(url, params=params) as response:
47
+ text = await response.text()
48
+ return xmltodict.parse(text)
49
+
50
+
51
+ if __name__ == "__main__":
52
+
53
+ async def main():
54
+ responses = await get_all_path_responses("data/spot/monthly/klines/SOLUSDT/1m")
55
+ print(
56
+ [
57
+ i.get("Key")
58
+ for response in responses for i in response.get("ListBucketResult", {}).get("Contents", [])
59
+ ]
60
+ )
61
+
62
+ asyncio.run(main())
@@ -0,0 +1,180 @@
1
+ Metadata-Version: 2.4
2
+ Name: rm-bdd
3
+ Version: 0.1.0
4
+ Summary: Binance Data Downloader
5
+ Author-email: Your Name <you@example.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.10
8
+ Description-Content-Type: text/markdown
9
+ Requires-Dist: aiohttp==3.13.3
10
+ Requires-Dist: pyarrow==23.0.1
11
+ Requires-Dist: pandas==3.0.1
12
+ Requires-Dist: xmltodict==1.0.4
13
+
14
+ # rm-binance-data-downloader
15
+
16
+ A Python library for downloading, extracting and formatting historical
17
+ Binance market data.
18
+
19
+ The package provides a pipeline that:
20
+
21
+ 1. Downloads data from Binance Vision
22
+ 2. Extracts compressed archives
23
+ 3. Formats data into a structured dataset
24
+
25
+ It is designed for fast data preparation for quantitative trading,
26
+ backtesting and data analysis.
27
+
28
+ ------------------------------------------------------------------------
29
+
30
+ # Installation
31
+
32
+ ``` bash
33
+ pip install rm-bdd
34
+ ```
35
+
36
+ ------------------------------------------------------------------------
37
+
38
+ # Features
39
+
40
+ - Download historical Binance data
41
+ - Automatic archive extraction
42
+ - Data formatting pipeline
43
+ - Metadata management
44
+ - Async architecture
45
+ - Easy integration into trading systems
46
+
47
+ ------------------------------------------------------------------------
48
+
49
+ # Quick Example
50
+
51
+ ``` python
52
+ import asyncio
53
+ import time
54
+
55
+ from rm_bdd.data_downloader import DataDownloader
56
+ from rm_bdd.data_extractor import DataExtractor
57
+ from rm_bdd.data_formatter import DataFormatter
58
+ from rm_bdd.binance_metadata_manager import BinanceMetadataManager
59
+
60
+
61
+ class DataManager:
62
+
63
+ def __init__(self, downloader, extractor, formatter):
64
+ self._downloader = downloader
65
+ self._extractor = extractor
66
+ self._formatter = formatter
67
+
68
+ async def download_and_save(self, symbol, timeframe, date_from=None, date_to=None):
69
+
70
+ start = time.time()
71
+ await self._downloader.download(symbol, timeframe, date_from=date_from, date_to=date_to)
72
+ print("download time:", time.time() - start)
73
+
74
+ start = time.time()
75
+ await self._extractor.extract(symbol, timeframe, date_from=date_from, date_to=date_to)
76
+ print("extract time:", time.time() - start)
77
+
78
+ start = time.time()
79
+ await self._formatter.format(symbol, timeframe, date_from=date_from, date_to=date_to)
80
+ print("format time:", time.time() - start)
81
+
82
+
83
+ async def main():
84
+
85
+ downloader = DataDownloader(
86
+ "downloads/",
87
+ BinanceMetadataManager("downloads/metadata.json")
88
+ )
89
+
90
+ extractor = DataExtractor(
91
+ "downloads/",
92
+ "extracts/",
93
+ BinanceMetadataManager("extracts/metadata.json")
94
+ )
95
+
96
+ formatter = DataFormatter(
97
+ "extracts/",
98
+ "data/",
99
+ BinanceMetadataManager("data/metadata.json")
100
+ )
101
+
102
+ manager = DataManager(downloader, extractor, formatter)
103
+
104
+ await manager.download_and_save("BTCUSDT", "1m")
105
+
106
+
107
+ asyncio.run(main())
108
+ ```
109
+
110
+ ------------------------------------------------------------------------
111
+
112
+ # Result Folder Structure
113
+
114
+ After execution the folders will look like:
115
+
116
+ downloads/
117
+ BTCUSDT/
118
+ metadata.json
119
+
120
+ extracts/
121
+ BTCUSDT/
122
+ metadata.json
123
+
124
+ data/
125
+ BTCUSDT/
126
+ metadata.json
127
+
128
+ ------------------------------------------------------------------------
129
+
130
+ # Pipeline Overview
131
+
132
+ The processing pipeline consists of three stages:
133
+
134
+ ### Downloader
135
+
136
+ Downloads historical data archives from Binance Vision.
137
+
138
+ ### Extractor
139
+
140
+ Extracts downloaded archives.
141
+
142
+ ### Formatter
143
+
144
+ Formats extracted CSV data into a structured dataset ready for analysis.
145
+
146
+ ------------------------------------------------------------------------
147
+
148
+ # Metadata Manager
149
+
150
+ The library uses a metadata system to track downloaded, extracted and
151
+ formatted data.
152
+
153
+ This prevents duplicate downloads and processing.
154
+
155
+ ------------------------------------------------------------------------
156
+
157
+ # Example Use Case
158
+
159
+ Typical workflow:
160
+
161
+ download → extract → format → analyze
162
+
163
+ Used for:
164
+
165
+ - algorithmic trading
166
+ - backtesting
167
+ - machine learning datasets
168
+ - market research
169
+
170
+ ------------------------------------------------------------------------
171
+
172
+ # Requirements
173
+
174
+ Python 3.10+
175
+
176
+ ------------------------------------------------------------------------
177
+
178
+ # License
179
+
180
+ MIT
@@ -0,0 +1,17 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/rm_bdd/__init__.py
4
+ src/rm_bdd/binance_metadata_manager.py
5
+ src/rm_bdd/csvs_to_parquet.py
6
+ src/rm_bdd/data_downloader.py
7
+ src/rm_bdd/data_extractor.py
8
+ src/rm_bdd/data_formatter.py
9
+ src/rm_bdd/data_manager.py
10
+ src/rm_bdd/files_parser.py
11
+ src/rm_bdd/metadata_manager.py
12
+ src/rm_bdd/path_loader.py
13
+ src/rm_bdd.egg-info/PKG-INFO
14
+ src/rm_bdd.egg-info/SOURCES.txt
15
+ src/rm_bdd.egg-info/dependency_links.txt
16
+ src/rm_bdd.egg-info/requires.txt
17
+ src/rm_bdd.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ aiohttp==3.13.3
2
+ pyarrow==23.0.1
3
+ pandas==3.0.1
4
+ xmltodict==1.0.4
@@ -0,0 +1 @@
1
+ rm_bdd