panda-data 0.0.3__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {panda_data-0.0.3 → panda_data-0.0.4}/PKG-INFO +1 -1
- panda_data-0.0.4/panda_data/test.py +30 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/transport/http.py +290 -224
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/PKG-INFO +1 -1
- {panda_data-0.0.3 → panda_data-0.0.4}/pyproject.toml +1 -1
- panda_data-0.0.3/panda_data/test.py +0 -27
- {panda_data-0.0.3 → panda_data-0.0.4}/README.md +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/__init__.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/client.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/config/__init__.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/core/__init__.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/core/service.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/exceptions.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/__init__.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/financial_and_factors_reader.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/future_reader.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/init_token.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/market_reader.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/market_reference_reader.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/trading_tools_reader.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/transport/__init__.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/utils/common_utils.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/utils/param_check_utils.py +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/SOURCES.txt +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/dependency_links.txt +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/requires.txt +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/top_level.txt +0 -0
- {panda_data-0.0.3 → panda_data-0.0.4}/setup.cfg +0 -0
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
|
|
2
|
+
import panda_data
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
if __name__ == "__main__":
|
|
6
|
+
# 1. pip install panda_data
|
|
7
|
+
# 2. 账号为:86+官网注册的手机号(eg:8617777777777),密码与官网同步
|
|
8
|
+
# 3. 接口文档见官方知识库
|
|
9
|
+
# panda_data.init_token(username="", password="")
|
|
10
|
+
# panda_data.init_token(username="super_data_user", password="panda@2026^_^", base_url="http://192.168.1.3:8180")
|
|
11
|
+
panda_data.init_token(username="kk111", password="lijingyu.", base_url="http://127.0.0.1:8180")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# 开始计时
|
|
15
|
+
start_time = time.time()
|
|
16
|
+
|
|
17
|
+
result = panda_data.get_market_min_data(
|
|
18
|
+
symbol="000001.SZ",
|
|
19
|
+
start_date="20250101",
|
|
20
|
+
end_date="20250131",
|
|
21
|
+
symbol_type="stock",
|
|
22
|
+
fields=["symbol", "date", "num_trades", "amount", "volume"],
|
|
23
|
+
frequency="1m",
|
|
24
|
+
time_zone=("10:00", "11:00")
|
|
25
|
+
)
|
|
26
|
+
print(result)
|
|
27
|
+
end_time = time.time()
|
|
28
|
+
print(f"耗时:{end_time - start_time}")
|
|
29
|
+
|
|
30
|
+
print(result)
|
|
@@ -11,7 +11,7 @@ import socket
|
|
|
11
11
|
import threading
|
|
12
12
|
import time
|
|
13
13
|
from dataclasses import dataclass
|
|
14
|
-
from typing import Any, Dict, Optional
|
|
14
|
+
from typing import Any, Dict, List, Optional
|
|
15
15
|
from urllib.parse import urljoin, urlparse
|
|
16
16
|
from urllib.request import (
|
|
17
17
|
Request,
|
|
@@ -94,6 +94,7 @@ class HTTPClientConfig:
|
|
|
94
94
|
proxy_username: Optional[str] = None
|
|
95
95
|
proxy_password: Optional[str] = None
|
|
96
96
|
use_gzip: bool = False # 是否使用 gzip 压缩请求体
|
|
97
|
+
local_mode: bool = False # 本地模式:为 True 时不读 token 文件,不鉴权
|
|
97
98
|
|
|
98
99
|
|
|
99
100
|
class HTTPClient:
|
|
@@ -236,7 +237,7 @@ class HTTPClient:
|
|
|
236
237
|
if not self._establish_monitor_connection():
|
|
237
238
|
consecutive_failures += 1
|
|
238
239
|
if consecutive_failures >= max_failures:
|
|
239
|
-
self._delete_token_file()
|
|
240
|
+
# self._delete_token_file()
|
|
240
241
|
break
|
|
241
242
|
else:
|
|
242
243
|
consecutive_failures = 0
|
|
@@ -254,7 +255,7 @@ class HTTPClient:
|
|
|
254
255
|
# 连接断开
|
|
255
256
|
consecutive_failures += 1
|
|
256
257
|
if consecutive_failures >= max_failures:
|
|
257
|
-
self._delete_token_file()
|
|
258
|
+
# self._delete_token_file()
|
|
258
259
|
# 清理连接
|
|
259
260
|
try:
|
|
260
261
|
self._monitor_connection.close()
|
|
@@ -266,7 +267,7 @@ class HTTPClient:
|
|
|
266
267
|
# 其他异常,也视为连接问题
|
|
267
268
|
consecutive_failures += 1
|
|
268
269
|
if consecutive_failures >= max_failures:
|
|
269
|
-
self._delete_token_file()
|
|
270
|
+
# self._delete_token_file()
|
|
270
271
|
try:
|
|
271
272
|
self._monitor_connection.close()
|
|
272
273
|
except Exception:
|
|
@@ -277,7 +278,7 @@ class HTTPClient:
|
|
|
277
278
|
except Exception:
|
|
278
279
|
consecutive_failures += 1
|
|
279
280
|
if consecutive_failures >= max_failures:
|
|
280
|
-
self._delete_token_file()
|
|
281
|
+
# self._delete_token_file()
|
|
281
282
|
break
|
|
282
283
|
|
|
283
284
|
def start_connection_monitoring(self) -> None:
|
|
@@ -350,8 +351,8 @@ class HTTPClient:
|
|
|
350
351
|
if accept_encodings:
|
|
351
352
|
headers["Accept-Encoding"] = ", ".join(accept_encodings)
|
|
352
353
|
|
|
353
|
-
# 添加从文件读取的token到Authorization header
|
|
354
|
-
if endpoint and not endpoint.__contains__("login"):
|
|
354
|
+
# 添加从文件读取的token到Authorization header(本地模式下跳过鉴权)
|
|
355
|
+
if not self._config.local_mode and endpoint and not endpoint.__contains__("login"):
|
|
355
356
|
user_file_path = self._get_token_file_path()
|
|
356
357
|
if not user_file_path:
|
|
357
358
|
raise ServiceError("无法确定token文件路径,请重新登录!")
|
|
@@ -450,9 +451,7 @@ class HTTPClient:
|
|
|
450
451
|
if not content_encoding:
|
|
451
452
|
return content
|
|
452
453
|
|
|
453
|
-
decompress_start = time.time()
|
|
454
454
|
content_encoding_lower = content_encoding.lower()
|
|
455
|
-
compressed_size = len(content)
|
|
456
455
|
|
|
457
456
|
try:
|
|
458
457
|
if content_encoding_lower == "gzip":
|
|
@@ -499,55 +498,188 @@ class HTTPClient:
|
|
|
499
498
|
except Exception as e:
|
|
500
499
|
raise
|
|
501
500
|
|
|
501
|
+
def _stream_to_temp_file(self, response, content_encoding: str, tmp_file_path: str) -> int:
|
|
502
|
+
"""
|
|
503
|
+
将 HTTP 响应流式解压写入临时文件,避免全量数据同时驻留内存。
|
|
504
|
+
大内存机器受益于更快的 I/O 吞吐;小内存机器(如 8GB)
|
|
505
|
+
因不将全量解压数据加载到内存而避免 OOM。
|
|
506
|
+
|
|
507
|
+
Returns:
|
|
508
|
+
写入文件的总字节数
|
|
509
|
+
"""
|
|
510
|
+
_CHUNK = 512 * 1024
|
|
511
|
+
total_written = 0
|
|
512
|
+
encoding = content_encoding.lower() if content_encoding else ""
|
|
513
|
+
|
|
514
|
+
with open(tmp_file_path, 'wb') as out:
|
|
515
|
+
if encoding in ("zstd", "z-standard"):
|
|
516
|
+
if not HAS_ZSTD:
|
|
517
|
+
raise ServiceError(
|
|
518
|
+
"Response is compressed with zstd, but zstandard library is not installed. "
|
|
519
|
+
"Please install it with: pip install zstandard"
|
|
520
|
+
)
|
|
521
|
+
dctx = zstd.ZstdDecompressor()
|
|
522
|
+
reader = dctx.stream_reader(response, read_size=_CHUNK)
|
|
523
|
+
try:
|
|
524
|
+
while True:
|
|
525
|
+
chunk = reader.read(_CHUNK)
|
|
526
|
+
if not chunk:
|
|
527
|
+
break
|
|
528
|
+
out.write(chunk)
|
|
529
|
+
total_written += len(chunk)
|
|
530
|
+
finally:
|
|
531
|
+
reader.close()
|
|
532
|
+
elif encoding == "gzip":
|
|
533
|
+
import zlib
|
|
534
|
+
decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
|
|
535
|
+
while True:
|
|
536
|
+
compressed_chunk = response.read(_CHUNK)
|
|
537
|
+
if not compressed_chunk:
|
|
538
|
+
break
|
|
539
|
+
decompressed = decompressor.decompress(compressed_chunk)
|
|
540
|
+
if decompressed:
|
|
541
|
+
out.write(decompressed)
|
|
542
|
+
total_written += len(decompressed)
|
|
543
|
+
remaining = decompressor.flush()
|
|
544
|
+
if remaining:
|
|
545
|
+
out.write(remaining)
|
|
546
|
+
total_written += len(remaining)
|
|
547
|
+
else:
|
|
548
|
+
while True:
|
|
549
|
+
chunk = response.read(_CHUNK)
|
|
550
|
+
if not chunk:
|
|
551
|
+
break
|
|
552
|
+
out.write(chunk)
|
|
553
|
+
total_written += len(chunk)
|
|
554
|
+
|
|
555
|
+
return total_written
|
|
556
|
+
|
|
557
|
+
@staticmethod
|
|
558
|
+
def _parquet_col_ci(columns: List[str], name: str) -> Optional[str]:
|
|
559
|
+
"""按不区分大小写匹配列名,返回 Parquet/DuckDB 中的实际列名。"""
|
|
560
|
+
nl = name.lower()
|
|
561
|
+
for c in columns:
|
|
562
|
+
if c.lower() == nl:
|
|
563
|
+
return c
|
|
564
|
+
return None
|
|
565
|
+
|
|
566
|
+
def _infer_parquet_columns(self, conn, tmp_file_path: str) -> List[str]:
|
|
567
|
+
"""读取 Parquet 列名;LIMIT 1 失败时用 pyarrow / DuckDB DESCRIBE 兜底,避免无 ORDER BY 时乱序。"""
|
|
568
|
+
try:
|
|
569
|
+
sample_result = conn.execute(
|
|
570
|
+
f"SELECT * FROM read_parquet('{tmp_file_path}') LIMIT 1"
|
|
571
|
+
)
|
|
572
|
+
return list(sample_result.df().columns)
|
|
573
|
+
except Exception:
|
|
574
|
+
pass
|
|
575
|
+
try:
|
|
576
|
+
import pyarrow.parquet as pq
|
|
577
|
+
|
|
578
|
+
return list(pq.read_schema(tmp_file_path).names)
|
|
579
|
+
except Exception:
|
|
580
|
+
pass
|
|
581
|
+
try:
|
|
582
|
+
desc = conn.execute(
|
|
583
|
+
f"DESCRIBE SELECT * FROM read_parquet('{tmp_file_path}')"
|
|
584
|
+
).df()
|
|
585
|
+
return desc.iloc[:, 0].astype(str).tolist()
|
|
586
|
+
except Exception:
|
|
587
|
+
pass
|
|
588
|
+
return []
|
|
589
|
+
|
|
502
590
|
def _parse_response(self, response, endpoint: str = "") -> Dict[str, Any]:
|
|
503
591
|
"""解析响应,支持标准JSON、流式响应(NDJSON格式)和Parquet格式"""
|
|
504
|
-
content = response.read()
|
|
505
592
|
content_encoding = response.headers.get("Content-Encoding", "")
|
|
506
|
-
content = self._decompress_content(content, content_encoding)
|
|
507
|
-
|
|
508
|
-
# 获取 Content-Type 头
|
|
509
593
|
content_type = response.headers.get("Content-Type", "")
|
|
510
594
|
|
|
511
|
-
#
|
|
512
|
-
|
|
595
|
+
# 已知返回 Parquet 格式的接口关键词
|
|
596
|
+
_parquet_ep_kws = (
|
|
597
|
+
"getmultimarketmindata", "getfuturetickdata", "getfinancialstatementdata",
|
|
598
|
+
"getstockmarkethkdata", "getstockmarkethkmindata",
|
|
599
|
+
"getstockmarketusdata", "getstockmarketusmindata",
|
|
600
|
+
)
|
|
601
|
+
endpoint_lower = endpoint.lower()
|
|
602
|
+
is_known_parquet_ep = (
|
|
603
|
+
"parquet" in content_type.lower()
|
|
604
|
+
or "application/x-parquet" in content_type.lower()
|
|
605
|
+
or any(kw in endpoint_lower for kw in _parquet_ep_kws)
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
# 对于已知 Parquet 接口,流式写入临时文件以避免全量数据驻留内存
|
|
609
|
+
_streamed_tmp_path = None
|
|
610
|
+
_streamed_size_mb = 0.0
|
|
611
|
+
if is_known_parquet_ep:
|
|
612
|
+
import tempfile
|
|
613
|
+
import os
|
|
614
|
+
_tmp_fd, _streamed_tmp_path = tempfile.mkstemp(suffix='.parquet')
|
|
615
|
+
os.close(_tmp_fd)
|
|
616
|
+
try:
|
|
617
|
+
_total_written = self._stream_to_temp_file(
|
|
618
|
+
response, content_encoding, _streamed_tmp_path
|
|
619
|
+
)
|
|
620
|
+
except Exception as _e:
|
|
621
|
+
try:
|
|
622
|
+
os.remove(_streamed_tmp_path)
|
|
623
|
+
except Exception:
|
|
624
|
+
pass
|
|
625
|
+
raise ServiceError(f"流式写入临时文件失败: {_e}") from _e
|
|
626
|
+
# 验证文件确实是 Parquet 格式(检查 "PAR1" 魔数)
|
|
627
|
+
_is_valid_parquet = False
|
|
628
|
+
if _total_written >= 4:
|
|
629
|
+
with open(_streamed_tmp_path, 'rb') as _f:
|
|
630
|
+
_is_valid_parquet = _f.read(4) == b"PAR1"
|
|
631
|
+
if _is_valid_parquet:
|
|
632
|
+
_streamed_size_mb = _total_written / (1024 * 1024)
|
|
633
|
+
content = None
|
|
634
|
+
else:
|
|
635
|
+
# 非 Parquet(可能是 JSON 错误响应),回退到内存解析
|
|
636
|
+
with open(_streamed_tmp_path, 'rb') as _f:
|
|
637
|
+
content = _f.read()
|
|
638
|
+
try:
|
|
639
|
+
os.remove(_streamed_tmp_path)
|
|
640
|
+
except Exception:
|
|
641
|
+
pass
|
|
642
|
+
_streamed_tmp_path = None
|
|
643
|
+
else:
|
|
644
|
+
content = response.read()
|
|
645
|
+
content = self._decompress_content(content, content_encoding)
|
|
646
|
+
|
|
647
|
+
# 非流式路径:通过文件魔数检测是否为 Parquet
|
|
513
648
|
is_parquet_by_content = False
|
|
514
|
-
if len(content) >= 4:
|
|
515
|
-
# 检查文件魔数 "PAR1"
|
|
649
|
+
if content is not None and len(content) >= 4:
|
|
516
650
|
is_parquet_by_content = content[:4] == b"PAR1"
|
|
517
651
|
|
|
518
|
-
is_parquet = (
|
|
519
|
-
"parquet" in content_type.lower() or
|
|
520
|
-
"application/x-parquet" in content_type.lower() or
|
|
521
|
-
"getmultimarketmindata" in endpoint.lower() or
|
|
522
|
-
"getfuturetickdata" in endpoint.lower() or
|
|
523
|
-
"getfinancialstatementdata" in endpoint.lower() or
|
|
524
|
-
"getfinancialstatementdailydata" in endpoint.lower() or
|
|
525
|
-
is_parquet_by_content
|
|
526
|
-
)
|
|
652
|
+
is_parquet = (_streamed_tmp_path is not None) or is_parquet_by_content
|
|
527
653
|
|
|
528
654
|
# 如果是 Parquet 格式,使用 DuckDB 读取
|
|
529
655
|
if is_parquet:
|
|
530
656
|
if not HAS_DUCKDB or duckdb is None:
|
|
657
|
+
if _streamed_tmp_path:
|
|
658
|
+
try:
|
|
659
|
+
os.remove(_streamed_tmp_path)
|
|
660
|
+
except Exception:
|
|
661
|
+
pass
|
|
531
662
|
raise ServiceError(
|
|
532
663
|
"响应是 Parquet 格式,但 DuckDB 库未安装。"
|
|
533
664
|
"请安装: pip install duckdb"
|
|
534
665
|
)
|
|
535
666
|
|
|
536
|
-
parquet_start = time.time()
|
|
537
667
|
global _active_parquet_reads
|
|
538
668
|
try:
|
|
539
|
-
# 将字节内容写入临时文件(DuckDB 需要文件路径)
|
|
540
669
|
import tempfile
|
|
541
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.parquet') as tmp_file:
|
|
542
|
-
tmp_file.write(content)
|
|
543
|
-
tmp_file_path = tmp_file.name
|
|
544
670
|
|
|
545
|
-
|
|
671
|
+
if _streamed_tmp_path is not None:
|
|
672
|
+
tmp_file_path = _streamed_tmp_path
|
|
673
|
+
file_size_mb = _streamed_size_mb
|
|
674
|
+
else:
|
|
675
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.parquet') as tmp_file:
|
|
676
|
+
tmp_file.write(content)
|
|
677
|
+
tmp_file_path = tmp_file.name
|
|
678
|
+
file_size_mb = len(content) / 1024 / 1024
|
|
679
|
+
del content
|
|
546
680
|
|
|
547
|
-
# 获取信号量,限制最大并发HTTP请求数
|
|
548
681
|
_parquet_read_semaphore.acquire()
|
|
549
682
|
|
|
550
|
-
max_concurrent = _get_max_concurrent_parquet_reads()
|
|
551
683
|
with _parquet_read_lock:
|
|
552
684
|
_active_parquet_reads += 1
|
|
553
685
|
current_concurrency = _active_parquet_reads
|
|
@@ -556,7 +688,6 @@ class HTTPClient:
|
|
|
556
688
|
# 使用 DuckDB 读取 Parquet 文件
|
|
557
689
|
conn = duckdb.connect()
|
|
558
690
|
try:
|
|
559
|
-
import os
|
|
560
691
|
try:
|
|
561
692
|
import psutil
|
|
562
693
|
HAS_PSUTIL = True
|
|
@@ -575,109 +706,69 @@ class HTTPClient:
|
|
|
575
706
|
available_memory_gb = 4.0
|
|
576
707
|
total_memory_gb = 8.0
|
|
577
708
|
|
|
709
|
+
cpu_count = os.cpu_count() or 4
|
|
578
710
|
effective_concurrency = max(current_concurrency, 1)
|
|
579
|
-
|
|
580
|
-
#
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
711
|
+
|
|
712
|
+
# Parquet 展开后通常是文件大小的 5~15 倍,排序还需要额外临时空间
|
|
713
|
+
estimated_expanded_gb = file_size_mb * 10 / 1024
|
|
714
|
+
estimated_need_gb = max(estimated_expanded_gb * 2.5, 1.0)
|
|
715
|
+
|
|
716
|
+
# 根据总内存渐进式分配 DuckDB 内存预算:
|
|
717
|
+
# ≤8GB → 20%(配合 temp_directory 溢写磁盘防 OOM)
|
|
718
|
+
# 8~16GB → 20%~40% 线性过渡(平衡速度与安全)
|
|
719
|
+
# >16GB → 积极分配(充分利用内存提升速度)
|
|
720
|
+
if total_memory_gb <= 8:
|
|
721
|
+
per_conn_budget_gb = total_memory_gb * 0.20 / effective_concurrency
|
|
722
|
+
elif total_memory_gb <= 16:
|
|
723
|
+
_ratio = 0.20 + (total_memory_gb - 8) / 8 * 0.20
|
|
724
|
+
per_conn_budget_gb = total_memory_gb * _ratio / effective_concurrency
|
|
585
725
|
else:
|
|
586
|
-
|
|
726
|
+
per_conn_budget_gb = max(
|
|
727
|
+
total_memory_gb * 0.5,
|
|
728
|
+
available_memory_gb * 0.9,
|
|
729
|
+
) / effective_concurrency
|
|
730
|
+
cap_ratio = float(os.environ.get("PANDA_DATA_DUCKDB_MEMORY_CAP_RATIO", "0.65"))
|
|
731
|
+
per_conn_budget_gb = min(
|
|
732
|
+
per_conn_budget_gb,
|
|
733
|
+
total_memory_gb * cap_ratio / effective_concurrency,
|
|
734
|
+
)
|
|
735
|
+
memory_limit_gb = min(estimated_need_gb, per_conn_budget_gb)
|
|
736
|
+
memory_limit_gb = max(memory_limit_gb, 0.5)
|
|
737
|
+
|
|
738
|
+
env_mem = os.environ.get("PANDA_DATA_DUCKDB_MEMORY_GB", "").strip()
|
|
739
|
+
if env_mem:
|
|
740
|
+
try:
|
|
741
|
+
memory_limit_gb = max(0.5, float(env_mem))
|
|
742
|
+
except ValueError:
|
|
743
|
+
pass
|
|
587
744
|
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
max_threads_for_file = min(max_possible_threads, 2)
|
|
601
|
-
else:
|
|
602
|
-
max_threads_for_file = 1 # 内存不足时只使用1个线程
|
|
603
|
-
num_threads = max(1, min(cpu_count, max_threads_for_file))
|
|
604
|
-
# 对于大文件,使用更保守的内存限制
|
|
605
|
-
memory_limit_gb = min(per_thread_memory_gb, 1.5, max_per_thread_memory_gb)
|
|
606
|
-
memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
|
|
607
|
-
elif file_size_mb > 500:
|
|
608
|
-
base_threads = 4
|
|
609
|
-
if available_memory_gb > 32:
|
|
610
|
-
max_threads_for_file = min(max_possible_threads, 8)
|
|
611
|
-
elif available_memory_gb > 16:
|
|
612
|
-
max_threads_for_file = min(max_possible_threads, 6)
|
|
613
|
-
elif available_memory_gb > 8:
|
|
614
|
-
max_threads_for_file = min(max_possible_threads, 4)
|
|
615
|
-
else:
|
|
616
|
-
max_threads_for_file = 2 # 内存不足时减少线程
|
|
617
|
-
num_threads = max(1, min(cpu_count, max_threads_for_file))
|
|
618
|
-
memory_limit_gb = min(per_thread_memory_gb, 2.0, max_per_thread_memory_gb)
|
|
619
|
-
memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
|
|
620
|
-
elif file_size_mb > 200:
|
|
621
|
-
base_threads = 8
|
|
622
|
-
if available_memory_gb > 64:
|
|
623
|
-
max_threads_for_file = min(max_possible_threads, 16)
|
|
624
|
-
elif available_memory_gb > 32:
|
|
625
|
-
max_threads_for_file = min(max_possible_threads, 12)
|
|
626
|
-
elif available_memory_gb > 8:
|
|
627
|
-
max_threads_for_file = min(max_possible_threads, 8)
|
|
628
|
-
else:
|
|
629
|
-
max_threads_for_file = 4 # 内存不足时减少线程
|
|
630
|
-
num_threads = max(1, min(cpu_count, max_threads_for_file))
|
|
631
|
-
memory_limit_gb = min(per_thread_memory_gb, 2.0, max_per_thread_memory_gb)
|
|
632
|
-
memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
|
|
745
|
+
memory_limit = f"{memory_limit_gb:.1f}GB"
|
|
746
|
+
|
|
747
|
+
# 线程:内存过小时少开线程;避免 available=4 时 int(4/4)=1 单核跑满
|
|
748
|
+
env_thr = os.environ.get("PANDA_DATA_DUCKDB_THREADS", "").strip()
|
|
749
|
+
if env_thr:
|
|
750
|
+
try:
|
|
751
|
+
num_threads = max(1, min(cpu_count, int(env_thr)))
|
|
752
|
+
except ValueError:
|
|
753
|
+
num_threads = min(
|
|
754
|
+
cpu_count,
|
|
755
|
+
max(2, min(16, int(memory_limit_gb // 1.2))),
|
|
756
|
+
)
|
|
633
757
|
else:
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
base_threads = 12
|
|
641
|
-
elif available_memory_gb > 32:
|
|
642
|
-
max_threads_for_file = min(max_possible_threads, 16)
|
|
643
|
-
base_threads = 8
|
|
644
|
-
elif available_memory_gb > 8:
|
|
645
|
-
max_threads_for_file = min(max_possible_threads, 8)
|
|
646
|
-
base_threads = 4
|
|
647
|
-
else:
|
|
648
|
-
# 低内存系统:但考虑到数据量可能很大(行数多),使用4个线程
|
|
649
|
-
# 如果内存真的紧张,会在后续的内存限制中体现
|
|
650
|
-
max_threads_for_file = min(max_possible_threads, 4)
|
|
651
|
-
base_threads = 4
|
|
652
|
-
num_threads = max(1, min(cpu_count, max_threads_for_file))
|
|
653
|
-
# 小文件不需要太多内存,限制在1GB以内
|
|
654
|
-
memory_limit_gb = min(per_thread_memory_gb, 1.0, max_per_thread_memory_gb)
|
|
655
|
-
memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
|
|
656
|
-
|
|
657
|
-
# 根据文件大小进一步调整内存限制(小文件不需要太多内存)
|
|
658
|
-
if file_size_mb <= 100:
|
|
659
|
-
# 对于小于100MB的文件,限制内存使用在512MB-1GB之间
|
|
660
|
-
memory_limit_gb = min(memory_limit_gb, 1.0)
|
|
661
|
-
elif file_size_mb <= 200:
|
|
662
|
-
# 对于100-200MB的文件,限制在1.5GB以内
|
|
663
|
-
memory_limit_gb = min(memory_limit_gb, 1.5)
|
|
664
|
-
|
|
665
|
-
# 确保内存限制不超过可用内存的90%(保留安全边距)
|
|
666
|
-
max_safe_memory_gb = available_memory_gb * 0.9
|
|
667
|
-
memory_limit_gb = min(memory_limit_gb, max_safe_memory_gb)
|
|
668
|
-
memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
|
|
669
|
-
|
|
670
|
-
num_threads = max(num_threads, 1)
|
|
758
|
+
num_threads = min(
|
|
759
|
+
cpu_count,
|
|
760
|
+
max(2, min(16, int(memory_limit_gb // 1.2))),
|
|
761
|
+
)
|
|
762
|
+
num_threads = max(1, num_threads)
|
|
763
|
+
|
|
671
764
|
conn.execute(f"SET threads TO {num_threads}")
|
|
672
765
|
conn.execute(f"SET memory_limit='{memory_limit}'")
|
|
673
|
-
|
|
674
|
-
conn.execute("SET enable_object_cache TO true")
|
|
675
|
-
else:
|
|
676
|
-
conn.execute("SET enable_object_cache TO false")
|
|
766
|
+
conn.execute("SET enable_object_cache TO true")
|
|
677
767
|
conn.execute("SET enable_progress_bar TO false")
|
|
678
768
|
conn.execute("SET preserve_insertion_order TO false")
|
|
769
|
+
_td = tempfile.gettempdir()
|
|
770
|
+
conn.execute(f"SET temp_directory='{_td}'")
|
|
679
771
|
except Exception as config_error:
|
|
680
|
-
# 回退配置:使用更保守的设置以避免内存不足
|
|
681
772
|
num_threads = 1
|
|
682
773
|
memory_limit = "512MB"
|
|
683
774
|
try:
|
|
@@ -686,93 +777,95 @@ class HTTPClient:
|
|
|
686
777
|
conn.execute("SET enable_object_cache TO false")
|
|
687
778
|
conn.execute("SET enable_progress_bar TO false")
|
|
688
779
|
conn.execute("SET preserve_insertion_order TO false")
|
|
780
|
+
try:
|
|
781
|
+
_td = tempfile.gettempdir()
|
|
782
|
+
conn.execute(f"SET temp_directory='{_td}'")
|
|
783
|
+
except Exception:
|
|
784
|
+
pass
|
|
689
785
|
except Exception:
|
|
690
786
|
pass
|
|
691
787
|
|
|
692
|
-
#
|
|
693
|
-
available_columns =
|
|
694
|
-
try:
|
|
695
|
-
sample_result = conn.execute(f"SELECT * FROM read_parquet('{tmp_file_path}') LIMIT 1")
|
|
696
|
-
sample_df = sample_result.df()
|
|
697
|
-
available_columns = list(sample_df.columns)
|
|
698
|
-
except Exception:
|
|
699
|
-
pass
|
|
788
|
+
# 读取列信息(LIMIT 1 失败时用 pyarrow schema 兜底,避免无 ORDER BY 时保持 Parquet 物理顺序)
|
|
789
|
+
available_columns = self._infer_parquet_columns(conn, tmp_file_path)
|
|
700
790
|
|
|
701
791
|
# 构建查询语句
|
|
702
|
-
# 判断 endpoint
|
|
792
|
+
# 判断 endpoint 类型并设置排序和列选择逻辑(列名大小写不敏感,ORDER BY 使用实际列名)
|
|
703
793
|
if "getfuturetickdata" in endpoint.lower():
|
|
704
794
|
# 特殊处理:按照 symbol ASC, timestamp ASC 排序,并将 symbol, date, timestamp 放在前三列
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
795
|
+
sym_col = self._parquet_col_ci(available_columns, "symbol")
|
|
796
|
+
ts_col = self._parquet_col_ci(available_columns, "timestamp")
|
|
797
|
+
date_col = self._parquet_col_ci(available_columns, "date")
|
|
708
798
|
|
|
709
799
|
order_by_clause = ""
|
|
710
|
-
if
|
|
711
|
-
order_by_clause =
|
|
712
|
-
elif
|
|
713
|
-
order_by_clause =
|
|
714
|
-
elif
|
|
715
|
-
order_by_clause =
|
|
716
|
-
|
|
717
|
-
if
|
|
718
|
-
other_columns = [
|
|
719
|
-
|
|
720
|
-
|
|
800
|
+
if sym_col and ts_col:
|
|
801
|
+
order_by_clause = f' ORDER BY "{sym_col}" ASC, "{ts_col}" ASC'
|
|
802
|
+
elif sym_col:
|
|
803
|
+
order_by_clause = f' ORDER BY "{sym_col}" ASC'
|
|
804
|
+
elif ts_col:
|
|
805
|
+
order_by_clause = f' ORDER BY "{ts_col}" ASC'
|
|
806
|
+
|
|
807
|
+
if sym_col and date_col and ts_col:
|
|
808
|
+
other_columns = [
|
|
809
|
+
col
|
|
810
|
+
for col in available_columns
|
|
811
|
+
if col not in (sym_col, date_col, ts_col)
|
|
812
|
+
]
|
|
813
|
+
select_columns = [sym_col, date_col, ts_col] + other_columns
|
|
721
814
|
select_clause = ", ".join([f'"{col}"' for col in select_columns])
|
|
722
|
-
elif
|
|
723
|
-
other_columns = [
|
|
724
|
-
|
|
815
|
+
elif sym_col and date_col:
|
|
816
|
+
other_columns = [
|
|
817
|
+
col for col in available_columns if col not in (sym_col, date_col)
|
|
818
|
+
]
|
|
819
|
+
select_columns = [sym_col, date_col] + other_columns
|
|
725
820
|
select_clause = ", ".join([f'"{col}"' for col in select_columns])
|
|
726
|
-
elif
|
|
727
|
-
other_columns = [col for col in available_columns if col !=
|
|
728
|
-
select_columns = [
|
|
821
|
+
elif sym_col:
|
|
822
|
+
other_columns = [col for col in available_columns if col != sym_col]
|
|
823
|
+
select_columns = [sym_col] + other_columns
|
|
729
824
|
select_clause = ", ".join([f'"{col}"' for col in select_columns])
|
|
730
825
|
else:
|
|
731
826
|
select_clause = "*"
|
|
732
827
|
else:
|
|
733
|
-
#
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
828
|
+
# 默认:日频/分钟等行情——优先 symbol,再 date / minute 降序
|
|
829
|
+
sym_col = self._parquet_col_ci(available_columns, "symbol")
|
|
830
|
+
date_col = self._parquet_col_ci(available_columns, "date")
|
|
831
|
+
min_col = self._parquet_col_ci(available_columns, "minute")
|
|
737
832
|
|
|
738
833
|
order_by_clause = ""
|
|
739
|
-
if
|
|
740
|
-
order_by_clause =
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
elif
|
|
744
|
-
order_by_clause =
|
|
745
|
-
elif
|
|
746
|
-
order_by_clause =
|
|
747
|
-
elif
|
|
748
|
-
order_by_clause =
|
|
749
|
-
elif
|
|
750
|
-
order_by_clause =
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
834
|
+
if sym_col and date_col and min_col:
|
|
835
|
+
order_by_clause = (
|
|
836
|
+
f' ORDER BY "{sym_col}" ASC, "{date_col}" DESC, "{min_col}" DESC'
|
|
837
|
+
)
|
|
838
|
+
elif sym_col and date_col:
|
|
839
|
+
order_by_clause = f' ORDER BY "{sym_col}" ASC, "{date_col}" DESC'
|
|
840
|
+
elif sym_col:
|
|
841
|
+
order_by_clause = f' ORDER BY "{sym_col}" ASC'
|
|
842
|
+
elif date_col and min_col:
|
|
843
|
+
order_by_clause = f' ORDER BY "{date_col}" DESC, "{min_col}" DESC'
|
|
844
|
+
elif date_col:
|
|
845
|
+
order_by_clause = f' ORDER BY "{date_col}" DESC'
|
|
846
|
+
elif min_col:
|
|
847
|
+
order_by_clause = f' ORDER BY "{min_col}" DESC'
|
|
848
|
+
|
|
849
|
+
if sym_col and date_col:
|
|
850
|
+
other_columns = [
|
|
851
|
+
col for col in available_columns if col not in (sym_col, date_col)
|
|
852
|
+
]
|
|
853
|
+
select_columns = [sym_col, date_col] + other_columns
|
|
755
854
|
select_clause = ", ".join([f'"{col}"' for col in select_columns])
|
|
756
|
-
elif
|
|
757
|
-
other_columns = [col for col in available_columns if col !=
|
|
758
|
-
select_columns = [
|
|
855
|
+
elif sym_col:
|
|
856
|
+
other_columns = [col for col in available_columns if col != sym_col]
|
|
857
|
+
select_columns = [sym_col] + other_columns
|
|
759
858
|
select_clause = ", ".join([f'"{col}"' for col in select_columns])
|
|
760
|
-
elif
|
|
761
|
-
other_columns = [col for col in available_columns if col !=
|
|
762
|
-
select_columns = [
|
|
859
|
+
elif date_col:
|
|
860
|
+
other_columns = [col for col in available_columns if col != date_col]
|
|
861
|
+
select_columns = [date_col] + other_columns
|
|
763
862
|
select_clause = ", ".join([f'"{col}"' for col in select_columns])
|
|
764
863
|
else:
|
|
765
864
|
select_clause = "*"
|
|
766
865
|
|
|
767
866
|
# 执行主查询
|
|
768
867
|
query = f"SELECT {select_clause} FROM read_parquet('{tmp_file_path}'){order_by_clause}"
|
|
769
|
-
result = conn.execute(query)
|
|
770
|
-
|
|
771
|
-
# 转换为DataFrame(优先使用Arrow格式优化性能)
|
|
772
|
-
use_arrow = False
|
|
773
|
-
df = None
|
|
774
868
|
|
|
775
|
-
# 检查是否可以使用Arrow格式
|
|
776
869
|
try:
|
|
777
870
|
import pyarrow as pa
|
|
778
871
|
HAS_PYARROW = True
|
|
@@ -781,47 +874,22 @@ class HTTPClient:
|
|
|
781
874
|
pa = None
|
|
782
875
|
|
|
783
876
|
if HAS_PYARROW:
|
|
784
|
-
|
|
877
|
+
result = conn.execute(query)
|
|
785
878
|
try:
|
|
786
879
|
arrow_result = result.arrow()
|
|
787
|
-
|
|
788
|
-
# DuckDB的arrow()可能返回Table或RecordBatchReader
|
|
789
880
|
if isinstance(arrow_result, pa.Table):
|
|
790
|
-
# 如果是Table,直接转换
|
|
791
881
|
df = arrow_result.to_pandas()
|
|
792
882
|
elif isinstance(arrow_result, pa.RecordBatchReader):
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
df = arrow_table.to_pandas()
|
|
796
|
-
elif hasattr(arrow_result, 'to_pandas'):
|
|
797
|
-
# 某些版本可能直接支持to_pandas
|
|
883
|
+
df = arrow_result.read_all().to_pandas()
|
|
884
|
+
elif hasattr(arrow_result, "to_pandas"):
|
|
798
885
|
df = arrow_result.to_pandas()
|
|
799
|
-
elif hasattr(arrow_result, 'read_all'):
|
|
800
|
-
# 尝试使用read_all方法
|
|
801
|
-
arrow_table = arrow_result.read_all()
|
|
802
|
-
df = arrow_table.to_pandas()
|
|
803
886
|
else:
|
|
804
|
-
|
|
805
|
-
try:
|
|
806
|
-
batches = list(arrow_result)
|
|
807
|
-
if batches:
|
|
808
|
-
arrow_table = pa.Table.from_batches(batches)
|
|
809
|
-
df = arrow_table.to_pandas()
|
|
810
|
-
else:
|
|
811
|
-
raise ValueError("No batches in RecordBatchReader")
|
|
812
|
-
except Exception as e:
|
|
813
|
-
raise ValueError(
|
|
814
|
-
f"Cannot convert arrow_result to pandas: {type(arrow_result)}, error: {e}")
|
|
815
|
-
|
|
816
|
-
use_arrow = True
|
|
887
|
+
raise TypeError(f"unexpected arrow result type: {type(arrow_result)}")
|
|
817
888
|
except Exception:
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
result_retry = conn.execute(query)
|
|
821
|
-
df = result_retry.df()
|
|
889
|
+
result = conn.execute(query)
|
|
890
|
+
df = result.df()
|
|
822
891
|
else:
|
|
823
|
-
|
|
824
|
-
df = result.df()
|
|
892
|
+
df = conn.execute(query).df()
|
|
825
893
|
|
|
826
894
|
# 确保df已正确赋值
|
|
827
895
|
if df is None:
|
|
@@ -831,7 +899,6 @@ class HTTPClient:
|
|
|
831
899
|
|
|
832
900
|
# 清理临时文件
|
|
833
901
|
try:
|
|
834
|
-
import os
|
|
835
902
|
if os.path.exists(tmp_file_path):
|
|
836
903
|
os.remove(tmp_file_path)
|
|
837
904
|
except Exception as e:
|
|
@@ -1062,7 +1129,6 @@ class HTTPClient:
|
|
|
1062
1129
|
|
|
1063
1130
|
for attempt in range(retries):
|
|
1064
1131
|
try:
|
|
1065
|
-
# 发送请求并等待响应
|
|
1066
1132
|
response = self._opener.open(request, timeout=timeout_value)
|
|
1067
1133
|
return self._parse_response(response, endpoint)
|
|
1068
1134
|
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
import panda_data
|
|
3
|
-
import time
|
|
4
|
-
|
|
5
|
-
if __name__ == "__main__":
|
|
6
|
-
# 1. pip install panda_data
|
|
7
|
-
# 2. 账号为:86+官网注册的手机号(eg:8617777777777),密码与官网同步
|
|
8
|
-
# 3. 接口文档见官方知识库
|
|
9
|
-
panda_data.init_token(username="", password="")
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# 开始计时
|
|
13
|
-
start_time = time.time()
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
result = panda_data.get_market_data(
|
|
17
|
-
symbol="000001.SZ",
|
|
18
|
-
start_date="20200101",
|
|
19
|
-
end_date="20250101",
|
|
20
|
-
# fields = ["open"],
|
|
21
|
-
type="stock"
|
|
22
|
-
)
|
|
23
|
-
print(result)
|
|
24
|
-
end_time = time.time()
|
|
25
|
-
print(f"耗时:{end_time - start_time}")
|
|
26
|
-
|
|
27
|
-
print(result)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|