panda-data 0.0.3__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {panda_data-0.0.3 → panda_data-0.0.4}/PKG-INFO +1 -1
  2. panda_data-0.0.4/panda_data/test.py +30 -0
  3. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/transport/http.py +290 -224
  4. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/PKG-INFO +1 -1
  5. {panda_data-0.0.3 → panda_data-0.0.4}/pyproject.toml +1 -1
  6. panda_data-0.0.3/panda_data/test.py +0 -27
  7. {panda_data-0.0.3 → panda_data-0.0.4}/README.md +0 -0
  8. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/__init__.py +0 -0
  9. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/client.py +0 -0
  10. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/config/__init__.py +0 -0
  11. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/core/__init__.py +0 -0
  12. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/core/service.py +0 -0
  13. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/exceptions.py +0 -0
  14. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/__init__.py +0 -0
  15. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/financial_and_factors_reader.py +0 -0
  16. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/future_reader.py +0 -0
  17. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/init_token.py +0 -0
  18. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/market_reader.py +0 -0
  19. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/market_reference_reader.py +0 -0
  20. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/readers/trading_tools_reader.py +0 -0
  21. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/transport/__init__.py +0 -0
  22. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/utils/common_utils.py +0 -0
  23. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data/utils/param_check_utils.py +0 -0
  24. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/SOURCES.txt +0 -0
  25. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/dependency_links.txt +0 -0
  26. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/requires.txt +0 -0
  27. {panda_data-0.0.3 → panda_data-0.0.4}/panda_data.egg-info/top_level.txt +0 -0
  28. {panda_data-0.0.3 → panda_data-0.0.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: panda_data
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: PandaAI DataQuant
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -0,0 +1,30 @@
1
+
2
+ import panda_data
3
+ import time
4
+
5
+ if __name__ == "__main__":
6
+ # 1. pip install panda_data
7
+ # 2. 账号为:86+官网注册的手机号(eg:8617777777777),密码与官网同步
8
+ # 3. 接口文档见官方知识库
9
+ # panda_data.init_token(username="", password="")
10
+ # panda_data.init_token(username="super_data_user", password="panda@2026^_^", base_url="http://192.168.1.3:8180")
11
+ panda_data.init_token(username="kk111", password="lijingyu.", base_url="http://127.0.0.1:8180")
12
+
13
+
14
+ # 开始计时
15
+ start_time = time.time()
16
+
17
+ result = panda_data.get_market_min_data(
18
+ symbol="000001.SZ",
19
+ start_date="20250101",
20
+ end_date="20250131",
21
+ symbol_type="stock",
22
+ fields=["symbol", "date", "num_trades", "amount", "volume"],
23
+ frequency="1m",
24
+ time_zone=("10:00", "11:00")
25
+ )
26
+ print(result)
27
+ end_time = time.time()
28
+ print(f"耗时:{end_time - start_time}")
29
+
30
+ print(result)
@@ -11,7 +11,7 @@ import socket
11
11
  import threading
12
12
  import time
13
13
  from dataclasses import dataclass
14
- from typing import Any, Dict, Optional
14
+ from typing import Any, Dict, List, Optional
15
15
  from urllib.parse import urljoin, urlparse
16
16
  from urllib.request import (
17
17
  Request,
@@ -94,6 +94,7 @@ class HTTPClientConfig:
94
94
  proxy_username: Optional[str] = None
95
95
  proxy_password: Optional[str] = None
96
96
  use_gzip: bool = False # 是否使用 gzip 压缩请求体
97
+ local_mode: bool = False # 本地模式:为 True 时不读 token 文件,不鉴权
97
98
 
98
99
 
99
100
  class HTTPClient:
@@ -236,7 +237,7 @@ class HTTPClient:
236
237
  if not self._establish_monitor_connection():
237
238
  consecutive_failures += 1
238
239
  if consecutive_failures >= max_failures:
239
- self._delete_token_file()
240
+ # self._delete_token_file()
240
241
  break
241
242
  else:
242
243
  consecutive_failures = 0
@@ -254,7 +255,7 @@ class HTTPClient:
254
255
  # 连接断开
255
256
  consecutive_failures += 1
256
257
  if consecutive_failures >= max_failures:
257
- self._delete_token_file()
258
+ # self._delete_token_file()
258
259
  # 清理连接
259
260
  try:
260
261
  self._monitor_connection.close()
@@ -266,7 +267,7 @@ class HTTPClient:
266
267
  # 其他异常,也视为连接问题
267
268
  consecutive_failures += 1
268
269
  if consecutive_failures >= max_failures:
269
- self._delete_token_file()
270
+ # self._delete_token_file()
270
271
  try:
271
272
  self._monitor_connection.close()
272
273
  except Exception:
@@ -277,7 +278,7 @@ class HTTPClient:
277
278
  except Exception:
278
279
  consecutive_failures += 1
279
280
  if consecutive_failures >= max_failures:
280
- self._delete_token_file()
281
+ # self._delete_token_file()
281
282
  break
282
283
 
283
284
  def start_connection_monitoring(self) -> None:
@@ -350,8 +351,8 @@ class HTTPClient:
350
351
  if accept_encodings:
351
352
  headers["Accept-Encoding"] = ", ".join(accept_encodings)
352
353
 
353
- # 添加从文件读取的token到Authorization header(登录接口除外)
354
- if endpoint and not endpoint.__contains__("login"):
354
+ # 添加从文件读取的token到Authorization header(本地模式下跳过鉴权)
355
+ if not self._config.local_mode and endpoint and not endpoint.__contains__("login"):
355
356
  user_file_path = self._get_token_file_path()
356
357
  if not user_file_path:
357
358
  raise ServiceError("无法确定token文件路径,请重新登录!")
@@ -450,9 +451,7 @@ class HTTPClient:
450
451
  if not content_encoding:
451
452
  return content
452
453
 
453
- decompress_start = time.time()
454
454
  content_encoding_lower = content_encoding.lower()
455
- compressed_size = len(content)
456
455
 
457
456
  try:
458
457
  if content_encoding_lower == "gzip":
@@ -499,55 +498,188 @@ class HTTPClient:
499
498
  except Exception as e:
500
499
  raise
501
500
 
501
+ def _stream_to_temp_file(self, response, content_encoding: str, tmp_file_path: str) -> int:
502
+ """
503
+ 将 HTTP 响应流式解压写入临时文件,避免全量数据同时驻留内存。
504
+ 大内存机器受益于更快的 I/O 吞吐;小内存机器(如 8GB)
505
+ 因不将全量解压数据加载到内存而避免 OOM。
506
+
507
+ Returns:
508
+ 写入文件的总字节数
509
+ """
510
+ _CHUNK = 512 * 1024
511
+ total_written = 0
512
+ encoding = content_encoding.lower() if content_encoding else ""
513
+
514
+ with open(tmp_file_path, 'wb') as out:
515
+ if encoding in ("zstd", "z-standard"):
516
+ if not HAS_ZSTD:
517
+ raise ServiceError(
518
+ "Response is compressed with zstd, but zstandard library is not installed. "
519
+ "Please install it with: pip install zstandard"
520
+ )
521
+ dctx = zstd.ZstdDecompressor()
522
+ reader = dctx.stream_reader(response, read_size=_CHUNK)
523
+ try:
524
+ while True:
525
+ chunk = reader.read(_CHUNK)
526
+ if not chunk:
527
+ break
528
+ out.write(chunk)
529
+ total_written += len(chunk)
530
+ finally:
531
+ reader.close()
532
+ elif encoding == "gzip":
533
+ import zlib
534
+ decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
535
+ while True:
536
+ compressed_chunk = response.read(_CHUNK)
537
+ if not compressed_chunk:
538
+ break
539
+ decompressed = decompressor.decompress(compressed_chunk)
540
+ if decompressed:
541
+ out.write(decompressed)
542
+ total_written += len(decompressed)
543
+ remaining = decompressor.flush()
544
+ if remaining:
545
+ out.write(remaining)
546
+ total_written += len(remaining)
547
+ else:
548
+ while True:
549
+ chunk = response.read(_CHUNK)
550
+ if not chunk:
551
+ break
552
+ out.write(chunk)
553
+ total_written += len(chunk)
554
+
555
+ return total_written
556
+
557
+ @staticmethod
558
+ def _parquet_col_ci(columns: List[str], name: str) -> Optional[str]:
559
+ """按不区分大小写匹配列名,返回 Parquet/DuckDB 中的实际列名。"""
560
+ nl = name.lower()
561
+ for c in columns:
562
+ if c.lower() == nl:
563
+ return c
564
+ return None
565
+
566
+ def _infer_parquet_columns(self, conn, tmp_file_path: str) -> List[str]:
567
+ """读取 Parquet 列名;LIMIT 1 失败时用 pyarrow / DuckDB DESCRIBE 兜底,避免无 ORDER BY 时乱序。"""
568
+ try:
569
+ sample_result = conn.execute(
570
+ f"SELECT * FROM read_parquet('{tmp_file_path}') LIMIT 1"
571
+ )
572
+ return list(sample_result.df().columns)
573
+ except Exception:
574
+ pass
575
+ try:
576
+ import pyarrow.parquet as pq
577
+
578
+ return list(pq.read_schema(tmp_file_path).names)
579
+ except Exception:
580
+ pass
581
+ try:
582
+ desc = conn.execute(
583
+ f"DESCRIBE SELECT * FROM read_parquet('{tmp_file_path}')"
584
+ ).df()
585
+ return desc.iloc[:, 0].astype(str).tolist()
586
+ except Exception:
587
+ pass
588
+ return []
589
+
502
590
  def _parse_response(self, response, endpoint: str = "") -> Dict[str, Any]:
503
591
  """解析响应,支持标准JSON、流式响应(NDJSON格式)和Parquet格式"""
504
- content = response.read()
505
592
  content_encoding = response.headers.get("Content-Encoding", "")
506
- content = self._decompress_content(content, content_encoding)
507
-
508
- # 获取 Content-Type 头
509
593
  content_type = response.headers.get("Content-Type", "")
510
594
 
511
- # 检测是否是 Parquet 格式(通过 Content-Type、endpoint 或文件内容魔数)
512
- # Parquet 文件以 "PAR1" 开头(前4个字节)
595
+ # 已知返回 Parquet 格式的接口关键词
596
+ _parquet_ep_kws = (
597
+ "getmultimarketmindata", "getfuturetickdata", "getfinancialstatementdata",
598
+ "getstockmarkethkdata", "getstockmarkethkmindata",
599
+ "getstockmarketusdata", "getstockmarketusmindata",
600
+ )
601
+ endpoint_lower = endpoint.lower()
602
+ is_known_parquet_ep = (
603
+ "parquet" in content_type.lower()
604
+ or "application/x-parquet" in content_type.lower()
605
+ or any(kw in endpoint_lower for kw in _parquet_ep_kws)
606
+ )
607
+
608
+ # 对于已知 Parquet 接口,流式写入临时文件以避免全量数据驻留内存
609
+ _streamed_tmp_path = None
610
+ _streamed_size_mb = 0.0
611
+ if is_known_parquet_ep:
612
+ import tempfile
613
+ import os
614
+ _tmp_fd, _streamed_tmp_path = tempfile.mkstemp(suffix='.parquet')
615
+ os.close(_tmp_fd)
616
+ try:
617
+ _total_written = self._stream_to_temp_file(
618
+ response, content_encoding, _streamed_tmp_path
619
+ )
620
+ except Exception as _e:
621
+ try:
622
+ os.remove(_streamed_tmp_path)
623
+ except Exception:
624
+ pass
625
+ raise ServiceError(f"流式写入临时文件失败: {_e}") from _e
626
+ # 验证文件确实是 Parquet 格式(检查 "PAR1" 魔数)
627
+ _is_valid_parquet = False
628
+ if _total_written >= 4:
629
+ with open(_streamed_tmp_path, 'rb') as _f:
630
+ _is_valid_parquet = _f.read(4) == b"PAR1"
631
+ if _is_valid_parquet:
632
+ _streamed_size_mb = _total_written / (1024 * 1024)
633
+ content = None
634
+ else:
635
+ # 非 Parquet(可能是 JSON 错误响应),回退到内存解析
636
+ with open(_streamed_tmp_path, 'rb') as _f:
637
+ content = _f.read()
638
+ try:
639
+ os.remove(_streamed_tmp_path)
640
+ except Exception:
641
+ pass
642
+ _streamed_tmp_path = None
643
+ else:
644
+ content = response.read()
645
+ content = self._decompress_content(content, content_encoding)
646
+
647
+ # 非流式路径:通过文件魔数检测是否为 Parquet
513
648
  is_parquet_by_content = False
514
- if len(content) >= 4:
515
- # 检查文件魔数 "PAR1"
649
+ if content is not None and len(content) >= 4:
516
650
  is_parquet_by_content = content[:4] == b"PAR1"
517
651
 
518
- is_parquet = (
519
- "parquet" in content_type.lower() or
520
- "application/x-parquet" in content_type.lower() or
521
- "getmultimarketmindata" in endpoint.lower() or
522
- "getfuturetickdata" in endpoint.lower() or
523
- "getfinancialstatementdata" in endpoint.lower() or
524
- "getfinancialstatementdailydata" in endpoint.lower() or
525
- is_parquet_by_content
526
- )
652
+ is_parquet = (_streamed_tmp_path is not None) or is_parquet_by_content
527
653
 
528
654
  # 如果是 Parquet 格式,使用 DuckDB 读取
529
655
  if is_parquet:
530
656
  if not HAS_DUCKDB or duckdb is None:
657
+ if _streamed_tmp_path:
658
+ try:
659
+ os.remove(_streamed_tmp_path)
660
+ except Exception:
661
+ pass
531
662
  raise ServiceError(
532
663
  "响应是 Parquet 格式,但 DuckDB 库未安装。"
533
664
  "请安装: pip install duckdb"
534
665
  )
535
666
 
536
- parquet_start = time.time()
537
667
  global _active_parquet_reads
538
668
  try:
539
- # 将字节内容写入临时文件(DuckDB 需要文件路径)
540
669
  import tempfile
541
- with tempfile.NamedTemporaryFile(delete=False, suffix='.parquet') as tmp_file:
542
- tmp_file.write(content)
543
- tmp_file_path = tmp_file.name
544
670
 
545
- file_size_mb = len(content) / 1024 / 1024
671
+ if _streamed_tmp_path is not None:
672
+ tmp_file_path = _streamed_tmp_path
673
+ file_size_mb = _streamed_size_mb
674
+ else:
675
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.parquet') as tmp_file:
676
+ tmp_file.write(content)
677
+ tmp_file_path = tmp_file.name
678
+ file_size_mb = len(content) / 1024 / 1024
679
+ del content
546
680
 
547
- # 获取信号量,限制最大并发HTTP请求数
548
681
  _parquet_read_semaphore.acquire()
549
682
 
550
- max_concurrent = _get_max_concurrent_parquet_reads()
551
683
  with _parquet_read_lock:
552
684
  _active_parquet_reads += 1
553
685
  current_concurrency = _active_parquet_reads
@@ -556,7 +688,6 @@ class HTTPClient:
556
688
  # 使用 DuckDB 读取 Parquet 文件
557
689
  conn = duckdb.connect()
558
690
  try:
559
- import os
560
691
  try:
561
692
  import psutil
562
693
  HAS_PSUTIL = True
@@ -575,109 +706,69 @@ class HTTPClient:
575
706
  available_memory_gb = 4.0
576
707
  total_memory_gb = 8.0
577
708
 
709
+ cpu_count = os.cpu_count() or 4
578
710
  effective_concurrency = max(current_concurrency, 1)
579
- # 更保守的内存分配:使用更小的百分比,并保留安全边距
580
- # 当可用内存较低时,使用更小的百分比
581
- if available_memory_gb < 4:
582
- memory_percentage = 0.5 # 只使用50%的可用内存
583
- elif available_memory_gb < 8:
584
- memory_percentage = 0.6 # 使用60%的可用内存
711
+
712
+ # Parquet 展开后通常是文件大小的 5~15 倍,排序还需要额外临时空间
713
+ estimated_expanded_gb = file_size_mb * 10 / 1024
714
+ estimated_need_gb = max(estimated_expanded_gb * 2.5, 1.0)
715
+
716
+ # 根据总内存渐进式分配 DuckDB 内存预算:
717
+ # ≤8GB → 20%(配合 temp_directory 溢写磁盘防 OOM)
718
+ # 8~16GB → 20%~40% 线性过渡(平衡速度与安全)
719
+ # >16GB → 积极分配(充分利用内存提升速度)
720
+ if total_memory_gb <= 8:
721
+ per_conn_budget_gb = total_memory_gb * 0.20 / effective_concurrency
722
+ elif total_memory_gb <= 16:
723
+ _ratio = 0.20 + (total_memory_gb - 8) / 8 * 0.20
724
+ per_conn_budget_gb = total_memory_gb * _ratio / effective_concurrency
585
725
  else:
586
- memory_percentage = 0.7 # 使用70%的可用内存
726
+ per_conn_budget_gb = max(
727
+ total_memory_gb * 0.5,
728
+ available_memory_gb * 0.9,
729
+ ) / effective_concurrency
730
+ cap_ratio = float(os.environ.get("PANDA_DATA_DUCKDB_MEMORY_CAP_RATIO", "0.65"))
731
+ per_conn_budget_gb = min(
732
+ per_conn_budget_gb,
733
+ total_memory_gb * cap_ratio / effective_concurrency,
734
+ )
735
+ memory_limit_gb = min(estimated_need_gb, per_conn_budget_gb)
736
+ memory_limit_gb = max(memory_limit_gb, 0.5)
737
+
738
+ env_mem = os.environ.get("PANDA_DATA_DUCKDB_MEMORY_GB", "").strip()
739
+ if env_mem:
740
+ try:
741
+ memory_limit_gb = max(0.5, float(env_mem))
742
+ except ValueError:
743
+ pass
587
744
 
588
- per_thread_memory_gb = (available_memory_gb / effective_concurrency) * memory_percentage
589
- max_per_thread_memory_gb = 6.0
590
- cpu_count = os.cpu_count() or 4
591
- max_threads_by_memory = max(1, int(available_memory_gb / 8))
592
- max_possible_threads = min(cpu_count, max_threads_by_memory)
593
-
594
- # 根据可用内存调整线程数和内存限制
595
- if file_size_mb > 1000:
596
- base_threads = 2
597
- if available_memory_gb > 32:
598
- max_threads_for_file = min(max_possible_threads, 4)
599
- elif available_memory_gb > 8:
600
- max_threads_for_file = min(max_possible_threads, 2)
601
- else:
602
- max_threads_for_file = 1 # 内存不足时只使用1个线程
603
- num_threads = max(1, min(cpu_count, max_threads_for_file))
604
- # 对于大文件,使用更保守的内存限制
605
- memory_limit_gb = min(per_thread_memory_gb, 1.5, max_per_thread_memory_gb)
606
- memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
607
- elif file_size_mb > 500:
608
- base_threads = 4
609
- if available_memory_gb > 32:
610
- max_threads_for_file = min(max_possible_threads, 8)
611
- elif available_memory_gb > 16:
612
- max_threads_for_file = min(max_possible_threads, 6)
613
- elif available_memory_gb > 8:
614
- max_threads_for_file = min(max_possible_threads, 4)
615
- else:
616
- max_threads_for_file = 2 # 内存不足时减少线程
617
- num_threads = max(1, min(cpu_count, max_threads_for_file))
618
- memory_limit_gb = min(per_thread_memory_gb, 2.0, max_per_thread_memory_gb)
619
- memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
620
- elif file_size_mb > 200:
621
- base_threads = 8
622
- if available_memory_gb > 64:
623
- max_threads_for_file = min(max_possible_threads, 16)
624
- elif available_memory_gb > 32:
625
- max_threads_for_file = min(max_possible_threads, 12)
626
- elif available_memory_gb > 8:
627
- max_threads_for_file = min(max_possible_threads, 8)
628
- else:
629
- max_threads_for_file = 4 # 内存不足时减少线程
630
- num_threads = max(1, min(cpu_count, max_threads_for_file))
631
- memory_limit_gb = min(per_thread_memory_gb, 2.0, max_per_thread_memory_gb)
632
- memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
745
+ memory_limit = f"{memory_limit_gb:.1f}GB"
746
+
747
+ # 线程:内存过小时少开线程;避免 available=4 时 int(4/4)=1 单核跑满
748
+ env_thr = os.environ.get("PANDA_DATA_DUCKDB_THREADS", "").strip()
749
+ if env_thr:
750
+ try:
751
+ num_threads = max(1, min(cpu_count, int(env_thr)))
752
+ except ValueError:
753
+ num_threads = min(
754
+ cpu_count,
755
+ max(2, min(16, int(memory_limit_gb // 1.2))),
756
+ )
633
757
  else:
634
- # 小文件(<=200MB):使用更少的线程和内存
635
- if available_memory_gb > 128:
636
- max_threads_for_file = min(max_possible_threads, cpu_count)
637
- base_threads = 16
638
- elif available_memory_gb > 64:
639
- max_threads_for_file = min(max_possible_threads, min(cpu_count, 24))
640
- base_threads = 12
641
- elif available_memory_gb > 32:
642
- max_threads_for_file = min(max_possible_threads, 16)
643
- base_threads = 8
644
- elif available_memory_gb > 8:
645
- max_threads_for_file = min(max_possible_threads, 8)
646
- base_threads = 4
647
- else:
648
- # 低内存系统:但考虑到数据量可能很大(行数多),使用4个线程
649
- # 如果内存真的紧张,会在后续的内存限制中体现
650
- max_threads_for_file = min(max_possible_threads, 4)
651
- base_threads = 4
652
- num_threads = max(1, min(cpu_count, max_threads_for_file))
653
- # 小文件不需要太多内存,限制在1GB以内
654
- memory_limit_gb = min(per_thread_memory_gb, 1.0, max_per_thread_memory_gb)
655
- memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
656
-
657
- # 根据文件大小进一步调整内存限制(小文件不需要太多内存)
658
- if file_size_mb <= 100:
659
- # 对于小于100MB的文件,限制内存使用在512MB-1GB之间
660
- memory_limit_gb = min(memory_limit_gb, 1.0)
661
- elif file_size_mb <= 200:
662
- # 对于100-200MB的文件,限制在1.5GB以内
663
- memory_limit_gb = min(memory_limit_gb, 1.5)
664
-
665
- # 确保内存限制不超过可用内存的90%(保留安全边距)
666
- max_safe_memory_gb = available_memory_gb * 0.9
667
- memory_limit_gb = min(memory_limit_gb, max_safe_memory_gb)
668
- memory_limit = f"{max(memory_limit_gb, 0.5):.1f}GB"
669
-
670
- num_threads = max(num_threads, 1)
758
+ num_threads = min(
759
+ cpu_count,
760
+ max(2, min(16, int(memory_limit_gb // 1.2))),
761
+ )
762
+ num_threads = max(1, num_threads)
763
+
671
764
  conn.execute(f"SET threads TO {num_threads}")
672
765
  conn.execute(f"SET memory_limit='{memory_limit}'")
673
- if file_size_mb <= 500:
674
- conn.execute("SET enable_object_cache TO true")
675
- else:
676
- conn.execute("SET enable_object_cache TO false")
766
+ conn.execute("SET enable_object_cache TO true")
677
767
  conn.execute("SET enable_progress_bar TO false")
678
768
  conn.execute("SET preserve_insertion_order TO false")
769
+ _td = tempfile.gettempdir()
770
+ conn.execute(f"SET temp_directory='{_td}'")
679
771
  except Exception as config_error:
680
- # 回退配置:使用更保守的设置以避免内存不足
681
772
  num_threads = 1
682
773
  memory_limit = "512MB"
683
774
  try:
@@ -686,93 +777,95 @@ class HTTPClient:
686
777
  conn.execute("SET enable_object_cache TO false")
687
778
  conn.execute("SET enable_progress_bar TO false")
688
779
  conn.execute("SET preserve_insertion_order TO false")
780
+ try:
781
+ _td = tempfile.gettempdir()
782
+ conn.execute(f"SET temp_directory='{_td}'")
783
+ except Exception:
784
+ pass
689
785
  except Exception:
690
786
  pass
691
787
 
692
- # 读取列信息
693
- available_columns = []
694
- try:
695
- sample_result = conn.execute(f"SELECT * FROM read_parquet('{tmp_file_path}') LIMIT 1")
696
- sample_df = sample_result.df()
697
- available_columns = list(sample_df.columns)
698
- except Exception:
699
- pass
788
+ # 读取列信息(LIMIT 1 失败时用 pyarrow schema 兜底,避免无 ORDER BY 时保持 Parquet 物理顺序)
789
+ available_columns = self._infer_parquet_columns(conn, tmp_file_path)
700
790
 
701
791
  # 构建查询语句
702
- # 判断 endpoint 类型并设置排序和列选择逻辑
792
+ # 判断 endpoint 类型并设置排序和列选择逻辑(列名大小写不敏感,ORDER BY 使用实际列名)
703
793
  if "getfuturetickdata" in endpoint.lower():
704
794
  # 特殊处理:按照 symbol ASC, timestamp ASC 排序,并将 symbol, date, timestamp 放在前三列
705
- has_symbol = 'symbol' in available_columns
706
- has_timestamp = 'timestamp' in available_columns
707
- has_date = 'date' in available_columns
795
+ sym_col = self._parquet_col_ci(available_columns, "symbol")
796
+ ts_col = self._parquet_col_ci(available_columns, "timestamp")
797
+ date_col = self._parquet_col_ci(available_columns, "date")
708
798
 
709
799
  order_by_clause = ""
710
- if has_symbol and has_timestamp:
711
- order_by_clause = " ORDER BY symbol ASC, timestamp ASC"
712
- elif has_symbol:
713
- order_by_clause = " ORDER BY symbol ASC"
714
- elif has_timestamp:
715
- order_by_clause = " ORDER BY timestamp ASC"
716
-
717
- if has_symbol and has_date and has_timestamp:
718
- other_columns = [col for col in available_columns if
719
- col not in ['symbol', 'date', 'timestamp']]
720
- select_columns = ['symbol', 'date', 'timestamp'] + other_columns
800
+ if sym_col and ts_col:
801
+ order_by_clause = f' ORDER BY "{sym_col}" ASC, "{ts_col}" ASC'
802
+ elif sym_col:
803
+ order_by_clause = f' ORDER BY "{sym_col}" ASC'
804
+ elif ts_col:
805
+ order_by_clause = f' ORDER BY "{ts_col}" ASC'
806
+
807
+ if sym_col and date_col and ts_col:
808
+ other_columns = [
809
+ col
810
+ for col in available_columns
811
+ if col not in (sym_col, date_col, ts_col)
812
+ ]
813
+ select_columns = [sym_col, date_col, ts_col] + other_columns
721
814
  select_clause = ", ".join([f'"{col}"' for col in select_columns])
722
- elif has_symbol and has_date:
723
- other_columns = [col for col in available_columns if col not in ['symbol', 'date']]
724
- select_columns = ['symbol', 'date'] + other_columns
815
+ elif sym_col and date_col:
816
+ other_columns = [
817
+ col for col in available_columns if col not in (sym_col, date_col)
818
+ ]
819
+ select_columns = [sym_col, date_col] + other_columns
725
820
  select_clause = ", ".join([f'"{col}"' for col in select_columns])
726
- elif has_symbol:
727
- other_columns = [col for col in available_columns if col != 'symbol']
728
- select_columns = ['symbol'] + other_columns
821
+ elif sym_col:
822
+ other_columns = [col for col in available_columns if col != sym_col]
823
+ select_columns = [sym_col] + other_columns
729
824
  select_clause = ", ".join([f'"{col}"' for col in select_columns])
730
825
  else:
731
826
  select_clause = "*"
732
827
  else:
733
- # 默认处理:保持原来的逻辑
734
- has_symbol = 'symbol' in available_columns
735
- has_date = 'date' in available_columns
736
- has_minute = 'minute' in available_columns
828
+ # 默认:日频/分钟等行情——优先 symbol,再 date / minute 降序
829
+ sym_col = self._parquet_col_ci(available_columns, "symbol")
830
+ date_col = self._parquet_col_ci(available_columns, "date")
831
+ min_col = self._parquet_col_ci(available_columns, "minute")
737
832
 
738
833
  order_by_clause = ""
739
- if has_symbol and has_date and has_minute:
740
- order_by_clause = " ORDER BY symbol ASC, date DESC, minute DESC"
741
- elif has_symbol and has_date:
742
- order_by_clause = " ORDER BY symbol ASC, date DESC"
743
- elif has_symbol:
744
- order_by_clause = " ORDER BY symbol ASC"
745
- elif has_date and has_minute:
746
- order_by_clause = " ORDER BY date DESC, minute DESC"
747
- elif has_date:
748
- order_by_clause = " ORDER BY date DESC"
749
- elif has_minute:
750
- order_by_clause = " ORDER BY minute DESC"
751
-
752
- if has_symbol and has_date:
753
- other_columns = [col for col in available_columns if col not in ['symbol', 'date']]
754
- select_columns = ['symbol', 'date'] + other_columns
834
+ if sym_col and date_col and min_col:
835
+ order_by_clause = (
836
+ f' ORDER BY "{sym_col}" ASC, "{date_col}" DESC, "{min_col}" DESC'
837
+ )
838
+ elif sym_col and date_col:
839
+ order_by_clause = f' ORDER BY "{sym_col}" ASC, "{date_col}" DESC'
840
+ elif sym_col:
841
+ order_by_clause = f' ORDER BY "{sym_col}" ASC'
842
+ elif date_col and min_col:
843
+ order_by_clause = f' ORDER BY "{date_col}" DESC, "{min_col}" DESC'
844
+ elif date_col:
845
+ order_by_clause = f' ORDER BY "{date_col}" DESC'
846
+ elif min_col:
847
+ order_by_clause = f' ORDER BY "{min_col}" DESC'
848
+
849
+ if sym_col and date_col:
850
+ other_columns = [
851
+ col for col in available_columns if col not in (sym_col, date_col)
852
+ ]
853
+ select_columns = [sym_col, date_col] + other_columns
755
854
  select_clause = ", ".join([f'"{col}"' for col in select_columns])
756
- elif has_symbol:
757
- other_columns = [col for col in available_columns if col != 'symbol']
758
- select_columns = ['symbol'] + other_columns
855
+ elif sym_col:
856
+ other_columns = [col for col in available_columns if col != sym_col]
857
+ select_columns = [sym_col] + other_columns
759
858
  select_clause = ", ".join([f'"{col}"' for col in select_columns])
760
- elif has_date:
761
- other_columns = [col for col in available_columns if col != 'date']
762
- select_columns = ['date'] + other_columns
859
+ elif date_col:
860
+ other_columns = [col for col in available_columns if col != date_col]
861
+ select_columns = [date_col] + other_columns
763
862
  select_clause = ", ".join([f'"{col}"' for col in select_columns])
764
863
  else:
765
864
  select_clause = "*"
766
865
 
767
866
  # 执行主查询
768
867
  query = f"SELECT {select_clause} FROM read_parquet('{tmp_file_path}'){order_by_clause}"
769
- result = conn.execute(query)
770
-
771
- # 转换为DataFrame(优先使用Arrow格式优化性能)
772
- use_arrow = False
773
- df = None
774
868
 
775
- # 检查是否可以使用Arrow格式
776
869
  try:
777
870
  import pyarrow as pa
778
871
  HAS_PYARROW = True
@@ -781,47 +874,22 @@ class HTTPClient:
781
874
  pa = None
782
875
 
783
876
  if HAS_PYARROW:
784
- # 方法1: 尝试使用Arrow格式(通常比直接df()快2-5倍)
877
+ result = conn.execute(query)
785
878
  try:
786
879
  arrow_result = result.arrow()
787
-
788
- # DuckDB的arrow()可能返回Table或RecordBatchReader
789
880
  if isinstance(arrow_result, pa.Table):
790
- # 如果是Table,直接转换
791
881
  df = arrow_result.to_pandas()
792
882
  elif isinstance(arrow_result, pa.RecordBatchReader):
793
- # 如果是RecordBatchReader,读取所有批次
794
- arrow_table = arrow_result.read_all()
795
- df = arrow_table.to_pandas()
796
- elif hasattr(arrow_result, 'to_pandas'):
797
- # 某些版本可能直接支持to_pandas
883
+ df = arrow_result.read_all().to_pandas()
884
+ elif hasattr(arrow_result, "to_pandas"):
798
885
  df = arrow_result.to_pandas()
799
- elif hasattr(arrow_result, 'read_all'):
800
- # 尝试使用read_all方法
801
- arrow_table = arrow_result.read_all()
802
- df = arrow_table.to_pandas()
803
886
  else:
804
- # 未知类型,尝试作为迭代器读取批次
805
- try:
806
- batches = list(arrow_result)
807
- if batches:
808
- arrow_table = pa.Table.from_batches(batches)
809
- df = arrow_table.to_pandas()
810
- else:
811
- raise ValueError("No batches in RecordBatchReader")
812
- except Exception as e:
813
- raise ValueError(
814
- f"Cannot convert arrow_result to pandas: {type(arrow_result)}, error: {e}")
815
-
816
- use_arrow = True
887
+ raise TypeError(f"unexpected arrow result type: {type(arrow_result)}")
817
888
  except Exception:
818
- # Arrow转换失败,需要重新执行查询
819
- # 重新执行查询(因为result已经被消耗)
820
- result_retry = conn.execute(query)
821
- df = result_retry.df()
889
+ result = conn.execute(query)
890
+ df = result.df()
822
891
  else:
823
- # 方法2: 直接使用df()(如果没有pyarrow)
824
- df = result.df()
892
+ df = conn.execute(query).df()
825
893
 
826
894
  # 确保df已正确赋值
827
895
  if df is None:
@@ -831,7 +899,6 @@ class HTTPClient:
831
899
 
832
900
  # 清理临时文件
833
901
  try:
834
- import os
835
902
  if os.path.exists(tmp_file_path):
836
903
  os.remove(tmp_file_path)
837
904
  except Exception as e:
@@ -1062,7 +1129,6 @@ class HTTPClient:
1062
1129
 
1063
1130
  for attempt in range(retries):
1064
1131
  try:
1065
- # 发送请求并等待响应
1066
1132
  response = self._opener.open(request, timeout=timeout_value)
1067
1133
  return self._parse_response(response, endpoint)
1068
1134
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: panda_data
3
- Version: 0.0.3
3
+ Version: 0.0.4
4
4
  Summary: PandaAI DataQuant
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "panda_data"
3
- version = "0.0.3"
3
+ version = "0.0.4"
4
4
  description = "PandaAI DataQuant"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,27 +0,0 @@
1
-
2
- import panda_data
3
- import time
4
-
5
- if __name__ == "__main__":
6
- # 1. pip install panda_data
7
- # 2. 账号为:86+官网注册的手机号(eg:8617777777777),密码与官网同步
8
- # 3. 接口文档见官方知识库
9
- panda_data.init_token(username="", password="")
10
-
11
-
12
- # 开始计时
13
- start_time = time.time()
14
-
15
-
16
- result = panda_data.get_market_data(
17
- symbol="000001.SZ",
18
- start_date="20200101",
19
- end_date="20250101",
20
- # fields = ["open"],
21
- type="stock"
22
- )
23
- print(result)
24
- end_time = time.time()
25
- print(f"耗时:{end_time - start_time}")
26
-
27
- print(result)
File without changes
File without changes