lidb 1.0.10__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lidb might be problematic. Click here for more details.

lidb/__init__.py CHANGED
@@ -19,4 +19,4 @@ from .database import (
19
19
 
20
20
  from .parse import parse_hive_partition_structure
21
21
 
22
- __version__ = "1.0.10"
22
+ __version__ = "1.1.0"
lidb/parse.py CHANGED
@@ -11,7 +11,10 @@ import re
11
11
  from pathlib import Path
12
12
  from typing import Dict, Set
13
13
  from urllib.parse import unquote
14
+ from collections import defaultdict
14
15
 
16
+ import polars
17
+ import polars as pl
15
18
  import sqlparse
16
19
 
17
20
 
@@ -69,7 +72,7 @@ def extract_table_names_from_sql(sql_query):
69
72
  return table_names
70
73
 
71
74
 
72
- def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") -> Dict[str, Set[str]]:
75
+ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") -> polars.DataFrame:
73
76
  """
74
77
  通用Hive分区结构解析器
75
78
 
@@ -78,25 +81,30 @@ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*
78
81
  file_pattern: 文件匹配模式 (默认 "*.parquet")
79
82
 
80
83
  Returns:
81
- Dict[str, Set[str]]: 分区键到值集合的映射
84
+ polars.DataFrame
82
85
  """
83
86
  if isinstance(root_path, str):
84
87
  root_path = Path(root_path)
85
- partitions: Dict[str, Set[str]] = {}
86
88
 
87
- # 查找所有匹配的文件
89
+ partition_combinations = set()
90
+
88
91
  for file_path in root_path.rglob(file_pattern):
89
- # 获取相对于根路径的路径
90
92
  relative_path = file_path.relative_to(root_path)
91
93
 
92
- # 解析路径中的分区信息
93
- for part in relative_path.parts[:-1]: # 排除文件名部分
94
+ # 收集分区信息
95
+ partition_dict = {}
96
+ for part in relative_path.parts[:-1]: # 排除文件名
94
97
  if '=' in part:
95
98
  key, value = part.split('=', 1)
96
99
  value = unquote(value)
97
100
 
98
- if key not in partitions:
99
- partitions[key] = set()
100
- partitions[key].add(value)
101
+ partition_dict[key] = value
102
+
103
+ # 记录分区组合
104
+ combination = tuple(sorted(partition_dict.items()))
105
+ partition_combinations.add(combination)
106
+
107
+ # 转换为普通dict
108
+ res = [dict(combo) for combo in partition_combinations]
101
109
 
102
- return partitions
110
+ return pl.DataFrame(res)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lidb
3
- Version: 1.0.10
3
+ Version: 1.1.0
4
4
  Requires-Python: >=3.12
5
5
  Description-Content-Type: text/markdown
6
6
  Requires-Dist: dynaconf>=3.2.11
@@ -0,0 +1,8 @@
1
+ lidb/__init__.py,sha256=dCfYBd1A4-lAcGv-9kVjLxmWva2DnH8ev0s-3nJk3vQ,313
2
+ lidb/database.py,sha256=U1h80jgmkRfgQW8sdhb7B3ISvQJUvPefLurWMsbzqa0,5372
3
+ lidb/init.py,sha256=jLHpeL5mIM4YjdMYAndZlDilMiKXJMr_51Ke3ZSJWCM,1170
4
+ lidb/parse.py,sha256=5lrYtT_XyRfZYX_AwfmhDYZywPBkAbzIVwJk_7l5Nrw,3470
5
+ lidb-1.1.0.dist-info/METADATA,sha256=jkGViIS9bnzJtVGF0vzgJm4W4_MaMIb6h00rYuyezN8,303
6
+ lidb-1.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ lidb-1.1.0.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
8
+ lidb-1.1.0.dist-info/RECORD,,
@@ -1,8 +0,0 @@
1
- lidb/__init__.py,sha256=MG3U8f-IJ6X1EkgdHjDNqUpbJsdqX3C4mfSze0V_9xY,314
2
- lidb/database.py,sha256=U1h80jgmkRfgQW8sdhb7B3ISvQJUvPefLurWMsbzqa0,5372
3
- lidb/init.py,sha256=jLHpeL5mIM4YjdMYAndZlDilMiKXJMr_51Ke3ZSJWCM,1170
4
- lidb/parse.py,sha256=8NpWRjoVc45QDgNUoEcsAQQ-AcciBjpstar5kbSc_Lc,3364
5
- lidb-1.0.10.dist-info/METADATA,sha256=-5CuLQhl2nsgANTWYG25IsWpJlQ9tjM00NasRe1rs04,304
6
- lidb-1.0.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
- lidb-1.0.10.dist-info/top_level.txt,sha256=NgXJNwt6ld6oLXtW1vOPaEh-VO5R0JEX_KmGIJR4ueE,5
8
- lidb-1.0.10.dist-info/RECORD,,
File without changes