lidb 1.0.10__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of lidb might be problematic. Click here for more details.
- {lidb-1.0.10 → lidb-1.1.0}/PKG-INFO +1 -1
- {lidb-1.0.10 → lidb-1.1.0}/lidb/__init__.py +1 -1
- {lidb-1.0.10 → lidb-1.1.0}/lidb/parse.py +19 -11
- {lidb-1.0.10 → lidb-1.1.0}/lidb.egg-info/PKG-INFO +1 -1
- {lidb-1.0.10 → lidb-1.1.0}/pyproject.toml +1 -1
- {lidb-1.0.10 → lidb-1.1.0}/tests/test_parse.py +1 -1
- {lidb-1.0.10 → lidb-1.1.0}/README.md +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/lidb/database.py +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/lidb/init.py +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/lidb.egg-info/SOURCES.txt +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/lidb.egg-info/dependency_links.txt +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/lidb.egg-info/requires.txt +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/lidb.egg-info/top_level.txt +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/setup.cfg +0 -0
- {lidb-1.0.10 → lidb-1.1.0}/tests/test_conf.py +0 -0
|
@@ -11,7 +11,10 @@ import re
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Dict, Set
|
|
13
13
|
from urllib.parse import unquote
|
|
14
|
+
from collections import defaultdict
|
|
14
15
|
|
|
16
|
+
import polars
|
|
17
|
+
import polars as pl
|
|
15
18
|
import sqlparse
|
|
16
19
|
|
|
17
20
|
|
|
@@ -69,7 +72,7 @@ def extract_table_names_from_sql(sql_query):
|
|
|
69
72
|
return table_names
|
|
70
73
|
|
|
71
74
|
|
|
72
|
-
def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") ->
|
|
75
|
+
def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") -> polars.DataFrame:
|
|
73
76
|
"""
|
|
74
77
|
通用Hive分区结构解析器
|
|
75
78
|
|
|
@@ -78,25 +81,30 @@ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*
|
|
|
78
81
|
file_pattern: 文件匹配模式 (默认 "*.parquet")
|
|
79
82
|
|
|
80
83
|
Returns:
|
|
81
|
-
|
|
84
|
+
polars.DataFrame
|
|
82
85
|
"""
|
|
83
86
|
if isinstance(root_path, str):
|
|
84
87
|
root_path = Path(root_path)
|
|
85
|
-
partitions: Dict[str, Set[str]] = {}
|
|
86
88
|
|
|
87
|
-
|
|
89
|
+
partition_combinations = set()
|
|
90
|
+
|
|
88
91
|
for file_path in root_path.rglob(file_pattern):
|
|
89
|
-
# 获取相对于根路径的路径
|
|
90
92
|
relative_path = file_path.relative_to(root_path)
|
|
91
93
|
|
|
92
|
-
#
|
|
93
|
-
|
|
94
|
+
# 收集分区信息
|
|
95
|
+
partition_dict = {}
|
|
96
|
+
for part in relative_path.parts[:-1]: # 排除文件名
|
|
94
97
|
if '=' in part:
|
|
95
98
|
key, value = part.split('=', 1)
|
|
96
99
|
value = unquote(value)
|
|
97
100
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
+
partition_dict[key] = value
|
|
102
|
+
|
|
103
|
+
# 记录分区组合
|
|
104
|
+
combination = tuple(sorted(partition_dict.items()))
|
|
105
|
+
partition_combinations.add(combination)
|
|
106
|
+
|
|
107
|
+
# 转换为普通dict
|
|
108
|
+
res = [dict(combo) for combo in partition_combinations]
|
|
101
109
|
|
|
102
|
-
return
|
|
110
|
+
return pl.DataFrame(res)
|
|
@@ -12,7 +12,7 @@ def test_parse_hive_partition_structure():
|
|
|
12
12
|
root_path = lidb.tb_path("mc")
|
|
13
13
|
file_pattern = "*.parquet"
|
|
14
14
|
result = lidb.parse.parse_hive_partition_structure(root_path, file_pattern)
|
|
15
|
-
logger.info(result)
|
|
15
|
+
logger.info(result["freq"].unique())
|
|
16
16
|
|
|
17
17
|
if __name__ == '__main__':
|
|
18
18
|
test_parse_hive_partition_structure()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|