lidb 1.0.9__tar.gz → 1.0.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of lidb might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lidb
3
- Version: 1.0.9
3
+ Version: 1.0.10
4
4
  Requires-Python: >=3.12
5
5
  Description-Content-Type: text/markdown
6
6
  Requires-Dist: dynaconf>=3.2.11
@@ -17,4 +17,6 @@ from .database import (
17
17
  read_ck,
18
18
  )
19
19
 
20
- __version__ = "1.0.9"
20
+ from .parse import parse_hive_partition_structure
21
+
22
+ __version__ = "1.0.10"
@@ -7,19 +7,26 @@ Created on 2024/11/6 下午7:25
7
7
  Email: yundi.xxii@outlook.com
8
8
  ---------------------------------------------
9
9
  """
10
- import sqlparse
11
10
  import re
11
+ from pathlib import Path
12
+ from typing import Dict, Set
13
+ from urllib.parse import unquote
14
+
15
+ import sqlparse
16
+
12
17
 
13
18
  def format_sql(sql_content):
14
19
  """将sql语句进行规范化,并去除sql中的注释,输入和输出均为字符串"""
15
20
  parse_str = sqlparse.format(sql_content, reindent=True, strip_comments=True)
16
21
  return parse_str
17
22
 
23
+
18
24
  def extract_temp_tables(with_clause):
19
25
  """从WITH子句中提取临时表名,输出为列表"""
20
26
  temp_tables = re.findall(r'\b(\w+)\s*as\s*\(', with_clause, re.IGNORECASE)
21
27
  return temp_tables
22
28
 
29
+
23
30
  def extract_table_names_from_sql(sql_query):
24
31
  """从sql中提取对应的表名称,输出为列表"""
25
32
  table_names = set()
@@ -34,7 +41,7 @@ def extract_table_names_from_sql(sql_query):
34
41
  # 遍历解析后的语句块
35
42
  for statement in parsed:
36
43
  # 转换为字符串
37
- statement_str = str(statement)# .lower()
44
+ statement_str = str(statement) # .lower()
38
45
 
39
46
  # 将字符串中的特殊语法置空
40
47
  statement_str = re.sub(r'(substring|extract)\s*\(((.|\s)*?)\)', '', statement_str)
@@ -62,4 +69,34 @@ def extract_table_names_from_sql(sql_query):
62
69
  return table_names
63
70
 
64
71
 
72
+ def parse_hive_partition_structure(root_path: Path | str, file_pattern: str = "*.parquet") -> Dict[str, Set[str]]:
73
+ """
74
+ 通用Hive分区结构解析器
75
+
76
+ Args:
77
+ root_path: 根路径 (如 /data)
78
+ file_pattern: 文件匹配模式 (默认 "*.parquet")
79
+
80
+ Returns:
81
+ Dict[str, Set[str]]: 分区键到值集合的映射
82
+ """
83
+ if isinstance(root_path, str):
84
+ root_path = Path(root_path)
85
+ partitions: Dict[str, Set[str]] = {}
86
+
87
+ # 查找所有匹配的文件
88
+ for file_path in root_path.rglob(file_pattern):
89
+ # 获取相对于根路径的路径
90
+ relative_path = file_path.relative_to(root_path)
91
+
92
+ # 解析路径中的分区信息
93
+ for part in relative_path.parts[:-1]: # 排除文件名部分
94
+ if '=' in part:
95
+ key, value = part.split('=', 1)
96
+ value = unquote(value)
97
+
98
+ if key not in partitions:
99
+ partitions[key] = set()
100
+ partitions[key].add(value)
65
101
 
102
+ return partitions
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: lidb
3
- Version: 1.0.9
3
+ Version: 1.0.10
4
4
  Requires-Python: >=3.12
5
5
  Description-Content-Type: text/markdown
6
6
  Requires-Dist: dynaconf>=3.2.11
@@ -9,4 +9,5 @@ lidb.egg-info/SOURCES.txt
9
9
  lidb.egg-info/dependency_links.txt
10
10
  lidb.egg-info/requires.txt
11
11
  lidb.egg-info/top_level.txt
12
- tests/test_conf.py
12
+ tests/test_conf.py
13
+ tests/test_parse.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "lidb"
3
- version = "1.0.9"
3
+ version = "1.0.10"
4
4
  description = ""
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12"
@@ -0,0 +1,18 @@
1
+ # Copyright (c) ZhangYundi.
2
+ # Licensed under the MIT License.
3
+ # Created on 2025/8/18 11:16
4
+ # Description:
5
+
6
+ import lidb
7
+ import logair
8
+
9
+ logger = logair.get_logger("lidb.test")
10
+
11
+ def test_parse_hive_partition_structure():
12
+ root_path = lidb.tb_path("mc")
13
+ file_pattern = "*.parquet"
14
+ result = lidb.parse.parse_hive_partition_structure(root_path, file_pattern)
15
+ logger.info(result)
16
+
17
+ if __name__ == '__main__':
18
+ test_parse_hive_partition_structure()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes