sqlh 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sqlh-0.3.1 → sqlh-0.3.2}/PKG-INFO +1 -1
- {sqlh-0.3.1 → sqlh-0.3.2}/pyproject.toml +1 -1
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/__init__.py +1 -1
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/core/helper.py +1 -153
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/tests/test_utils.py +2 -2
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/utils.py +2 -2
- {sqlh-0.3.1 → sqlh-0.3.2}/README.md +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/.DS_Store +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/cli.py +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/core/graph.py +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/core/keywords.py +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/static/dagre_template.html +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/static/mermaid_template.html +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/tests/test_cli.py +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/tests/test_graph.py +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/tests/test_import.py +0 -0
- {sqlh-0.3.1 → sqlh-0.3.2}/sqlh/tests/test_sqlhelper.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: sqlh
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.2
|
|
4
4
|
Summary: A lightweight SQL lineage analysis library for tracking table dependencies in data pipelines
|
|
5
5
|
Keywords: sql,lineage,data-pipeline,dag,dependency,database,etl,data-engineering
|
|
6
6
|
Requires-Python: >=3.10
|
|
@@ -444,7 +444,7 @@ def get_source_target_tables(sql: str) -> dict[str, list[str]] | None:
|
|
|
444
444
|
ParseException: If SQL contains multiple statements
|
|
445
445
|
|
|
446
446
|
Note:
|
|
447
|
-
TODO: 不能识别join后面的 [hint] table_name
|
|
447
|
+
TODO: 不能识别join后面的 [hint] table_name
|
|
448
448
|
{
|
|
449
449
|
"source_tables": [(t1, 1), (t2, 2), (t3, 3)],
|
|
450
450
|
"target_tables": [(t4, 1)]
|
|
@@ -563,158 +563,6 @@ def get_source_target_tables(sql: str) -> dict[str, list[str]] | None:
|
|
|
563
563
|
return
|
|
564
564
|
|
|
565
565
|
|
|
566
|
-
def get_source_target_tables_v2(sql: str) -> dict[str, list[str]] | None:
|
|
567
|
-
"""
|
|
568
|
-
Extract source and target tables from a single SQL statement using a token-based approach.
|
|
569
|
-
changelog:
|
|
570
|
-
- 支持识别join后面的 [hint] table_name
|
|
571
|
-
"""
|
|
572
|
-
# 1. 预处理
|
|
573
|
-
clean_sql = trim_comment(sql).strip()
|
|
574
|
-
if not clean_sql:
|
|
575
|
-
return None
|
|
576
|
-
|
|
577
|
-
# 去除末尾分号
|
|
578
|
-
if clean_sql.endswith(";"):
|
|
579
|
-
clean_sql = clean_sql[:-1]
|
|
580
|
-
|
|
581
|
-
# 校验多语句
|
|
582
|
-
# 注意:这里直接传原始 sql 给 split_sql 可能更稳妥,或者传 clean_sql 取决于 split_sql 的实现
|
|
583
|
-
if len(split_sql(clean_sql)) > 1:
|
|
584
|
-
raise ParseException("sql脚本为多条SQL语句,需传入单条SQL语句.")
|
|
585
|
-
|
|
586
|
-
# 2. 分词 (使用正则优化性能)
|
|
587
|
-
# 匹配单词、括号、逗号。忽略空白字符。
|
|
588
|
-
# tokens = re.findall(r"[A-Za-z_][A-Za-z0-9_]*|[()]|,", clean_sql)
|
|
589
|
-
tokens = re.findall(r"\[[^\]]+\]|[A-Za-z_][A-Za-z0-9_]*|[()]|,", clean_sql)
|
|
590
|
-
|
|
591
|
-
if not tokens:
|
|
592
|
-
return None
|
|
593
|
-
|
|
594
|
-
# 3. 状态机变量
|
|
595
|
-
source_tables = set()
|
|
596
|
-
target_tables = set()
|
|
597
|
-
|
|
598
|
-
# 状态标志
|
|
599
|
-
# 状态标志
|
|
600
|
-
is_insert_context = False
|
|
601
|
-
is_from_context = False
|
|
602
|
-
is_join_context = False # 新增:专门标记 JOIN 后的状态
|
|
603
|
-
is_using_context = False
|
|
604
|
-
is_merge_context = False
|
|
605
|
-
|
|
606
|
-
# 辅助变量
|
|
607
|
-
|
|
608
|
-
for token in tokens:
|
|
609
|
-
token_upper = token.upper()
|
|
610
|
-
|
|
611
|
-
# --- 特殊处理:跳过 Hint 内容 ---
|
|
612
|
-
# 如果 token 是 [...] 格式,直接跳过,不要把它当成括号或关键字
|
|
613
|
-
if token.startswith("[") and token.endswith("]"):
|
|
614
|
-
# 如果是在 JOIN 后面遇到的 Hint,保持 join_context 为 True
|
|
615
|
-
# 如果是在 FROM 后面遇到的 Hint,保持 from_context 为 True
|
|
616
|
-
continue
|
|
617
|
-
|
|
618
|
-
# --- 上下文重置 ---
|
|
619
|
-
if token == "(":
|
|
620
|
-
# 左括号通常意味着子查询,重置 FROM/JOIN 上下文
|
|
621
|
-
# 但我们不能重置 INSERT 上下文
|
|
622
|
-
is_from_context = False
|
|
623
|
-
is_join_context = False
|
|
624
|
-
is_using_context = False
|
|
625
|
-
continue
|
|
626
|
-
|
|
627
|
-
# --- 关键字状态流转 ---
|
|
628
|
-
|
|
629
|
-
if token_upper in KeyWords.insert_keywords:
|
|
630
|
-
is_insert_context = True
|
|
631
|
-
is_from_context = False
|
|
632
|
-
is_join_context = False
|
|
633
|
-
continue
|
|
634
|
-
|
|
635
|
-
if token_upper == "FROM":
|
|
636
|
-
is_from_context = True
|
|
637
|
-
is_join_context = False
|
|
638
|
-
is_insert_context = False
|
|
639
|
-
continue
|
|
640
|
-
|
|
641
|
-
# 修复核心:处理 JOIN
|
|
642
|
-
if token_upper in ["JOIN", "INNER", "LEFT", "RIGHT", "OUTER", "CROSS"]:
|
|
643
|
-
is_from_context = True # JOIN 也是一种来源
|
|
644
|
-
is_join_context = True # 标记刚刚遇到了 JOIN,下一个非关键字即为表名
|
|
645
|
-
continue
|
|
646
|
-
|
|
647
|
-
# 【关键修复】:遇到 ON,说明表名部分结束了,后面是关联条件
|
|
648
|
-
if token_upper == "ON":
|
|
649
|
-
is_from_context = False
|
|
650
|
-
is_join_context = False
|
|
651
|
-
continue
|
|
652
|
-
|
|
653
|
-
if token_upper == "MERGE":
|
|
654
|
-
is_merge_context = True
|
|
655
|
-
continue
|
|
656
|
-
|
|
657
|
-
if token_upper == "USING":
|
|
658
|
-
is_from_context = False
|
|
659
|
-
is_join_context = False
|
|
660
|
-
if is_merge_context:
|
|
661
|
-
is_using_context = True
|
|
662
|
-
continue
|
|
663
|
-
|
|
664
|
-
# --- 表名捕获逻辑 ---
|
|
665
|
-
|
|
666
|
-
# 1. INSERT 目标表
|
|
667
|
-
if is_insert_context and token_upper not in KeyWords.keywords and token not in ("(", ")"):
|
|
668
|
-
target_tables.add(token)
|
|
669
|
-
is_insert_context = False
|
|
670
|
-
continue
|
|
671
|
-
|
|
672
|
-
# 2. MERGE 逻辑 (略,保持原有逻辑) ...
|
|
673
|
-
if is_merge_context and not is_using_context and token_upper not in KeyWords.keywords:
|
|
674
|
-
target_tables.add(token)
|
|
675
|
-
is_merge_context = False
|
|
676
|
-
continue
|
|
677
|
-
|
|
678
|
-
if is_using_context and token_upper not in KeyWords.keywords:
|
|
679
|
-
if token != "(":
|
|
680
|
-
source_tables.add(token)
|
|
681
|
-
is_using_context = False
|
|
682
|
-
is_merge_context = False
|
|
683
|
-
continue
|
|
684
|
-
|
|
685
|
-
# 3. FROM / JOIN 源表 (修复核心)
|
|
686
|
-
# 条件:处于 FROM 或 JOIN 状态,且不是关键字,且不是括号/逗号
|
|
687
|
-
if (is_from_context or is_join_context) and token_upper not in KeyWords.keywords and token not in ("(", ")", ","):
|
|
688
|
-
source_tables.add(token)
|
|
689
|
-
|
|
690
|
-
# 捕获到表名后,重置状态,等待下一个 JOIN 或逗号
|
|
691
|
-
is_join_context = False
|
|
692
|
-
# 注意:is_from_context 保持 True,以便处理 "FROM t1, t2" 这种逗号分隔的情况
|
|
693
|
-
continue
|
|
694
|
-
|
|
695
|
-
# 处理逗号:逗号意味着可能有下一个表名,重置别名状态,保持 FROM 上下文
|
|
696
|
-
if token == ",":
|
|
697
|
-
is_join_context = False
|
|
698
|
-
# is_from_context 保持不变
|
|
699
|
-
continue
|
|
700
|
-
|
|
701
|
-
# 4. 过滤 CTE 中间表
|
|
702
|
-
# 注意:_get_cte_mid_tables 需要解析 WITH 子句,这里假设它工作正常
|
|
703
|
-
cte_tables = _get_cte_mid_tables(clean_sql)
|
|
704
|
-
source_tables -= set(cte_tables)
|
|
705
|
-
|
|
706
|
-
# 5. 构建结果
|
|
707
|
-
if source_tables or target_tables:
|
|
708
|
-
return {"source_tables": list(source_tables), "target_tables": list(target_tables)}
|
|
709
|
-
|
|
710
|
-
return None
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
# ============================================================================
|
|
714
|
-
# Private helper functions
|
|
715
|
-
# ============================================================================
|
|
716
|
-
|
|
717
|
-
|
|
718
566
|
def _trim_single_line_comment(sql: str) -> str:
|
|
719
567
|
"""删除单行注释"""
|
|
720
568
|
result = []
|
|
@@ -3,9 +3,9 @@ from typing import Tuple
|
|
|
3
3
|
from sqlh import utils
|
|
4
4
|
|
|
5
5
|
# 读取目录或文件
|
|
6
|
-
sql_path = ""
|
|
6
|
+
sql_path = "/Users/dunett/codes/duperl/daas-migration/showyu_fastdata_backup_20260413"
|
|
7
7
|
sql_stmt_str = utils.read_sql_from_directory(sql_path)
|
|
8
|
-
sql_stmt_str = """insert into t3 SELECT /*+edede */ COUNT(*) FROM t2 JOIN [broadcast] t1 ON t1.c1 = t2.c2;"""
|
|
8
|
+
# sql_stmt_str = """insert into t3 SELECT /*+edede */ COUNT(*) FROM t2 JOIN [broadcast] t1 ON t1.c1 = t2.c2;"""
|
|
9
9
|
|
|
10
10
|
def test_read_sql_from_directory():
|
|
11
11
|
import timeit
|
|
@@ -26,7 +26,7 @@ from pathlib import Path
|
|
|
26
26
|
from typing import List, Literal, Tuple, Union
|
|
27
27
|
|
|
28
28
|
from .core.graph import DagGraph, NodeNotFoundException
|
|
29
|
-
from .core.helper import
|
|
29
|
+
from .core.helper import split_sql, trim_comment, get_source_target_tables
|
|
30
30
|
|
|
31
31
|
SearchResult = Union[Tuple[List[str], DagGraph], NodeNotFoundException]
|
|
32
32
|
|
|
@@ -96,7 +96,7 @@ def __build_tables_and_graph(sql_stmt_str: str) -> Tuple[list, list, DagGraph]:
|
|
|
96
96
|
dg = DagGraph()
|
|
97
97
|
|
|
98
98
|
for sql_stmt in sql_stmt_lst:
|
|
99
|
-
table_info =
|
|
99
|
+
table_info = get_source_target_tables(sql_stmt)
|
|
100
100
|
if table_info:
|
|
101
101
|
sources = [re.sub(r"`|\"", "", t) for t in table_info["source_tables"]]
|
|
102
102
|
targets = [re.sub(r"`|\"", "", t) for t in table_info["target_tables"]]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|