PyPI - sqlh - Versions diffs - 0.2.3__py3-none-any.whl - Mend

sqlh 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

sqlh/.DS_Store +0 -0
sqlh/__init__.py +32 -0
sqlh/cli.py +153 -0
sqlh/core/graph.py +385 -0
sqlh/core/helper.py +444 -0
sqlh/core/keywords.py +62 -0
sqlh/static/dagre_template.html +857 -0
sqlh/static/mermaid_template.html +28 -0
sqlh/tests/test_cli.py +3 -0
sqlh/tests/test_graph.py +55 -0
sqlh/tests/test_import.py +16 -0
sqlh/tests/test_sqlhelper.py +36 -0
sqlh/tests/test_utils.py +86 -0
sqlh/utils.py +365 -0
sqlh-0.2.3.dist-info/METADATA +338 -0
sqlh-0.2.3.dist-info/RECORD +18 -0
sqlh-0.2.3.dist-info/WHEEL +4 -0
sqlh-0.2.3.dist-info/entry_points.txt +3 -0

sqlh/core/helper.py ADDED Viewed

@@ -0,0 +1,444 @@
+"""
+SQL parser with token-based analysis.
+This module provides SQL parsing functionality using token-based analysis:
+|- Splitting multi-statement SQL by semicolons
+|- Removing SQL comments (single-line and multi-line)
+|- Extracting source and target tables
+|- Handling CTE (Common Table Expression) identification
+The parser uses keyword-based tokenization rather than full AST parsing,
+making it lightweight and fast for simple table/field extraction tasks.
+"""
+from .keywords import KeyWords
+class ParseException(Exception):
+    """Exception raised when SQL parsing fails."""
+    pass
+# ============================================================================
+# Module-level functions (replacing SqlHelper class)
+# ============================================================================
+def split_sql(sql: str) -> list[str]:
+    """
+    Split multi-statement SQL by semicolons, handling comments and quotes.
+    Args:
+        sql: SQL statement string
+    Returns:
+        List of individual SQL statements
+    Example:
+        >>> split_sql("SELECT 1; SELECT 2;")
+        ["SELECT 1", " SELECT 2"]
+    """
+    result = []
+    # 嵌套注释的层级数
+    depth = 0
+    # 多行SQL的前缀语句,分号之前的语句
+    prefix = ""
+    sql = sql + ";" if not sql.strip().endswith(";") else sql
+    for line in sql.splitlines():
+        line = "" if line.strip().startswith("--") else line
+        # 标记是否以双引号结尾
+        has_terminated_double_quote = True
+        # 标记是否以单引号结尾
+        has_terminated_single_quote = True
+        # 标记是否属于单行注释内容
+        is_single_line_comment = False
+        # 标记前一个字符是否是短横行 "-"
+        was_pre_dash = False
+        # 标记前一个字符是否是斜杆 "/"
+        was_pre_slash = False
+        # 标记前一个字符是否是星号 "*"
+        was_pre_star = False
+        last_semi_index = 0
+        index = 0
+        if len(prefix) > 0:
+            prefix += "\n"
+        for char in line:
+            index += 1
+            match char:
+                case "'":
+                    if has_terminated_double_quote:
+                        has_terminated_single_quote = not has_terminated_single_quote
+                case '"':
+                    if has_terminated_single_quote:
+                        has_terminated_double_quote = not has_terminated_double_quote
+                case "-":
+                    if has_terminated_double_quote and has_terminated_single_quote:
+                        if was_pre_dash:
+                            is_single_line_comment = True
+                    was_pre_dash = True
+                case "/":
+                    if has_terminated_double_quote and has_terminated_single_quote:
+                        # 如果'/'前面是'*'， 那么嵌套层级数-1
+                        if was_pre_star:
+                            depth -= 1
+                    was_pre_slash = True
+                    was_pre_dash = False
+                    was_pre_star = False
+                case "*":
+                    if has_terminated_double_quote and has_terminated_single_quote:
+                        # 如果'*'前面是'/'， 那么嵌套层级数+1
+                        if was_pre_slash:
+                            depth += 1
+                    was_pre_star = True
+                    was_pre_dash = False
+                    was_pre_slash = False
+                case ";":
+                    # 当分号不在单引号内，不在双引号内，不属于单行注释，并且多行嵌套注释的层级数为0时，表示此分号应该作为分隔符进行划分
+                    if (
+                        has_terminated_double_quote
+                        and has_terminated_single_quote
+                        and not is_single_line_comment
+                        and depth == 0
+                    ):
+                        sql_stmt = prefix + line[last_semi_index : index - 1]
+                        result.append(sql_stmt)
+                        prefix = ""
+                        last_semi_index = index
+                case _:
+                    was_pre_dash = False
+                    was_pre_slash = False
+                    was_pre_star = False
+        if last_semi_index != index or len(line) == 0:
+            prefix += line[last_semi_index:]
+    assert depth == 0, f"The number of nested levels of sql multi-line comments is not equal to 0: {depth}"
+    if "" in result:
+        result.remove("")
+    return result
+def trim_comment(sql: str) -> str:
+    """
+    Remove single-line and multi-line comments from SQL.
+    Args:
+        sql: SQL statement string
+    Returns:
+        SQL string with comments removed
+    """
+    # 1. 删除单行注释
+    sql = _trim_single_line_comment(sql=sql)
+    # 2. 将多行SQL转为单行SQL
+    sql = "\\n".join(sql.splitlines())
+    # 3. 删除多行注释
+    index = 0
+    # 嵌套注释的层级数
+    depth = 0
+    # 标记是否以双引号结尾
+    has_terminated_double_quote = True
+    # 标记是否以单引号结尾
+    has_terminated_single_quote = True
+    # 标记前一个字符是否是斜杆 "/"
+    was_pre_slash = False
+    # 标记前一个字符是否是星号 "*"
+    was_pre_star = False
+    # 标记是否是SQL Hint
+    is_hint = False
+    comment_start_index = 0
+    comment_end_index = 0
+    comment_index_list = []
+    for char in sql:
+        index += 1
+        match char:
+            case "'":
+                if has_terminated_double_quote:
+                    has_terminated_single_quote = not has_terminated_single_quote
+            case '"':
+                if has_terminated_single_quote:
+                    has_terminated_double_quote = not has_terminated_double_quote
+            case "/":
+                if has_terminated_double_quote and has_terminated_single_quote:
+                    # 如果'/'前面是'*'， 那么嵌套层级数-1
+                    if was_pre_star:
+                        if not is_hint:
+                            depth -= 1
+                            if depth == 0:
+                                comment_end_index = index
+                                comment_index_list.append((comment_start_index, comment_end_index))
+                        else:
+                            is_hint = False
+                was_pre_slash = True
+                was_pre_star = False
+            case "*":
+                if has_terminated_double_quote and has_terminated_single_quote:
+                    # 如果'*'前面是'/'， 那么嵌套层级数+1
+                    if was_pre_slash:
+                        depth += 1
+                        # 记录层级为1的开始索引
+                        if depth == 1:
+                            comment_start_index = index - 2
+                was_pre_star = True
+                was_pre_slash = False
+            case "+":
+                if has_terminated_double_quote and has_terminated_single_quote:
+                    if was_pre_star and depth == 1:
+                        is_hint = True
+                        depth = 0
+                was_pre_star = False
+                was_pre_slash = False
+            case _:
+                was_pre_slash = False
+                was_pre_star = False
+    for start, end in reversed(comment_index_list):
+        sql = sql[:start] + sql[end:]
+    # 4. 单行SQL转为多行
+    sql = sql.replace("\\n", "\n")
+    return sql
+def get_source_target_tables(sql: str) -> dict[str, list[str]] | None:
+    """
+    Extract source and target tables from a single SQL statement.
+    This method uses token-based parsing to identify table dependencies.
+    CTE (Common Table Expression) intermediate tables are filtered out.
+    Args:
+        sql: Single SQL statement string
+    Returns:
+        Dictionary with keys:
+            - "source_tables": list of source table names
+            - "target_tables": list of target table names
+        Returns None if no tables found
+    Raises:
+        ParseException: If SQL contains multiple statements
+    Note:
+        TODO:
+        {
+            "source_tables": [(t1, 1), (t2, 2), (t3, 3)],
+            "target_tables": [(t4, 1)]
+        }
+    """
+    # 预处理：去掉多行注释和单行注释
+    sql = trim_comment(sql).strip()
+    # 删除末尾的`;`
+    sql = sql[:-1] if sql.endswith(";") else sql
+    # 校验SQL参数
+    if len(split_sql(sql)) > 1:
+        raise ParseException("sql脚本为多条SQL语句,需传入单条SQL语句.")
+    was_pre_insert = False
+    was_pre_from = False
+    was_pre_as = False
+    was_merge = False
+    was_using = False
+    was_pre_table_name = False
+    was_pre_table_function = False
+    target_tables: list[str] = []
+    source_tables: list[str] = []
+    result: dict[str, list[str]] = {}
+    for line in sql.splitlines():
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        line = line.replace("(", " ( ")
+        line = line.replace(")", " ) ")
+        line = line.replace(",", " , ")
+        for token in line.split(" "):
+            token = token.strip()
+            if len(token) == 0:
+                continue
+            if token.upper() == "AS":
+                was_pre_as = True
+                continue
+            if token.upper() in KeyWords.insert_keywords:
+                was_pre_insert = True
+                was_pre_from = False
+                continue
+            if token.upper() == "MERGE":
+                was_merge = True
+                continue
+            if token.upper() == "USING":
+                was_using = True
+                continue
+            if token.upper() in KeyWords.from_keywords:
+                was_pre_from = True
+                was_pre_insert = False
+                was_pre_table_name = False
+                continue
+            if was_pre_as and token.upper() not in KeyWords.keywords:
+                was_pre_as = False
+                was_pre_table_name = False
+                continue
+            if token.upper() in KeyWords.keywords:
+                if was_pre_insert or was_pre_from:
+                    was_pre_from = False
+                continue
+            if token.upper() not in KeyWords.keywords and was_pre_insert:
+                target_tables.append(token)
+                was_pre_insert = False
+                was_pre_from = False
+                continue
+            if token.upper() in KeyWords.table_function_keywords and was_pre_from:
+                was_pre_table_function = True
+                continue
+            # merge into
+            if was_merge and not was_using and token.upper() not in KeyWords.keywords and len(target_tables) == 0:
+                target_tables.append(token)
+                continue
+            if was_merge and was_using and token.upper() not in KeyWords.keywords:
+                if token != "(":
+                    source_tables.append(token)
+                was_using = False
+                was_merge = False
+                continue
+            if was_pre_from:
+                if (
+                    token not in KeyWords.keywords
+                    and not was_pre_table_name
+                    and token not in (",", "(")
+                    and not was_pre_table_function
+                ):
+                    source_tables.append(token)
+                    was_pre_from = True
+                    was_pre_table_name = True
+                if token in ["AS", ","]:
+                    was_pre_from = True
+                    was_pre_table_name = False
+    mid_table = _get_cte_mid_tables(sql)
+    source_tables = list(set(source_tables) - set(mid_table))
+    if len(source_tables) != 0:
+        result.setdefault("target_tables", target_tables)
+        result.setdefault("source_tables", source_tables)
+        return result
+    else:
+        return
+# ============================================================================
+# Private helper functions
+# ============================================================================
+def _trim_single_line_comment(sql: str) -> str:
+    """删除单行注释"""
+    result = []
+    for line in sql.splitlines():
+        line = line.strip()
+        line = "" if line.startswith("--") else line
+        line = "" if line.startswith("#") else line
+        if len(line) == 0:
+            continue
+        # 标记是否以双引号结尾
+        has_terminated_double_quote = True
+        # 标记是否以单引号结尾
+        has_terminated_single_quote = True
+        # 标记前一个字符是否是短横行 "-"
+        was_pre_dash = False
+        index = 0
+        for char in line:
+            index += 1
+            match char:
+                case "'":
+                    if has_terminated_double_quote:
+                        has_terminated_single_quote = not has_terminated_single_quote
+                case '"':
+                    if has_terminated_single_quote:
+                        has_terminated_double_quote = not has_terminated_double_quote
+                case "-":
+                    if has_terminated_double_quote and has_terminated_single_quote:
+                        if was_pre_dash:
+                            line = line[: index - 2]
+                            continue
+                    was_pre_dash = True
+                case "#":
+                    if has_terminated_double_quote and has_terminated_single_quote:
+                        line = line[: index - 1]
+                        continue
+                case _:
+                    was_pre_dash = False
+        result.append(line)
+    return "\n".join(result)
+def _get_cte_mid_tables(sql: str) -> list:
+    """获取cte语句的临时表名"""
+    # 括号层级
+    bracket_level = 0
+    was_pre_with = False
+    is_cte = False
+    was_pre_right_bracket = False
+    result = []
+    # 预处理：去掉多行注释和单行注释
+    sql = trim_comment(sql)
+    for line in sql.splitlines():
+        line = line.strip()
+        if len(line) == 0:
+            continue
+        line = line.replace("(", " ( ")
+        line = line.replace(")", " ) ")
+        line = line.replace(",", " , ")
+        for token in line.split(" "):
+            token = token.strip()
+            if len(token) == 0:
+                continue
+            if token.upper() == "(":
+                bracket_level += 1
+            if token.upper() == ")":
+                bracket_level -= 1
+                was_pre_right_bracket = True
+            if token.upper() == "WITH":
+                was_pre_with = True
+                is_cte = True
+                continue
+            if token.upper() in KeyWords.keywords:
+                if was_pre_right_bracket and is_cte and bracket_level == 0 and token.upper() != "AS":
+                    is_cte = False
+            if token.upper() not in KeyWords.keywords:
+                if was_pre_with:
+                    result.append(token)
+                if is_cte and bracket_level == 0 and not was_pre_with and token not in (",", "(", ")"):
+                    result.append(token)
+                was_pre_with = False
+    return result

sqlh/core/keywords.py ADDED Viewed

@@ -0,0 +1,62 @@
+class KeyWords:
+    keywords = [
+        "SELECT",
+        "INSERT",
+        "DELETE",
+        "UPDATE",
+        "UPSERT",
+        "REPLACE",
+        "DROP",
+        "CREATE",
+        "ALTER",
+        "TRUNCATE",
+        "WHERE",
+        "FROM",
+        "INNER",
+        "JOIN",
+        "AND",
+        "ON",
+        "OR",
+        "LIKE",
+        "IN",
+        "SET",
+        "BY",
+        "GROUP",
+        "ORDER",
+        "LEFT",
+        "OUTER",
+        "FULL",
+        "RIGHT",
+        "IF",
+        "END",
+        "THEN",
+        "AS",
+        "ELSE",
+        "CASE",
+        "WHEN",
+        "DISTINCT",
+        "OVERWRITE",
+        "TABLE",
+        "OVER",
+        "INTO",
+        "VIEW",
+        "NOT",
+        "EXISTS",
+        "EXTERNAL",
+        "WITH",
+        "DATABASE",
+        "TEMPORARY",
+        "MERGE",
+    ]
+    insert_keywords = [
+        "INSERT",
+        "CREATE",
+    ]
+    from_keywords = [
+        "FROM",
+        "JOIN",
+    ]
+    table_function_keywords = ["UNNEST", "LATERAL", "GENERATE_SERIES", "SEQUENCE"]