PyPI - sql-dag-flow - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sql-dag-flow 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

sql_dag_flow/__init__.py +0 -0
sql_dag_flow/main.py +203 -0
sql_dag_flow/parser.py +290 -0
sql_dag_flow/static/assets/index-BvR3xTHQ.js +47 -0
sql_dag_flow/static/assets/index-h7gxR8yA.css +1 -0
sql_dag_flow/static/index.html +14 -0
sql_dag_flow/static/vite.svg +1 -0
sql_dag_flow/test_api_endpoints.py +44 -0
sql_dag_flow/test_parser.py +29 -0
sql_dag_flow/verify_counts.py +21 -0
sql_dag_flow-0.1.0.dist-info/METADATA +137 -0
sql_dag_flow-0.1.0.dist-info/RECORD +16 -0
sql_dag_flow-0.1.0.dist-info/WHEEL +5 -0
sql_dag_flow-0.1.0.dist-info/entry_points.txt +2 -0
sql_dag_flow-0.1.0.dist-info/licenses/LICENSE +21 -0
sql_dag_flow-0.1.0.dist-info/top_level.txt +1 -0

sql_dag_flow/__init__.py ADDED Viewed

File without changes

sql_dag_flow/main.py ADDED Viewed

@@ -0,0 +1,203 @@
+from fastapi import FastAPI, HTTPException, Body
+from pydantic import BaseModel
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import FileResponse
+import uvicorn
+import os
+import sys
+import json
+import webbrowser
+import threading
+import time
+from .parser import parse_sql_files, build_graph
+app = FastAPI()
+# Enable CORS for frontend
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Package structure
+# __file__ is inside src/sql_dag_flow/main.py
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+STATIC_DIR = os.path.join(BASE_DIR, "static")
+# Global state
+CURRENT_DIRECTORY = os.getcwd() # Default, updated by start()
+DIAGRAM_FILE = "sql_diagram.json"
+@app.get("/graph")
+def get_graph(dialect: str = "bigquery"):
+    """Parses SQL files in the current directory and returns graph data."""
+    if not os.path.exists(CURRENT_DIRECTORY):
+        return {"nodes": [], "edges": [], "error": "Directory not found"}
+    tables = parse_sql_files(CURRENT_DIRECTORY, dialect=dialect)
+    nodes, edges = build_graph(tables)
+    return {"nodes": nodes, "edges": edges}
+@app.post("/config/path")
+def set_path(path_data: dict = Body(...)):
+    """Updates the directory to scan."""
+    global CURRENT_DIRECTORY
+    path = path_data.get("path")
+    # Basic validation
+    if not path or not os.path.exists(path):
+        raise HTTPException(status_code=400, detail="Directory does not exist")
+    CURRENT_DIRECTORY = path
+    return {"message": "Path updated", "path": CURRENT_DIRECTORY}
+@app.post("/scan/folders")
+def scan_folders(path_data: dict = Body(...)):
+    """Scans a directory and returns all subfolders (recursive, relative paths)."""
+    path = path_data.get("path")
+    if not path or not os.path.exists(path):
+         raise HTTPException(status_code=400, detail="Directory does not exist")
+    try:
+        subfolders = []
+        # Walk the directory tree
+        for root, dirs, files in os.walk(path):
+            # Skip hidden folders
+            dirs[:] = [d for d in dirs if not d.startswith('.')]
+            for d in dirs:
+                # Create relative path from the root path
+                full_path = os.path.join(root, d)
+                rel_path = os.path.relpath(full_path, path)
+                # Normalize separators to forward slashes for consistency
+                rel_path = rel_path.replace(os.sep, '/')
+                subfolders.append(rel_path)
+        # Sort for better UX
+        subfolders.sort()
+        return {"folders": subfolders}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/graph/filtered")
+def get_filtered_graph(data: dict = Body(...)):
+    """Parses SQL files with subfolder filtering."""
+    if not os.path.exists(CURRENT_DIRECTORY):
+        return {"nodes": [], "edges": [], "error": "Directory not found"}
+    subfolders = data.get("subfolders") # List of strings or None
+    dialect = data.get("dialect", "bigquery")
+    tables = parse_sql_files(CURRENT_DIRECTORY, allowed_subfolders=subfolders, dialect=dialect)
+    nodes, edges = build_graph(tables)
+    return {"nodes": nodes, "edges": edges}
+@app.get("/config/path")
+def get_path():
+    return {"path": CURRENT_DIRECTORY}
+class SaveRequest(BaseModel):
+    nodes: list
+    edges: list
+    viewport: dict
+    metadata: dict
+    filename: str = "sql_diagram.json" # Default filename
+@app.post("/save")
+def save_graph(request: SaveRequest):
+    try:
+        # Use the path from metadata if available, otherwise default
+        path = request.metadata.get("path", ".")
+        if not os.path.isabs(path):
+             path = os.path.abspath(path)
+        filepath = os.path.join(path, request.filename)
+        data = {
+            "nodes": request.nodes,
+            "edges": request.edges,
+            "viewport": request.viewport,
+            "metadata": request.metadata
+        }
+        with open(filepath, "w") as f:
+            json.dump(data, f, indent=4)
+        return {"message": f"Graph saved successfully to {filepath}"}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/load")
+def load_graph(path: str = ".", filename: str = "sql_diagram.json"):
+    try:
+        if not os.path.isabs(path):
+             path = os.path.abspath(path)
+        filepath = os.path.join(path, filename)
+        if not os.path.exists(filepath):
+            return {"nodes": [], "edges": [], "viewport": {"x": 0, "y": 0, "zoom": 1}, "metadata": {}}
+        with open(filepath, "r") as f:
+            data = json.load(f)
+        return data
+    except Exception as e:
+        print(f"Error loading graph: {e}")
+        return {"nodes": [], "edges": [], "viewport": {"x": 0, "y": 0, "zoom": 1}, "metadata": {}}
+@app.get("/config_files")
+def list_config_files(path: str = "."):
+    try:
+        if not os.path.isabs(path):
+             path = os.path.abspath(path)
+        if not os.path.exists(path):
+            return {"files": []}
+        files = [f for f in os.listdir(path) if f.endswith(".json") and os.path.isfile(os.path.join(path, f))]
+        return {"files": files}
+    except Exception as e:
+        print(f"Error listing config files: {e}")
+        return {"files": []}
+# Serve Static Files (Frontend)
+if os.path.exists(STATIC_DIR):
+    app.mount("/assets", StaticFiles(directory=os.path.join(STATIC_DIR, "assets")), name="assets")
+    # Catch-all for SPA routing
+    @app.get("/{full_path:path}")
+    async def serve_spa(full_path: str):
+        file_path = os.path.join(STATIC_DIR, full_path)
+        if os.path.isfile(file_path):
+            return FileResponse(file_path)
+        return FileResponse(os.path.join(STATIC_DIR, "index.html"))
+def start():
+    """Entry point for the CLI tool."""
+    global CURRENT_DIRECTORY
+    # CLI Argument Parsing
+    if len(sys.argv) > 1:
+        path_arg = sys.argv[1]
+        if os.path.exists(path_arg):
+            CURRENT_DIRECTORY = os.path.abspath(path_arg)
+            print(f"Setting project path from CLI: {CURRENT_DIRECTORY}")
+        else:
+            print(f"Warning: Path '{path_arg}' does not exist. Using defaults.")
+    else:
+        CURRENT_DIRECTORY = os.getcwd()
+        print(f"Using current directory: {CURRENT_DIRECTORY}")
+    def open_browser():
+        time.sleep(1.5)
+        webbrowser.open("http://localhost:8000")
+    threading.Thread(target=open_browser, daemon=True).start()
+    # Run uvicorn programmatically
+    # Note: When running programmatically, reload=True is not supported easily without other hacks
+    uvicorn.run(app, host="127.0.0.1", port=8000)
+if __name__ == "__main__":
+    start()

sql_dag_flow/parser.py ADDED Viewed

@@ -0,0 +1,290 @@
+import os
+import sqlglot
+from sqlglot import exp
+import networkx as nx
+import os
+import sqlglot
+from sqlglot import exp
+import networkx as nx
+import re
+def parse_sql_files(directory, allowed_subfolders=None, dialect="bigquery"):
+    """
+    Recursively scans a directory for .sql files and parses them.
+    Returns a dictionary mapping table names to their dependencies and metadata.
+    """
+    tables = {}
+    for root, dirs, files in os.walk(directory):
+        # Filter subfolders if allowed_subfolders is specified
+        if allowed_subfolders is not None:
+             # allowed_subfolders contains relative paths like "sub1", "sub1/nested"
+             # We must prune 'dirs' so we only traverse relevant paths.
+             rel_root = os.path.relpath(root, directory).replace(os.sep, '/')
+             if rel_root == ".": rel_root = ""
+             allowed_dirs = []
+             for d in dirs:
+                 rel_d = f"{rel_root}/{d}" if rel_root else d
+                 # Keep 'd' if:
+                 # 1. rel_d is exactly one of the allowed paths
+                 # 2. rel_d is a parent of an allowed path (e.g. 'sub1' parent of 'sub1/nested')
+                 # 3. rel_d is inside an allowed path (e.g. 'sub1/nested' inside 'sub1' which is allowed)
+                 is_allowed = False
+                 for allowed in allowed_subfolders:
+                     if rel_d == allowed:
+                         is_allowed = True
+                         break
+                     if allowed.startswith(rel_d + '/'): # rel_d is parent
+                         is_allowed = True
+                         break
+                     if rel_d.startswith(allowed + '/'): # rel_d is child
+                         is_allowed = True
+                         break
+                 if is_allowed:
+                     allowed_dirs.append(d)
+             dirs[:] = allowed_dirs
+        # Check if the current directory is valid for file parsing
+        # We only parse files if we are IN a selected folder or a SUBFOLDER of a selected folder.
+        # We do NOT parse files if we are just traversing a PARENT folder to get to a selected one.
+        should_parse_files = True
+        if allowed_subfolders is not None:
+            should_parse_files = False
+            rel_root_check = os.path.relpath(root, directory).replace(os.sep, '/')
+            if rel_root_check == ".": rel_root_check = ""
+            # 1. Decide if we should parse files in THIS folder
+            if rel_root_check in allowed_subfolders:
+                should_parse_files = True
+            # 2. Prune 'dirs' to only traverse towards allowed folders
+            allowed_dirs = []
+            for d in dirs:
+                rel_d = f"{rel_root_check}/{d}" if rel_root_check else d
+                # Keep 'd' if:
+                # A. It is explicitly in the allowed list (so we can go there and parse)
+                # B. It is an ANCESTOR of something in the allowed list (so we can reach the allowed child)
+                is_traversal_allowed = False
+                if rel_d in allowed_subfolders:
+                    is_traversal_allowed = True
+                else:
+                    # Check if it's an ancestor
+                    for allowed in allowed_subfolders:
+                        if allowed.startswith(rel_d + '/'):
+                            is_traversal_allowed = True
+                            break
+                if is_traversal_allowed:
+                    allowed_dirs.append(d)
+            dirs[:] = allowed_dirs
+        if not should_parse_files:
+            continue
+        for file in files:
+            if file.endswith(".sql"):
+                filepath = os.path.join(root, file)
+                # Heuristic for table name: filename without extension
+                filename_base = os.path.splitext(file)[0]
+                # Layer detection based on folder structure first, then filename
+                lower_path = filepath.lower()
+                layer = "other"
+                if "bronze" in lower_path or "bronce" in lower_path:
+                    layer = "bronze"
+                elif "silver" in lower_path:
+                    layer = "silver"
+                elif "gold" in lower_path:
+                    layer = "gold"
+                with open(filepath, "r", encoding="utf-8") as f:
+                    sql_content = f.read()
+                try:
+                    # Parse with BigQuery dialect to support CREATE OR REPLACE TABLE/VIEW
+                    parsed = sqlglot.parse_one(sql_content, read=dialect)
+                    # Detect Node Type (Table or View)
+                    node_type = "table" # default
+                    if isinstance(parsed, exp.Create):
+                        if parsed.kind == "VIEW":
+                            node_type = "view"
+                    # Attempt to extract Project and Dataset from the CREATE statement
+                    # pattern: project.dataset.table or dataset.table
+                    # We look for the creation target
+                    target_table_name = filename_base
+                    project = "default"
+                    dataset = "default"
+                    create_node = parsed.find(exp.Create)
+                    if create_node and create_node.this:
+                        # sqlglot represents the target as an exp.Table or exp.Schema
+                        target_exp = create_node.this
+                        if isinstance(target_exp, exp.Table):
+                            target_table_name = target_exp.name
+                            dataset = target_exp.db or "default"
+                            project = target_exp.catalog or "default"
+                    # Fallback: Extract from filename (project.dataset.table.sql)
+                    if project == "default" and dataset == "default":
+                        parts = filename_base.split('.')
+                        if len(parts) == 3:
+                            project, dataset, target_table_name = parts
+                        elif len(parts) == 2:
+                            dataset, target_table_name = parts
+                    # Fallback: Extract from directory structure if straightforward
+                    # e.g. /project/dataset/table.sql
+                    if project == "default" and dataset == "default":
+                         path_parts = os.path.normpath(filepath).split(os.sep)
+                         # Simple heuristic: parent dir is dataset, grandparent is project?
+                         # This is risky without strict structure, so maybe just stick to filename for now.
+                         # Or just capture parent folder as dataset if it's not the layer name
+                         parent_dir = path_parts[-2] if len(path_parts) > 1 else ""
+                         if parent_dir.lower() not in ["bronze", "bronce", "silver", "gold", "other"] and dataset == "default":
+                             dataset = parent_dir
+                    dependencies = set()
+                    # Find all tables referenced in the query
+                    for table in parsed.find_all(exp.Table):
+                        dep_name = table.name
+                        # Construct full name if available to match lookup
+                        full_name = dep_name
+                        if table.db:
+                            full_name = f"{table.db}.{dep_name}"
+                            if table.catalog:
+                                full_name = f"{table.catalog}.{table.db}.{dep_name}"
+                        # Avoid self-reference if it matches the target
+                        if dep_name == target_table_name:
+                            continue
+                        # If we haven't found a CREATE statement, this might just be a SELECT
+                        # and we treat the filename as the target.
+                        dependencies.add(full_name)
+                        # REMOVED: partial match addition to prevent double counting in visual metadata
+                        # matches are now handled in build_graph via fuzzy lookup
+                    tables[filename_base] = {
+                        # Use filename_base as unique ID for the graph to avoid ambiguity
+                        # Visual label can be the actual table name
+                        "id": filename_base,
+                        "label": target_table_name,
+                        "layer": layer,
+                        "type": node_type,
+                        "project": project,
+                        "dataset": dataset,
+                        "path": filepath,
+                        "dependencies": list(dependencies),
+                        "content": sql_content
+                    }
+                except Exception as e:
+                    print(f"Error parsing {filepath}: {e}")
+                    tables[filename_base] = {
+                        "id": filename_base,
+                        "label": filename_base,
+                        "layer": layer,
+                        "type": "unknown",
+                        "project": "n/a",
+                        "dataset": "n/a",
+                        "path": filepath,
+                        "dependencies": [],
+                        "error": str(e),
+                        "content": sql_content
+                    }
+    return tables
+def build_graph(tables):
+    """
+    Constructs nodes and edges for React Flow.
+    """
+    nodes = []
+    edges = []
+    # Create a lookup map: identifier -> node_id
+    lookup = {}
+    for node_id, data in tables.items():
+        lookup[node_id] = node_id
+        if "label" in data:
+            lookup[data["label"]] = node_id
+        project = data.get("project", "default")
+        dataset = data.get("dataset", "default")
+        table = data.get("label", "")
+        if table:
+             if dataset != "default":
+                 lookup[f"{dataset}.{table}"] = node_id
+                 if project != "default":
+                     lookup[f"{project}.{dataset}.{table}"] = node_id
+    # Track incoming edges for accurate dependency counting
+    incoming_edges_count = {node_id: 0 for node_id in tables}
+    # Create edges first (conceptually) to count dependencies
+    for source_id, data in tables.items():
+        for dep in data["dependencies"]:
+            target_id = lookup.get(dep)
+            # Fuzzy lookup: if exact match fails, try splitting by dot and matching last part (table name)
+            if not target_id and "." in dep:
+                short_name = dep.split(".")[-1]
+                target_id = lookup.get(short_name)
+            if target_id and target_id != source_id:
+                edges.append({
+                    "id": f"{target_id}-{source_id}",
+                    "source": target_id,
+                    "target": source_id,
+                    "animated": True,
+                    "style": {"stroke": "#b1b1b7"}
+                })
+                incoming_edges_count[source_id] = incoming_edges_count.get(source_id, 0) + 1
+            else:
+                 pass
+    # Create nodes with edge count info
+    # First, build a NetworkX graph to calculate transitive dependencies (nested deps)
+    G = nx.DiGraph()
+    for edge in edges:
+        G.add_edge(edge["source"], edge["target"])
+    for table_name, data in tables.items():
+        # Calculate nested dependencies (all ancestors in the dependency graph)
+        nested_count = 0
+        if G.has_node(table_name):
+            try:
+                # ancestors() returns all nodes u such that there is a path from u to table_name
+                nested_count = len(nx.ancestors(G, table_name))
+            except Exception:
+                pass # distinct graph parts or cycles? cycles shouldn't exist in DAG but safety first
+        nodes.append({
+            "id": table_name,
+            "data": {
+                "label": data["label"],
+                "layer": data["layer"],
+                "details": data,
+                "incomingCount": incoming_edges_count.get(table_name, 0),
+                "nestedCount": nested_count
+            },
+            "position": {"x": 0, "y": 0},
+            "type": "custom",
+        })
+    return nodes, edges