PyPI - modaryn - Versions diffs - 0.1.0__tar.gz - Mend

modaryn 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

modaryn-0.1.0/LICENSE +21 -0
modaryn-0.1.0/PKG-INFO +19 -0
modaryn-0.1.0/README.md +183 -0
modaryn-0.1.0/modaryn/__init__.py +0 -0
modaryn-0.1.0/modaryn/analyzers/__init__.py +0 -0
modaryn-0.1.0/modaryn/analyzers/dependency.py +0 -0
modaryn-0.1.0/modaryn/analyzers/importance.py +0 -0
modaryn-0.1.0/modaryn/analyzers/lineage.py +140 -0
modaryn-0.1.0/modaryn/analyzers/sql_complexity.py +48 -0
modaryn-0.1.0/modaryn/assets/logo.txt +6 -0
modaryn-0.1.0/modaryn/cli.py +500 -0
modaryn-0.1.0/modaryn/config/default.yml +16 -0
modaryn-0.1.0/modaryn/domain/__init__.py +0 -0
modaryn-0.1.0/modaryn/domain/model.py +106 -0
modaryn-0.1.0/modaryn/loaders/__init__.py +0 -0
modaryn-0.1.0/modaryn/loaders/manifest.py +156 -0
modaryn-0.1.0/modaryn/loaders/models.py +0 -0
modaryn-0.1.0/modaryn/outputs/__init__.py +26 -0
modaryn-0.1.0/modaryn/outputs/graph.py +70 -0
modaryn-0.1.0/modaryn/outputs/html.py +408 -0
modaryn-0.1.0/modaryn/outputs/logo.py +21 -0
modaryn-0.1.0/modaryn/outputs/markdown.py +54 -0
modaryn-0.1.0/modaryn/outputs/terminal.py +98 -0
modaryn-0.1.0/modaryn/scorers/__init__.py +0 -0
modaryn-0.1.0/modaryn/scorers/score.py +119 -0
modaryn-0.1.0/modaryn.egg-info/PKG-INFO +19 -0
modaryn-0.1.0/modaryn.egg-info/SOURCES.txt +36 -0
modaryn-0.1.0/modaryn.egg-info/dependency_links.txt +1 -0
modaryn-0.1.0/modaryn.egg-info/entry_points.txt +2 -0
modaryn-0.1.0/modaryn.egg-info/requires.txt +11 -0
modaryn-0.1.0/modaryn.egg-info/top_level.txt +1 -0
modaryn-0.1.0/pyproject.toml +37 -0
modaryn-0.1.0/setup.cfg +4 -0
modaryn-0.1.0/tests/test_cli.py +413 -0
modaryn-0.1.0/tests/test_dbtmodel.py +39 -0
modaryn-0.1.0/tests/test_lineage.py +218 -0
modaryn-0.1.0/tests/test_logo.py +48 -0
modaryn-0.1.0/tests/test_scorer.py +179 -0

modaryn-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 yujikawa
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

modaryn-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,19 @@
+Metadata-Version: 2.4
+Name: modaryn
+Version: 0.1.0
+Summary: modaryn analyzes dbt projects to score model complexity and structural importance, helping teams identify high-risk and high-impact data models.
+Author: yujikawa
+License: MIT
+Requires-Python: >=3.9
+License-File: LICENSE
+Requires-Dist: typer[all]
+Requires-Dist: sqlglot
+Requires-Dist: rich
+Requires-Dist: pyyaml
+Requires-Dist: jinja2
+Requires-Dist: plotly
+Requires-Dist: numpy
+Requires-Dist: pandas
+Provides-Extra: test
+Requires-Dist: pytest; extra == "test"
+Dynamic: license-file

modaryn-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,183 @@
+# modaryn
+![modaryn](./docs/assets/header.png)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![Python](https://img.shields.io/badge/python-3.9%2B-blue.svg)](https://www.python.org/)
+[![dbt](https://img.shields.io/badge/dbt-compatible-orange.svg)](https://www.getdbt.com/)
+[![sqlglot](https://img.shields.io/badge/powered%20by-sqlglot-blueviolet.svg)](https://github.com/tobymao/sqlglot)
+modaryn analyzes dbt projects to score model complexity and structural importance,
+helping teams identify high-risk and high-impact data models.
+### Overview
+`modaryn` is a Python-based CLI tool that analyzes dbt projects and scores each model based on three pillars:
+- **Complexity** — SQL metrics (JOINs, CTEs, conditionals, WHERE clauses, character count)
+- **Importance** — Structural metrics (downstream model/column counts)
+- **Quality** — Test coverage metrics (test count, column coverage %)
+**Final score:** `raw_score = complexity_score + importance_score - quality_score` (higher = riskier)
+The SQL dialect is auto-detected from `manifest.json`. Column-level lineage is traced via `sqlglot` to compute downstream column impact.
+### Installation
+```bash
+uv pip install git+https://github.com/yujikawa/modaryn.git
+```
+### Usage
+#### `score` command
+Analyzes and scores all dbt models, displaying a combined scan and score report.
+```bash
+modaryn score --project-path . --apply-zscore --format html --output report.html
+```
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--project-path` | `-p` | Path to the dbt project directory | `.` |
+| `--dialect` | `-d` | SQL dialect (`bigquery`, `snowflake`, `duckdb`, etc.). Auto-detected from `manifest.json` if omitted. | auto |
+| `--config` | `-c` | Path to a custom weights YAML file | `None` |
+| `--apply-zscore` | `-z` | Apply Z-score normalization to scores | `False` |
+| `--format` | `-f` | Output format: `terminal`, `markdown`, `html` | `terminal` |
+| `--output` | `-o` | Path to write the output file | `None` |
+| `--select` | `-s` | Filter models by selector (repeatable, OR logic) | `None` |
+| `--verbose` | `-v` | Show detailed warnings (missing SQL, skipped columns) | `False` |
+**`--select` selector syntax:**
+```bash
+# Model name glob
+modaryn score --project-path . --select "fct_*"
+# Path prefix
+modaryn score --project-path . --select path:marts/finance
+# dbt tag
+modaryn score --project-path . --select tag:daily
+# Multiple selectors (OR logic)
+modaryn score --project-path . --select path:marts/customer --select path:marts/finance
+```
+---
+#### `ci-check` command
+Checks model scores against a threshold for use in CI/CD pipelines. Exits with code `1` if any model exceeds the threshold, `0` otherwise.
+```bash
+modaryn ci-check --project-path . --threshold 20.0 --apply-zscore
+```
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--project-path` | `-p` | Path to the dbt project directory | `.` |
+| `--threshold` | `-t` | Maximum allowed score (**required**) | — |
+| `--dialect` | `-d` | SQL dialect. Auto-detected if omitted. | auto |
+| `--config` | `-c` | Path to a custom weights YAML file | `None` |
+| `--apply-zscore` | `-z` | Check against Z-scores instead of raw scores | `False` |
+| `--format` | `-f` | Output format: `terminal`, `markdown`, `html` | `terminal` |
+| `--output` | `-o` | Path to write the output file | `None` |
+| `--select` | `-s` | Filter models by selector (repeatable, OR logic) | `None` |
+| `--verbose` | `-v` | Show detailed warnings | `False` |
+---
+#### `impact` command
+Traces all downstream columns affected by a change to a specific column (BFS column-level impact analysis).
+```bash
+modaryn impact --project-path . --model fct_orders --column order_id
+```
+| Option | Short | Description | Default |
+|--------|-------|-------------|---------|
+| `--project-path` | `-p` | Path to the dbt project directory | `.` |
+| `--model` | `-m` | Model name to trace impact from (**required**) | — |
+| `--column` | `-c` | Column name to trace impact from (**required**) | — |
+| `--dialect` | `-d` | SQL dialect. Auto-detected if omitted. | auto |
+| `--select` | `-s` | Filter models by selector (restricts lineage scope) | `None` |
+| `--verbose` | `-v` | Show detailed warnings | `False` |
+---
+### Missing compiled SQL (N/A columns)
+Complexity metrics require compiled SQL from `target/compiled/`. If `dbt compile` has not been run or a model failed to compile, those columns will show `N/A` in the report. A warning summary is printed at the end of the output. Use `--verbose` to see the full list of affected models.
+```
+⚠ 3 model(s) show N/A for complexity columns because compiled SQL was not found.
+Run `dbt compile` to enable full analysis: model_a, model_b, model_c
+```
+---
+### Report Columns and Calculation Logic
+#### 1. SQL Complexity Metrics
+| Metric | Calculation | Example |
+|--------|-------------|---------|
+| **JOINs** | Count of all `JOIN` clauses | `JOIN`, `LEFT JOIN`, `CROSS JOIN` each count as 1 |
+| **CTEs** | Count of all CTEs defined | `WITH a AS (...), b AS (...)` = 2 |
+| **Conditionals** | Count of `IF` expressions (each `WHEN` branch in a `CASE`) | A `CASE WHEN ... WHEN ... END` with 2 branches = 2 |
+| **WHEREs** | Count of `WHERE` clauses including subqueries | Main `WHERE` + subquery `WHERE` = 2 |
+| **SQL Chars** | Total character count of the compiled SQL | — |
+#### 2. Structural Importance Metrics
+| Metric | Calculation | Example |
+|--------|-------------|---------|
+| **Downstream** | Number of dbt models that directly reference this model | Models B and C use A → A has **2** |
+| **Col. Down** | Total count of downstream column references | B's `col1` and `col2` both reference A's `id` → **2** |
+#### 3. Quality Metrics
+| Metric | Calculation | Example |
+|--------|-------------|---------|
+| **Tests** | Total dbt tests attached to the model | 4 column tests → **4** |
+| **Coverage (%)** | % of columns with at least one test | 8 of 10 columns tested → **80%** |
+---
+### Scoring Formula
+1. **Complexity Score** = `(JOINs × w1) + (CTEs × w2) + (Conditionals × w3) + (WHEREs × w4) + (Chars × w5)`
+2. **Importance Score** = `(Downstream Models × w6) + (Col. Down × w7)`
+3. **Quality Score** = `(Tests × w8) + (Coverage % × w9)`
+**Raw Score** = `Complexity Score + Importance Score − Quality Score` (minimum 0)
+#### Z-Score Normalization
+When `--apply-zscore` is used:
+`Z-Score = (Raw Score − Mean) / Standard Deviation`
+---
+### Custom Weights Configuration
+Override default weights by passing a YAML file via `--config`:
+```yaml
+sql_complexity:
+  join_count: 2.0
+  cte_count: 1.5
+  conditional_count: 1.0
+  where_count: 0.5
+  sql_char_count: 0.01
+importance:
+  downstream_model_count: 1.0
+quality:
+  test_count: 0.5
+  column_coverage: 1.0
+```
+Unknown sections or keys are reported as warnings at runtime.
+---
+![modaryn](./docs/assets/result.png)
+![modaryn](./docs/assets/result2.png)

modaryn-0.1.0/modaryn/__init__.py ADDED Viewed

File without changes

modaryn-0.1.0/modaryn/analyzers/__init__.py ADDED Viewed

File without changes

modaryn-0.1.0/modaryn/analyzers/dependency.py ADDED Viewed

File without changes

modaryn-0.1.0/modaryn/analyzers/importance.py ADDED Viewed

File without changes

modaryn-0.1.0/modaryn/analyzers/lineage.py ADDED Viewed

@@ -0,0 +1,140 @@
+import warnings
+from typing import Callable, Dict, List, Optional, Set
+import sqlglot
+from sqlglot import exp
+from sqlglot.lineage import lineage
+from modaryn.domain.model import DbtProject, ColumnReference, DbtModel
+class LineageAnalyzer:
+    def __init__(self, dialect: str = "bigquery"):
+        self.dialect = dialect
+    def analyze(self, project: DbtProject, on_progress: Optional[Callable[[int, int], None]] = None):
+        """
+        Analyzes column-level lineage for all models in the project.
+        on_progress: optional callback(current, total) called after each model is processed.
+        """
+        schema = self._build_schema(project)
+        # Store table names in lowercase for case-insensitive lookup
+        table_to_id = {model.model_name.lower(): model.unique_id for model in project.models.values()}
+        models = list(project.models.values())
+        total = len(models)
+        for i, model in enumerate(models):
+            if on_progress:
+                on_progress(i + 1, total)
+            if not model.raw_sql:
+                continue
+            for column_name in model.columns:
+                try:
+                    # Try variations ordered by likelihood for the dialect to minimize failed attempts.
+                    # BigQuery uses backticks; Snowflake/Redshift default to uppercase; others use lowercase.
+                    node = None
+                    last_error = None
+                    search_variations = self._get_column_variations(column_name)
+                    for variation in search_variations:
+                        try:
+                            node = lineage(variation, sql=model.raw_sql, schema=schema, dialect=self.dialect)
+                            if node:
+                                break
+                        except Exception as e:
+                            last_error = e
+                            continue
+                    if node:
+                        self._extract_source_columns(model, column_name, node, table_to_id, project)
+                    elif last_error:
+                        warnings.warn(
+                            f"Lineage unavailable for column '{column_name}' in model '{model.model_name}': {last_error}",
+                            UserWarning,
+                            stacklevel=2,
+                        )
+                except Exception as e:
+                    warnings.warn(
+                        f"Lineage analysis failed for column '{column_name}' in model '{model.model_name}': {e}",
+                        UserWarning,
+                        stacklevel=2,
+                    )
+                    continue
+    def _get_column_variations(self, column_name: str) -> List[str]:
+        """Returns column name variations ordered by likelihood for the current dialect."""
+        if self.dialect == "bigquery":
+            return [f'`{column_name}`', column_name, column_name.upper(), f'"{column_name}"']
+        elif self.dialect in ("snowflake", "redshift"):
+            return [column_name.upper(), column_name, f'"{column_name}"', f'`{column_name}`']
+        else:
+            return [column_name, column_name.upper(), f'"{column_name}"', f'`{column_name}`']
+    def _build_schema(self, project: DbtProject) -> Dict:
+        """
+        Builds a sqlglot compatible schema from the dbt project.
+        """
+        schema = {}
+        for model in project.models.values():
+            # Use lowercase for table and column names in schema to allow flexible matching
+            model_name_lower = model.model_name.lower()
+            schema[model_name_lower] = {col.name.lower(): "UNKNOWN" for col in model.columns.values()}
+        return schema
+    def _extract_source_columns(self, target_model: DbtModel, target_column_name: str, node, table_to_id: Dict[str, str], project: DbtProject):
+        """
+        Recursively finds the source columns from the lineage node and populates the project model.
+        """
+        processed = set()
+        def walk(current_node):
+            if id(current_node) in processed:
+                return
+            processed.add(id(current_node))
+            # Identify if this node represents a source table and column.
+            # We prioritize Table expressions but fallback to parsing the node name (e.g., 'table.column').
+            table_name = None
+            source_col_raw = None
+            if isinstance(current_node.expression, exp.Table):
+                table_id_raw = current_node.expression.this
+                if hasattr(table_id_raw, 'name'):
+                    table_name = table_id_raw.name.lower().strip('"`')
+                else:
+                    table_name = str(table_id_raw).lower().strip('"`')
+                source_col_raw = current_node.name.split('.')[-1].lower().strip('"`')
+            elif '.' in current_node.name:
+                parts = current_node.name.split('.')
+                table_name = parts[-2].lower().strip('"`')
+                source_col_raw = parts[-1].lower().strip('"`')
+            # If we found a candidate table name, check if it's in our dbt project
+            if table_name and table_name in table_to_id:
+                source_model_id = table_to_id[table_name]
+                source_model = project.models.get(source_model_id)
+                if source_model:
+                    # Map normalized source_col_raw back to the actual column name in the source model
+                    actual_source_col = None
+                    for col_name in source_model.columns:
+                        if col_name.lower() == source_col_raw:
+                            actual_source_col = col_name
+                            break
+                    if actual_source_col:
+                        # Add reference if not already present
+                        if not any(ref.model_unique_id == source_model_id and ref.column_name == actual_source_col
+                                   for ref in target_model.columns[target_column_name].upstream_columns):
+                            target_model.columns[target_column_name].upstream_columns.append(
+                                ColumnReference(model_unique_id=source_model_id, column_name=actual_source_col)
+                            )
+                            source_model.columns[actual_source_col].downstream_columns.append(
+                                ColumnReference(model_unique_id=target_model.unique_id, column_name=target_column_name)
+                            )
+            for downstream in current_node.downstream:
+                walk(downstream)
+        walk(node)

modaryn-0.1.0/modaryn/analyzers/sql_complexity.py ADDED Viewed

@@ -0,0 +1,48 @@
+from dataclasses import dataclass
+import sqlglot
+@dataclass
+class SqlComplexityResult:
+    join_count: int
+    cte_count: int
+    conditional_count: int
+    where_count: int
+    sql_char_count: int
+class SqlComplexityAnalyzer:
+    def __init__(self, dialect: str = "bigquery"):
+        self.dialect = dialect
+    def analyze(self, sql: str) -> SqlComplexityResult:
+        """
+        Analyzes the complexity of a SQL query.
+        Args:
+            sql: The SQL query string to analyze.
+        Returns:
+            A dictionary containing complexity metrics.
+        """
+        try:
+            expression = sqlglot.parse_one(sql, read=self.dialect)
+        except sqlglot.errors.ParseError as e:
+            # If sqlglot can't parse, return zero for all metrics
+            # We don't print warnings here to avoid polluting test output
+            return SqlComplexityResult(join_count=0, cte_count=0, conditional_count=0, where_count=0, sql_char_count=0)
+        join_count = len(list(expression.find_all(sqlglot.exp.Join)))
+        cte_count = len(list(expression.find_all(sqlglot.exp.CTE)))
+        conditional_count = len(list(expression.find_all(sqlglot.exp.If)))
+        where_count = len(list(expression.find_all(sqlglot.exp.Where)))
+        sql_char_count = len(sql.replace(' ', '').strip())
+        return SqlComplexityResult(
+            join_count=join_count,
+            cte_count=cte_count,
+            conditional_count=conditional_count,
+            where_count=where_count,
+            sql_char_count=sql_char_count
+        )

modaryn-0.1.0/modaryn/assets/logo.txt ADDED Viewed

@@ -0,0 +1,6 @@
+███╗   ███╗ ██████╗ ██████╗  █████╗ ██████╗ ██╗   ██╗███╗   ██╗
+████╗ ████║██╔═══██╗██╔══██╗██╔══██╗██╔══██╗╚██╗ ██╔╝████╗  ██║
+██╔████╔██║██║   ██║██║  ██║███████║██████╔╝ ╚████╔╝ ██╔██╗ ██║
+██║╚██╔╝██║██║   ██║██║  ██║██╔══██║██╔══██╗  ╚██╔╝  ██║╚██╗██║
+██║ ╚═╝ ██║╚██████╔╝██████╔╝██║  ██║██║  ██║   ██║   ██║ ╚████║
+╚═╝     ╚═╝ ╚═════╝ ╚═════╝ ╚═╝  ╚═╝╚═╝  ╚═╝   ╚═╝   ╚═╝  ╚═══╝