modaryn 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. modaryn-0.1.0/LICENSE +21 -0
  2. modaryn-0.1.0/PKG-INFO +19 -0
  3. modaryn-0.1.0/README.md +183 -0
  4. modaryn-0.1.0/modaryn/__init__.py +0 -0
  5. modaryn-0.1.0/modaryn/analyzers/__init__.py +0 -0
  6. modaryn-0.1.0/modaryn/analyzers/dependency.py +0 -0
  7. modaryn-0.1.0/modaryn/analyzers/importance.py +0 -0
  8. modaryn-0.1.0/modaryn/analyzers/lineage.py +140 -0
  9. modaryn-0.1.0/modaryn/analyzers/sql_complexity.py +48 -0
  10. modaryn-0.1.0/modaryn/assets/logo.txt +6 -0
  11. modaryn-0.1.0/modaryn/cli.py +500 -0
  12. modaryn-0.1.0/modaryn/config/default.yml +16 -0
  13. modaryn-0.1.0/modaryn/domain/__init__.py +0 -0
  14. modaryn-0.1.0/modaryn/domain/model.py +106 -0
  15. modaryn-0.1.0/modaryn/loaders/__init__.py +0 -0
  16. modaryn-0.1.0/modaryn/loaders/manifest.py +156 -0
  17. modaryn-0.1.0/modaryn/loaders/models.py +0 -0
  18. modaryn-0.1.0/modaryn/outputs/__init__.py +26 -0
  19. modaryn-0.1.0/modaryn/outputs/graph.py +70 -0
  20. modaryn-0.1.0/modaryn/outputs/html.py +408 -0
  21. modaryn-0.1.0/modaryn/outputs/logo.py +21 -0
  22. modaryn-0.1.0/modaryn/outputs/markdown.py +54 -0
  23. modaryn-0.1.0/modaryn/outputs/terminal.py +98 -0
  24. modaryn-0.1.0/modaryn/scorers/__init__.py +0 -0
  25. modaryn-0.1.0/modaryn/scorers/score.py +119 -0
  26. modaryn-0.1.0/modaryn.egg-info/PKG-INFO +19 -0
  27. modaryn-0.1.0/modaryn.egg-info/SOURCES.txt +36 -0
  28. modaryn-0.1.0/modaryn.egg-info/dependency_links.txt +1 -0
  29. modaryn-0.1.0/modaryn.egg-info/entry_points.txt +2 -0
  30. modaryn-0.1.0/modaryn.egg-info/requires.txt +11 -0
  31. modaryn-0.1.0/modaryn.egg-info/top_level.txt +1 -0
  32. modaryn-0.1.0/pyproject.toml +37 -0
  33. modaryn-0.1.0/setup.cfg +4 -0
  34. modaryn-0.1.0/tests/test_cli.py +413 -0
  35. modaryn-0.1.0/tests/test_dbtmodel.py +39 -0
  36. modaryn-0.1.0/tests/test_lineage.py +218 -0
  37. modaryn-0.1.0/tests/test_logo.py +48 -0
  38. modaryn-0.1.0/tests/test_scorer.py +179 -0
modaryn-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 yujikawa
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
modaryn-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: modaryn
3
+ Version: 0.1.0
4
+ Summary: modaryn analyzes dbt projects to score model complexity and structural importance, helping teams identify high-risk and high-impact data models.
5
+ Author: yujikawa
6
+ License: MIT
7
+ Requires-Python: >=3.9
8
+ License-File: LICENSE
9
+ Requires-Dist: typer[all]
10
+ Requires-Dist: sqlglot
11
+ Requires-Dist: rich
12
+ Requires-Dist: pyyaml
13
+ Requires-Dist: jinja2
14
+ Requires-Dist: plotly
15
+ Requires-Dist: numpy
16
+ Requires-Dist: pandas
17
+ Provides-Extra: test
18
+ Requires-Dist: pytest; extra == "test"
19
+ Dynamic: license-file
@@ -0,0 +1,183 @@
1
+ # modaryn
2
+ ![modaryn](./docs/assets/header.png)
3
+
4
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
5
+ [![Python](https://img.shields.io/badge/python-3.9%2B-blue.svg)](https://www.python.org/)
6
+ [![dbt](https://img.shields.io/badge/dbt-compatible-orange.svg)](https://www.getdbt.com/)
7
+ [![sqlglot](https://img.shields.io/badge/powered%20by-sqlglot-blueviolet.svg)](https://github.com/tobymao/sqlglot)
8
+
9
+ modaryn analyzes dbt projects to score model complexity and structural importance,
10
+ helping teams identify high-risk and high-impact data models.
11
+
12
+ ### Overview
13
+ `modaryn` is a Python-based CLI tool that analyzes dbt projects and scores each model based on three pillars:
14
+
15
+ - **Complexity** — SQL metrics (JOINs, CTEs, conditionals, WHERE clauses, character count)
16
+ - **Importance** — Structural metrics (downstream model/column counts)
17
+ - **Quality** — Test coverage metrics (test count, column coverage %)
18
+
19
+ **Final score:** `raw_score = complexity_score + importance_score - quality_score` (higher = riskier)
20
+
21
+ The SQL dialect is auto-detected from `manifest.json`. Column-level lineage is traced via `sqlglot` to compute downstream column impact.
22
+
23
+ ### Installation
24
+ ```bash
25
+ uv pip install git+https://github.com/yujikawa/modaryn.git
26
+ ```
27
+
28
+ ### Usage
29
+
30
+ #### `score` command
31
+ Analyzes and scores all dbt models, displaying a combined scan and score report.
32
+
33
+ ```bash
34
+ modaryn score --project-path . --apply-zscore --format html --output report.html
35
+ ```
36
+
37
+ | Option | Short | Description | Default |
38
+ |--------|-------|-------------|---------|
39
+ | `--project-path` | `-p` | Path to the dbt project directory | `.` |
40
+ | `--dialect` | `-d` | SQL dialect (`bigquery`, `snowflake`, `duckdb`, etc.). Auto-detected from `manifest.json` if omitted. | auto |
41
+ | `--config` | `-c` | Path to a custom weights YAML file | `None` |
42
+ | `--apply-zscore` | `-z` | Apply Z-score normalization to scores | `False` |
43
+ | `--format` | `-f` | Output format: `terminal`, `markdown`, `html` | `terminal` |
44
+ | `--output` | `-o` | Path to write the output file | `None` |
45
+ | `--select` | `-s` | Filter models by selector (repeatable, OR logic) | `None` |
46
+ | `--verbose` | `-v` | Show detailed warnings (missing SQL, skipped columns) | `False` |
47
+
48
+ **`--select` selector syntax:**
49
+ ```bash
50
+ # Model name glob
51
+ modaryn score --project-path . --select "fct_*"
52
+
53
+ # Path prefix
54
+ modaryn score --project-path . --select path:marts/finance
55
+
56
+ # dbt tag
57
+ modaryn score --project-path . --select tag:daily
58
+
59
+ # Multiple selectors (OR logic)
60
+ modaryn score --project-path . --select path:marts/customer --select path:marts/finance
61
+ ```
62
+
63
+ ---
64
+
65
+ #### `ci-check` command
66
+ Checks model scores against a threshold for use in CI/CD pipelines. Exits with code `1` if any model exceeds the threshold, `0` otherwise.
67
+
68
+ ```bash
69
+ modaryn ci-check --project-path . --threshold 20.0 --apply-zscore
70
+ ```
71
+
72
+ | Option | Short | Description | Default |
73
+ |--------|-------|-------------|---------|
74
+ | `--project-path` | `-p` | Path to the dbt project directory | `.` |
75
+ | `--threshold` | `-t` | Maximum allowed score (**required**) | — |
76
+ | `--dialect` | `-d` | SQL dialect. Auto-detected if omitted. | auto |
77
+ | `--config` | `-c` | Path to a custom weights YAML file | `None` |
78
+ | `--apply-zscore` | `-z` | Check against Z-scores instead of raw scores | `False` |
79
+ | `--format` | `-f` | Output format: `terminal`, `markdown`, `html` | `terminal` |
80
+ | `--output` | `-o` | Path to write the output file | `None` |
81
+ | `--select` | `-s` | Filter models by selector (repeatable, OR logic) | `None` |
82
+ | `--verbose` | `-v` | Show detailed warnings | `False` |
83
+
84
+ ---
85
+
86
+ #### `impact` command
87
+ Traces all downstream columns affected by a change to a specific column (BFS column-level impact analysis).
88
+
89
+ ```bash
90
+ modaryn impact --project-path . --model fct_orders --column order_id
91
+ ```
92
+
93
+ | Option | Short | Description | Default |
94
+ |--------|-------|-------------|---------|
95
+ | `--project-path` | `-p` | Path to the dbt project directory | `.` |
96
+ | `--model` | `-m` | Model name to trace impact from (**required**) | — |
97
+ | `--column` | `-c` | Column name to trace impact from (**required**) | — |
98
+ | `--dialect` | `-d` | SQL dialect. Auto-detected if omitted. | auto |
99
+ | `--select` | `-s` | Filter models by selector (restricts lineage scope) | `None` |
100
+ | `--verbose` | `-v` | Show detailed warnings | `False` |
101
+
102
+ ---
103
+
104
+ ### Missing compiled SQL (N/A columns)
105
+
106
+ Complexity metrics require compiled SQL from `target/compiled/`. If `dbt compile` has not been run or a model failed to compile, those columns will show `N/A` in the report. A warning summary is printed at the end of the output. Use `--verbose` to see the full list of affected models.
107
+
108
+ ```
109
+ ⚠ 3 model(s) show N/A for complexity columns because compiled SQL was not found.
110
+ Run `dbt compile` to enable full analysis: model_a, model_b, model_c
111
+ ```
112
+
113
+ ---
114
+
115
+ ### Report Columns and Calculation Logic
116
+
117
+ #### 1. SQL Complexity Metrics
118
+
119
+ | Metric | Calculation | Example |
120
+ |--------|-------------|---------|
121
+ | **JOINs** | Count of all `JOIN` clauses | `JOIN`, `LEFT JOIN`, `CROSS JOIN` each count as 1 |
122
+ | **CTEs** | Count of all CTEs defined | `WITH a AS (...), b AS (...)` = 2 |
123
+ | **Conditionals** | Count of `IF` expressions (each `WHEN` branch in a `CASE`) | A `CASE WHEN ... WHEN ... END` with 2 branches = 2 |
124
+ | **WHEREs** | Count of `WHERE` clauses including subqueries | Main `WHERE` + subquery `WHERE` = 2 |
125
+ | **SQL Chars** | Total character count of the compiled SQL | — |
126
+
127
+ #### 2. Structural Importance Metrics
128
+
129
+ | Metric | Calculation | Example |
130
+ |--------|-------------|---------|
131
+ | **Downstream** | Number of dbt models that directly reference this model | Models B and C use A → A has **2** |
132
+ | **Col. Down** | Total count of downstream column references | B's `col1` and `col2` both reference A's `id` → **2** |
133
+
134
+ #### 3. Quality Metrics
135
+
136
+ | Metric | Calculation | Example |
137
+ |--------|-------------|---------|
138
+ | **Tests** | Total dbt tests attached to the model | 4 column tests → **4** |
139
+ | **Coverage (%)** | % of columns with at least one test | 8 of 10 columns tested → **80%** |
140
+
141
+ ---
142
+
143
+ ### Scoring Formula
144
+
145
+ 1. **Complexity Score** = `(JOINs × w1) + (CTEs × w2) + (Conditionals × w3) + (WHEREs × w4) + (Chars × w5)`
146
+ 2. **Importance Score** = `(Downstream Models × w6) + (Col. Down × w7)`
147
+ 3. **Quality Score** = `(Tests × w8) + (Coverage % × w9)`
148
+
149
+ **Raw Score** = `Complexity Score + Importance Score − Quality Score` (minimum 0)
150
+
151
+ #### Z-Score Normalization
152
+ When `--apply-zscore` is used:
153
+ `Z-Score = (Raw Score − Mean) / Standard Deviation`
154
+
155
+ ---
156
+
157
+ ### Custom Weights Configuration
158
+
159
+ Override default weights by passing a YAML file via `--config`:
160
+
161
+ ```yaml
162
+ sql_complexity:
163
+ join_count: 2.0
164
+ cte_count: 1.5
165
+ conditional_count: 1.0
166
+ where_count: 0.5
167
+ sql_char_count: 0.01
168
+
169
+ importance:
170
+ downstream_model_count: 1.0
171
+
172
+ quality:
173
+ test_count: 0.5
174
+ column_coverage: 1.0
175
+ ```
176
+
177
+ Unknown sections or keys are reported as warnings at runtime.
178
+
179
+ ---
180
+
181
+ ![modaryn](./docs/assets/result.png)
182
+
183
+ ![modaryn](./docs/assets/result2.png)
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,140 @@
1
+ import warnings
2
+ from typing import Callable, Dict, List, Optional, Set
3
+ import sqlglot
4
+ from sqlglot import exp
5
+ from sqlglot.lineage import lineage
6
+ from modaryn.domain.model import DbtProject, ColumnReference, DbtModel
7
+
8
+
9
+ class LineageAnalyzer:
10
+ def __init__(self, dialect: str = "bigquery"):
11
+ self.dialect = dialect
12
+
13
+ def analyze(self, project: DbtProject, on_progress: Optional[Callable[[int, int], None]] = None):
14
+ """
15
+ Analyzes column-level lineage for all models in the project.
16
+ on_progress: optional callback(current, total) called after each model is processed.
17
+ """
18
+ schema = self._build_schema(project)
19
+ # Store table names in lowercase for case-insensitive lookup
20
+ table_to_id = {model.model_name.lower(): model.unique_id for model in project.models.values()}
21
+
22
+ models = list(project.models.values())
23
+ total = len(models)
24
+ for i, model in enumerate(models):
25
+ if on_progress:
26
+ on_progress(i + 1, total)
27
+ if not model.raw_sql:
28
+ continue
29
+
30
+ for column_name in model.columns:
31
+ try:
32
+ # Try variations ordered by likelihood for the dialect to minimize failed attempts.
33
+ # BigQuery uses backticks; Snowflake/Redshift default to uppercase; others use lowercase.
34
+ node = None
35
+ last_error = None
36
+ search_variations = self._get_column_variations(column_name)
37
+
38
+ for variation in search_variations:
39
+ try:
40
+ node = lineage(variation, sql=model.raw_sql, schema=schema, dialect=self.dialect)
41
+ if node:
42
+ break
43
+ except Exception as e:
44
+ last_error = e
45
+ continue
46
+
47
+ if node:
48
+ self._extract_source_columns(model, column_name, node, table_to_id, project)
49
+ elif last_error:
50
+ warnings.warn(
51
+ f"Lineage unavailable for column '{column_name}' in model '{model.model_name}': {last_error}",
52
+ UserWarning,
53
+ stacklevel=2,
54
+ )
55
+ except Exception as e:
56
+ warnings.warn(
57
+ f"Lineage analysis failed for column '{column_name}' in model '{model.model_name}': {e}",
58
+ UserWarning,
59
+ stacklevel=2,
60
+ )
61
+ continue
62
+
63
+ def _get_column_variations(self, column_name: str) -> List[str]:
64
+ """Returns column name variations ordered by likelihood for the current dialect."""
65
+ if self.dialect == "bigquery":
66
+ return [f'`{column_name}`', column_name, column_name.upper(), f'"{column_name}"']
67
+ elif self.dialect in ("snowflake", "redshift"):
68
+ return [column_name.upper(), column_name, f'"{column_name}"', f'`{column_name}`']
69
+ else:
70
+ return [column_name, column_name.upper(), f'"{column_name}"', f'`{column_name}`']
71
+
72
+ def _build_schema(self, project: DbtProject) -> Dict:
73
+ """
74
+ Builds a sqlglot compatible schema from the dbt project.
75
+ """
76
+ schema = {}
77
+ for model in project.models.values():
78
+ # Use lowercase for table and column names in schema to allow flexible matching
79
+ model_name_lower = model.model_name.lower()
80
+ schema[model_name_lower] = {col.name.lower(): "UNKNOWN" for col in model.columns.values()}
81
+ return schema
82
+
83
+ def _extract_source_columns(self, target_model: DbtModel, target_column_name: str, node, table_to_id: Dict[str, str], project: DbtProject):
84
+ """
85
+ Recursively finds the source columns from the lineage node and populates the project model.
86
+ """
87
+ processed = set()
88
+
89
+ def walk(current_node):
90
+ if id(current_node) in processed:
91
+ return
92
+ processed.add(id(current_node))
93
+
94
+ # Identify if this node represents a source table and column.
95
+ # We prioritize Table expressions but fallback to parsing the node name (e.g., 'table.column').
96
+ table_name = None
97
+ source_col_raw = None
98
+
99
+ if isinstance(current_node.expression, exp.Table):
100
+ table_id_raw = current_node.expression.this
101
+ if hasattr(table_id_raw, 'name'):
102
+ table_name = table_id_raw.name.lower().strip('"`')
103
+ else:
104
+ table_name = str(table_id_raw).lower().strip('"`')
105
+ source_col_raw = current_node.name.split('.')[-1].lower().strip('"`')
106
+
107
+ elif '.' in current_node.name:
108
+ parts = current_node.name.split('.')
109
+ table_name = parts[-2].lower().strip('"`')
110
+ source_col_raw = parts[-1].lower().strip('"`')
111
+
112
+ # If we found a candidate table name, check if it's in our dbt project
113
+ if table_name and table_name in table_to_id:
114
+ source_model_id = table_to_id[table_name]
115
+ source_model = project.models.get(source_model_id)
116
+
117
+ if source_model:
118
+ # Map normalized source_col_raw back to the actual column name in the source model
119
+ actual_source_col = None
120
+ for col_name in source_model.columns:
121
+ if col_name.lower() == source_col_raw:
122
+ actual_source_col = col_name
123
+ break
124
+
125
+ if actual_source_col:
126
+ # Add reference if not already present
127
+ if not any(ref.model_unique_id == source_model_id and ref.column_name == actual_source_col
128
+ for ref in target_model.columns[target_column_name].upstream_columns):
129
+
130
+ target_model.columns[target_column_name].upstream_columns.append(
131
+ ColumnReference(model_unique_id=source_model_id, column_name=actual_source_col)
132
+ )
133
+ source_model.columns[actual_source_col].downstream_columns.append(
134
+ ColumnReference(model_unique_id=target_model.unique_id, column_name=target_column_name)
135
+ )
136
+
137
+ for downstream in current_node.downstream:
138
+ walk(downstream)
139
+
140
+ walk(node)
@@ -0,0 +1,48 @@
1
+ from dataclasses import dataclass
2
+ import sqlglot
3
+
4
+
5
+ @dataclass
6
+ class SqlComplexityResult:
7
+ join_count: int
8
+ cte_count: int
9
+ conditional_count: int
10
+ where_count: int
11
+ sql_char_count: int
12
+
13
+
14
+ class SqlComplexityAnalyzer:
15
+ def __init__(self, dialect: str = "bigquery"):
16
+ self.dialect = dialect
17
+
18
+ def analyze(self, sql: str) -> SqlComplexityResult:
19
+ """
20
+ Analyzes the complexity of a SQL query.
21
+
22
+ Args:
23
+ sql: The SQL query string to analyze.
24
+
25
+ Returns:
26
+ A dictionary containing complexity metrics.
27
+ """
28
+ try:
29
+ expression = sqlglot.parse_one(sql, read=self.dialect)
30
+ except sqlglot.errors.ParseError as e:
31
+ # If sqlglot can't parse, return zero for all metrics
32
+ # We don't print warnings here to avoid polluting test output
33
+ return SqlComplexityResult(join_count=0, cte_count=0, conditional_count=0, where_count=0, sql_char_count=0)
34
+
35
+ join_count = len(list(expression.find_all(sqlglot.exp.Join)))
36
+ cte_count = len(list(expression.find_all(sqlglot.exp.CTE)))
37
+ conditional_count = len(list(expression.find_all(sqlglot.exp.If)))
38
+ where_count = len(list(expression.find_all(sqlglot.exp.Where)))
39
+ sql_char_count = len(sql.replace(' ', '').strip())
40
+
41
+
42
+ return SqlComplexityResult(
43
+ join_count=join_count,
44
+ cte_count=cte_count,
45
+ conditional_count=conditional_count,
46
+ where_count=where_count,
47
+ sql_char_count=sql_char_count
48
+ )
@@ -0,0 +1,6 @@
1
+ ███╗ ███╗ ██████╗ ██████╗ █████╗ ██████╗ ██╗ ██╗███╗ ██╗
2
+ ████╗ ████║██╔═══██╗██╔══██╗██╔══██╗██╔══██╗╚██╗ ██╔╝████╗ ██║
3
+ ██╔████╔██║██║ ██║██║ ██║███████║██████╔╝ ╚████╔╝ ██╔██╗ ██║
4
+ ██║╚██╔╝██║██║ ██║██║ ██║██╔══██║██╔══██╗ ╚██╔╝ ██║╚██╗██║
5
+ ██║ ╚═╝ ██║╚██████╔╝██████╔╝██║ ██║██║ ██║ ██║ ██║ ╚████║
6
+ ╚═╝ ╚═╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═══╝