tree-sitter-analyzer 1.9.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tree_sitter_analyzer/__init__.py +132 -0
- tree_sitter_analyzer/__main__.py +11 -0
- tree_sitter_analyzer/api.py +853 -0
- tree_sitter_analyzer/cli/__init__.py +39 -0
- tree_sitter_analyzer/cli/__main__.py +12 -0
- tree_sitter_analyzer/cli/argument_validator.py +89 -0
- tree_sitter_analyzer/cli/commands/__init__.py +26 -0
- tree_sitter_analyzer/cli/commands/advanced_command.py +226 -0
- tree_sitter_analyzer/cli/commands/base_command.py +181 -0
- tree_sitter_analyzer/cli/commands/default_command.py +18 -0
- tree_sitter_analyzer/cli/commands/find_and_grep_cli.py +188 -0
- tree_sitter_analyzer/cli/commands/list_files_cli.py +133 -0
- tree_sitter_analyzer/cli/commands/partial_read_command.py +139 -0
- tree_sitter_analyzer/cli/commands/query_command.py +109 -0
- tree_sitter_analyzer/cli/commands/search_content_cli.py +161 -0
- tree_sitter_analyzer/cli/commands/structure_command.py +156 -0
- tree_sitter_analyzer/cli/commands/summary_command.py +116 -0
- tree_sitter_analyzer/cli/commands/table_command.py +414 -0
- tree_sitter_analyzer/cli/info_commands.py +124 -0
- tree_sitter_analyzer/cli_main.py +472 -0
- tree_sitter_analyzer/constants.py +85 -0
- tree_sitter_analyzer/core/__init__.py +15 -0
- tree_sitter_analyzer/core/analysis_engine.py +580 -0
- tree_sitter_analyzer/core/cache_service.py +333 -0
- tree_sitter_analyzer/core/engine.py +585 -0
- tree_sitter_analyzer/core/parser.py +293 -0
- tree_sitter_analyzer/core/query.py +605 -0
- tree_sitter_analyzer/core/query_filter.py +200 -0
- tree_sitter_analyzer/core/query_service.py +340 -0
- tree_sitter_analyzer/encoding_utils.py +530 -0
- tree_sitter_analyzer/exceptions.py +747 -0
- tree_sitter_analyzer/file_handler.py +246 -0
- tree_sitter_analyzer/formatters/__init__.py +1 -0
- tree_sitter_analyzer/formatters/base_formatter.py +201 -0
- tree_sitter_analyzer/formatters/csharp_formatter.py +367 -0
- tree_sitter_analyzer/formatters/formatter_config.py +197 -0
- tree_sitter_analyzer/formatters/formatter_factory.py +84 -0
- tree_sitter_analyzer/formatters/formatter_registry.py +377 -0
- tree_sitter_analyzer/formatters/formatter_selector.py +96 -0
- tree_sitter_analyzer/formatters/go_formatter.py +368 -0
- tree_sitter_analyzer/formatters/html_formatter.py +498 -0
- tree_sitter_analyzer/formatters/java_formatter.py +423 -0
- tree_sitter_analyzer/formatters/javascript_formatter.py +611 -0
- tree_sitter_analyzer/formatters/kotlin_formatter.py +268 -0
- tree_sitter_analyzer/formatters/language_formatter_factory.py +123 -0
- tree_sitter_analyzer/formatters/legacy_formatter_adapters.py +228 -0
- tree_sitter_analyzer/formatters/markdown_formatter.py +725 -0
- tree_sitter_analyzer/formatters/php_formatter.py +301 -0
- tree_sitter_analyzer/formatters/python_formatter.py +830 -0
- tree_sitter_analyzer/formatters/ruby_formatter.py +278 -0
- tree_sitter_analyzer/formatters/rust_formatter.py +233 -0
- tree_sitter_analyzer/formatters/sql_formatter_wrapper.py +689 -0
- tree_sitter_analyzer/formatters/sql_formatters.py +536 -0
- tree_sitter_analyzer/formatters/typescript_formatter.py +543 -0
- tree_sitter_analyzer/formatters/yaml_formatter.py +462 -0
- tree_sitter_analyzer/interfaces/__init__.py +9 -0
- tree_sitter_analyzer/interfaces/cli.py +535 -0
- tree_sitter_analyzer/interfaces/cli_adapter.py +359 -0
- tree_sitter_analyzer/interfaces/mcp_adapter.py +224 -0
- tree_sitter_analyzer/interfaces/mcp_server.py +428 -0
- tree_sitter_analyzer/language_detector.py +553 -0
- tree_sitter_analyzer/language_loader.py +271 -0
- tree_sitter_analyzer/languages/__init__.py +10 -0
- tree_sitter_analyzer/languages/csharp_plugin.py +1076 -0
- tree_sitter_analyzer/languages/css_plugin.py +449 -0
- tree_sitter_analyzer/languages/go_plugin.py +836 -0
- tree_sitter_analyzer/languages/html_plugin.py +496 -0
- tree_sitter_analyzer/languages/java_plugin.py +1299 -0
- tree_sitter_analyzer/languages/javascript_plugin.py +1622 -0
- tree_sitter_analyzer/languages/kotlin_plugin.py +656 -0
- tree_sitter_analyzer/languages/markdown_plugin.py +1928 -0
- tree_sitter_analyzer/languages/php_plugin.py +862 -0
- tree_sitter_analyzer/languages/python_plugin.py +1636 -0
- tree_sitter_analyzer/languages/ruby_plugin.py +757 -0
- tree_sitter_analyzer/languages/rust_plugin.py +673 -0
- tree_sitter_analyzer/languages/sql_plugin.py +2444 -0
- tree_sitter_analyzer/languages/typescript_plugin.py +1892 -0
- tree_sitter_analyzer/languages/yaml_plugin.py +695 -0
- tree_sitter_analyzer/legacy_table_formatter.py +860 -0
- tree_sitter_analyzer/mcp/__init__.py +34 -0
- tree_sitter_analyzer/mcp/resources/__init__.py +43 -0
- tree_sitter_analyzer/mcp/resources/code_file_resource.py +208 -0
- tree_sitter_analyzer/mcp/resources/project_stats_resource.py +586 -0
- tree_sitter_analyzer/mcp/server.py +869 -0
- tree_sitter_analyzer/mcp/tools/__init__.py +28 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool.py +779 -0
- tree_sitter_analyzer/mcp/tools/analyze_scale_tool_cli_compatible.py +291 -0
- tree_sitter_analyzer/mcp/tools/base_tool.py +139 -0
- tree_sitter_analyzer/mcp/tools/fd_rg_utils.py +816 -0
- tree_sitter_analyzer/mcp/tools/find_and_grep_tool.py +686 -0
- tree_sitter_analyzer/mcp/tools/list_files_tool.py +413 -0
- tree_sitter_analyzer/mcp/tools/output_format_validator.py +148 -0
- tree_sitter_analyzer/mcp/tools/query_tool.py +443 -0
- tree_sitter_analyzer/mcp/tools/read_partial_tool.py +464 -0
- tree_sitter_analyzer/mcp/tools/search_content_tool.py +836 -0
- tree_sitter_analyzer/mcp/tools/table_format_tool.py +572 -0
- tree_sitter_analyzer/mcp/tools/universal_analyze_tool.py +653 -0
- tree_sitter_analyzer/mcp/utils/__init__.py +113 -0
- tree_sitter_analyzer/mcp/utils/error_handler.py +569 -0
- tree_sitter_analyzer/mcp/utils/file_output_factory.py +217 -0
- tree_sitter_analyzer/mcp/utils/file_output_manager.py +322 -0
- tree_sitter_analyzer/mcp/utils/gitignore_detector.py +358 -0
- tree_sitter_analyzer/mcp/utils/path_resolver.py +414 -0
- tree_sitter_analyzer/mcp/utils/search_cache.py +343 -0
- tree_sitter_analyzer/models.py +840 -0
- tree_sitter_analyzer/mypy_current_errors.txt +2 -0
- tree_sitter_analyzer/output_manager.py +255 -0
- tree_sitter_analyzer/platform_compat/__init__.py +3 -0
- tree_sitter_analyzer/platform_compat/adapter.py +324 -0
- tree_sitter_analyzer/platform_compat/compare.py +224 -0
- tree_sitter_analyzer/platform_compat/detector.py +67 -0
- tree_sitter_analyzer/platform_compat/fixtures.py +228 -0
- tree_sitter_analyzer/platform_compat/profiles.py +217 -0
- tree_sitter_analyzer/platform_compat/record.py +55 -0
- tree_sitter_analyzer/platform_compat/recorder.py +155 -0
- tree_sitter_analyzer/platform_compat/report.py +92 -0
- tree_sitter_analyzer/plugins/__init__.py +280 -0
- tree_sitter_analyzer/plugins/base.py +647 -0
- tree_sitter_analyzer/plugins/manager.py +384 -0
- tree_sitter_analyzer/project_detector.py +328 -0
- tree_sitter_analyzer/queries/__init__.py +27 -0
- tree_sitter_analyzer/queries/csharp.py +216 -0
- tree_sitter_analyzer/queries/css.py +615 -0
- tree_sitter_analyzer/queries/go.py +275 -0
- tree_sitter_analyzer/queries/html.py +543 -0
- tree_sitter_analyzer/queries/java.py +402 -0
- tree_sitter_analyzer/queries/javascript.py +724 -0
- tree_sitter_analyzer/queries/kotlin.py +192 -0
- tree_sitter_analyzer/queries/markdown.py +258 -0
- tree_sitter_analyzer/queries/php.py +95 -0
- tree_sitter_analyzer/queries/python.py +859 -0
- tree_sitter_analyzer/queries/ruby.py +92 -0
- tree_sitter_analyzer/queries/rust.py +223 -0
- tree_sitter_analyzer/queries/sql.py +555 -0
- tree_sitter_analyzer/queries/typescript.py +871 -0
- tree_sitter_analyzer/queries/yaml.py +236 -0
- tree_sitter_analyzer/query_loader.py +272 -0
- tree_sitter_analyzer/security/__init__.py +22 -0
- tree_sitter_analyzer/security/boundary_manager.py +277 -0
- tree_sitter_analyzer/security/regex_checker.py +297 -0
- tree_sitter_analyzer/security/validator.py +599 -0
- tree_sitter_analyzer/table_formatter.py +782 -0
- tree_sitter_analyzer/utils/__init__.py +53 -0
- tree_sitter_analyzer/utils/logging.py +433 -0
- tree_sitter_analyzer/utils/tree_sitter_compat.py +289 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/METADATA +485 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/RECORD +149 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/WHEEL +4 -0
- tree_sitter_analyzer-1.9.17.1.dist-info/entry_points.txt +25 -0
|
@@ -0,0 +1,2444 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
SQL Language Plugin
|
|
4
|
+
|
|
5
|
+
Provides SQL-specific parsing and element extraction functionality.
|
|
6
|
+
Supports extraction of tables, views, stored procedures, functions, triggers, and indexes.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from collections.abc import Iterator
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
import tree_sitter
|
|
14
|
+
|
|
15
|
+
from ..core.analysis_engine import AnalysisRequest
|
|
16
|
+
from ..models import AnalysisResult
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
import tree_sitter
|
|
20
|
+
|
|
21
|
+
TREE_SITTER_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
TREE_SITTER_AVAILABLE = False
|
|
24
|
+
|
|
25
|
+
from ..encoding_utils import extract_text_slice, safe_encode
|
|
26
|
+
from ..models import (
|
|
27
|
+
Class,
|
|
28
|
+
Function,
|
|
29
|
+
Import,
|
|
30
|
+
SQLColumn,
|
|
31
|
+
SQLConstraint,
|
|
32
|
+
SQLElement,
|
|
33
|
+
SQLFunction,
|
|
34
|
+
SQLIndex,
|
|
35
|
+
SQLParameter,
|
|
36
|
+
SQLProcedure,
|
|
37
|
+
SQLTable,
|
|
38
|
+
SQLTrigger,
|
|
39
|
+
SQLView,
|
|
40
|
+
Variable,
|
|
41
|
+
)
|
|
42
|
+
from ..platform_compat.adapter import CompatibilityAdapter
|
|
43
|
+
from ..platform_compat.detector import PlatformDetector
|
|
44
|
+
from ..platform_compat.profiles import BehaviorProfile
|
|
45
|
+
from ..plugins.base import ElementExtractor, LanguagePlugin
|
|
46
|
+
from ..utils import log_debug, log_error
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class SQLElementExtractor(ElementExtractor):
|
|
50
|
+
"""
|
|
51
|
+
SQL-specific element extractor.
|
|
52
|
+
|
|
53
|
+
This extractor parses SQL AST and extracts database elements, mapping them
|
|
54
|
+
to the unified element model:
|
|
55
|
+
- Tables and Views → Class elements
|
|
56
|
+
- Stored Procedures, Functions, Triggers → Function elements
|
|
57
|
+
- Indexes → Variable elements
|
|
58
|
+
- Schema references → Import elements
|
|
59
|
+
|
|
60
|
+
The extractor handles standard SQL (ANSI SQL) syntax and supports
|
|
61
|
+
CREATE TABLE, CREATE VIEW, CREATE PROCEDURE, CREATE FUNCTION,
|
|
62
|
+
CREATE TRIGGER, and CREATE INDEX statements.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, diagnostic_mode: bool = False) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Initialize the SQL element extractor.
|
|
68
|
+
|
|
69
|
+
Sets up internal state for source code processing and performance
|
|
70
|
+
optimization caches for node text extraction.
|
|
71
|
+
"""
|
|
72
|
+
super().__init__()
|
|
73
|
+
self.source_code: str = ""
|
|
74
|
+
self.content_lines: list[str] = []
|
|
75
|
+
self.diagnostic_mode = diagnostic_mode
|
|
76
|
+
self.platform_info = None
|
|
77
|
+
|
|
78
|
+
# Performance optimization caches
|
|
79
|
+
# Cache node text to avoid repeated extraction
|
|
80
|
+
self._node_text_cache: dict[int, str] = {}
|
|
81
|
+
# Track processed nodes to avoid duplicate processing
|
|
82
|
+
self._processed_nodes: set[int] = set()
|
|
83
|
+
# File encoding for safe text extraction
|
|
84
|
+
self._file_encoding: str | None = None
|
|
85
|
+
|
|
86
|
+
# Platform compatibility
|
|
87
|
+
self.adapter: CompatibilityAdapter | None = None
|
|
88
|
+
|
|
89
|
+
def set_adapter(self, adapter: CompatibilityAdapter) -> None:
|
|
90
|
+
"""Set the compatibility adapter."""
|
|
91
|
+
self.adapter = adapter
|
|
92
|
+
|
|
93
|
+
def extract_sql_elements(
|
|
94
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
95
|
+
) -> list[SQLElement]:
|
|
96
|
+
"""
|
|
97
|
+
Extract all SQL elements with enhanced metadata.
|
|
98
|
+
|
|
99
|
+
This is the new enhanced extraction method that returns SQL-specific
|
|
100
|
+
element types with detailed metadata including columns, constraints,
|
|
101
|
+
parameters, and dependencies.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
tree: Tree-sitter AST tree parsed from SQL source
|
|
105
|
+
source_code: Original SQL source code as string
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
List of SQLElement objects with detailed metadata
|
|
109
|
+
"""
|
|
110
|
+
self.source_code = source_code or ""
|
|
111
|
+
self.content_lines = self.source_code.split("\n")
|
|
112
|
+
self._reset_caches()
|
|
113
|
+
|
|
114
|
+
sql_elements: list[SQLElement] = []
|
|
115
|
+
|
|
116
|
+
if tree is not None and tree.root_node is not None:
|
|
117
|
+
try:
|
|
118
|
+
# Extract all SQL element types with enhanced metadata
|
|
119
|
+
self._extract_sql_tables(tree.root_node, sql_elements)
|
|
120
|
+
self._extract_sql_views(tree.root_node, sql_elements)
|
|
121
|
+
self._extract_sql_procedures(tree.root_node, sql_elements)
|
|
122
|
+
self._extract_sql_functions_enhanced(tree.root_node, sql_elements)
|
|
123
|
+
self._extract_sql_triggers(tree.root_node, sql_elements)
|
|
124
|
+
self._extract_sql_indexes(tree.root_node, sql_elements)
|
|
125
|
+
|
|
126
|
+
# Apply platform compatibility adapter if available
|
|
127
|
+
if self.adapter:
|
|
128
|
+
if self.diagnostic_mode:
|
|
129
|
+
log_debug(
|
|
130
|
+
f"Diagnostic: Before adaptation: {[e.name for e in sql_elements]}"
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
sql_elements = self.adapter.adapt_elements(
|
|
134
|
+
sql_elements, self.source_code
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
if self.diagnostic_mode:
|
|
138
|
+
log_debug(
|
|
139
|
+
f"Diagnostic: After adaptation: {[e.name for e in sql_elements]}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# Post-process to fix platform-specific parsing errors
|
|
143
|
+
sql_elements = self._validate_and_fix_elements(sql_elements)
|
|
144
|
+
|
|
145
|
+
log_debug(f"Extracted {len(sql_elements)} SQL elements with metadata")
|
|
146
|
+
except Exception as e:
|
|
147
|
+
log_error(
|
|
148
|
+
f"Error during enhanced SQL extraction on {self.platform_info}: {e}"
|
|
149
|
+
)
|
|
150
|
+
log_error(
|
|
151
|
+
"Suggestion: Check platform compatibility documentation or enable diagnostic mode for more details."
|
|
152
|
+
)
|
|
153
|
+
# Return empty list or partial results to allow other languages to continue
|
|
154
|
+
if not sql_elements:
|
|
155
|
+
sql_elements = []
|
|
156
|
+
|
|
157
|
+
return sql_elements
|
|
158
|
+
|
|
159
|
+
def _validate_and_fix_elements(
|
|
160
|
+
self, elements: list[SQLElement]
|
|
161
|
+
) -> list[SQLElement]:
|
|
162
|
+
"""
|
|
163
|
+
Post-process elements to fix parsing errors caused by platform-specific
|
|
164
|
+
tree-sitter behavior (e.g. ERROR nodes misidentifying triggers).
|
|
165
|
+
"""
|
|
166
|
+
import re
|
|
167
|
+
|
|
168
|
+
validated = []
|
|
169
|
+
seen_names = set()
|
|
170
|
+
|
|
171
|
+
for elem in elements:
|
|
172
|
+
elem_type = getattr(elem, "sql_element_type", None)
|
|
173
|
+
|
|
174
|
+
# 1. Check for Phantom Elements (Mismatch between Type and Content)
|
|
175
|
+
if elem_type and elem.raw_text:
|
|
176
|
+
raw_text_stripped = elem.raw_text.strip()
|
|
177
|
+
is_valid = True
|
|
178
|
+
|
|
179
|
+
# Fix Ubuntu 3.12 phantom trigger issue (Trigger type but Function content)
|
|
180
|
+
if elem_type.value == "trigger":
|
|
181
|
+
# Must start with CREATE TRIGGER (allow comments/whitespace)
|
|
182
|
+
if not re.search(
|
|
183
|
+
r"CREATE\s+TRIGGER", raw_text_stripped, re.IGNORECASE
|
|
184
|
+
):
|
|
185
|
+
log_debug(
|
|
186
|
+
f"Removing phantom trigger: {elem.name} (content mismatch)"
|
|
187
|
+
)
|
|
188
|
+
is_valid = False
|
|
189
|
+
|
|
190
|
+
# Fix phantom functions
|
|
191
|
+
elif elem_type.value == "function":
|
|
192
|
+
if not re.search(
|
|
193
|
+
r"CREATE\s+FUNCTION", raw_text_stripped, re.IGNORECASE
|
|
194
|
+
):
|
|
195
|
+
log_debug(
|
|
196
|
+
f"Removing phantom function: {elem.name} (content mismatch)"
|
|
197
|
+
)
|
|
198
|
+
is_valid = False
|
|
199
|
+
|
|
200
|
+
if not is_valid:
|
|
201
|
+
continue
|
|
202
|
+
|
|
203
|
+
# 2. Fix Names
|
|
204
|
+
if elem_type and elem.raw_text:
|
|
205
|
+
# Fix Trigger name issues (e.g. macOS "description" bug)
|
|
206
|
+
if elem_type.value == "trigger":
|
|
207
|
+
match = re.search(
|
|
208
|
+
r"CREATE\s+TRIGGER\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
209
|
+
elem.raw_text,
|
|
210
|
+
re.IGNORECASE,
|
|
211
|
+
)
|
|
212
|
+
if match:
|
|
213
|
+
correct_name = match.group(1)
|
|
214
|
+
if elem.name != correct_name and self._is_valid_identifier(
|
|
215
|
+
correct_name
|
|
216
|
+
):
|
|
217
|
+
log_debug(
|
|
218
|
+
f"Fixing trigger name: {elem.name} -> {correct_name}"
|
|
219
|
+
)
|
|
220
|
+
elem.name = correct_name
|
|
221
|
+
|
|
222
|
+
# Fix Function name issues (e.g. Windows/Ubuntu "AUTO_INCREMENT" bug)
|
|
223
|
+
elif elem_type.value == "function":
|
|
224
|
+
# Filter out obvious garbage names if they match keywords
|
|
225
|
+
if elem.name and elem.name.upper() in (
|
|
226
|
+
"AUTO_INCREMENT",
|
|
227
|
+
"KEY",
|
|
228
|
+
"PRIMARY",
|
|
229
|
+
"FOREIGN",
|
|
230
|
+
):
|
|
231
|
+
# Try to recover correct name
|
|
232
|
+
match = re.search(
|
|
233
|
+
r"CREATE\s+FUNCTION\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
234
|
+
elem.raw_text,
|
|
235
|
+
re.IGNORECASE,
|
|
236
|
+
)
|
|
237
|
+
if match:
|
|
238
|
+
correct_name = match.group(1)
|
|
239
|
+
log_debug(
|
|
240
|
+
f"Fixing garbage function name: {elem.name} -> {correct_name}"
|
|
241
|
+
)
|
|
242
|
+
elem.name = correct_name
|
|
243
|
+
else:
|
|
244
|
+
log_debug(f"Removing garbage function name: {elem.name}")
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
# General name verification
|
|
248
|
+
match = re.search(
|
|
249
|
+
r"CREATE\s+FUNCTION\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
250
|
+
elem.raw_text,
|
|
251
|
+
re.IGNORECASE,
|
|
252
|
+
)
|
|
253
|
+
if match:
|
|
254
|
+
correct_name = match.group(1)
|
|
255
|
+
if elem.name != correct_name and self._is_valid_identifier(
|
|
256
|
+
correct_name
|
|
257
|
+
):
|
|
258
|
+
log_debug(
|
|
259
|
+
f"Fixing function name: {elem.name} -> {correct_name}"
|
|
260
|
+
)
|
|
261
|
+
elem.name = correct_name
|
|
262
|
+
|
|
263
|
+
# Deduplication
|
|
264
|
+
key = (getattr(elem, "sql_element_type", None), elem.name, elem.start_line)
|
|
265
|
+
if key in seen_names:
|
|
266
|
+
continue
|
|
267
|
+
seen_names.add(key)
|
|
268
|
+
|
|
269
|
+
validated.append(elem)
|
|
270
|
+
|
|
271
|
+
# Recover missing Views (often missed in ERROR nodes on some platforms)
|
|
272
|
+
# This is a fallback scan of the entire source code
|
|
273
|
+
if self.source_code:
|
|
274
|
+
existing_views = {
|
|
275
|
+
e.name
|
|
276
|
+
for e in validated
|
|
277
|
+
if hasattr(e, "sql_element_type") and e.sql_element_type.value == "view"
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
view_matches = re.finditer(
|
|
281
|
+
r"^\s*CREATE\s+VIEW\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)\s+AS",
|
|
282
|
+
self.source_code,
|
|
283
|
+
re.IGNORECASE | re.MULTILINE,
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
for match in view_matches:
|
|
287
|
+
view_name = match.group(1)
|
|
288
|
+
if view_name not in existing_views and self._is_valid_identifier(
|
|
289
|
+
view_name
|
|
290
|
+
):
|
|
291
|
+
log_debug(f"Recovering missing view: {view_name}")
|
|
292
|
+
|
|
293
|
+
# Calculate approximate line numbers
|
|
294
|
+
start_pos = match.start()
|
|
295
|
+
# Count newlines before start_pos
|
|
296
|
+
start_line = self.source_code.count("\n", 0, start_pos) + 1
|
|
297
|
+
|
|
298
|
+
# Estimate end line (until next semicolon or empty line)
|
|
299
|
+
view_context = self.source_code[start_pos:]
|
|
300
|
+
semicolon_match = re.search(r";", view_context)
|
|
301
|
+
if semicolon_match:
|
|
302
|
+
end_pos = start_pos + semicolon_match.end()
|
|
303
|
+
end_line = self.source_code.count("\n", 0, end_pos) + 1
|
|
304
|
+
else:
|
|
305
|
+
end_line = start_line + 5 # Fallback estimate
|
|
306
|
+
|
|
307
|
+
# Extract source tables roughly
|
|
308
|
+
source_tables = []
|
|
309
|
+
table_matches = re.findall(
|
|
310
|
+
r"(?:FROM|JOIN)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
311
|
+
view_context[
|
|
312
|
+
: semicolon_match.end() if semicolon_match else 500
|
|
313
|
+
],
|
|
314
|
+
re.IGNORECASE,
|
|
315
|
+
)
|
|
316
|
+
source_tables.extend(table_matches)
|
|
317
|
+
|
|
318
|
+
view = SQLView(
|
|
319
|
+
name=view_name,
|
|
320
|
+
start_line=start_line,
|
|
321
|
+
end_line=end_line,
|
|
322
|
+
raw_text=f"CREATE VIEW {view_name} ...",
|
|
323
|
+
language="sql",
|
|
324
|
+
source_tables=sorted(set(source_tables)),
|
|
325
|
+
dependencies=sorted(set(source_tables)),
|
|
326
|
+
)
|
|
327
|
+
validated.append(view)
|
|
328
|
+
existing_views.add(view_name)
|
|
329
|
+
|
|
330
|
+
return validated
|
|
331
|
+
|
|
332
|
+
def extract_functions(
|
|
333
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
334
|
+
) -> list[Function]:
|
|
335
|
+
"""
|
|
336
|
+
Extract stored procedures, functions, and triggers from SQL code.
|
|
337
|
+
|
|
338
|
+
Maps SQL executable units to Function elements:
|
|
339
|
+
- CREATE PROCEDURE statements → Function
|
|
340
|
+
- CREATE FUNCTION statements → Function
|
|
341
|
+
- CREATE TRIGGER statements → Function
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
tree: Tree-sitter AST tree parsed from SQL source
|
|
345
|
+
source_code: Original SQL source code as string
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
List of Function elements representing procedures, functions, and triggers
|
|
349
|
+
"""
|
|
350
|
+
self.source_code = source_code or ""
|
|
351
|
+
self.content_lines = self.source_code.split("\n")
|
|
352
|
+
self._reset_caches()
|
|
353
|
+
|
|
354
|
+
functions: list[Function] = []
|
|
355
|
+
|
|
356
|
+
if tree is not None and tree.root_node is not None:
|
|
357
|
+
try:
|
|
358
|
+
# Extract procedures, functions, and triggers
|
|
359
|
+
self._extract_procedures(tree.root_node, functions)
|
|
360
|
+
self._extract_sql_functions(tree.root_node, functions)
|
|
361
|
+
self._extract_triggers(tree.root_node, functions)
|
|
362
|
+
log_debug(
|
|
363
|
+
f"Extracted {len(functions)} SQL functions/procedures/triggers"
|
|
364
|
+
)
|
|
365
|
+
except Exception as e:
|
|
366
|
+
log_debug(f"Error during function extraction: {e}")
|
|
367
|
+
|
|
368
|
+
return functions
|
|
369
|
+
|
|
370
|
+
def extract_classes(
|
|
371
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
372
|
+
) -> list[Class]:
|
|
373
|
+
"""
|
|
374
|
+
Extract tables and views from SQL code.
|
|
375
|
+
|
|
376
|
+
Maps SQL structural definitions to Class elements:
|
|
377
|
+
- CREATE TABLE statements → Class
|
|
378
|
+
- CREATE VIEW statements → Class
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
tree: Tree-sitter AST tree parsed from SQL source
|
|
382
|
+
source_code: Original SQL source code as string
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
List of Class elements representing tables and views
|
|
386
|
+
"""
|
|
387
|
+
self.source_code = source_code or ""
|
|
388
|
+
self.content_lines = self.source_code.split("\n")
|
|
389
|
+
self._reset_caches()
|
|
390
|
+
|
|
391
|
+
classes: list[Class] = []
|
|
392
|
+
|
|
393
|
+
if tree is not None and tree.root_node is not None:
|
|
394
|
+
try:
|
|
395
|
+
# Extract tables and views
|
|
396
|
+
self._extract_tables(tree.root_node, classes)
|
|
397
|
+
self._extract_views(tree.root_node, classes)
|
|
398
|
+
log_debug(f"Extracted {len(classes)} SQL tables/views")
|
|
399
|
+
except Exception as e:
|
|
400
|
+
log_debug(f"Error during class extraction: {e}")
|
|
401
|
+
|
|
402
|
+
return classes
|
|
403
|
+
|
|
404
|
+
def extract_variables(
|
|
405
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
406
|
+
) -> list[Variable]:
|
|
407
|
+
"""
|
|
408
|
+
Extract indexes from SQL code.
|
|
409
|
+
|
|
410
|
+
Maps SQL metadata definitions to Variable elements:
|
|
411
|
+
- CREATE INDEX statements → Variable
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
tree: Tree-sitter AST tree parsed from SQL source
|
|
415
|
+
source_code: Original SQL source code as string
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
List of Variable elements representing indexes
|
|
419
|
+
"""
|
|
420
|
+
self.source_code = source_code or ""
|
|
421
|
+
self.content_lines = self.source_code.split("\n")
|
|
422
|
+
self._reset_caches()
|
|
423
|
+
|
|
424
|
+
variables: list[Variable] = []
|
|
425
|
+
|
|
426
|
+
if tree is not None and tree.root_node is not None:
|
|
427
|
+
try:
|
|
428
|
+
# Extract indexes
|
|
429
|
+
self._extract_indexes(tree.root_node, variables)
|
|
430
|
+
log_debug(f"Extracted {len(variables)} SQL indexes")
|
|
431
|
+
except Exception as e:
|
|
432
|
+
log_debug(f"Error during variable extraction: {e}")
|
|
433
|
+
|
|
434
|
+
return variables
|
|
435
|
+
|
|
436
|
+
def extract_imports(
|
|
437
|
+
self, tree: "tree_sitter.Tree", source_code: str
|
|
438
|
+
) -> list[Import]:
|
|
439
|
+
"""
|
|
440
|
+
Extract schema references and dependencies from SQL code.
|
|
441
|
+
|
|
442
|
+
Extracts qualified names (schema.table) that represent cross-schema
|
|
443
|
+
dependencies, mapping them to Import elements.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
tree: Tree-sitter AST tree parsed from SQL source
|
|
447
|
+
source_code: Original SQL source code as string
|
|
448
|
+
|
|
449
|
+
Returns:
|
|
450
|
+
List of Import elements representing schema references
|
|
451
|
+
"""
|
|
452
|
+
self.source_code = source_code or ""
|
|
453
|
+
self.content_lines = self.source_code.split("\n")
|
|
454
|
+
self._reset_caches()
|
|
455
|
+
|
|
456
|
+
imports: list[Import] = []
|
|
457
|
+
|
|
458
|
+
if tree is not None and tree.root_node is not None:
|
|
459
|
+
try:
|
|
460
|
+
# Extract schema references (e.g., FROM schema.table)
|
|
461
|
+
self._extract_schema_references(tree.root_node, imports)
|
|
462
|
+
log_debug(f"Extracted {len(imports)} SQL schema references")
|
|
463
|
+
except Exception as e:
|
|
464
|
+
log_debug(f"Error during import extraction: {e}")
|
|
465
|
+
|
|
466
|
+
return imports
|
|
467
|
+
|
|
468
|
+
def _reset_caches(self) -> None:
|
|
469
|
+
"""Reset performance caches."""
|
|
470
|
+
self._node_text_cache.clear()
|
|
471
|
+
self._processed_nodes.clear()
|
|
472
|
+
|
|
473
|
+
def _get_node_text(self, node: "tree_sitter.Node") -> str:
|
|
474
|
+
"""
|
|
475
|
+
Get text content from a tree-sitter node with caching.
|
|
476
|
+
|
|
477
|
+
Uses byte-based extraction first, falls back to line-based extraction
|
|
478
|
+
if byte extraction fails. Results are cached for performance.
|
|
479
|
+
|
|
480
|
+
Args:
|
|
481
|
+
node: Tree-sitter node to extract text from
|
|
482
|
+
|
|
483
|
+
Returns:
|
|
484
|
+
Text content of the node, or empty string if extraction fails
|
|
485
|
+
"""
|
|
486
|
+
node_id = id(node)
|
|
487
|
+
|
|
488
|
+
if node_id in self._node_text_cache:
|
|
489
|
+
return self._node_text_cache[node_id]
|
|
490
|
+
|
|
491
|
+
try:
|
|
492
|
+
start_byte = node.start_byte
|
|
493
|
+
end_byte = node.end_byte
|
|
494
|
+
encoding = self._file_encoding or "utf-8"
|
|
495
|
+
content_bytes = safe_encode("\n".join(self.content_lines), encoding)
|
|
496
|
+
text = extract_text_slice(content_bytes, start_byte, end_byte, encoding)
|
|
497
|
+
|
|
498
|
+
if text:
|
|
499
|
+
self._node_text_cache[node_id] = text
|
|
500
|
+
return text
|
|
501
|
+
except Exception as e:
|
|
502
|
+
log_debug(f"Error in _get_node_text: {e}")
|
|
503
|
+
|
|
504
|
+
# Fallback to line-based extraction
|
|
505
|
+
try:
|
|
506
|
+
start_point = node.start_point
|
|
507
|
+
end_point = node.end_point
|
|
508
|
+
|
|
509
|
+
if start_point[0] < 0 or start_point[0] >= len(self.content_lines):
|
|
510
|
+
return ""
|
|
511
|
+
|
|
512
|
+
if end_point[0] < 0 or end_point[0] >= len(self.content_lines):
|
|
513
|
+
return ""
|
|
514
|
+
|
|
515
|
+
if start_point[0] == end_point[0]:
|
|
516
|
+
line = self.content_lines[start_point[0]]
|
|
517
|
+
start_col = max(0, min(start_point[1], len(line)))
|
|
518
|
+
end_col = max(start_col, min(end_point[1], len(line)))
|
|
519
|
+
result: str = line[start_col:end_col]
|
|
520
|
+
self._node_text_cache[node_id] = result
|
|
521
|
+
return result
|
|
522
|
+
else:
|
|
523
|
+
lines = []
|
|
524
|
+
for i in range(
|
|
525
|
+
start_point[0], min(end_point[0] + 1, len(self.content_lines))
|
|
526
|
+
):
|
|
527
|
+
if i < len(self.content_lines):
|
|
528
|
+
line = self.content_lines[i]
|
|
529
|
+
if i == start_point[0] and i == end_point[0]:
|
|
530
|
+
start_col = max(0, min(start_point[1], len(line)))
|
|
531
|
+
end_col = max(start_col, min(end_point[1], len(line)))
|
|
532
|
+
lines.append(line[start_col:end_col])
|
|
533
|
+
elif i == start_point[0]:
|
|
534
|
+
start_col = max(0, min(start_point[1], len(line)))
|
|
535
|
+
lines.append(line[start_col:])
|
|
536
|
+
elif i == end_point[0]:
|
|
537
|
+
end_col = max(0, min(end_point[1], len(line)))
|
|
538
|
+
lines.append(line[:end_col])
|
|
539
|
+
else:
|
|
540
|
+
lines.append(line)
|
|
541
|
+
result = "\n".join(lines)
|
|
542
|
+
self._node_text_cache[node_id] = result
|
|
543
|
+
return result
|
|
544
|
+
except Exception as fallback_error:
|
|
545
|
+
log_debug(f"Fallback text extraction also failed: {fallback_error}")
|
|
546
|
+
return ""
|
|
547
|
+
|
|
548
|
+
def _traverse_nodes(self, node: "tree_sitter.Node") -> Iterator["tree_sitter.Node"]:
|
|
549
|
+
"""
|
|
550
|
+
Traverse tree nodes recursively in depth-first order.
|
|
551
|
+
|
|
552
|
+
Args:
|
|
553
|
+
node: Root node to start traversal from
|
|
554
|
+
|
|
555
|
+
Yields:
|
|
556
|
+
Each node in the tree, starting with the root node
|
|
557
|
+
"""
|
|
558
|
+
yield node
|
|
559
|
+
if hasattr(node, "children"):
|
|
560
|
+
for child in node.children:
|
|
561
|
+
yield from self._traverse_nodes(child)
|
|
562
|
+
|
|
563
|
+
def _is_valid_identifier(self, name: str) -> bool:
|
|
564
|
+
"""
|
|
565
|
+
Validate that a name is a valid SQL identifier.
|
|
566
|
+
|
|
567
|
+
This prevents accepting multi-line text or SQL statements as identifiers.
|
|
568
|
+
Also rejects common column names and SQL reserved keywords.
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
name: The identifier to validate
|
|
572
|
+
|
|
573
|
+
Returns:
|
|
574
|
+
True if the name is a valid identifier, False otherwise
|
|
575
|
+
"""
|
|
576
|
+
if not name:
|
|
577
|
+
return False
|
|
578
|
+
|
|
579
|
+
# Reject if contains newlines or other control characters
|
|
580
|
+
if "\n" in name or "\r" in name or "\t" in name:
|
|
581
|
+
return False
|
|
582
|
+
|
|
583
|
+
# Reject if matches SQL statement patterns (keyword followed by space)
|
|
584
|
+
# This catches "CREATE TABLE" but allows "create_table" as an identifier
|
|
585
|
+
name_upper = name.upper()
|
|
586
|
+
sql_statement_patterns = [
|
|
587
|
+
"CREATE ",
|
|
588
|
+
"SELECT ",
|
|
589
|
+
"INSERT ",
|
|
590
|
+
"UPDATE ",
|
|
591
|
+
"DELETE ",
|
|
592
|
+
"DROP ",
|
|
593
|
+
"ALTER ",
|
|
594
|
+
"TABLE ",
|
|
595
|
+
"VIEW ",
|
|
596
|
+
"PROCEDURE ",
|
|
597
|
+
"FUNCTION ",
|
|
598
|
+
"TRIGGER ",
|
|
599
|
+
]
|
|
600
|
+
if any(name_upper.startswith(pattern) for pattern in sql_statement_patterns):
|
|
601
|
+
return False
|
|
602
|
+
|
|
603
|
+
# Reject common column names that should never be function names
|
|
604
|
+
# These are typical column names that might appear in SELECT statements
|
|
605
|
+
common_column_names = {
|
|
606
|
+
"PRICE",
|
|
607
|
+
"QUANTITY",
|
|
608
|
+
"TOTAL",
|
|
609
|
+
"AMOUNT",
|
|
610
|
+
"COUNT",
|
|
611
|
+
"SUM",
|
|
612
|
+
"CREATED_AT",
|
|
613
|
+
"UPDATED_AT",
|
|
614
|
+
"ID",
|
|
615
|
+
"NAME",
|
|
616
|
+
"EMAIL",
|
|
617
|
+
"STATUS",
|
|
618
|
+
"VALUE",
|
|
619
|
+
"DATE",
|
|
620
|
+
"TIME",
|
|
621
|
+
"TIMESTAMP",
|
|
622
|
+
"USER_ID",
|
|
623
|
+
"ORDER_ID",
|
|
624
|
+
"PRODUCT_ID",
|
|
625
|
+
}
|
|
626
|
+
if name_upper in common_column_names:
|
|
627
|
+
return False
|
|
628
|
+
|
|
629
|
+
# Reject common SQL keywords that should never be identifiers
|
|
630
|
+
sql_keywords = {
|
|
631
|
+
"SELECT",
|
|
632
|
+
"FROM",
|
|
633
|
+
"WHERE",
|
|
634
|
+
"AS",
|
|
635
|
+
"IF",
|
|
636
|
+
"NOT",
|
|
637
|
+
"EXISTS",
|
|
638
|
+
"NULL",
|
|
639
|
+
"CURRENT_TIMESTAMP",
|
|
640
|
+
"NOW",
|
|
641
|
+
"SYSDATE",
|
|
642
|
+
"AVG",
|
|
643
|
+
"MAX",
|
|
644
|
+
"MIN",
|
|
645
|
+
"AND",
|
|
646
|
+
"OR",
|
|
647
|
+
"IN",
|
|
648
|
+
"LIKE",
|
|
649
|
+
"BETWEEN",
|
|
650
|
+
"JOIN",
|
|
651
|
+
"LEFT",
|
|
652
|
+
"RIGHT",
|
|
653
|
+
"INNER",
|
|
654
|
+
"OUTER",
|
|
655
|
+
"CROSS",
|
|
656
|
+
"ON",
|
|
657
|
+
"USING",
|
|
658
|
+
"GROUP",
|
|
659
|
+
"BY",
|
|
660
|
+
"ORDER",
|
|
661
|
+
"HAVING",
|
|
662
|
+
"LIMIT",
|
|
663
|
+
"OFFSET",
|
|
664
|
+
"DISTINCT",
|
|
665
|
+
"ALL",
|
|
666
|
+
"UNION",
|
|
667
|
+
"INTERSECT",
|
|
668
|
+
"EXCEPT",
|
|
669
|
+
"INSERT",
|
|
670
|
+
"UPDATE",
|
|
671
|
+
"DELETE",
|
|
672
|
+
"CREATE",
|
|
673
|
+
"DROP",
|
|
674
|
+
"ALTER",
|
|
675
|
+
"TABLE",
|
|
676
|
+
"VIEW",
|
|
677
|
+
"INDEX",
|
|
678
|
+
"TRIGGER",
|
|
679
|
+
"PROCEDURE",
|
|
680
|
+
"FUNCTION",
|
|
681
|
+
"PRIMARY",
|
|
682
|
+
"FOREIGN",
|
|
683
|
+
"KEY",
|
|
684
|
+
"UNIQUE",
|
|
685
|
+
"CHECK",
|
|
686
|
+
"DEFAULT",
|
|
687
|
+
"REFERENCES",
|
|
688
|
+
"CASCADE",
|
|
689
|
+
"RESTRICT",
|
|
690
|
+
"SET",
|
|
691
|
+
"NO",
|
|
692
|
+
"ACTION",
|
|
693
|
+
"INTO",
|
|
694
|
+
"VALUES",
|
|
695
|
+
"BEGIN",
|
|
696
|
+
"END",
|
|
697
|
+
"DECLARE",
|
|
698
|
+
"RETURN",
|
|
699
|
+
"RETURNS",
|
|
700
|
+
"READS",
|
|
701
|
+
"SQL",
|
|
702
|
+
"DATA",
|
|
703
|
+
"DETERMINISTIC",
|
|
704
|
+
"BEFORE",
|
|
705
|
+
"AFTER",
|
|
706
|
+
"EACH",
|
|
707
|
+
"ROW",
|
|
708
|
+
"FOR",
|
|
709
|
+
"COALESCE",
|
|
710
|
+
"CASE",
|
|
711
|
+
"WHEN",
|
|
712
|
+
"THEN",
|
|
713
|
+
"ELSE",
|
|
714
|
+
}
|
|
715
|
+
if name_upper in sql_keywords:
|
|
716
|
+
return False
|
|
717
|
+
|
|
718
|
+
# Reject if contains parentheses (like "users (" or "(id")
|
|
719
|
+
if "(" in name or ")" in name:
|
|
720
|
+
return False
|
|
721
|
+
|
|
722
|
+
# Reject if too long (identifiers should be reasonable length)
|
|
723
|
+
if len(name) > 128:
|
|
724
|
+
return False
|
|
725
|
+
|
|
726
|
+
# Accept if it matches standard identifier pattern
|
|
727
|
+
import re
|
|
728
|
+
|
|
729
|
+
# Allow alphanumeric, underscore, and some special chars used in SQL identifiers
|
|
730
|
+
if re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", name):
|
|
731
|
+
return True
|
|
732
|
+
|
|
733
|
+
# Also allow quoted identifiers (backticks, double quotes, square brackets)
|
|
734
|
+
if re.match(r'^[`"\[].*[`"\]]$', name):
|
|
735
|
+
return True
|
|
736
|
+
|
|
737
|
+
return False
|
|
738
|
+
|
|
739
|
+
def _extract_tables(
|
|
740
|
+
self, root_node: "tree_sitter.Node", classes: list[Class]
|
|
741
|
+
) -> None:
|
|
742
|
+
"""
|
|
743
|
+
Extract CREATE TABLE statements from SQL AST.
|
|
744
|
+
|
|
745
|
+
Searches for create_table nodes and identifies table names from
|
|
746
|
+
object_reference.identifier, supporting both simple identifiers
|
|
747
|
+
and qualified names (schema.table).
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
root_node: Root node of the SQL AST
|
|
751
|
+
classes: List to append extracted table Class elements to
|
|
752
|
+
"""
|
|
753
|
+
for node in self._traverse_nodes(root_node):
|
|
754
|
+
if node.type == "create_table":
|
|
755
|
+
# Look for object_reference within create_table
|
|
756
|
+
table_name = None
|
|
757
|
+
for child in node.children:
|
|
758
|
+
if child.type == "object_reference":
|
|
759
|
+
# object_reference contains identifier
|
|
760
|
+
for subchild in child.children:
|
|
761
|
+
if subchild.type == "identifier":
|
|
762
|
+
table_name = self._get_node_text(subchild).strip()
|
|
763
|
+
# Validate table name
|
|
764
|
+
if table_name and self._is_valid_identifier(table_name):
|
|
765
|
+
break
|
|
766
|
+
else:
|
|
767
|
+
table_name = None
|
|
768
|
+
if table_name:
|
|
769
|
+
break
|
|
770
|
+
|
|
771
|
+
if table_name:
|
|
772
|
+
try:
|
|
773
|
+
start_line = node.start_point[0] + 1
|
|
774
|
+
end_line = node.end_point[0] + 1
|
|
775
|
+
raw_text = self._get_node_text(node)
|
|
776
|
+
|
|
777
|
+
cls = Class(
|
|
778
|
+
name=table_name,
|
|
779
|
+
start_line=start_line,
|
|
780
|
+
end_line=end_line,
|
|
781
|
+
raw_text=raw_text,
|
|
782
|
+
language="sql",
|
|
783
|
+
)
|
|
784
|
+
classes.append(cls)
|
|
785
|
+
except Exception as e:
|
|
786
|
+
log_debug(f"Failed to extract table: {e}")
|
|
787
|
+
|
|
788
|
+
def _extract_views(
|
|
789
|
+
self, root_node: "tree_sitter.Node", classes: list[Class]
|
|
790
|
+
) -> None:
|
|
791
|
+
"""
|
|
792
|
+
Extract CREATE VIEW statements from SQL AST.
|
|
793
|
+
|
|
794
|
+
Searches for create_view nodes and extracts view names from
|
|
795
|
+
object_reference.identifier, supporting qualified names.
|
|
796
|
+
|
|
797
|
+
Args:
|
|
798
|
+
root_node: Root node of the SQL AST
|
|
799
|
+
classes: List to append extracted view Class elements to
|
|
800
|
+
"""
|
|
801
|
+
import re
|
|
802
|
+
|
|
803
|
+
for node in self._traverse_nodes(root_node):
|
|
804
|
+
if node.type == "create_view":
|
|
805
|
+
# Get raw text first for fallback regex
|
|
806
|
+
raw_text = self._get_node_text(node)
|
|
807
|
+
view_name = None
|
|
808
|
+
|
|
809
|
+
# FIRST: Try regex parsing (most reliable for CREATE VIEW)
|
|
810
|
+
if raw_text:
|
|
811
|
+
# Pattern: CREATE VIEW [IF NOT EXISTS] view_name AS
|
|
812
|
+
match = re.search(
|
|
813
|
+
r"CREATE\s+VIEW\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)\s+AS",
|
|
814
|
+
raw_text,
|
|
815
|
+
re.IGNORECASE,
|
|
816
|
+
)
|
|
817
|
+
if match:
|
|
818
|
+
potential_name = match.group(1).strip()
|
|
819
|
+
if self._is_valid_identifier(potential_name):
|
|
820
|
+
view_name = potential_name
|
|
821
|
+
|
|
822
|
+
# Fallback: Try AST parsing if regex didn't work
|
|
823
|
+
if not view_name:
|
|
824
|
+
for child in node.children:
|
|
825
|
+
if child.type == "object_reference":
|
|
826
|
+
# object_reference contains identifier
|
|
827
|
+
for subchild in child.children:
|
|
828
|
+
if subchild.type == "identifier":
|
|
829
|
+
potential_name = self._get_node_text(subchild)
|
|
830
|
+
if potential_name:
|
|
831
|
+
potential_name = potential_name.strip()
|
|
832
|
+
# Validate view name - exclude SQL keywords
|
|
833
|
+
if (
|
|
834
|
+
potential_name
|
|
835
|
+
and self._is_valid_identifier(
|
|
836
|
+
potential_name
|
|
837
|
+
)
|
|
838
|
+
and potential_name.upper()
|
|
839
|
+
not in (
|
|
840
|
+
"SELECT",
|
|
841
|
+
"FROM",
|
|
842
|
+
"WHERE",
|
|
843
|
+
"AS",
|
|
844
|
+
"IF",
|
|
845
|
+
"NOT",
|
|
846
|
+
"EXISTS",
|
|
847
|
+
"NULL",
|
|
848
|
+
"CURRENT_TIMESTAMP",
|
|
849
|
+
"NOW",
|
|
850
|
+
"SYSDATE",
|
|
851
|
+
)
|
|
852
|
+
):
|
|
853
|
+
view_name = potential_name
|
|
854
|
+
break
|
|
855
|
+
if view_name:
|
|
856
|
+
break
|
|
857
|
+
|
|
858
|
+
if view_name:
|
|
859
|
+
try:
|
|
860
|
+
start_line = node.start_point[0] + 1
|
|
861
|
+
end_line = node.end_point[0] + 1
|
|
862
|
+
|
|
863
|
+
# Fix for truncated view definitions (single-line misparsing)
|
|
864
|
+
# When tree-sitter misparses a view as a single line (e.g. lines 47-47),
|
|
865
|
+
# we need to expand the range to include the actual query definition.
|
|
866
|
+
# We look for the next semicolon or empty line to find the true end.
|
|
867
|
+
if start_line == end_line and self.source_code:
|
|
868
|
+
# This logic is similar to the recovery logic in _validate_and_fix_elements
|
|
869
|
+
# Find where the view definition actually ends
|
|
870
|
+
current_line_idx = start_line - 1
|
|
871
|
+
|
|
872
|
+
# Scan forward for semicolon to find end of statement
|
|
873
|
+
found_end = False
|
|
874
|
+
for i in range(current_line_idx, len(self.content_lines)):
|
|
875
|
+
line = self.content_lines[i]
|
|
876
|
+
if ";" in line:
|
|
877
|
+
end_line = i + 1
|
|
878
|
+
found_end = True
|
|
879
|
+
break
|
|
880
|
+
|
|
881
|
+
# If no semicolon found within reasonable range, use a fallback
|
|
882
|
+
if not found_end:
|
|
883
|
+
# Look for empty line as separator or next CREATE statement
|
|
884
|
+
for i in range(
|
|
885
|
+
current_line_idx + 1,
|
|
886
|
+
min(len(self.content_lines), current_line_idx + 50),
|
|
887
|
+
):
|
|
888
|
+
line = self.content_lines[i].strip()
|
|
889
|
+
if not line or line.upper().startswith("CREATE "):
|
|
890
|
+
end_line = i # End before this line
|
|
891
|
+
found_end = True
|
|
892
|
+
break
|
|
893
|
+
|
|
894
|
+
# Update raw_text to cover the full range
|
|
895
|
+
# Re-extract text for the corrected range
|
|
896
|
+
if found_end and end_line > start_line:
|
|
897
|
+
raw_text = "\n".join(
|
|
898
|
+
self.content_lines[current_line_idx:end_line]
|
|
899
|
+
)
|
|
900
|
+
log_debug(
|
|
901
|
+
f"Corrected view span for {view_name}: {start_line}-{end_line}"
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
cls = Class(
|
|
905
|
+
name=view_name,
|
|
906
|
+
start_line=start_line,
|
|
907
|
+
end_line=end_line,
|
|
908
|
+
raw_text=raw_text,
|
|
909
|
+
language="sql",
|
|
910
|
+
)
|
|
911
|
+
classes.append(cls)
|
|
912
|
+
except Exception as e:
|
|
913
|
+
log_debug(f"Failed to extract view: {e}")
|
|
914
|
+
|
|
915
|
+
def _extract_procedures(
|
|
916
|
+
self, root_node: "tree_sitter.Node", functions: list[Function]
|
|
917
|
+
) -> None:
|
|
918
|
+
"""
|
|
919
|
+
Extract CREATE PROCEDURE statements from SQL AST.
|
|
920
|
+
|
|
921
|
+
Since tree-sitter-sql doesn't fully support PROCEDURE syntax, these
|
|
922
|
+
appear as ERROR nodes. The PROCEDURE keyword is not tokenized, so we
|
|
923
|
+
need to check the raw text content of ERROR nodes that contain
|
|
924
|
+
keyword_create and look for "PROCEDURE" in the text.
|
|
925
|
+
|
|
926
|
+
Args:
|
|
927
|
+
root_node: Root node of the SQL AST
|
|
928
|
+
functions: List to append extracted procedure Function elements to
|
|
929
|
+
"""
|
|
930
|
+
for node in self._traverse_nodes(root_node):
|
|
931
|
+
if node.type == "ERROR":
|
|
932
|
+
# Check if this ERROR node contains CREATE and PROCEDURE in text
|
|
933
|
+
has_create = False
|
|
934
|
+
node_text = self._get_node_text(node)
|
|
935
|
+
node_text_upper = node_text.upper()
|
|
936
|
+
|
|
937
|
+
# Look for keyword_create child
|
|
938
|
+
for child in node.children:
|
|
939
|
+
if child.type == "keyword_create":
|
|
940
|
+
has_create = True
|
|
941
|
+
break
|
|
942
|
+
|
|
943
|
+
# Check if the text contains PROCEDURE
|
|
944
|
+
if has_create and "PROCEDURE" in node_text_upper:
|
|
945
|
+
# Extract procedure name from the text (preserve original case)
|
|
946
|
+
# Use finditer to find ALL procedures in the ERROR node
|
|
947
|
+
import re
|
|
948
|
+
|
|
949
|
+
matches = re.finditer(
|
|
950
|
+
r"CREATE\s+PROCEDURE\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
951
|
+
node_text,
|
|
952
|
+
re.IGNORECASE,
|
|
953
|
+
)
|
|
954
|
+
|
|
955
|
+
for match in matches:
|
|
956
|
+
proc_name = match.group(1)
|
|
957
|
+
|
|
958
|
+
if proc_name:
|
|
959
|
+
try:
|
|
960
|
+
# Calculate start line based on match position
|
|
961
|
+
newlines_before = node_text[: match.start()].count("\n")
|
|
962
|
+
start_line = node.start_point[0] + 1 + newlines_before
|
|
963
|
+
end_line = node.end_point[0] + 1
|
|
964
|
+
|
|
965
|
+
# Use specific text for this procedure if possible,
|
|
966
|
+
# but for legacy extraction we often just use the whole node text
|
|
967
|
+
# or we could slice it. For now, keeping whole node text is safer for legacy
|
|
968
|
+
raw_text = self._get_node_text(node)
|
|
969
|
+
|
|
970
|
+
func = Function(
|
|
971
|
+
name=proc_name,
|
|
972
|
+
start_line=start_line,
|
|
973
|
+
end_line=end_line,
|
|
974
|
+
raw_text=raw_text,
|
|
975
|
+
language="sql",
|
|
976
|
+
)
|
|
977
|
+
functions.append(func)
|
|
978
|
+
except Exception as e:
|
|
979
|
+
log_debug(f"Failed to extract procedure: {e}")
|
|
980
|
+
|
|
981
|
+
def _extract_sql_functions(
|
|
982
|
+
self, root_node: "tree_sitter.Node", functions: list[Function]
|
|
983
|
+
) -> None:
|
|
984
|
+
"""
|
|
985
|
+
Extract CREATE FUNCTION statements from SQL AST.
|
|
986
|
+
|
|
987
|
+
Functions are properly parsed as create_function nodes, so we search
|
|
988
|
+
for these nodes and extract the function name from object_reference > identifier.
|
|
989
|
+
|
|
990
|
+
Args:
|
|
991
|
+
root_node: Root node of the SQL AST
|
|
992
|
+
functions: List to append extracted function Function elements to
|
|
993
|
+
"""
|
|
994
|
+
for node in self._traverse_nodes(root_node):
|
|
995
|
+
if node.type == "create_function":
|
|
996
|
+
func_name = None
|
|
997
|
+
# Only use the FIRST object_reference as the function name
|
|
998
|
+
for child in node.children:
|
|
999
|
+
if child.type == "object_reference":
|
|
1000
|
+
# Only process the first object_reference
|
|
1001
|
+
for subchild in child.children:
|
|
1002
|
+
if subchild.type == "identifier":
|
|
1003
|
+
func_name = self._get_node_text(subchild).strip()
|
|
1004
|
+
if func_name and self._is_valid_identifier(func_name):
|
|
1005
|
+
break
|
|
1006
|
+
else:
|
|
1007
|
+
func_name = None
|
|
1008
|
+
break # Stop after first object_reference
|
|
1009
|
+
|
|
1010
|
+
# Fallback: Parse from raw text if AST parsing failed or returned invalid name
|
|
1011
|
+
if not func_name:
|
|
1012
|
+
raw_text = self._get_node_text(node)
|
|
1013
|
+
import re
|
|
1014
|
+
|
|
1015
|
+
match = re.search(
|
|
1016
|
+
r"CREATE\s+FUNCTION\s+(\w+)\s*\(", raw_text, re.IGNORECASE
|
|
1017
|
+
)
|
|
1018
|
+
if match:
|
|
1019
|
+
potential_name = match.group(1).strip()
|
|
1020
|
+
if self._is_valid_identifier(potential_name):
|
|
1021
|
+
func_name = potential_name
|
|
1022
|
+
|
|
1023
|
+
if func_name:
|
|
1024
|
+
try:
|
|
1025
|
+
start_line = node.start_point[0] + 1
|
|
1026
|
+
end_line = node.end_point[0] + 1
|
|
1027
|
+
raw_text = self._get_node_text(node)
|
|
1028
|
+
func = Function(
|
|
1029
|
+
name=func_name,
|
|
1030
|
+
start_line=start_line,
|
|
1031
|
+
end_line=end_line,
|
|
1032
|
+
raw_text=raw_text,
|
|
1033
|
+
language="sql",
|
|
1034
|
+
)
|
|
1035
|
+
functions.append(func)
|
|
1036
|
+
except Exception as e:
|
|
1037
|
+
log_debug(f"Failed to extract function: {e}")
|
|
1038
|
+
|
|
1039
|
+
def _extract_triggers(
|
|
1040
|
+
self, root_node: "tree_sitter.Node", functions: list[Function]
|
|
1041
|
+
) -> None:
|
|
1042
|
+
"""
|
|
1043
|
+
Extract CREATE TRIGGER statements from SQL AST.
|
|
1044
|
+
|
|
1045
|
+
Since tree-sitter-sql doesn't fully support TRIGGER syntax, these
|
|
1046
|
+
appear as ERROR nodes. We search for ERROR nodes containing both
|
|
1047
|
+
keyword_create and keyword_trigger, then extract the trigger name
|
|
1048
|
+
from the first object_reference > identifier that appears after
|
|
1049
|
+
keyword_trigger.
|
|
1050
|
+
|
|
1051
|
+
Args:
|
|
1052
|
+
root_node: Root node of the SQL AST
|
|
1053
|
+
functions: List to append extracted trigger Function elements to
|
|
1054
|
+
"""
|
|
1055
|
+
for node in self._traverse_nodes(root_node):
|
|
1056
|
+
if node.type == "ERROR":
|
|
1057
|
+
# Check if this ERROR node contains CREATE TRIGGER
|
|
1058
|
+
# Since multiple triggers might be lumped into one ERROR node,
|
|
1059
|
+
# we need to scan all children or use regex.
|
|
1060
|
+
# Using regex on the node text is more robust for ERROR nodes.
|
|
1061
|
+
|
|
1062
|
+
node_text = self._get_node_text(node)
|
|
1063
|
+
if not node_text:
|
|
1064
|
+
continue
|
|
1065
|
+
|
|
1066
|
+
node_text_upper = node_text.upper()
|
|
1067
|
+
if "CREATE" in node_text_upper and "TRIGGER" in node_text_upper:
|
|
1068
|
+
import re
|
|
1069
|
+
|
|
1070
|
+
# Regex to find CREATE TRIGGER statements
|
|
1071
|
+
# Matches: CREATE TRIGGER [IF NOT EXISTS] trigger_name
|
|
1072
|
+
matches = re.finditer(
|
|
1073
|
+
r"CREATE\s+TRIGGER\s+(?:IF\s+NOT\s+EXISTS\s+)?([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
1074
|
+
node_text,
|
|
1075
|
+
re.IGNORECASE,
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
for match in matches:
|
|
1079
|
+
trigger_name = match.group(1)
|
|
1080
|
+
|
|
1081
|
+
if trigger_name and self._is_valid_identifier(trigger_name):
|
|
1082
|
+
# Skip common SQL keywords
|
|
1083
|
+
if trigger_name.upper() in (
|
|
1084
|
+
"KEY",
|
|
1085
|
+
"AUTO_INCREMENT",
|
|
1086
|
+
"PRIMARY",
|
|
1087
|
+
"FOREIGN",
|
|
1088
|
+
"INDEX",
|
|
1089
|
+
"UNIQUE",
|
|
1090
|
+
"PRICE",
|
|
1091
|
+
"QUANTITY",
|
|
1092
|
+
"TOTAL",
|
|
1093
|
+
"SUM",
|
|
1094
|
+
"COUNT",
|
|
1095
|
+
"AVG",
|
|
1096
|
+
"MAX",
|
|
1097
|
+
"MIN",
|
|
1098
|
+
"CONSTRAINT",
|
|
1099
|
+
"CHECK",
|
|
1100
|
+
"DEFAULT",
|
|
1101
|
+
"REFERENCES",
|
|
1102
|
+
"ON",
|
|
1103
|
+
"UPDATE",
|
|
1104
|
+
"DELETE",
|
|
1105
|
+
"INSERT",
|
|
1106
|
+
"BEFORE",
|
|
1107
|
+
"AFTER",
|
|
1108
|
+
"INSTEAD",
|
|
1109
|
+
"OF",
|
|
1110
|
+
):
|
|
1111
|
+
continue
|
|
1112
|
+
|
|
1113
|
+
try:
|
|
1114
|
+
# Calculate start line based on match position
|
|
1115
|
+
newlines_before = node_text[: match.start()].count("\n")
|
|
1116
|
+
start_line = node.start_point[0] + 1 + newlines_before
|
|
1117
|
+
end_line = node.end_point[0] + 1
|
|
1118
|
+
|
|
1119
|
+
# Use the whole error node text as raw text for now
|
|
1120
|
+
raw_text = node_text
|
|
1121
|
+
|
|
1122
|
+
func = Function(
|
|
1123
|
+
name=trigger_name,
|
|
1124
|
+
start_line=start_line,
|
|
1125
|
+
end_line=end_line,
|
|
1126
|
+
raw_text=raw_text,
|
|
1127
|
+
language="sql",
|
|
1128
|
+
)
|
|
1129
|
+
functions.append(func)
|
|
1130
|
+
except Exception as e:
|
|
1131
|
+
log_debug(f"Failed to extract trigger: {e}")
|
|
1132
|
+
|
|
1133
|
+
def _extract_indexes(
|
|
1134
|
+
self, root_node: "tree_sitter.Node", variables: list[Variable]
|
|
1135
|
+
) -> None:
|
|
1136
|
+
"""
|
|
1137
|
+
Extract CREATE INDEX statements from SQL AST.
|
|
1138
|
+
|
|
1139
|
+
Searches for create_index nodes and extracts index names from
|
|
1140
|
+
identifier child nodes.
|
|
1141
|
+
|
|
1142
|
+
Args:
|
|
1143
|
+
root_node: Root node of the SQL AST
|
|
1144
|
+
variables: List to append extracted index Variable elements to
|
|
1145
|
+
"""
|
|
1146
|
+
for node in self._traverse_nodes(root_node):
|
|
1147
|
+
if node.type == "create_index":
|
|
1148
|
+
# Index name is directly in identifier child
|
|
1149
|
+
index_name = None
|
|
1150
|
+
for child in node.children:
|
|
1151
|
+
if child.type == "identifier":
|
|
1152
|
+
index_name = self._get_node_text(child).strip()
|
|
1153
|
+
break
|
|
1154
|
+
|
|
1155
|
+
if index_name:
|
|
1156
|
+
try:
|
|
1157
|
+
start_line = node.start_point[0] + 1
|
|
1158
|
+
end_line = node.end_point[0] + 1
|
|
1159
|
+
raw_text = self._get_node_text(node)
|
|
1160
|
+
|
|
1161
|
+
var = Variable(
|
|
1162
|
+
name=index_name,
|
|
1163
|
+
start_line=start_line,
|
|
1164
|
+
end_line=end_line,
|
|
1165
|
+
raw_text=raw_text,
|
|
1166
|
+
language="sql",
|
|
1167
|
+
)
|
|
1168
|
+
variables.append(var)
|
|
1169
|
+
except Exception as e:
|
|
1170
|
+
log_debug(f"Failed to extract index: {e}")
|
|
1171
|
+
|
|
1172
|
+
def _extract_schema_references(
|
|
1173
|
+
self, root_node: "tree_sitter.Node", imports: list[Import]
|
|
1174
|
+
) -> None:
|
|
1175
|
+
"""Extract schema references (e.g., FROM schema.table)."""
|
|
1176
|
+
# This is a simplified implementation
|
|
1177
|
+
# In a full implementation, we would extract schema.table references
|
|
1178
|
+
# For now, we'll extract qualified names that might represent schema references
|
|
1179
|
+
for node in self._traverse_nodes(root_node):
|
|
1180
|
+
if node.type == "qualified_name":
|
|
1181
|
+
# Check if this looks like a schema reference
|
|
1182
|
+
text = self._get_node_text(node)
|
|
1183
|
+
if "." in text and len(text.split(".")) == 2:
|
|
1184
|
+
try:
|
|
1185
|
+
start_line = node.start_point[0] + 1
|
|
1186
|
+
end_line = node.end_point[0] + 1
|
|
1187
|
+
raw_text = text
|
|
1188
|
+
|
|
1189
|
+
imp = Import(
|
|
1190
|
+
name=text,
|
|
1191
|
+
start_line=start_line,
|
|
1192
|
+
end_line=end_line,
|
|
1193
|
+
raw_text=raw_text,
|
|
1194
|
+
language="sql",
|
|
1195
|
+
)
|
|
1196
|
+
imports.append(imp)
|
|
1197
|
+
except Exception as e:
|
|
1198
|
+
log_debug(f"Failed to extract schema reference: {e}")
|
|
1199
|
+
|
|
1200
|
+
def _extract_sql_tables(
|
|
1201
|
+
self, root_node: "tree_sitter.Node", sql_elements: list[SQLElement]
|
|
1202
|
+
) -> None:
|
|
1203
|
+
"""
|
|
1204
|
+
Extract CREATE TABLE statements with enhanced metadata.
|
|
1205
|
+
|
|
1206
|
+
Extracts table information including columns, data types, constraints,
|
|
1207
|
+
and dependencies for comprehensive table analysis.
|
|
1208
|
+
"""
|
|
1209
|
+
for node in self._traverse_nodes(root_node):
|
|
1210
|
+
if node.type == "create_table":
|
|
1211
|
+
table_name = None
|
|
1212
|
+
columns = []
|
|
1213
|
+
constraints = []
|
|
1214
|
+
|
|
1215
|
+
# Extract table name
|
|
1216
|
+
for child in node.children:
|
|
1217
|
+
if child.type == "object_reference":
|
|
1218
|
+
for subchild in child.children:
|
|
1219
|
+
if subchild.type == "identifier":
|
|
1220
|
+
table_name = self._get_node_text(subchild).strip()
|
|
1221
|
+
# Validate table name - should be a simple identifier
|
|
1222
|
+
if table_name and self._is_valid_identifier(table_name):
|
|
1223
|
+
break
|
|
1224
|
+
else:
|
|
1225
|
+
table_name = None
|
|
1226
|
+
if table_name:
|
|
1227
|
+
break
|
|
1228
|
+
|
|
1229
|
+
# Extract column definitions
|
|
1230
|
+
self._extract_table_columns(node, columns, constraints)
|
|
1231
|
+
|
|
1232
|
+
if table_name:
|
|
1233
|
+
try:
|
|
1234
|
+
start_line = node.start_point[0] + 1
|
|
1235
|
+
end_line = node.end_point[0] + 1
|
|
1236
|
+
raw_text = self._get_node_text(node)
|
|
1237
|
+
|
|
1238
|
+
table = SQLTable(
|
|
1239
|
+
name=table_name,
|
|
1240
|
+
start_line=start_line,
|
|
1241
|
+
end_line=end_line,
|
|
1242
|
+
raw_text=raw_text,
|
|
1243
|
+
language="sql",
|
|
1244
|
+
columns=columns,
|
|
1245
|
+
constraints=constraints,
|
|
1246
|
+
)
|
|
1247
|
+
sql_elements.append(table)
|
|
1248
|
+
except Exception as e:
|
|
1249
|
+
log_debug(f"Failed to extract enhanced table: {e}")
|
|
1250
|
+
|
|
1251
|
+
def _extract_table_columns(
|
|
1252
|
+
self,
|
|
1253
|
+
table_node: "tree_sitter.Node",
|
|
1254
|
+
columns: list[SQLColumn],
|
|
1255
|
+
constraints: list[SQLConstraint],
|
|
1256
|
+
) -> None:
|
|
1257
|
+
"""Extract column definitions from CREATE TABLE statement."""
|
|
1258
|
+
# Use a more robust approach to extract columns
|
|
1259
|
+
table_text = self._get_node_text(table_node)
|
|
1260
|
+
|
|
1261
|
+
# Parse the table definition using regex as fallback
|
|
1262
|
+
import re
|
|
1263
|
+
|
|
1264
|
+
# Extract the content between parentheses
|
|
1265
|
+
table_content_match = re.search(
|
|
1266
|
+
r"\(\s*(.*?)\s*\)(?:\s*;)?$", table_text, re.DOTALL
|
|
1267
|
+
)
|
|
1268
|
+
if table_content_match:
|
|
1269
|
+
table_content = table_content_match.group(1)
|
|
1270
|
+
|
|
1271
|
+
# Split by commas, but be careful with nested parentheses
|
|
1272
|
+
column_definitions = self._split_column_definitions(table_content)
|
|
1273
|
+
|
|
1274
|
+
for col_def in column_definitions:
|
|
1275
|
+
col_def = col_def.strip()
|
|
1276
|
+
if not col_def or col_def.upper().startswith(
|
|
1277
|
+
("PRIMARY KEY", "FOREIGN KEY", "UNIQUE", "INDEX", "KEY")
|
|
1278
|
+
):
|
|
1279
|
+
continue
|
|
1280
|
+
|
|
1281
|
+
# Parse individual column definition
|
|
1282
|
+
column = self._parse_column_definition(col_def)
|
|
1283
|
+
if column:
|
|
1284
|
+
columns.append(column)
|
|
1285
|
+
|
|
1286
|
+
# Also try tree-sitter approach as backup
|
|
1287
|
+
for node in self._traverse_nodes(table_node):
|
|
1288
|
+
if node.type == "column_definition":
|
|
1289
|
+
column_name = None
|
|
1290
|
+
data_type = None
|
|
1291
|
+
nullable = True
|
|
1292
|
+
is_primary_key = False
|
|
1293
|
+
|
|
1294
|
+
# Extract column name and type
|
|
1295
|
+
for child in node.children:
|
|
1296
|
+
if child.type == "identifier" and column_name is None:
|
|
1297
|
+
column_name = self._get_node_text(child).strip()
|
|
1298
|
+
elif child.type in ["data_type", "type_name"]:
|
|
1299
|
+
data_type = self._get_node_text(child).strip()
|
|
1300
|
+
elif (
|
|
1301
|
+
child.type == "not_null"
|
|
1302
|
+
or "NOT NULL" in self._get_node_text(child).upper()
|
|
1303
|
+
):
|
|
1304
|
+
nullable = False
|
|
1305
|
+
elif (
|
|
1306
|
+
child.type == "primary_key"
|
|
1307
|
+
or "PRIMARY KEY" in self._get_node_text(child).upper()
|
|
1308
|
+
):
|
|
1309
|
+
is_primary_key = True
|
|
1310
|
+
|
|
1311
|
+
if column_name and data_type:
|
|
1312
|
+
# Check if this column is already added by regex parsing
|
|
1313
|
+
existing_column = next(
|
|
1314
|
+
(c for c in columns if c.name == column_name), None
|
|
1315
|
+
)
|
|
1316
|
+
if not existing_column:
|
|
1317
|
+
column = SQLColumn(
|
|
1318
|
+
name=column_name,
|
|
1319
|
+
data_type=data_type,
|
|
1320
|
+
nullable=nullable,
|
|
1321
|
+
is_primary_key=is_primary_key,
|
|
1322
|
+
)
|
|
1323
|
+
columns.append(column)
|
|
1324
|
+
|
|
1325
|
+
def _split_column_definitions(self, content: str) -> list[str]:
|
|
1326
|
+
"""Split column definitions by commas, handling nested parentheses."""
|
|
1327
|
+
definitions = []
|
|
1328
|
+
current_def = ""
|
|
1329
|
+
paren_count = 0
|
|
1330
|
+
|
|
1331
|
+
for char in content:
|
|
1332
|
+
if char == "(":
|
|
1333
|
+
paren_count += 1
|
|
1334
|
+
elif char == ")":
|
|
1335
|
+
paren_count -= 1
|
|
1336
|
+
elif char == "," and paren_count == 0:
|
|
1337
|
+
if current_def.strip():
|
|
1338
|
+
definitions.append(current_def.strip())
|
|
1339
|
+
current_def = ""
|
|
1340
|
+
continue
|
|
1341
|
+
|
|
1342
|
+
current_def += char
|
|
1343
|
+
|
|
1344
|
+
if current_def.strip():
|
|
1345
|
+
definitions.append(current_def.strip())
|
|
1346
|
+
|
|
1347
|
+
return definitions
|
|
1348
|
+
|
|
1349
|
+
def _parse_column_definition(self, col_def: str) -> SQLColumn | None:
|
|
1350
|
+
"""Parse a single column definition string."""
|
|
1351
|
+
import re
|
|
1352
|
+
|
|
1353
|
+
# Basic pattern: column_name data_type [constraints]
|
|
1354
|
+
match = re.match(
|
|
1355
|
+
r"^\s*([a-zA-Z_][a-zA-Z0-9_]*)\s+([A-Z]+(?:\([^)]*\))?)",
|
|
1356
|
+
col_def,
|
|
1357
|
+
re.IGNORECASE,
|
|
1358
|
+
)
|
|
1359
|
+
if not match:
|
|
1360
|
+
return None
|
|
1361
|
+
|
|
1362
|
+
column_name = match.group(1)
|
|
1363
|
+
data_type = match.group(2)
|
|
1364
|
+
|
|
1365
|
+
# Check for constraints
|
|
1366
|
+
col_def_upper = col_def.upper()
|
|
1367
|
+
nullable = "NOT NULL" not in col_def_upper
|
|
1368
|
+
is_primary_key = (
|
|
1369
|
+
"PRIMARY KEY" in col_def_upper or "AUTO_INCREMENT" in col_def_upper
|
|
1370
|
+
)
|
|
1371
|
+
is_foreign_key = "REFERENCES" in col_def_upper
|
|
1372
|
+
|
|
1373
|
+
foreign_key_reference = None
|
|
1374
|
+
if is_foreign_key:
|
|
1375
|
+
ref_match = re.search(
|
|
1376
|
+
r"REFERENCES\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]+)\)",
|
|
1377
|
+
col_def,
|
|
1378
|
+
re.IGNORECASE,
|
|
1379
|
+
)
|
|
1380
|
+
if ref_match:
|
|
1381
|
+
foreign_key_reference = f"{ref_match.group(1)}({ref_match.group(2)})"
|
|
1382
|
+
|
|
1383
|
+
return SQLColumn(
|
|
1384
|
+
name=column_name,
|
|
1385
|
+
data_type=data_type,
|
|
1386
|
+
nullable=nullable,
|
|
1387
|
+
is_primary_key=is_primary_key,
|
|
1388
|
+
is_foreign_key=is_foreign_key,
|
|
1389
|
+
foreign_key_reference=foreign_key_reference,
|
|
1390
|
+
)
|
|
1391
|
+
|
|
1392
|
+
def _extract_sql_views(
|
|
1393
|
+
self, root_node: "tree_sitter.Node", sql_elements: list[SQLElement]
|
|
1394
|
+
) -> None:
|
|
1395
|
+
"""Extract CREATE VIEW statements with enhanced metadata."""
|
|
1396
|
+
for node in self._traverse_nodes(root_node):
|
|
1397
|
+
if node.type == "ERROR":
|
|
1398
|
+
# Handle views inside ERROR nodes (common in some environments)
|
|
1399
|
+
raw_text = self._get_node_text(node)
|
|
1400
|
+
if not raw_text:
|
|
1401
|
+
continue
|
|
1402
|
+
|
|
1403
|
+
import re
|
|
1404
|
+
|
|
1405
|
+
# Find all views in this error node
|
|
1406
|
+
view_matches = re.finditer(
|
|
1407
|
+
r"CREATE\s+VIEW\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)\s+AS",
|
|
1408
|
+
raw_text,
|
|
1409
|
+
re.IGNORECASE,
|
|
1410
|
+
)
|
|
1411
|
+
|
|
1412
|
+
for match in view_matches:
|
|
1413
|
+
view_name = match.group(1).strip()
|
|
1414
|
+
if not self._is_valid_identifier(view_name):
|
|
1415
|
+
continue
|
|
1416
|
+
|
|
1417
|
+
# Avoid duplicates
|
|
1418
|
+
if any(
|
|
1419
|
+
e.name == view_name and isinstance(e, SQLView)
|
|
1420
|
+
for e in sql_elements
|
|
1421
|
+
):
|
|
1422
|
+
continue
|
|
1423
|
+
|
|
1424
|
+
start_line = node.start_point[0] + 1
|
|
1425
|
+
end_line = node.end_point[0] + 1
|
|
1426
|
+
|
|
1427
|
+
# Extract source tables from context following the view definition
|
|
1428
|
+
view_context = raw_text[match.end() :]
|
|
1429
|
+
semicolon_match = re.search(r";", view_context)
|
|
1430
|
+
if semicolon_match:
|
|
1431
|
+
view_context = view_context[: semicolon_match.end()]
|
|
1432
|
+
|
|
1433
|
+
source_tables = []
|
|
1434
|
+
# Simple extraction for source tables
|
|
1435
|
+
table_matches = re.findall(
|
|
1436
|
+
r"(?:FROM|JOIN)\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
1437
|
+
view_context,
|
|
1438
|
+
re.IGNORECASE,
|
|
1439
|
+
)
|
|
1440
|
+
source_tables.extend(table_matches)
|
|
1441
|
+
|
|
1442
|
+
view = SQLView(
|
|
1443
|
+
name=view_name,
|
|
1444
|
+
start_line=start_line,
|
|
1445
|
+
end_line=end_line,
|
|
1446
|
+
raw_text=f"CREATE VIEW {view_name} ...",
|
|
1447
|
+
language="sql",
|
|
1448
|
+
source_tables=sorted(set(source_tables)),
|
|
1449
|
+
dependencies=sorted(set(source_tables)),
|
|
1450
|
+
)
|
|
1451
|
+
sql_elements.append(view)
|
|
1452
|
+
|
|
1453
|
+
elif node.type == "create_view":
|
|
1454
|
+
view_name = None
|
|
1455
|
+
source_tables = []
|
|
1456
|
+
|
|
1457
|
+
# Get raw text for regex parsing
|
|
1458
|
+
raw_text = self._get_node_text(node)
|
|
1459
|
+
|
|
1460
|
+
# FIRST: Try regex parsing (most reliable for CREATE VIEW)
|
|
1461
|
+
if raw_text:
|
|
1462
|
+
# Pattern: CREATE VIEW [IF NOT EXISTS] view_name AS
|
|
1463
|
+
import re
|
|
1464
|
+
|
|
1465
|
+
match = re.search(
|
|
1466
|
+
r"CREATE\s+VIEW\s+(?:IF\s+NOT\s+EXISTS\s+)?(\w+)\s+AS",
|
|
1467
|
+
raw_text,
|
|
1468
|
+
re.IGNORECASE,
|
|
1469
|
+
)
|
|
1470
|
+
if match:
|
|
1471
|
+
potential_name = match.group(1).strip()
|
|
1472
|
+
if self._is_valid_identifier(potential_name):
|
|
1473
|
+
view_name = potential_name
|
|
1474
|
+
|
|
1475
|
+
# Fallback: Try AST parsing if regex didn't work
|
|
1476
|
+
if not view_name:
|
|
1477
|
+
for child in node.children:
|
|
1478
|
+
if child.type == "object_reference":
|
|
1479
|
+
for subchild in child.children:
|
|
1480
|
+
if subchild.type == "identifier":
|
|
1481
|
+
potential_name = self._get_node_text(
|
|
1482
|
+
subchild
|
|
1483
|
+
).strip()
|
|
1484
|
+
# Validate view name more strictly - exclude SQL keywords
|
|
1485
|
+
if (
|
|
1486
|
+
potential_name
|
|
1487
|
+
and self._is_valid_identifier(potential_name)
|
|
1488
|
+
and potential_name.upper()
|
|
1489
|
+
not in (
|
|
1490
|
+
"SELECT",
|
|
1491
|
+
"FROM",
|
|
1492
|
+
"WHERE",
|
|
1493
|
+
"AS",
|
|
1494
|
+
"IF",
|
|
1495
|
+
"NOT",
|
|
1496
|
+
"EXISTS",
|
|
1497
|
+
"NULL",
|
|
1498
|
+
"CURRENT_TIMESTAMP",
|
|
1499
|
+
"NOW",
|
|
1500
|
+
"SYSDATE",
|
|
1501
|
+
"COUNT",
|
|
1502
|
+
"SUM",
|
|
1503
|
+
"AVG",
|
|
1504
|
+
"MAX",
|
|
1505
|
+
"MIN",
|
|
1506
|
+
)
|
|
1507
|
+
):
|
|
1508
|
+
view_name = potential_name
|
|
1509
|
+
break
|
|
1510
|
+
if view_name:
|
|
1511
|
+
break
|
|
1512
|
+
|
|
1513
|
+
# Extract source tables from SELECT statement
|
|
1514
|
+
self._extract_view_sources(node, source_tables)
|
|
1515
|
+
|
|
1516
|
+
if view_name:
|
|
1517
|
+
try:
|
|
1518
|
+
start_line = node.start_point[0] + 1
|
|
1519
|
+
end_line = node.end_point[0] + 1
|
|
1520
|
+
raw_text = self._get_node_text(node)
|
|
1521
|
+
|
|
1522
|
+
view = SQLView(
|
|
1523
|
+
name=view_name,
|
|
1524
|
+
start_line=start_line,
|
|
1525
|
+
end_line=end_line,
|
|
1526
|
+
raw_text=raw_text,
|
|
1527
|
+
language="sql",
|
|
1528
|
+
source_tables=source_tables,
|
|
1529
|
+
dependencies=source_tables,
|
|
1530
|
+
)
|
|
1531
|
+
sql_elements.append(view)
|
|
1532
|
+
except Exception as e:
|
|
1533
|
+
log_debug(f"Failed to extract enhanced view: {e}")
|
|
1534
|
+
|
|
1535
|
+
def _extract_view_sources(
|
|
1536
|
+
self, view_node: "tree_sitter.Node", source_tables: list[str]
|
|
1537
|
+
) -> None:
|
|
1538
|
+
"""Extract source tables from view definition."""
|
|
1539
|
+
for node in self._traverse_nodes(view_node):
|
|
1540
|
+
if node.type == "from_clause":
|
|
1541
|
+
for child in self._traverse_nodes(node):
|
|
1542
|
+
if child.type == "object_reference":
|
|
1543
|
+
for subchild in child.children:
|
|
1544
|
+
if subchild.type == "identifier":
|
|
1545
|
+
table_name = self._get_node_text(subchild).strip()
|
|
1546
|
+
if table_name and table_name not in source_tables:
|
|
1547
|
+
source_tables.append(table_name)
|
|
1548
|
+
|
|
1549
|
+
def _extract_sql_procedures(
|
|
1550
|
+
self, root_node: "tree_sitter.Node", sql_elements: list[SQLElement]
|
|
1551
|
+
) -> None:
|
|
1552
|
+
"""Extract CREATE PROCEDURE statements with enhanced metadata."""
|
|
1553
|
+
# Use regex-based approach to find all procedures in the source code
|
|
1554
|
+
import re
|
|
1555
|
+
|
|
1556
|
+
lines = self.source_code.split("\n")
|
|
1557
|
+
|
|
1558
|
+
# Pattern to match CREATE PROCEDURE statements
|
|
1559
|
+
procedure_pattern = re.compile(
|
|
1560
|
+
r"^\s*CREATE\s+PROCEDURE\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
1561
|
+
re.IGNORECASE | re.MULTILINE,
|
|
1562
|
+
)
|
|
1563
|
+
|
|
1564
|
+
i = 0
|
|
1565
|
+
while i < len(lines):
|
|
1566
|
+
line = lines[i].strip()
|
|
1567
|
+
if line.upper().startswith("CREATE") and "PROCEDURE" in line.upper():
|
|
1568
|
+
match = procedure_pattern.match(lines[i])
|
|
1569
|
+
if match:
|
|
1570
|
+
proc_name = match.group(1)
|
|
1571
|
+
start_line = i + 1
|
|
1572
|
+
|
|
1573
|
+
# Find the end of the procedure (look for END; or END$$)
|
|
1574
|
+
end_line = start_line
|
|
1575
|
+
for j in range(i + 1, len(lines)):
|
|
1576
|
+
if lines[j].strip().upper() in ["END;", "END$$", "END"]:
|
|
1577
|
+
end_line = j + 1
|
|
1578
|
+
break
|
|
1579
|
+
elif lines[j].strip().upper().startswith("END;"):
|
|
1580
|
+
end_line = j + 1
|
|
1581
|
+
break
|
|
1582
|
+
|
|
1583
|
+
# Extract the full procedure text
|
|
1584
|
+
proc_lines = lines[i:end_line]
|
|
1585
|
+
raw_text = "\n".join(proc_lines)
|
|
1586
|
+
|
|
1587
|
+
parameters = []
|
|
1588
|
+
dependencies = []
|
|
1589
|
+
|
|
1590
|
+
# Extract parameters and dependencies from the text
|
|
1591
|
+
self._extract_procedure_parameters(raw_text, parameters)
|
|
1592
|
+
|
|
1593
|
+
try:
|
|
1594
|
+
procedure = SQLProcedure(
|
|
1595
|
+
name=proc_name,
|
|
1596
|
+
start_line=start_line,
|
|
1597
|
+
end_line=end_line,
|
|
1598
|
+
raw_text=raw_text,
|
|
1599
|
+
language="sql",
|
|
1600
|
+
parameters=parameters,
|
|
1601
|
+
dependencies=dependencies,
|
|
1602
|
+
)
|
|
1603
|
+
sql_elements.append(procedure)
|
|
1604
|
+
log_debug(
|
|
1605
|
+
f"Extracted procedure: {proc_name} at lines {start_line}-{end_line}"
|
|
1606
|
+
)
|
|
1607
|
+
except Exception as e:
|
|
1608
|
+
log_debug(f"Failed to extract enhanced procedure: {e}")
|
|
1609
|
+
|
|
1610
|
+
i = end_line
|
|
1611
|
+
else:
|
|
1612
|
+
i += 1
|
|
1613
|
+
else:
|
|
1614
|
+
i += 1
|
|
1615
|
+
|
|
1616
|
+
# Also try the original tree-sitter approach as fallback
|
|
1617
|
+
for node in self._traverse_nodes(root_node):
|
|
1618
|
+
if node.type == "ERROR":
|
|
1619
|
+
has_create = False
|
|
1620
|
+
node_text = self._get_node_text(node)
|
|
1621
|
+
node_text_upper = node_text.upper()
|
|
1622
|
+
|
|
1623
|
+
for child in node.children:
|
|
1624
|
+
if child.type == "keyword_create":
|
|
1625
|
+
has_create = True
|
|
1626
|
+
break
|
|
1627
|
+
|
|
1628
|
+
if has_create and "PROCEDURE" in node_text_upper:
|
|
1629
|
+
# Extract procedure name
|
|
1630
|
+
# Use finditer to find ALL procedures in the ERROR node
|
|
1631
|
+
matches = re.finditer(
|
|
1632
|
+
r"CREATE\s+PROCEDURE\s+([a-zA-Z_][a-zA-Z0-9_]*)",
|
|
1633
|
+
node_text,
|
|
1634
|
+
re.IGNORECASE,
|
|
1635
|
+
)
|
|
1636
|
+
|
|
1637
|
+
for match in matches:
|
|
1638
|
+
proc_name = match.group(1)
|
|
1639
|
+
|
|
1640
|
+
# Check if this procedure was already extracted by regex
|
|
1641
|
+
already_extracted = any(
|
|
1642
|
+
hasattr(elem, "name") and elem.name == proc_name
|
|
1643
|
+
for elem in sql_elements
|
|
1644
|
+
if hasattr(elem, "sql_element_type")
|
|
1645
|
+
and elem.sql_element_type.value == "procedure"
|
|
1646
|
+
)
|
|
1647
|
+
|
|
1648
|
+
if not already_extracted:
|
|
1649
|
+
# Extract parameters
|
|
1650
|
+
# Note: This extracts parameters from the WHOLE node text, which might be wrong
|
|
1651
|
+
# if there are multiple procedures. Ideally we should slice the text.
|
|
1652
|
+
# But _extract_procedure_parameters parses the whole text.
|
|
1653
|
+
# For now, we use the text starting from the match.
|
|
1654
|
+
current_proc_text = node_text[match.start() :]
|
|
1655
|
+
|
|
1656
|
+
# Reset parameters and dependencies for each procedure
|
|
1657
|
+
parameters = []
|
|
1658
|
+
dependencies = []
|
|
1659
|
+
|
|
1660
|
+
self._extract_procedure_parameters(
|
|
1661
|
+
current_proc_text, parameters
|
|
1662
|
+
)
|
|
1663
|
+
|
|
1664
|
+
# Extract dependencies (table references)
|
|
1665
|
+
# This still uses the whole node for dependencies, which is hard to fix without
|
|
1666
|
+
# proper parsing, but acceptable for fallback.
|
|
1667
|
+
self._extract_procedure_dependencies(node, dependencies)
|
|
1668
|
+
|
|
1669
|
+
try:
|
|
1670
|
+
# Calculate start line
|
|
1671
|
+
newlines_before = node_text[: match.start()].count("\n")
|
|
1672
|
+
start_line = node.start_point[0] + 1 + newlines_before
|
|
1673
|
+
end_line = node.end_point[0] + 1
|
|
1674
|
+
|
|
1675
|
+
# Use current_proc_text as raw_text
|
|
1676
|
+
raw_text = current_proc_text
|
|
1677
|
+
|
|
1678
|
+
procedure = SQLProcedure(
|
|
1679
|
+
name=proc_name,
|
|
1680
|
+
start_line=start_line,
|
|
1681
|
+
end_line=end_line,
|
|
1682
|
+
raw_text=raw_text,
|
|
1683
|
+
language="sql",
|
|
1684
|
+
parameters=parameters,
|
|
1685
|
+
dependencies=dependencies,
|
|
1686
|
+
)
|
|
1687
|
+
sql_elements.append(procedure)
|
|
1688
|
+
except Exception as e:
|
|
1689
|
+
log_debug(f"Failed to extract enhanced procedure: {e}")
|
|
1690
|
+
|
|
1691
|
+
def _extract_procedure_parameters(
|
|
1692
|
+
self, proc_text: str, parameters: list[SQLParameter]
|
|
1693
|
+
) -> None:
|
|
1694
|
+
"""Extract parameters from procedure definition."""
|
|
1695
|
+
import re
|
|
1696
|
+
|
|
1697
|
+
# First, extract the parameter section from the procedure/function definition
|
|
1698
|
+
# Look for the parameter list in parentheses after the procedure/function name
|
|
1699
|
+
param_section_match = re.search(
|
|
1700
|
+
r"(?:PROCEDURE|FUNCTION)\s+[a-zA-Z_][a-zA-Z0-9_]*\s*\(([^)]*)\)",
|
|
1701
|
+
proc_text,
|
|
1702
|
+
re.IGNORECASE | re.DOTALL,
|
|
1703
|
+
)
|
|
1704
|
+
|
|
1705
|
+
if not param_section_match:
|
|
1706
|
+
return
|
|
1707
|
+
|
|
1708
|
+
param_section = param_section_match.group(1).strip()
|
|
1709
|
+
if not param_section:
|
|
1710
|
+
return
|
|
1711
|
+
|
|
1712
|
+
# Look for parameter patterns like: IN param_name TYPE
|
|
1713
|
+
# Only search within the parameter section to avoid SQL statement content
|
|
1714
|
+
# Ensure IN/OUT/INOUT is followed by space to avoid ambiguity
|
|
1715
|
+
param_matches = re.findall(
|
|
1716
|
+
r"(?:(?:IN|OUT|INOUT)\s+)?([a-zA-Z_][a-zA-Z0-9_]*)\s+([A-Z]+(?:\([^)]*\))?)",
|
|
1717
|
+
param_section,
|
|
1718
|
+
re.IGNORECASE,
|
|
1719
|
+
)
|
|
1720
|
+
for match in param_matches:
|
|
1721
|
+
param_name = match[0]
|
|
1722
|
+
data_type = match[1]
|
|
1723
|
+
|
|
1724
|
+
# Skip common SQL keywords and column names that might be incorrectly matched
|
|
1725
|
+
if param_name.upper() in (
|
|
1726
|
+
"SELECT",
|
|
1727
|
+
"FROM",
|
|
1728
|
+
"WHERE",
|
|
1729
|
+
"INTO",
|
|
1730
|
+
"VALUES",
|
|
1731
|
+
"SET",
|
|
1732
|
+
"UPDATE",
|
|
1733
|
+
"INSERT",
|
|
1734
|
+
"DELETE",
|
|
1735
|
+
"CREATED_AT",
|
|
1736
|
+
"UPDATED_AT",
|
|
1737
|
+
"ID",
|
|
1738
|
+
"NAME",
|
|
1739
|
+
"EMAIL",
|
|
1740
|
+
"STATUS",
|
|
1741
|
+
"IN",
|
|
1742
|
+
"OUT",
|
|
1743
|
+
"INOUT",
|
|
1744
|
+
):
|
|
1745
|
+
continue
|
|
1746
|
+
|
|
1747
|
+
# Determine direction from the original text
|
|
1748
|
+
direction = "IN" # Default
|
|
1749
|
+
if f"OUT {param_name}" in param_section:
|
|
1750
|
+
direction = "OUT"
|
|
1751
|
+
elif f"INOUT {param_name}" in param_section:
|
|
1752
|
+
direction = "INOUT"
|
|
1753
|
+
|
|
1754
|
+
parameter = SQLParameter(
|
|
1755
|
+
name=param_name,
|
|
1756
|
+
data_type=data_type,
|
|
1757
|
+
direction=direction,
|
|
1758
|
+
)
|
|
1759
|
+
parameters.append(parameter)
|
|
1760
|
+
|
|
1761
|
+
def _extract_procedure_dependencies(
|
|
1762
|
+
self, proc_node: "tree_sitter.Node", dependencies: list[str]
|
|
1763
|
+
) -> None:
|
|
1764
|
+
"""Extract table dependencies from procedure body."""
|
|
1765
|
+
for node in self._traverse_nodes(proc_node):
|
|
1766
|
+
if node.type == "object_reference":
|
|
1767
|
+
for child in node.children:
|
|
1768
|
+
if child.type == "identifier":
|
|
1769
|
+
table_name = self._get_node_text(child).strip()
|
|
1770
|
+
if table_name and table_name not in dependencies:
|
|
1771
|
+
# Simple heuristic: if it's referenced in FROM, UPDATE, INSERT, etc.
|
|
1772
|
+
dependencies.append(table_name)
|
|
1773
|
+
|
|
1774
|
+
def _extract_sql_functions_enhanced(
|
|
1775
|
+
self, root_node: "tree_sitter.Node", sql_elements: list[SQLElement]
|
|
1776
|
+
) -> None:
|
|
1777
|
+
"""Extract CREATE FUNCTION statements with enhanced metadata."""
|
|
1778
|
+
# Use regex-based approach to find all functions in the source code
|
|
1779
|
+
import re
|
|
1780
|
+
|
|
1781
|
+
lines = self.source_code.split("\n")
|
|
1782
|
+
|
|
1783
|
+
# Pattern to match CREATE FUNCTION statements - requires opening parenthesis
|
|
1784
|
+
function_pattern = re.compile(
|
|
1785
|
+
r"^\s*CREATE\s+FUNCTION\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(",
|
|
1786
|
+
re.IGNORECASE,
|
|
1787
|
+
)
|
|
1788
|
+
|
|
1789
|
+
i = 0
|
|
1790
|
+
inside_function = False
|
|
1791
|
+
|
|
1792
|
+
while i < len(lines):
|
|
1793
|
+
# Skip lines if we're inside a function body
|
|
1794
|
+
if inside_function:
|
|
1795
|
+
if lines[i].strip().upper() in ["END;", "END$"] or lines[
|
|
1796
|
+
i
|
|
1797
|
+
].strip().upper().startswith("END;"):
|
|
1798
|
+
inside_function = False
|
|
1799
|
+
i += 1
|
|
1800
|
+
continue
|
|
1801
|
+
|
|
1802
|
+
# Only check for CREATE FUNCTION when not inside a function
|
|
1803
|
+
match = function_pattern.match(lines[i])
|
|
1804
|
+
if match:
|
|
1805
|
+
func_name = match.group(1)
|
|
1806
|
+
|
|
1807
|
+
# Validate the function name using the centralized validation method
|
|
1808
|
+
if not self._is_valid_identifier(func_name):
|
|
1809
|
+
i += 1
|
|
1810
|
+
continue
|
|
1811
|
+
|
|
1812
|
+
start_line = i + 1
|
|
1813
|
+
inside_function = True
|
|
1814
|
+
|
|
1815
|
+
# Find the end of the function (look for END; or END$$)
|
|
1816
|
+
end_line = start_line
|
|
1817
|
+
nesting_level = 0
|
|
1818
|
+
|
|
1819
|
+
for j in range(i + 1, len(lines)):
|
|
1820
|
+
line_stripped = lines[j].strip().upper()
|
|
1821
|
+
|
|
1822
|
+
# Skip comments to avoid false positives
|
|
1823
|
+
if line_stripped.startswith("--") or line_stripped.startswith("#"):
|
|
1824
|
+
continue
|
|
1825
|
+
|
|
1826
|
+
# Handle nesting of BEGIN ... END blocks
|
|
1827
|
+
# This is a heuristic: if we see BEGIN, we expect a matching END;
|
|
1828
|
+
# We use word boundaries to avoid matching BEGIN in other contexts if possible
|
|
1829
|
+
if re.search(r"\bBEGIN\b", line_stripped):
|
|
1830
|
+
nesting_level += 1
|
|
1831
|
+
|
|
1832
|
+
is_end = False
|
|
1833
|
+
if line_stripped in ["END;", "END$", "END"]:
|
|
1834
|
+
is_end = True
|
|
1835
|
+
elif line_stripped.startswith("END;"):
|
|
1836
|
+
is_end = True
|
|
1837
|
+
|
|
1838
|
+
if is_end:
|
|
1839
|
+
if nesting_level > 0:
|
|
1840
|
+
nesting_level -= 1
|
|
1841
|
+
|
|
1842
|
+
if nesting_level == 0:
|
|
1843
|
+
end_line = j + 1
|
|
1844
|
+
inside_function = False
|
|
1845
|
+
break
|
|
1846
|
+
|
|
1847
|
+
# Extract the full function text
|
|
1848
|
+
func_lines = lines[i:end_line]
|
|
1849
|
+
raw_text = "\n".join(func_lines)
|
|
1850
|
+
|
|
1851
|
+
parameters = []
|
|
1852
|
+
dependencies = []
|
|
1853
|
+
return_type = None
|
|
1854
|
+
|
|
1855
|
+
# Extract parameters, return type and dependencies from the text
|
|
1856
|
+
self._extract_procedure_parameters(raw_text, parameters)
|
|
1857
|
+
|
|
1858
|
+
# Extract return type
|
|
1859
|
+
returns_match = re.search(
|
|
1860
|
+
r"RETURNS\s+([A-Z]+(?:\([^)]*\))?)", raw_text, re.IGNORECASE
|
|
1861
|
+
)
|
|
1862
|
+
if returns_match:
|
|
1863
|
+
return_type = returns_match.group(1)
|
|
1864
|
+
|
|
1865
|
+
try:
|
|
1866
|
+
function = SQLFunction(
|
|
1867
|
+
name=func_name,
|
|
1868
|
+
start_line=start_line,
|
|
1869
|
+
end_line=end_line,
|
|
1870
|
+
raw_text=raw_text,
|
|
1871
|
+
language="sql",
|
|
1872
|
+
parameters=parameters,
|
|
1873
|
+
dependencies=dependencies,
|
|
1874
|
+
return_type=return_type,
|
|
1875
|
+
)
|
|
1876
|
+
sql_elements.append(function)
|
|
1877
|
+
log_debug(
|
|
1878
|
+
f"Extracted function: {func_name} at lines {start_line}-{end_line}"
|
|
1879
|
+
)
|
|
1880
|
+
except Exception as e:
|
|
1881
|
+
log_debug(f"Failed to extract enhanced function: {e}")
|
|
1882
|
+
|
|
1883
|
+
i = end_line
|
|
1884
|
+
else:
|
|
1885
|
+
i += 1
|
|
1886
|
+
|
|
1887
|
+
# Also try the original tree-sitter approach as fallback
|
|
1888
|
+
for node in self._traverse_nodes(root_node):
|
|
1889
|
+
if node.type == "create_function":
|
|
1890
|
+
func_name = None
|
|
1891
|
+
parameters = []
|
|
1892
|
+
return_type = None
|
|
1893
|
+
dependencies = []
|
|
1894
|
+
|
|
1895
|
+
# Extract function name - only from the FIRST object_reference child
|
|
1896
|
+
# This should be the function name, not references within the function body
|
|
1897
|
+
found_first_object_ref = False
|
|
1898
|
+
for child in node.children:
|
|
1899
|
+
if child.type == "object_reference" and not found_first_object_ref:
|
|
1900
|
+
found_first_object_ref = True
|
|
1901
|
+
for subchild in child.children:
|
|
1902
|
+
if subchild.type == "identifier":
|
|
1903
|
+
func_name = self._get_node_text(subchild).strip()
|
|
1904
|
+
# Validate function name using centralized validation
|
|
1905
|
+
if func_name and self._is_valid_identifier(func_name):
|
|
1906
|
+
break
|
|
1907
|
+
else:
|
|
1908
|
+
func_name = None
|
|
1909
|
+
if func_name:
|
|
1910
|
+
break
|
|
1911
|
+
|
|
1912
|
+
if func_name:
|
|
1913
|
+
# Check if this function was already extracted by regex
|
|
1914
|
+
already_extracted = any(
|
|
1915
|
+
hasattr(elem, "name") and elem.name == func_name
|
|
1916
|
+
for elem in sql_elements
|
|
1917
|
+
if hasattr(elem, "sql_element_type")
|
|
1918
|
+
and elem.sql_element_type.value == "function"
|
|
1919
|
+
)
|
|
1920
|
+
|
|
1921
|
+
if not already_extracted:
|
|
1922
|
+
# Extract return type and other metadata
|
|
1923
|
+
self._extract_function_metadata(
|
|
1924
|
+
node, parameters, return_type, dependencies
|
|
1925
|
+
)
|
|
1926
|
+
|
|
1927
|
+
try:
|
|
1928
|
+
start_line = node.start_point[0] + 1
|
|
1929
|
+
end_line = node.end_point[0] + 1
|
|
1930
|
+
raw_text = self._get_node_text(node)
|
|
1931
|
+
|
|
1932
|
+
function = SQLFunction(
|
|
1933
|
+
name=func_name,
|
|
1934
|
+
start_line=start_line,
|
|
1935
|
+
end_line=end_line,
|
|
1936
|
+
raw_text=raw_text,
|
|
1937
|
+
language="sql",
|
|
1938
|
+
parameters=parameters,
|
|
1939
|
+
dependencies=dependencies,
|
|
1940
|
+
return_type=return_type,
|
|
1941
|
+
)
|
|
1942
|
+
sql_elements.append(function)
|
|
1943
|
+
except Exception as e:
|
|
1944
|
+
log_debug(f"Failed to extract enhanced function: {e}")
|
|
1945
|
+
|
|
1946
|
+
def _extract_function_metadata(
|
|
1947
|
+
self,
|
|
1948
|
+
func_node: "tree_sitter.Node",
|
|
1949
|
+
parameters: list[SQLParameter],
|
|
1950
|
+
return_type: str | None,
|
|
1951
|
+
dependencies: list[str],
|
|
1952
|
+
) -> None:
|
|
1953
|
+
"""Extract function metadata including parameters and return type."""
|
|
1954
|
+
func_text = self._get_node_text(func_node)
|
|
1955
|
+
|
|
1956
|
+
# Extract return type
|
|
1957
|
+
import re
|
|
1958
|
+
|
|
1959
|
+
returns_match = re.search(
|
|
1960
|
+
r"RETURNS\s+([A-Z]+(?:\([^)]*\))?)", func_text, re.IGNORECASE
|
|
1961
|
+
)
|
|
1962
|
+
if returns_match:
|
|
1963
|
+
_return_type = returns_match.group(1) # Reserved for future use
|
|
1964
|
+
|
|
1965
|
+
# Extract parameters (similar to procedure parameters)
|
|
1966
|
+
self._extract_procedure_parameters(func_text, parameters)
|
|
1967
|
+
|
|
1968
|
+
# Extract dependencies
|
|
1969
|
+
self._extract_procedure_dependencies(func_node, dependencies)
|
|
1970
|
+
|
|
1971
|
+
def _extract_sql_triggers(
|
|
1972
|
+
self, root_node: "tree_sitter.Node", sql_elements: list[SQLElement]
|
|
1973
|
+
) -> None:
|
|
1974
|
+
"""Extract CREATE TRIGGER statements with enhanced metadata."""
|
|
1975
|
+
import re
|
|
1976
|
+
|
|
1977
|
+
# Use self.source_code which is set by parent method _extract_sql_elements
|
|
1978
|
+
# This is more reliable than _get_node_text(root_node) which may fail
|
|
1979
|
+
# on some platforms due to encoding or byte offset issues
|
|
1980
|
+
source_code = self.source_code
|
|
1981
|
+
|
|
1982
|
+
if not source_code:
|
|
1983
|
+
log_debug("WARNING: source_code is empty in _extract_sql_triggers")
|
|
1984
|
+
return
|
|
1985
|
+
|
|
1986
|
+
# Track processed triggers by name to avoid duplicates
|
|
1987
|
+
processed_triggers = set()
|
|
1988
|
+
|
|
1989
|
+
# Use regex on the full source to find all triggers with accurate positions
|
|
1990
|
+
trigger_pattern = re.compile(
|
|
1991
|
+
r"CREATE\s+TRIGGER\s+([a-zA-Z_][a-zA-Z0-9_]*)", re.IGNORECASE | re.MULTILINE
|
|
1992
|
+
)
|
|
1993
|
+
|
|
1994
|
+
trigger_matches = list(trigger_pattern.finditer(source_code))
|
|
1995
|
+
log_debug(f"Found {len(trigger_matches)} CREATE TRIGGER statements in source")
|
|
1996
|
+
|
|
1997
|
+
for match in trigger_matches:
|
|
1998
|
+
trigger_name = match.group(1)
|
|
1999
|
+
|
|
2000
|
+
# Skip if already processed
|
|
2001
|
+
if trigger_name in processed_triggers:
|
|
2002
|
+
continue
|
|
2003
|
+
|
|
2004
|
+
if not self._is_valid_identifier(trigger_name):
|
|
2005
|
+
continue
|
|
2006
|
+
|
|
2007
|
+
# Skip invalid trigger names (too short or common SQL keywords)
|
|
2008
|
+
if len(trigger_name) <= 2:
|
|
2009
|
+
continue
|
|
2010
|
+
|
|
2011
|
+
# Skip common SQL keywords that might be incorrectly identified
|
|
2012
|
+
if trigger_name.upper() in (
|
|
2013
|
+
"KEY",
|
|
2014
|
+
"AUTO_INCREMENT",
|
|
2015
|
+
"PRIMARY",
|
|
2016
|
+
"FOREIGN",
|
|
2017
|
+
"INDEX",
|
|
2018
|
+
"UNIQUE",
|
|
2019
|
+
):
|
|
2020
|
+
continue
|
|
2021
|
+
|
|
2022
|
+
# Mark as processed
|
|
2023
|
+
processed_triggers.add(trigger_name)
|
|
2024
|
+
|
|
2025
|
+
# Calculate start line (1-indexed)
|
|
2026
|
+
start_line = source_code[: match.start()].count("\n") + 1
|
|
2027
|
+
|
|
2028
|
+
# Find the end of this trigger statement (looking for the END keyword followed by semicolon)
|
|
2029
|
+
trigger_start_pos = match.start()
|
|
2030
|
+
# Search for END; after the trigger definition
|
|
2031
|
+
end_pattern = re.compile(r"\bEND\s*;", re.IGNORECASE)
|
|
2032
|
+
end_match = end_pattern.search(source_code, trigger_start_pos)
|
|
2033
|
+
|
|
2034
|
+
if end_match:
|
|
2035
|
+
end_line = source_code[: end_match.end()].count("\n") + 1
|
|
2036
|
+
trigger_text = source_code[trigger_start_pos : end_match.end()]
|
|
2037
|
+
else:
|
|
2038
|
+
# Fallback: use a reasonable default
|
|
2039
|
+
end_line = start_line + 20
|
|
2040
|
+
trigger_text = source_code[trigger_start_pos : trigger_start_pos + 500]
|
|
2041
|
+
|
|
2042
|
+
# Extract trigger metadata from the extracted text
|
|
2043
|
+
trigger_timing, trigger_event, table_name = self._extract_trigger_metadata(
|
|
2044
|
+
trigger_text
|
|
2045
|
+
)
|
|
2046
|
+
|
|
2047
|
+
try:
|
|
2048
|
+
trigger = SQLTrigger(
|
|
2049
|
+
name=trigger_name,
|
|
2050
|
+
start_line=start_line,
|
|
2051
|
+
end_line=end_line,
|
|
2052
|
+
raw_text=trigger_text,
|
|
2053
|
+
language="sql",
|
|
2054
|
+
table_name=table_name,
|
|
2055
|
+
trigger_timing=trigger_timing,
|
|
2056
|
+
trigger_event=trigger_event,
|
|
2057
|
+
dependencies=[table_name] if table_name else [],
|
|
2058
|
+
)
|
|
2059
|
+
sql_elements.append(trigger)
|
|
2060
|
+
except Exception as e:
|
|
2061
|
+
log_debug(f"Failed to extract enhanced trigger: {e}")
|
|
2062
|
+
|
|
2063
|
+
def _extract_trigger_metadata(
|
|
2064
|
+
self,
|
|
2065
|
+
trigger_text: str,
|
|
2066
|
+
) -> tuple[str | None, str | None, str | None]:
|
|
2067
|
+
"""Extract trigger timing, event, and target table."""
|
|
2068
|
+
import re
|
|
2069
|
+
|
|
2070
|
+
timing = None
|
|
2071
|
+
event = None
|
|
2072
|
+
table_name = None
|
|
2073
|
+
|
|
2074
|
+
# Extract timing (BEFORE/AFTER)
|
|
2075
|
+
timing_match = re.search(r"(BEFORE|AFTER)", trigger_text, re.IGNORECASE)
|
|
2076
|
+
if timing_match:
|
|
2077
|
+
timing = timing_match.group(1).upper()
|
|
2078
|
+
|
|
2079
|
+
# Extract event (INSERT/UPDATE/DELETE)
|
|
2080
|
+
event_match = re.search(r"(INSERT|UPDATE|DELETE)", trigger_text, re.IGNORECASE)
|
|
2081
|
+
if event_match:
|
|
2082
|
+
event = event_match.group(1).upper()
|
|
2083
|
+
|
|
2084
|
+
# Extract target table
|
|
2085
|
+
table_match = re.search(
|
|
2086
|
+
r"ON\s+([a-zA-Z_][a-zA-Z0-9_]*)", trigger_text, re.IGNORECASE
|
|
2087
|
+
)
|
|
2088
|
+
if table_match:
|
|
2089
|
+
table_name = table_match.group(1)
|
|
2090
|
+
|
|
2091
|
+
return timing, event, table_name
|
|
2092
|
+
|
|
2093
|
+
def _extract_sql_indexes(
|
|
2094
|
+
self, root_node: "tree_sitter.Node", sql_elements: list[SQLElement]
|
|
2095
|
+
) -> None:
|
|
2096
|
+
"""Extract CREATE INDEX statements with enhanced metadata."""
|
|
2097
|
+
processed_indexes = set() # Track processed indexes to avoid duplicates
|
|
2098
|
+
|
|
2099
|
+
# First try tree-sitter parsing
|
|
2100
|
+
for node in self._traverse_nodes(root_node):
|
|
2101
|
+
if node.type == "create_index":
|
|
2102
|
+
index_name = None
|
|
2103
|
+
|
|
2104
|
+
# Use regex to extract index name from raw text for better accuracy
|
|
2105
|
+
import re
|
|
2106
|
+
|
|
2107
|
+
raw_text = self._get_node_text(node)
|
|
2108
|
+
# Pattern: CREATE [UNIQUE] INDEX index_name ON table_name
|
|
2109
|
+
index_pattern = re.search(
|
|
2110
|
+
r"CREATE\s+(?:UNIQUE\s+)?INDEX\s+([a-zA-Z_][a-zA-Z0-9_]*)\s+ON",
|
|
2111
|
+
raw_text,
|
|
2112
|
+
re.IGNORECASE,
|
|
2113
|
+
)
|
|
2114
|
+
if index_pattern:
|
|
2115
|
+
extracted_name = index_pattern.group(1)
|
|
2116
|
+
# Validate index name
|
|
2117
|
+
if self._is_valid_identifier(extracted_name):
|
|
2118
|
+
index_name = extracted_name
|
|
2119
|
+
|
|
2120
|
+
if index_name and index_name not in processed_indexes:
|
|
2121
|
+
try:
|
|
2122
|
+
start_line = node.start_point[0] + 1
|
|
2123
|
+
end_line = node.end_point[0] + 1
|
|
2124
|
+
raw_text = self._get_node_text(node)
|
|
2125
|
+
|
|
2126
|
+
# Create index object first
|
|
2127
|
+
index = SQLIndex(
|
|
2128
|
+
name=index_name,
|
|
2129
|
+
start_line=start_line,
|
|
2130
|
+
end_line=end_line,
|
|
2131
|
+
raw_text=raw_text,
|
|
2132
|
+
language="sql",
|
|
2133
|
+
table_name=None,
|
|
2134
|
+
indexed_columns=[],
|
|
2135
|
+
is_unique=False,
|
|
2136
|
+
dependencies=[],
|
|
2137
|
+
)
|
|
2138
|
+
|
|
2139
|
+
# Extract metadata and populate the index object
|
|
2140
|
+
self._extract_index_metadata(node, index)
|
|
2141
|
+
|
|
2142
|
+
sql_elements.append(index)
|
|
2143
|
+
processed_indexes.add(index_name)
|
|
2144
|
+
log_debug(
|
|
2145
|
+
f"Extracted index: {index_name} on table {index.table_name}"
|
|
2146
|
+
)
|
|
2147
|
+
except Exception as e:
|
|
2148
|
+
log_debug(f"Failed to extract enhanced index {index_name}: {e}")
|
|
2149
|
+
|
|
2150
|
+
# Add regex-based fallback for indexes that tree-sitter might miss
|
|
2151
|
+
self._extract_indexes_with_regex(sql_elements, processed_indexes)
|
|
2152
|
+
|
|
2153
|
+
def _extract_index_metadata(
|
|
2154
|
+
self,
|
|
2155
|
+
index_node: "tree_sitter.Node",
|
|
2156
|
+
index: "SQLIndex",
|
|
2157
|
+
) -> None:
|
|
2158
|
+
"""Extract index metadata including target table and columns."""
|
|
2159
|
+
index_text = self._get_node_text(index_node)
|
|
2160
|
+
|
|
2161
|
+
# Check for UNIQUE keyword
|
|
2162
|
+
if "UNIQUE" in index_text.upper():
|
|
2163
|
+
index.is_unique = True
|
|
2164
|
+
|
|
2165
|
+
# Extract table name
|
|
2166
|
+
import re
|
|
2167
|
+
|
|
2168
|
+
table_match = re.search(
|
|
2169
|
+
r"ON\s+([a-zA-Z_][a-zA-Z0-9_]*)", index_text, re.IGNORECASE
|
|
2170
|
+
)
|
|
2171
|
+
if table_match:
|
|
2172
|
+
index.table_name = table_match.group(1)
|
|
2173
|
+
# Update dependencies
|
|
2174
|
+
if index.table_name and index.table_name not in index.dependencies:
|
|
2175
|
+
index.dependencies.append(index.table_name)
|
|
2176
|
+
|
|
2177
|
+
# Extract column names
|
|
2178
|
+
columns_match = re.search(r"\(([^)]+)\)", index_text)
|
|
2179
|
+
if columns_match:
|
|
2180
|
+
columns_str = columns_match.group(1)
|
|
2181
|
+
columns = [col.strip() for col in columns_str.split(",")]
|
|
2182
|
+
index.indexed_columns.extend(columns)
|
|
2183
|
+
|
|
2184
|
+
def _extract_indexes_with_regex(
|
|
2185
|
+
self, sql_elements: list[SQLElement], processed_indexes: set[str]
|
|
2186
|
+
) -> None:
|
|
2187
|
+
"""Extract CREATE INDEX statements using regex as fallback."""
|
|
2188
|
+
import re
|
|
2189
|
+
|
|
2190
|
+
# Split source code into lines for line number tracking
|
|
2191
|
+
lines = self.source_code.split("\n")
|
|
2192
|
+
|
|
2193
|
+
# Pattern to match CREATE INDEX statements
|
|
2194
|
+
index_pattern = re.compile(
|
|
2195
|
+
r"^\s*CREATE\s+(UNIQUE\s+)?INDEX\s+([a-zA-Z_][a-zA-Z0-9_]*)\s+ON\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(([^)]+)\)",
|
|
2196
|
+
re.IGNORECASE | re.MULTILINE,
|
|
2197
|
+
)
|
|
2198
|
+
|
|
2199
|
+
for line_num, line in enumerate(lines, 1):
|
|
2200
|
+
line = line.strip()
|
|
2201
|
+
if not line.upper().startswith("CREATE") or "INDEX" not in line.upper():
|
|
2202
|
+
continue
|
|
2203
|
+
|
|
2204
|
+
match = index_pattern.match(line)
|
|
2205
|
+
if match:
|
|
2206
|
+
is_unique = match.group(1) is not None
|
|
2207
|
+
index_name = match.group(2)
|
|
2208
|
+
table_name = match.group(3)
|
|
2209
|
+
columns_str = match.group(4)
|
|
2210
|
+
|
|
2211
|
+
# Skip if already processed
|
|
2212
|
+
if index_name in processed_indexes:
|
|
2213
|
+
continue
|
|
2214
|
+
|
|
2215
|
+
# Parse columns
|
|
2216
|
+
columns = [col.strip() for col in columns_str.split(",")]
|
|
2217
|
+
|
|
2218
|
+
try:
|
|
2219
|
+
index = SQLIndex(
|
|
2220
|
+
name=index_name,
|
|
2221
|
+
start_line=line_num,
|
|
2222
|
+
end_line=line_num,
|
|
2223
|
+
raw_text=line,
|
|
2224
|
+
language="sql",
|
|
2225
|
+
table_name=table_name,
|
|
2226
|
+
indexed_columns=columns,
|
|
2227
|
+
is_unique=is_unique,
|
|
2228
|
+
dependencies=[table_name] if table_name else [],
|
|
2229
|
+
)
|
|
2230
|
+
|
|
2231
|
+
sql_elements.append(index)
|
|
2232
|
+
processed_indexes.add(index_name)
|
|
2233
|
+
log_debug(
|
|
2234
|
+
f"Regex extracted index: {index_name} on table {table_name}"
|
|
2235
|
+
)
|
|
2236
|
+
|
|
2237
|
+
except Exception as e:
|
|
2238
|
+
log_debug(
|
|
2239
|
+
f"Failed to create regex-extracted index {index_name}: {e}"
|
|
2240
|
+
)
|
|
2241
|
+
|
|
2242
|
+
|
|
2243
|
+
class SQLPlugin(LanguagePlugin):
|
|
2244
|
+
"""
|
|
2245
|
+
SQL language plugin implementation.
|
|
2246
|
+
|
|
2247
|
+
Provides SQL language support for tree-sitter-analyzer, enabling analysis
|
|
2248
|
+
of SQL files including database schema definitions, stored procedures,
|
|
2249
|
+
functions, triggers, and indexes.
|
|
2250
|
+
|
|
2251
|
+
The plugin follows the standard LanguagePlugin interface and integrates
|
|
2252
|
+
with the plugin manager for automatic discovery. It requires the
|
|
2253
|
+
tree-sitter-sql package to be installed (available as optional dependency).
|
|
2254
|
+
"""
|
|
2255
|
+
|
|
2256
|
+
def __init__(self, diagnostic_mode: bool = False) -> None:
|
|
2257
|
+
"""
|
|
2258
|
+
Initialize the SQL language plugin.
|
|
2259
|
+
|
|
2260
|
+
Sets up the extractor instance and caches for tree-sitter language
|
|
2261
|
+
loading. The plugin supports .sql file extensions.
|
|
2262
|
+
"""
|
|
2263
|
+
super().__init__()
|
|
2264
|
+
self.diagnostic_mode = diagnostic_mode
|
|
2265
|
+
self.extractor = SQLElementExtractor(diagnostic_mode=diagnostic_mode)
|
|
2266
|
+
self.language = "sql" # Add language property for test compatibility
|
|
2267
|
+
self.supported_extensions = self.get_file_extensions()
|
|
2268
|
+
self._cached_language: Any | None = None # Cache for tree-sitter language
|
|
2269
|
+
|
|
2270
|
+
# Platform compatibility initialization
|
|
2271
|
+
self.platform_info = None
|
|
2272
|
+
try:
|
|
2273
|
+
self.platform_info = PlatformDetector.detect()
|
|
2274
|
+
self.extractor.platform_info = self.platform_info
|
|
2275
|
+
|
|
2276
|
+
platform_info = self.platform_info
|
|
2277
|
+
profile = BehaviorProfile.load(platform_info.platform_key)
|
|
2278
|
+
|
|
2279
|
+
if self.diagnostic_mode:
|
|
2280
|
+
log_debug(f"Diagnostic: Platform detected: {platform_info}")
|
|
2281
|
+
if profile:
|
|
2282
|
+
log_debug(
|
|
2283
|
+
f"Diagnostic: Loaded SQL behavior profile for {platform_info.platform_key}"
|
|
2284
|
+
)
|
|
2285
|
+
log_debug(f"Diagnostic: Profile rules: {profile.adaptation_rules}")
|
|
2286
|
+
else:
|
|
2287
|
+
log_debug(
|
|
2288
|
+
f"Diagnostic: No SQL behavior profile found for {platform_info.platform_key}"
|
|
2289
|
+
)
|
|
2290
|
+
elif profile:
|
|
2291
|
+
log_debug(
|
|
2292
|
+
f"Loaded SQL behavior profile for {platform_info.platform_key}"
|
|
2293
|
+
)
|
|
2294
|
+
else:
|
|
2295
|
+
log_debug(
|
|
2296
|
+
f"No SQL behavior profile found for {platform_info.platform_key}, using defaults"
|
|
2297
|
+
)
|
|
2298
|
+
|
|
2299
|
+
self.adapter = CompatibilityAdapter(profile)
|
|
2300
|
+
self.extractor.set_adapter(self.adapter)
|
|
2301
|
+
except Exception as e:
|
|
2302
|
+
log_error(f"Failed to initialize SQL platform compatibility: {e}")
|
|
2303
|
+
self.adapter = CompatibilityAdapter(None) # Use default adapter
|
|
2304
|
+
self.extractor.set_adapter(self.adapter)
|
|
2305
|
+
|
|
2306
|
+
def get_tree_sitter_language(self) -> Any:
|
|
2307
|
+
"""
|
|
2308
|
+
Get the tree-sitter language object for SQL.
|
|
2309
|
+
|
|
2310
|
+
Returns:
|
|
2311
|
+
The tree-sitter language object.
|
|
2312
|
+
|
|
2313
|
+
Raises:
|
|
2314
|
+
RuntimeError: If tree-sitter-sql is not installed.
|
|
2315
|
+
"""
|
|
2316
|
+
if self._cached_language:
|
|
2317
|
+
return self._cached_language
|
|
2318
|
+
|
|
2319
|
+
try:
|
|
2320
|
+
import tree_sitter
|
|
2321
|
+
import tree_sitter_sql
|
|
2322
|
+
|
|
2323
|
+
self._cached_language = tree_sitter.Language(tree_sitter_sql.language())
|
|
2324
|
+
return self._cached_language
|
|
2325
|
+
except ImportError as e:
|
|
2326
|
+
raise RuntimeError(
|
|
2327
|
+
"tree-sitter-sql is required for SQL analysis but not installed."
|
|
2328
|
+
) from e
|
|
2329
|
+
|
|
2330
|
+
def get_language_name(self) -> str:
|
|
2331
|
+
"""Get the language name."""
|
|
2332
|
+
return "sql"
|
|
2333
|
+
|
|
2334
|
+
def get_file_extensions(self) -> list[str]:
|
|
2335
|
+
"""Get supported file extensions."""
|
|
2336
|
+
return [".sql"]
|
|
2337
|
+
|
|
2338
|
+
def create_extractor(self) -> ElementExtractor:
|
|
2339
|
+
"""Create a new element extractor instance."""
|
|
2340
|
+
return SQLElementExtractor()
|
|
2341
|
+
|
|
2342
|
+
def extract_elements(self, tree: Any, source_code: str) -> dict[str, list[Any]]:
|
|
2343
|
+
"""
|
|
2344
|
+
Legacy method for extracting elements.
|
|
2345
|
+
Maintained for backward compatibility and testing.
|
|
2346
|
+
|
|
2347
|
+
Args:
|
|
2348
|
+
tree: Tree-sitter AST tree
|
|
2349
|
+
source_code: Source code string
|
|
2350
|
+
|
|
2351
|
+
Returns:
|
|
2352
|
+
Dictionary with keys 'functions', 'classes', 'variables', 'imports'
|
|
2353
|
+
"""
|
|
2354
|
+
elements = self.extractor.extract_sql_elements(tree, source_code)
|
|
2355
|
+
|
|
2356
|
+
result = {"functions": [], "classes": [], "variables": [], "imports": []}
|
|
2357
|
+
|
|
2358
|
+
for element in elements:
|
|
2359
|
+
if element.element_type in ["function", "procedure", "trigger"]:
|
|
2360
|
+
result["functions"].append(element)
|
|
2361
|
+
elif element.element_type in ["class", "table", "view"]:
|
|
2362
|
+
result["classes"].append(element)
|
|
2363
|
+
elif element.element_type in ["variable", "index"]:
|
|
2364
|
+
result["variables"].append(element)
|
|
2365
|
+
elif element.element_type == "import":
|
|
2366
|
+
result["imports"].append(element)
|
|
2367
|
+
|
|
2368
|
+
return result
|
|
2369
|
+
|
|
2370
|
+
async def analyze_file(
|
|
2371
|
+
self, file_path: str, request: "AnalysisRequest"
|
|
2372
|
+
) -> "AnalysisResult":
|
|
2373
|
+
"""
|
|
2374
|
+
Analyze SQL file and return structured results.
|
|
2375
|
+
|
|
2376
|
+
Parses the SQL file using tree-sitter-sql, extracts database elements
|
|
2377
|
+
(tables, views, procedures, functions, triggers, indexes), and returns
|
|
2378
|
+
an AnalysisResult with all extracted information.
|
|
2379
|
+
|
|
2380
|
+
Args:
|
|
2381
|
+
file_path: Path to the file to analyze
|
|
2382
|
+
request: Analysis request object
|
|
2383
|
+
|
|
2384
|
+
Returns:
|
|
2385
|
+
AnalysisResult object containing extracted elements
|
|
2386
|
+
"""
|
|
2387
|
+
from ..core.parser import Parser
|
|
2388
|
+
from ..models import AnalysisResult
|
|
2389
|
+
|
|
2390
|
+
try:
|
|
2391
|
+
# Read file content
|
|
2392
|
+
with open(file_path, encoding="utf-8") as f:
|
|
2393
|
+
source_code = f.read()
|
|
2394
|
+
|
|
2395
|
+
# Parse using core parser
|
|
2396
|
+
parser = Parser()
|
|
2397
|
+
parse_result = parser.parse_code(source_code, "sql", file_path)
|
|
2398
|
+
|
|
2399
|
+
if not parse_result.success:
|
|
2400
|
+
return AnalysisResult(
|
|
2401
|
+
file_path=file_path,
|
|
2402
|
+
language="sql",
|
|
2403
|
+
line_count=len(source_code.splitlines()),
|
|
2404
|
+
elements=[],
|
|
2405
|
+
node_count=0,
|
|
2406
|
+
query_results={},
|
|
2407
|
+
source_code=source_code,
|
|
2408
|
+
success=False,
|
|
2409
|
+
error_message=parse_result.error_message,
|
|
2410
|
+
)
|
|
2411
|
+
|
|
2412
|
+
# Extract elements
|
|
2413
|
+
elements = self.extractor.extract_sql_elements(
|
|
2414
|
+
parse_result.tree, source_code
|
|
2415
|
+
)
|
|
2416
|
+
|
|
2417
|
+
# Create result
|
|
2418
|
+
return AnalysisResult(
|
|
2419
|
+
file_path=file_path,
|
|
2420
|
+
language="sql",
|
|
2421
|
+
line_count=len(source_code.splitlines()),
|
|
2422
|
+
elements=elements,
|
|
2423
|
+
node_count=(
|
|
2424
|
+
parse_result.tree.root_node.end_byte if parse_result.tree else 0
|
|
2425
|
+
),
|
|
2426
|
+
query_results={},
|
|
2427
|
+
source_code=source_code,
|
|
2428
|
+
success=True,
|
|
2429
|
+
error_message=None,
|
|
2430
|
+
)
|
|
2431
|
+
|
|
2432
|
+
except Exception as e:
|
|
2433
|
+
log_error(f"Failed to analyze SQL file {file_path}: {e}")
|
|
2434
|
+
return AnalysisResult(
|
|
2435
|
+
file_path=file_path,
|
|
2436
|
+
language="sql",
|
|
2437
|
+
line_count=0,
|
|
2438
|
+
elements=[],
|
|
2439
|
+
node_count=0,
|
|
2440
|
+
query_results={},
|
|
2441
|
+
source_code="",
|
|
2442
|
+
success=False,
|
|
2443
|
+
error_message=str(e),
|
|
2444
|
+
)
|